Merge branch 'master' of https://github.com/GjjvdBurg/arxiv2remarkable

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-04-07 21:27:03 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-04-07 21:27:03 +0100
commit: 7be62d3a0ce426a8828e482cbe4917f2b6bef9bb (patch)
tree: 6ae70f8e2a1b5e7f8e34ebbf624d2efd7c568afd
parent: log filename created (diff)
parent: Simplify the code (diff)
download: paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.tar.gz
paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.zip
2 files changed, 91 insertions, 24 deletions
diff --git a/README.md b/README.md
index 04712df..3b7be2a 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,13 @@ following sources:
 
 - an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``)
 - a PubMed Central url (either to the HTML or the PDF)
+- an ACM citation page url (``https://dl.acm.org/citation.cfm?id=...``)
 - a url to a PDF file
 - a local file.
 
 The script takes the source and:
 
-1. Downloads it if necessary
+1. Downloads the pdf if necessary
 2. Removes the arXiv timestamp
 3. Crops the pdf to remove unnecessary borders
 4. Shrinks the pdf file to reduce the filesize
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index ccd1621..a4449f8 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -34,6 +34,8 @@ import urllib.parse
 
 from loguru import logger
 
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
     "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
@@ -72,7 +74,14 @@ def arxiv_url(url):
 
 
 def pmc_url(url):
-    m = re.fullmatch("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url)
+    m = re.fullmatch(
+        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
+    )
+    return not m is None
+
+
+def acm_url(url):
+    m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url)
     return not m is None
 
 
@@ -122,12 +131,45 @@ def get_pmc_urls(url):
     return pdf_url, abs_url
 
 
+def get_acm_pdf_url(url):
+    page = get_page_with_retry(url)
+    soup = bs4.BeautifulSoup(page, "html.parser")
+    thea = None
+    for a in soup.find_all("a"):
+        if a.get("name") == "FullTextPDF":
+            thea = a
+            break
+    if thea is None:
+        return None
+    href = thea.get("href")
+    if href.startswith("http"):
+        return href
+    else:
+        return "https://dl.acm.org/" + href
+
+
+def get_acm_urls(url):
+    if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
+        abs_url = url
+        pdf_url = get_acm_pdf_url(url)
+        if pdf_url is None:
+            exception("Couldn't extract PDF url from ACM citation page.")
+    else:
+        exception(
+            "Couldn't figure out ACM urls, please provide a URL of the "
+            "format: http(s)://dl.acm.org/citation.cfm?id=..."
+        )
+    return pdf_url, abs_url
+
+
 def get_page_with_retry(url):
     """Get the content of an url, retrying up to five times on failure. """
 
     def retry(url, count):
         if count < 5:
-            logger.info("Caught error for url %s. Retrying in 5 seconds." % url)
+            logger.info(
+                "Caught error for url %s. Retrying in 5 seconds." % url
+            )
             time.sleep(5)
         else:
             exception("Failed to download url: %s" % url)
@@ -237,11 +279,13 @@ def shrink_pdf(filepath, gs_path="gs"):
 
 
 def get_paper_info_arxiv(url):
+    """ Extract the paper's authors, title, and publication year """
     logger.info("Getting paper info from arXiv")
     page = get_page_with_retry(url)
     soup = bs4.BeautifulSoup(page, "html.parser")
     authors = [
-        x["content"] for x in soup.find_all("meta", {"name": "citation_author"})
+        x["content"]
+        for x in soup.find_all("meta", {"name": "citation_author"})
     ]
     authors = [x.split(",")[0].strip() for x in authors]
     title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
@@ -270,6 +314,30 @@ def get_paper_info_pmc(url):
     return dict(title=title, date=date, authors=authors)
 
 
+def get_paper_info_acm(url):
+    """ Extract the paper's authors, title, and publication year """
+    logger.info("Getting paper info from ACM")
+    page = get_page_with_retry(url)
+    soup = bs4.BeautifulSoup(page, "html.parser")
+    authors = [
+        x["content"]
+        for x in soup.find_all("meta", {"name": "citation_authors"})
+    ]
+    # We only use last names, and this method is a guess. I'm open to more
+    # advanced approaches.
+    authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")]
+    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+    if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
+        logger.warning(
+            "Couldn't extract year from ACM page, please raise an "
+            "issue on GitHub so I can fix it: %s",
+            GITHUB_URL,
+        )
+    date = date.strip().split("/")[-1]
+    return dict(title=title, date=date, authors=authors)
+
+
 def generate_filename(info):
     """ Generate a nice filename for a paper given the info dict """
     # we assume that the list of authors is lastname only.
@@ -364,9 +432,13 @@ def main():
         mode = "arxiv_url"
     elif pmc_url(args.input):
         mode = "pmc_url"
+    elif acm_url(args.input):
+        mode = "acm_url"
     elif valid_url(args.input):
         if args.filename is None:
-            exception("Filename must be provided with pdf url (use --filename)")
+            exception(
+                "Filename must be provided with pdf url (use --filename)"
+            )
         mode = "pdf_url"
     else:
         exception("Input not a valid url, arxiv url, or existing file.")
@@ -383,33 +455,27 @@ def main():
             clean_filename = args.filename if args.filename else filename
 
         os.chdir(working_dir)
-        if mode == "arxiv_url":
-            pdf_url, abs_url = get_arxiv_urls(args.input)
+        if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]:
             filename = "paper.pdf"
-            download_url(pdf_url, filename)
-            if args.filename:
-                clean_filename = args.filename
-            else:
+            if mode == "arxiv_url":
+                pdf_url, abs_url = get_arxiv_urls(args.input)
                 paper_info = get_paper_info_arxiv(abs_url)
-                clean_filename = generate_filename(paper_info)
-
-        if mode == "pmc_url":
-            pdf_url, abs_url = get_pmc_urls(args.input)
-            filename = "paper.pdf"
+            elif mode == "pmc_url":
+                pdf_url, abs_url = get_pmc_urls(args.input)
+                paper_info = get_paper_info_pmc(abs_url)
+            elif mode == "acm_url":
+                pdf_url, abs_url = get_acm_urls(args.input)
+                paper_info = get_paper_info_acm(abs_url)
+            else:
+                pdf_url = args.input
             download_url(pdf_url, filename)
+            if not check_file_is_pdf(filename):
+                exception("Downloaded file isn't a valid pdf file.")
             if args.filename:
                 clean_filename = args.filename
             else:
-                paper_info = get_paper_info_pmc(abs_url)
                 clean_filename = generate_filename(paper_info)
 
-        if mode == "pdf_url":
-            filename = "paper.pdf"
-            download_url(args.input, filename)
-            if not check_file_is_pdf(filename):
-                exception("Input url doesn't point to valid pdf file.")
-            clean_filename = args.filename
-
         dearxived = dearxiv(filename, pdftk_path=args.pdftk)
         cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
         shrinked = shrink_pdf(cropped)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-04-07 21:27:03 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-04-07 21:27:03 +0100
commit	7be62d3a0ce426a8828e482cbe4917f2b6bef9bb (patch)
tree	6ae70f8e2a1b5e7f8e34ebbf624d2efd7c568afd
parent	log filename created (diff)
parent	Simplify the code (diff)
download	paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.tar.gz paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.zip