diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-04-07 21:27:03 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-04-07 21:27:03 +0100 |
| commit | 7be62d3a0ce426a8828e482cbe4917f2b6bef9bb (patch) | |
| tree | 6ae70f8e2a1b5e7f8e34ebbf624d2efd7c568afd | |
| parent | log filename created (diff) | |
| parent | Simplify the code (diff) | |
| download | paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.tar.gz paper2remarkable-7be62d3a0ce426a8828e482cbe4917f2b6bef9bb.zip | |
Merge branch 'master' of https://github.com/GjjvdBurg/arxiv2remarkable
| -rw-r--r-- | README.md | 3 | ||||
| -rwxr-xr-x | arxiv2remarkable.py | 112 |
2 files changed, 91 insertions, 24 deletions
@@ -5,12 +5,13 @@ following sources: - an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``) - a PubMed Central url (either to the HTML or the PDF) +- an ACM citation page url (``https://dl.acm.org/citation.cfm?id=...``) - a url to a PDF file - a local file. The script takes the source and: -1. Downloads it if necessary +1. Downloads the pdf if necessary 2. Removes the arXiv timestamp 3. Crops the pdf to remove unnecessary borders 4. Shrinks the pdf file to reduce the filesize diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index ccd1621..a4449f8 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -34,6 +34,8 @@ import urllib.parse from loguru import logger +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" + HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " @@ -72,7 +74,14 @@ def arxiv_url(url): def pmc_url(url): - m = re.fullmatch("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url) + m = re.fullmatch( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url + ) + return not m is None + + +def acm_url(url): + m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url) return not m is None @@ -122,12 +131,45 @@ def get_pmc_urls(url): return pdf_url, abs_url +def get_acm_pdf_url(url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + +def get_acm_urls(url): + if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): + abs_url = url + pdf_url = get_acm_pdf_url(url) + if pdf_url is None: + exception("Couldn't extract PDF url from ACM citation page.") + else: + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return pdf_url, abs_url + + def get_page_with_retry(url): """Get the content of an url, retrying up to five times on failure. """ def retry(url, count): if count < 5: - logger.info("Caught error for url %s. Retrying in 5 seconds." % url) + logger.info( + "Caught error for url %s. Retrying in 5 seconds." % url + ) time.sleep(5) else: exception("Failed to download url: %s" % url) @@ -237,11 +279,13 @@ def shrink_pdf(filepath, gs_path="gs"): def get_paper_info_arxiv(url): + """ Extract the paper's authors, title, and publication year """ logger.info("Getting paper info from arXiv") page = get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") authors = [ - x["content"] for x in soup.find_all("meta", {"name": "citation_author"}) + x["content"] + for x in soup.find_all("meta", {"name": "citation_author"}) ] authors = [x.split(",")[0].strip() for x in authors] title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] @@ -270,6 +314,30 @@ def get_paper_info_pmc(url): return dict(title=title, date=date, authors=authors) +def get_paper_info_acm(url): + """ Extract the paper's authors, title, and publication year """ + logger.info("Getting paper info from ACM") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess. I'm open to more + # advanced approaches. + authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): + logger.warning( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so I can fix it: %s", + GITHUB_URL, + ) + date = date.strip().split("/")[-1] + return dict(title=title, date=date, authors=authors) + + def generate_filename(info): """ Generate a nice filename for a paper given the info dict """ # we assume that the list of authors is lastname only. @@ -364,9 +432,13 @@ def main(): mode = "arxiv_url" elif pmc_url(args.input): mode = "pmc_url" + elif acm_url(args.input): + mode = "acm_url" elif valid_url(args.input): if args.filename is None: - exception("Filename must be provided with pdf url (use --filename)") + exception( + "Filename must be provided with pdf url (use --filename)" + ) mode = "pdf_url" else: exception("Input not a valid url, arxiv url, or existing file.") @@ -383,33 +455,27 @@ def main(): clean_filename = args.filename if args.filename else filename os.chdir(working_dir) - if mode == "arxiv_url": - pdf_url, abs_url = get_arxiv_urls(args.input) + if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]: filename = "paper.pdf" - download_url(pdf_url, filename) - if args.filename: - clean_filename = args.filename - else: + if mode == "arxiv_url": + pdf_url, abs_url = get_arxiv_urls(args.input) paper_info = get_paper_info_arxiv(abs_url) - clean_filename = generate_filename(paper_info) - - if mode == "pmc_url": - pdf_url, abs_url = get_pmc_urls(args.input) - filename = "paper.pdf" + elif mode == "pmc_url": + pdf_url, abs_url = get_pmc_urls(args.input) + paper_info = get_paper_info_pmc(abs_url) + elif mode == "acm_url": + pdf_url, abs_url = get_acm_urls(args.input) + paper_info = get_paper_info_acm(abs_url) + else: + pdf_url = args.input download_url(pdf_url, filename) + if not check_file_is_pdf(filename): + exception("Downloaded file isn't a valid pdf file.") if args.filename: clean_filename = args.filename else: - paper_info = get_paper_info_pmc(abs_url) clean_filename = generate_filename(paper_info) - if mode == "pdf_url": - filename = "paper.pdf" - download_url(args.input, filename) - if not check_file_is_pdf(filename): - exception("Input url doesn't point to valid pdf file.") - clean_filename = args.filename - dearxived = dearxiv(filename, pdftk_path=args.pdftk) cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) shrinked = shrink_pdf(cropped) |
