diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-02-03 18:14:17 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-02-03 18:14:17 +0000 |
| commit | b0d6247ce59a54d0b9930b9caacd34cd2075eee8 (patch) | |
| tree | a30deafc062065d81441ee4f086ff574c008b881 /arxiv2remarkable.py | |
| download | paper2remarkable-b0d6247ce59a54d0b9930b9caacd34cd2075eee8.tar.gz paper2remarkable-b0d6247ce59a54d0b9930b9caacd34cd2075eee8.zip | |
Initial commit
Diffstat (limited to 'arxiv2remarkable.py')
| -rw-r--r-- | arxiv2remarkable.py | 304 |
1 files changed, 304 insertions, 0 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py new file mode 100644 index 0000000..b48d7a7 --- /dev/null +++ b/arxiv2remarkable.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Given an arXiv paper url this script: + +1. Downloads the paper +2. Strips the timestamp +3. Crops the pdf to remove unnecessary borders +4. Shrinks the pdf to reduce the filesize +5. Renames it using the format: + '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf +6. Uploads it to the reMarkable using rMapi. + +Author: G.J.J. van den Burg +Date: 2019-02-02 +License: MIT + +""" + +import argparse +import bs4 +import os +import re +import requests +import shutil +import subprocess +import sys +import tempfile +import time + +from loguru import logger + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + raise SystemExit + + +def validate_url(url): + """Check if the url is to an arXiv page. + + >>> validate_url("https://arxiv.org/abs/1811.11242") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("http://arxiv.org/abs/1811.11242") + True + >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("https://arxiv.org/abs/1811.11242v1") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") + True + >>> validate_url("https://gertjanvandenburg.com") + False + """ + m = re.match( + "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{5}(v\d+)?(\.pdf)?", url + ) + return not m is None + + +def get_urls(url): + """Get the pdf and abs url from any given url + """ + if re.match("https?://arxiv.org/abs/\d{4}\.\d{5}(v\d+)?", url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{5}(v\d+)?\.pdf", url): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return pdf_url, abs_url + + +def get_page_with_retry(url): + """Get the content of an url, retrying up to five times on failure. + """ + count = 0 + while True: + res = requests.get(url, headers=HEADERS) + if res.ok: + logger.info("Downloading url: %s" % url) + return res.content + else: + if count < 5: + logger.info( + "Caught error for url %s. Retrying in 5 seconds." % url + ) + time.sleep(5) + else: + exception("Failed to download url: %s" % url) + + +def download_url(url, filename): + """Download the content of an url and save it to a filename """ + logger.info("Downloading file at url: %s" % url) + content = get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + +def dearxiv(input_file, pdftk_path="pdftk"): + """Remove the arXiv timestamp from a pdf""" + logger.info("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [pdftk_path, input_file, "output", uncompress_file, "uncompress"] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{5}v\d\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + +def crop_pdf(filepath, pdfcrop_path="pdfcrop"): + logger.info("Cropping pdf file") + status = subprocess.call( + [pdfcrop_path, "--margins", "15 40 15 15", filepath], + stdout=subprocess.DEVNULL + ) + if not status == 0: + logger.warning("Failed to crop the pdf file at: %s" % filepath) + return filepath + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + if not os.path.exists(cropped_file): + logger.warning( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + +def shrink_pdf(filepath, gs_path="gs"): + logger.info("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + "gs", + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + logger.warning("Failed to shrink the pdf file") + return filepath + return output_file + + +def get_paper_info(url): + logger.info("Getting paper info from arXiv") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] for x in soup.find_all("meta", {"name": "citation_author"}) + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + + +def generate_filename(info): + logger.info("Generating output filename") + if len(info["authors"]) > 3: + author_part = info["authors"][0].split(",")[0] + "_et_al" + else: + author_part = "_".join([x.split(",")[0] for x in info["authors"]]) + author_part = author_part.replace(" ", "_") + title_part = info["title"].replace(",", "").replace(" ", "_") + year_part = info["date"].split("/")[0] + return author_part + "_-_" + title_part + "_" + year_part + ".pdf" + + +def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): + logger.info("Starting upload to reMarkable") + status = subprocess.call([rmapi_path, "put", filepath, remarkable_dir], + stdout=subprocess.DEVNULL) + if not status == 0: + exception("Uploading file %s to remarkable failed" % filepath) + logger.info("Upload successful.") + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + help="debug mode, doesn't upload to reMarkable", + action="store_true", + ) + parser.add_argument( + "--rmapi", help="path to rmapi executable", default="rmapi" + ) + parser.add_argument( + "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" + ) + parser.add_argument( + "--pdftk", help="path to pdftk executable", default="pdftk" + ) + parser.add_argument("--gs", help="path to gs executable", default="gs") + parser.add_argument( + "input", help="url to an arxiv paper or existing pdf file" + ) + return parser.parse_args() + + +@logger.catch +def main(): + args = parse_args() + + if not (os.path.exists(args.input) or validate_url(args.input)): + exception("Input not a file or arXiv url.") + + if not args.verbose: + logger.remove(0) + + start_wd = os.getcwd() + + with tempfile.TemporaryDirectory() as working_dir: + if os.path.exists(args.input): + shutil.copy(args.input, working_dir) + filename = os.path.basename(args.input) + clean_filename = os.path.splitext(filename)[0] + "_cropped.pdf" + + os.chdir(working_dir) + if validate_url(args.input): + pdf_url, abs_url = get_urls(args.input) + filename = "paper.pdf" + download_url(pdf_url, filename) + paper_info = get_paper_info(abs_url) + clean_filename = generate_filename(paper_info) + + dearxived = dearxiv(filename, pdftk_path=args.pdftk) + cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) + shrinked = shrink_pdf(cropped) + shutil.move(shrinked, clean_filename) + + if args.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if args.no_upload: + shutil.move(clean_filename, start_wd) + else: + upload_to_rm(clean_filename, rmapi_path=args.rmapi) + + +if __name__ == "__main__": + main() |
