aboutsummaryrefslogtreecommitdiff
path: root/arxiv2remarkable.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-02-03 18:14:17 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-02-03 18:14:17 +0000
commitb0d6247ce59a54d0b9930b9caacd34cd2075eee8 (patch)
treea30deafc062065d81441ee4f086ff574c008b881 /arxiv2remarkable.py
downloadpaper2remarkable-b0d6247ce59a54d0b9930b9caacd34cd2075eee8.tar.gz
paper2remarkable-b0d6247ce59a54d0b9930b9caacd34cd2075eee8.zip
Initial commit
Diffstat (limited to 'arxiv2remarkable.py')
-rw-r--r--arxiv2remarkable.py304
1 files changed, 304 insertions, 0 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
new file mode 100644
index 0000000..b48d7a7
--- /dev/null
+++ b/arxiv2remarkable.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Given an arXiv paper url this script:
+
+1. Downloads the paper
+2. Strips the timestamp
+3. Crops the pdf to remove unnecessary borders
+4. Shrinks the pdf to reduce the filesize
+5. Renames it using the format:
+ '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf
+6. Uploads it to the reMarkable using rMapi.
+
+Author: G.J.J. van den Burg
+Date: 2019-02-02
+License: MIT
+
+"""
+
+import argparse
+import bs4
+import os
+import re
+import requests
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+
+from loguru import logger
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+ "Safari/537.36"
+}
+
+
+def exception(msg):
+ print("ERROR: " + msg, file=sys.stderr)
+ print("Error occurred. Exiting.", file=sys.stderr)
+ raise SystemExit
+
+
+def validate_url(url):
+ """Check if the url is to an arXiv page.
+
+ >>> validate_url("https://arxiv.org/abs/1811.11242")
+ True
+ >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
+ True
+ >>> validate_url("http://arxiv.org/abs/1811.11242")
+ True
+ >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
+ True
+ >>> validate_url("https://arxiv.org/abs/1811.11242v1")
+ True
+ >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
+ True
+ >>> validate_url("https://gertjanvandenburg.com")
+ False
+ """
+ m = re.match(
+ "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{5}(v\d+)?(\.pdf)?", url
+ )
+ return not m is None
+
+
+def get_urls(url):
+ """Get the pdf and abs url from any given url
+ """
+ if re.match("https?://arxiv.org/abs/\d{4}\.\d{5}(v\d+)?", url):
+ abs_url = url
+ pdf_url = url.replace("abs", "pdf") + ".pdf"
+ elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{5}(v\d+)?\.pdf", url):
+ abs_url = url[:-4].replace("pdf", "abs")
+ pdf_url = url
+ else:
+ exception("Couldn't figure out arXiv urls.")
+ return pdf_url, abs_url
+
+
+def get_page_with_retry(url):
+ """Get the content of an url, retrying up to five times on failure.
+ """
+ count = 0
+ while True:
+ res = requests.get(url, headers=HEADERS)
+ if res.ok:
+ logger.info("Downloading url: %s" % url)
+ return res.content
+ else:
+ if count < 5:
+ logger.info(
+ "Caught error for url %s. Retrying in 5 seconds." % url
+ )
+ time.sleep(5)
+ else:
+ exception("Failed to download url: %s" % url)
+
+
+def download_url(url, filename):
+ """Download the content of an url and save it to a filename """
+ logger.info("Downloading file at url: %s" % url)
+ content = get_page_with_retry(url)
+ with open(filename, "wb") as fid:
+ fid.write(content)
+
+
+def dearxiv(input_file, pdftk_path="pdftk"):
+ """Remove the arXiv timestamp from a pdf"""
+ logger.info("Removing arXiv timestamp")
+ basename = os.path.splitext(input_file)[0]
+ uncompress_file = basename + "_uncompress.pdf"
+
+ status = subprocess.call(
+ [pdftk_path, input_file, "output", uncompress_file, "uncompress"]
+ )
+ if not status == 0:
+ exception("pdftk failed to uncompress the pdf.")
+
+ with open(uncompress_file, "rb") as fid:
+ data = fid.read()
+ # Remove the text element
+ data = re.sub(
+ b"\(arXiv:\d{4}\.\d{5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+ b"()Tj",
+ data,
+ )
+ # Remove the URL element
+ data = re.sub(
+ b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{5}v\d\)\\n\/S /URI\\n>>\\n",
+ b"",
+ data,
+ )
+
+ removed_file = basename + "_removed.pdf"
+ with open(removed_file, "wb") as oid:
+ oid.write(data)
+
+ output_file = basename + "_dearxiv.pdf"
+ status = subprocess.call(
+ [pdftk_path, removed_file, "output", output_file, "compress"]
+ )
+ if not status == 0:
+ exception("pdftk failed to compress the pdf.")
+
+ return output_file
+
+
+def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
+ logger.info("Cropping pdf file")
+ status = subprocess.call(
+ [pdfcrop_path, "--margins", "15 40 15 15", filepath],
+ stdout=subprocess.DEVNULL
+ )
+ if not status == 0:
+ logger.warning("Failed to crop the pdf file at: %s" % filepath)
+ return filepath
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+ if not os.path.exists(cropped_file):
+ logger.warning(
+ "Can't find cropped file '%s' where expected." % cropped_file
+ )
+ return filepath
+ return cropped_file
+
+
+def shrink_pdf(filepath, gs_path="gs"):
+ logger.info("Shrinking pdf file")
+ output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+ status = subprocess.call(
+ [
+ "gs",
+ "-sDEVICE=pdfwrite",
+ "-dCompatibilityLevel=1.4",
+ "-dPDFSETTINGS=/printer",
+ "-dNOPAUSE",
+ "-dBATCH",
+ "-dQUIET",
+ "-sOutputFile=%s" % output_file,
+ filepath,
+ ]
+ )
+ if not status == 0:
+ logger.warning("Failed to shrink the pdf file")
+ return filepath
+ return output_file
+
+
+def get_paper_info(url):
+ logger.info("Getting paper info from arXiv")
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ authors = [
+ x["content"] for x in soup.find_all("meta", {"name": "citation_author"})
+ ]
+ title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+ date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+ return dict(title=title, date=date, authors=authors)
+
+
+def generate_filename(info):
+ logger.info("Generating output filename")
+ if len(info["authors"]) > 3:
+ author_part = info["authors"][0].split(",")[0] + "_et_al"
+ else:
+ author_part = "_".join([x.split(",")[0] for x in info["authors"]])
+ author_part = author_part.replace(" ", "_")
+ title_part = info["title"].replace(",", "").replace(" ", "_")
+ year_part = info["date"].split("/")[0]
+ return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+
+
+def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
+ logger.info("Starting upload to reMarkable")
+ status = subprocess.call([rmapi_path, "put", filepath, remarkable_dir],
+ stdout=subprocess.DEVNULL)
+ if not status == 0:
+ exception("Uploading file %s to remarkable failed" % filepath)
+ logger.info("Upload successful.")
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ parser.add_argument(
+ "-v", "--verbose", help="be verbose", action="store_true"
+ )
+ parser.add_argument(
+ "-n",
+ "--no-upload",
+ help="don't upload to the reMarkable, save the output in current working dir",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-d",
+ "--debug",
+ help="debug mode, doesn't upload to reMarkable",
+ action="store_true",
+ )
+ parser.add_argument(
+ "--rmapi", help="path to rmapi executable", default="rmapi"
+ )
+ parser.add_argument(
+ "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
+ )
+ parser.add_argument(
+ "--pdftk", help="path to pdftk executable", default="pdftk"
+ )
+ parser.add_argument("--gs", help="path to gs executable", default="gs")
+ parser.add_argument(
+ "input", help="url to an arxiv paper or existing pdf file"
+ )
+ return parser.parse_args()
+
+
+@logger.catch
+def main():
+ args = parse_args()
+
+ if not (os.path.exists(args.input) or validate_url(args.input)):
+ exception("Input not a file or arXiv url.")
+
+ if not args.verbose:
+ logger.remove(0)
+
+ start_wd = os.getcwd()
+
+ with tempfile.TemporaryDirectory() as working_dir:
+ if os.path.exists(args.input):
+ shutil.copy(args.input, working_dir)
+ filename = os.path.basename(args.input)
+ clean_filename = os.path.splitext(filename)[0] + "_cropped.pdf"
+
+ os.chdir(working_dir)
+ if validate_url(args.input):
+ pdf_url, abs_url = get_urls(args.input)
+ filename = "paper.pdf"
+ download_url(pdf_url, filename)
+ paper_info = get_paper_info(abs_url)
+ clean_filename = generate_filename(paper_info)
+
+ dearxived = dearxiv(filename, pdftk_path=args.pdftk)
+ cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
+ shrinked = shrink_pdf(cropped)
+ shutil.move(shrinked, clean_filename)
+
+ if args.debug:
+ print("Paused in debug mode in dir: %s" % working_dir)
+ print("Press enter to exit.")
+ return input()
+
+ if args.no_upload:
+ shutil.move(clean_filename, start_wd)
+ else:
+ upload_to_rm(clean_filename, rmapi_path=args.rmapi)
+
+
+if __name__ == "__main__":
+ main()