diff options
| -rw-r--r-- | README.md | 79 | ||||
| -rw-r--r-- | arxiv2remarkable.py | 304 | ||||
| -rw-r--r-- | poetry.lock | 147 | ||||
| -rw-r--r-- | pyproject.toml | 18 |
4 files changed, 548 insertions, 0 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..4687012 --- /dev/null +++ b/README.md @@ -0,0 +1,79 @@ +# arxiv2remarkable.py + +This script takes an URL to an arXiv paper, and: + +1. Downloads it +2. Removes the arXiv timestamp +3. Crops the pdf to remove unnecessary borders +4. Shrinks the pdf to reduce filesize +5. Generates a nice filename based on author/title/year of the paper +6. Uploads it to the reMarkable + +Optionally, you can download a paper but not have it uploaded to the +reMarkable (using the ``-n`` switch), or provide an existing pdf file (to use +only steps 2 - 6). + +Here's the full help of the script: + +```bash +[arxiv2remarkable] $ python arxiv2remarkable.py -h +usage: arxiv2remarkable.py [-h] [-v] [-n] [-d] [--rmapi RMAPI] + [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS] + input + +positional arguments: + input url to an arxiv paper or existing pdf file + +optional arguments: + -h, --help show this help message and exit + -v, --verbose be verbose (default: False) + -n, --no-upload don't upload to the reMarkable, save the output in + current working dir (default: False) + -d, --debug debug mode, doesn't upload to reMarkable (default: False) + --rmapi RMAPI path to rmapi executable (default: rmapi) + --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) + --pdftk PDFTK path to pdftk executable (default: pdftk) + --gs GS path to gs executable (default: gs) +``` + +And here's an example with verbose mode enabled that shows everything the +script does: +```bash +$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 +2019-02-03 18:11:41.816 | INFO | __main__:download_url:106 - Downloading file at url: https://arxiv.org/pdf/1811.11242v1.pdf +2019-02-03 18:11:46.833 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/pdf/1811.11242v1.pdf +2019-02-03 18:11:46.835 | INFO | __main__:get_paper_info:194 - Getting paper info from arXiv +2019-02-03 18:11:47.496 | INFO | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/abs/1811.11242v1 +2019-02-03 18:11:47.508 | INFO | __main__:generate_filename:206 - Generating output filename +2019-02-03 18:11:47.508 | INFO | __main__:dearxiv:114 - Removing arXiv timestamp +2019-02-03 18:11:49.221 | INFO | __main__:crop_pdf:154 - Cropping pdf file +2019-02-03 18:11:53.247 | INFO | __main__:shrink_pdf:172 - Shrinking pdf file +2019-02-03 18:11:54.802 | INFO | __main__:upload_to_rm:218 - Starting upload to reMarkable +2019-02-03 18:11:57.767 | INFO | __main__:upload_to_rm:223 - Upload successful. +``` + +## Dependencies + +The script requires the following external programs to be available: + +- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) +- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a + LaTeX installation. +- [GhostScript](https://www.ghostscript.com/) +- [rMAPI](https://github.com/juruen/rmapi) + +If these scripts are not available on the PATH variable, you can supply them +with the relevant options to the script. + +The script also needs the following Python packages: + +- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/) +- [requests](https://pypi.org/project/requests/) +- [loguru](https://pypi.org/project/loguru/) + + +# Notes + +License: MIT + +Author: G.J.J. van den Burg diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py new file mode 100644 index 0000000..b48d7a7 --- /dev/null +++ b/arxiv2remarkable.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Given an arXiv paper url this script: + +1. Downloads the paper +2. Strips the timestamp +3. Crops the pdf to remove unnecessary borders +4. Shrinks the pdf to reduce the filesize +5. Renames it using the format: + '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf +6. Uploads it to the reMarkable using rMapi. + +Author: G.J.J. van den Burg +Date: 2019-02-02 +License: MIT + +""" + +import argparse +import bs4 +import os +import re +import requests +import shutil +import subprocess +import sys +import tempfile +import time + +from loguru import logger + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + raise SystemExit + + +def validate_url(url): + """Check if the url is to an arXiv page. + + >>> validate_url("https://arxiv.org/abs/1811.11242") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("http://arxiv.org/abs/1811.11242") + True + >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") + True + >>> validate_url("https://arxiv.org/abs/1811.11242v1") + True + >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") + True + >>> validate_url("https://gertjanvandenburg.com") + False + """ + m = re.match( + "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{5}(v\d+)?(\.pdf)?", url + ) + return not m is None + + +def get_urls(url): + """Get the pdf and abs url from any given url + """ + if re.match("https?://arxiv.org/abs/\d{4}\.\d{5}(v\d+)?", url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{5}(v\d+)?\.pdf", url): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return pdf_url, abs_url + + +def get_page_with_retry(url): + """Get the content of an url, retrying up to five times on failure. + """ + count = 0 + while True: + res = requests.get(url, headers=HEADERS) + if res.ok: + logger.info("Downloading url: %s" % url) + return res.content + else: + if count < 5: + logger.info( + "Caught error for url %s. Retrying in 5 seconds." % url + ) + time.sleep(5) + else: + exception("Failed to download url: %s" % url) + + +def download_url(url, filename): + """Download the content of an url and save it to a filename """ + logger.info("Downloading file at url: %s" % url) + content = get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + +def dearxiv(input_file, pdftk_path="pdftk"): + """Remove the arXiv timestamp from a pdf""" + logger.info("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [pdftk_path, input_file, "output", uncompress_file, "uncompress"] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{5}v\d\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + +def crop_pdf(filepath, pdfcrop_path="pdfcrop"): + logger.info("Cropping pdf file") + status = subprocess.call( + [pdfcrop_path, "--margins", "15 40 15 15", filepath], + stdout=subprocess.DEVNULL + ) + if not status == 0: + logger.warning("Failed to crop the pdf file at: %s" % filepath) + return filepath + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + if not os.path.exists(cropped_file): + logger.warning( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + +def shrink_pdf(filepath, gs_path="gs"): + logger.info("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + "gs", + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + logger.warning("Failed to shrink the pdf file") + return filepath + return output_file + + +def get_paper_info(url): + logger.info("Getting paper info from arXiv") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] for x in soup.find_all("meta", {"name": "citation_author"}) + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + + +def generate_filename(info): + logger.info("Generating output filename") + if len(info["authors"]) > 3: + author_part = info["authors"][0].split(",")[0] + "_et_al" + else: + author_part = "_".join([x.split(",")[0] for x in info["authors"]]) + author_part = author_part.replace(" ", "_") + title_part = info["title"].replace(",", "").replace(" ", "_") + year_part = info["date"].split("/")[0] + return author_part + "_-_" + title_part + "_" + year_part + ".pdf" + + +def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): + logger.info("Starting upload to reMarkable") + status = subprocess.call([rmapi_path, "put", filepath, remarkable_dir], + stdout=subprocess.DEVNULL) + if not status == 0: + exception("Uploading file %s to remarkable failed" % filepath) + logger.info("Upload successful.") + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + help="debug mode, doesn't upload to reMarkable", + action="store_true", + ) + parser.add_argument( + "--rmapi", help="path to rmapi executable", default="rmapi" + ) + parser.add_argument( + "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" + ) + parser.add_argument( + "--pdftk", help="path to pdftk executable", default="pdftk" + ) + parser.add_argument("--gs", help="path to gs executable", default="gs") + parser.add_argument( + "input", help="url to an arxiv paper or existing pdf file" + ) + return parser.parse_args() + + +@logger.catch +def main(): + args = parse_args() + + if not (os.path.exists(args.input) or validate_url(args.input)): + exception("Input not a file or arXiv url.") + + if not args.verbose: + logger.remove(0) + + start_wd = os.getcwd() + + with tempfile.TemporaryDirectory() as working_dir: + if os.path.exists(args.input): + shutil.copy(args.input, working_dir) + filename = os.path.basename(args.input) + clean_filename = os.path.splitext(filename)[0] + "_cropped.pdf" + + os.chdir(working_dir) + if validate_url(args.input): + pdf_url, abs_url = get_urls(args.input) + filename = "paper.pdf" + download_url(pdf_url, filename) + paper_info = get_paper_info(abs_url) + clean_filename = generate_filename(paper_info) + + dearxived = dearxiv(filename, pdftk_path=args.pdftk) + cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) + shrinked = shrink_pdf(cropped) + shutil.move(shrinked, clean_filename) + + if args.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if args.no_upload: + shutil.move(clean_filename, start_wd) + else: + upload_to_rm(clean_filename, rmapi_path=args.rmapi) + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..d8a1205 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,147 @@ +[[package]] +category = "main" +description = "Produce colored terminal text with an xml-like markup" +name = "ansimarkup" +optional = false +python-versions = "*" +version = "1.4.0" + +[package.dependencies] +colorama = "*" + +[[package]] +category = "main" +description = "Screen-scraping library" +name = "beautifulsoup4" +optional = false +python-versions = "*" +version = "4.7.1" + +[package.dependencies] +soupsieve = ">=1.2" + +[[package]] +category = "main" +description = "Pretty and helpful exceptions, automatically" +name = "better-exceptions-fork" +optional = false +python-versions = "*" +version = "0.2.1.post6" + +[package.dependencies] +ansimarkup = ">=1.3.0" +colorama = "*" +pygments = ">=2.2.0" + +[[package]] +category = "main" +description = "Dummy package for Beautiful Soup" +name = "bs4" +optional = false +python-versions = "*" +version = "0.0.1" + +[package.dependencies] +beautifulsoup4 = "*" + +[[package]] +category = "main" +description = "Python package for providing Mozilla's CA Bundle." +name = "certifi" +optional = false +python-versions = "*" +version = "2018.11.29" + +[[package]] +category = "main" +description = "Universal encoding detector for Python 2 and 3" +name = "chardet" +optional = false +python-versions = "*" +version = "3.0.4" + +[[package]] +category = "main" +description = "Cross-platform colored terminal text." +name = "colorama" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "0.4.1" + +[[package]] +category = "main" +description = "Internationalized Domain Names in Applications (IDNA)" +name = "idna" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.8" + +[[package]] +category = "main" +description = "Python logging made (stupidly) simple" +name = "loguru" +optional = false +python-versions = ">=3.5" +version = "0.2.5" + +[package.dependencies] +ansimarkup = ">=1.4.0" +better-exceptions-fork = ">=0.2.1.post6" +colorama = ">=0.3.4" + +[[package]] +category = "main" +description = "Pygments is a syntax highlighting package written in Python." +name = "pygments" +optional = false +python-versions = "*" +version = "2.3.1" + +[[package]] +category = "main" +description = "Python HTTP for Humans." +name = "requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "2.21.0" + +[package.dependencies] +certifi = ">=2017.4.17" +chardet = ">=3.0.2,<3.1.0" +idna = ">=2.5,<2.9" +urllib3 = ">=1.21.1,<1.25" + +[[package]] +category = "main" +description = "A CSS4 selector implementation for Beautiful Soup." +name = "soupsieve" +optional = false +python-versions = "*" +version = "1.7.3" + +[[package]] +category = "main" +description = "HTTP library with thread-safe connection pooling, file post, and more." +name = "urllib3" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" +version = "1.24.1" + +[metadata] +content-hash = "b92b4b1d2c4f9d3181044c1ad99fd9bfa49e8618c6ff5de7bd64c557bcc27e39" +python-versions = "^3.5" + +[metadata.hashes] +ansimarkup = ["06365e3ef89a12734fc408b2449cb4642d5fe2e603e95e7296eff9e98a0fe0b4", "174d920481416cec8d5a707af542d6fba25a1df1c21d8996479c32ba453649a4"] +beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"] +better-exceptions-fork = ["5f0983da51e956dbdaf8b9a3d10e2774b382ce6c6ff2e54685c33e2dbe8f1472"] +bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] +certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] +chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] +colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"] +idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] +loguru = ["68297d9f23064c2f4764bb5d0c5c767f3ed7f9fc1218244841878f5fc7c94add", "ebac59630946721fd6207264679b267a8bdc290b086226067d6aad86830e3123"] +pygments = ["5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", "e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"] +requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] +soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] +urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6f67ecd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[tool.poetry] +name = "arxiv2remarkable" +version = "0.1.0" +description = "Download an arXiv paper and send it to reMarkable" +authors = ["Gertjan van den Burg <gertjanvandenburg@gmail.com>"] +license = "MIT" + +[tool.poetry.dependencies] +python = "^3.5" +bs4 = "^0.0.1" +requests = "^2.21" +loguru = "^0.2.5" + +[tool.poetry.dev-dependencies] + +[build-system] +requires = ["poetry>=0.12"] +build-backend = "poetry.masonry.api" |
