diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-03-04 18:22:40 -0500 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-03-04 18:22:40 -0500 |
| commit | a839d252b22124b3fb0570fadea881ea9ebaef46 (patch) | |
| tree | f6c16f4fe8035eb66d6d8086bdc773f8178ba6e1 | |
| parent | If pdf file supplied, use that as rM filename (diff) | |
| download | paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.tar.gz paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.zip | |
Add ability to download a pdf from a URL
| -rw-r--r-- | README.md | 57 | ||||
| -rwxr-xr-x | arxiv2remarkable.py | 63 |
2 files changed, 89 insertions, 31 deletions
@@ -1,39 +1,50 @@ # arxiv2remarkable.py -This script takes an URL to an arXiv paper, and: +This script makes it as easy to get a PDF on your reMarkable from any of the +following sources: -1. Downloads it +- an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``) +- a url to a PDF file +- a local file. + +The script takes the source and: + +1. Downloads it if necessary 2. Removes the arXiv timestamp 3. Crops the pdf to remove unnecessary borders -4. Shrinks the pdf to reduce filesize -5. Generates a nice filename based on author/title/year of the paper -6. Uploads it to the reMarkable +4. Shrinks the pdf file to reduce the filesize +5. Generates a nice filename based on author/title/year of the paper (arXiv + only) +6. Uploads it to your reMarkable using ``rMapi``. Optionally, you can download a paper but not have it uploaded to the -reMarkable (using the ``-n`` switch), or provide an existing pdf file (to use -only steps 2 - 6). +reMarkable using the ``-n`` switch. Also, the ``--filename`` parameter to the +script can be used to provide an explicit filename for on the reMarkable. Here's the full help of the script: ```bash -[arxiv2remarkable] $ python arxiv2remarkable.py -h -usage: arxiv2remarkable.py [-h] [-v] [-n] [-d] [--rmapi RMAPI] - [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS] +usage: arxiv2remarkable.py [-h] [-v] [-n] [-d] [--filename FILENAME] + [--rmapi RMAPI] [--pdfcrop PDFCROP] [--pdftk PDFTK] + [--gs GS] input positional arguments: - input url to an arxiv paper or existing pdf file + input url to an arxiv paper, url to pdf, or existing pdf file optional arguments: - -h, --help show this help message and exit - -v, --verbose be verbose (default: False) - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir (default: False) - -d, --debug debug mode, doesn't upload to reMarkable (default: False) - --rmapi RMAPI path to rmapi executable (default: rmapi) - --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) - --pdftk PDFTK path to pdftk executable (default: pdftk) - --gs GS path to gs executable (default: gs) + -h, --help show this help message and exit + -v, --verbose be verbose (default: False) + -n, --no-upload don't upload to the reMarkable, save the output in + current working dir (default: False) + -d, --debug debug mode, doesn't upload to reMarkable (default: + False) + --filename FILENAME Filename to use for the file on reMarkable (default: + None) + --rmapi RMAPI path to rmapi executable (default: rmapi) + --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) + --pdftk PDFTK path to pdftk executable (default: pdftk) + --gs GS path to gs executable (default: gs) ``` And here's an example with verbose mode enabled that shows everything the @@ -70,7 +81,13 @@ The script also needs the following Python packages: - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/) - [requests](https://pypi.org/project/requests/) - [loguru](https://pypi.org/project/loguru/) +- [PyPDF2](https://github.com/mstamy2/PyPDF2) + +You can use this line: +```bash +pip install --user bs4 requests loguru PyPDF2 +``` # Notes diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 8c7b0ea..5ba357c 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -18,6 +18,7 @@ License: MIT """ +import PyPDF2 import argparse import bs4 import os @@ -28,6 +29,7 @@ import subprocess import sys import tempfile import time +import urllib.parse from loguru import logger @@ -44,7 +46,7 @@ def exception(msg): raise SystemExit(1) -def validate_url(url): +def arxiv_url(url): """Check if the url is to an arXiv page. >>> validate_url("https://arxiv.org/abs/1811.11242") @@ -68,7 +70,23 @@ def validate_url(url): return not m is None -def get_urls(url): +def valid_url(url): + try: + result = urllib.parse.urlparse(url) + return all([result.scheme, result.netloc, result.path]) + except: + return False + + +def check_file_is_pdf(filename): + try: + PyPDF2.PdfFileReader(open(filename, "rb")) + return True + except PyPDF2.utils.PdfReadError: + return False + + +def get_arxiv_urls(url): """Get the pdf and abs url from any given url """ if re.match("https?://arxiv.org/abs/\d{4}\.\d{5}(v\d+)?", url): abs_url = url @@ -242,6 +260,11 @@ def parse_args(): action="store_true", ) parser.add_argument( + "--filename", + help="Filename to use for the file on reMarkable", + default=None, + ) + parser.add_argument( "--rmapi", help="path to rmapi executable", default="rmapi" ) parser.add_argument( @@ -252,7 +275,7 @@ def parse_args(): ) parser.add_argument("--gs", help="path to gs executable", default="gs") parser.add_argument( - "input", help="url to an arxiv paper or existing pdf file" + "input", help="url to an arxiv paper, url to pdf, or existing pdf file" ) return parser.parse_args() @@ -261,8 +284,16 @@ def parse_args(): def main(): args = parse_args() - if not (os.path.exists(args.input) or validate_url(args.input)): - exception("Input not a file or arXiv url.") + if os.path.exists(args.input): + mode = "local_file" + elif arxiv_url(args.input): + mode = "arxiv_url" + elif valid_url(args.input): + if args.filename is None: + exception("Filename must be provided with pdf url (use --filename)") + mode = "pdf_url" + else: + exception("Input not a valid url, arxiv url, or existing file.") if not args.verbose: logger.remove(0) @@ -270,18 +301,28 @@ def main(): start_wd = os.getcwd() with tempfile.TemporaryDirectory() as working_dir: - if os.path.exists(args.input): + if mode == "local_file": shutil.copy(args.input, working_dir) filename = os.path.basename(args.input) - clean_filename = filename + clean_filename = args.filename if args.filename else filename os.chdir(working_dir) - if validate_url(args.input): - pdf_url, abs_url = get_urls(args.input) + if mode == "arxiv_url": + pdf_url, abs_url = get_arxiv_urls(args.input) filename = "paper.pdf" download_url(pdf_url, filename) - paper_info = get_paper_info(abs_url) - clean_filename = generate_filename(paper_info) + if args.filename: + clean_filename = args.filename + else: + paper_info = get_paper_info(abs_url) + clean_filename = generate_filename(paper_info) + + if mode == "pdf_url": + filename = "paper.pdf" + download_url(args.input, filename) + if not check_file_is_pdf(filename): + exception("Input url doesn't point to valid pdf file.") + clean_filename = args.filename dearxived = dearxiv(filename, pdftk_path=args.pdftk) cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) |
