Add ability to download a pdf from a URL

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-03-04 18:22:40 -0500
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-03-04 18:22:40 -0500
commit: a839d252b22124b3fb0570fadea881ea9ebaef46 (patch)
tree: f6c16f4fe8035eb66d6d8086bdc773f8178ba6e1
parent: If pdf file supplied, use that as rM filename (diff)
download: paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.tar.gz
paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.zip
2 files changed, 89 insertions, 31 deletions
diff --git a/README.md b/README.md
index 4687012..00c45c4 100644
--- a/README.md
+++ b/README.md
@@ -1,39 +1,50 @@
 # arxiv2remarkable.py
 
-This script takes an URL to an arXiv paper, and:
+This script makes it as easy to get a PDF on your reMarkable from any of the 
+following sources:
 
-1. Downloads it
+- an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``)
+- a url to a PDF file
+- a local file.
+
+The script takes the source and:
+
+1. Downloads it if necessary
 2. Removes the arXiv timestamp
 3. Crops the pdf to remove unnecessary borders
-4. Shrinks the pdf to reduce filesize
-5. Generates a nice filename based on author/title/year of the paper
-6. Uploads it to the reMarkable
+4. Shrinks the pdf file to reduce the filesize
+5. Generates a nice filename based on author/title/year of the paper (arXiv 
+   only)
+6. Uploads it to your reMarkable using ``rMapi``.
 
 Optionally, you can download a paper but not have it uploaded to the 
-reMarkable (using the ``-n`` switch), or provide an existing pdf file (to use 
-only steps 2 - 6).
+reMarkable using the ``-n`` switch. Also, the ``--filename`` parameter to the 
+script can be used to provide an explicit filename for on the reMarkable.
 
 Here's the full help of the script:
 
 ```bash
-[arxiv2remarkable] $ python arxiv2remarkable.py -h
-usage: arxiv2remarkable.py [-h] [-v] [-n] [-d] [--rmapi RMAPI]
-                           [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS]
+usage: arxiv2remarkable.py [-h] [-v] [-n] [-d] [--filename FILENAME]
+                           [--rmapi RMAPI] [--pdfcrop PDFCROP] [--pdftk PDFTK]
+                           [--gs GS]
                            input
 
 positional arguments:
-  input              url to an arxiv paper or existing pdf file
+  input                url to an arxiv paper, url to pdf, or existing pdf file
 
 optional arguments:
-  -h, --help         show this help message and exit
-  -v, --verbose      be verbose (default: False)
-  -n, --no-upload    don't upload to the reMarkable, save the output in
-                     current working dir (default: False)
-  -d, --debug        debug mode, doesn't upload to reMarkable (default: False)
-  --rmapi RMAPI      path to rmapi executable (default: rmapi)
-  --pdfcrop PDFCROP  path to pdfcrop executable (default: pdfcrop)
-  --pdftk PDFTK      path to pdftk executable (default: pdftk)
-  --gs GS            path to gs executable (default: gs)
+  -h, --help           show this help message and exit
+  -v, --verbose        be verbose (default: False)
+  -n, --no-upload      don't upload to the reMarkable, save the output in
+                       current working dir (default: False)
+  -d, --debug          debug mode, doesn't upload to reMarkable (default:
+                       False)
+  --filename FILENAME  Filename to use for the file on reMarkable (default:
+                       None)
+  --rmapi RMAPI        path to rmapi executable (default: rmapi)
+  --pdfcrop PDFCROP    path to pdfcrop executable (default: pdfcrop)
+  --pdftk PDFTK        path to pdftk executable (default: pdftk)
+  --gs GS              path to gs executable (default: gs)
 ```
 
 And here's an example with verbose mode enabled that shows everything the 
@@ -70,7 +81,13 @@ The script also needs the following Python packages:
 - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/)
 - [requests](https://pypi.org/project/requests/)
 - [loguru](https://pypi.org/project/loguru/)
+- [PyPDF2](https://github.com/mstamy2/PyPDF2)
+
+You can use this line:
 
+```bash
+pip install --user bs4 requests loguru PyPDF2
+```
 
 # Notes
 
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 8c7b0ea..5ba357c 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -18,6 +18,7 @@ License: MIT
 
 """
 
+import PyPDF2
 import argparse
 import bs4
 import os
@@ -28,6 +29,7 @@ import subprocess
 import sys
 import tempfile
 import time
+import urllib.parse
 
 from loguru import logger
 
@@ -44,7 +46,7 @@ def exception(msg):
     raise SystemExit(1)
 
 
-def validate_url(url):
+def arxiv_url(url):
     """Check if the url is to an arXiv page.
 
     >>> validate_url("https://arxiv.org/abs/1811.11242")
@@ -68,7 +70,23 @@ def validate_url(url):
     return not m is None
 
 
-def get_urls(url):
+def valid_url(url):
+    try:
+        result = urllib.parse.urlparse(url)
+        return all([result.scheme, result.netloc, result.path])
+    except:
+        return False
+
+
+def check_file_is_pdf(filename):
+    try:
+        PyPDF2.PdfFileReader(open(filename, "rb"))
+        return True
+    except PyPDF2.utils.PdfReadError:
+        return False
+
+
+def get_arxiv_urls(url):
     """Get the pdf and abs url from any given url """
     if re.match("https?://arxiv.org/abs/\d{4}\.\d{5}(v\d+)?", url):
         abs_url = url
@@ -242,6 +260,11 @@ def parse_args():
         action="store_true",
     )
     parser.add_argument(
+        "--filename",
+        help="Filename to use for the file on reMarkable",
+        default=None,
+    )
+    parser.add_argument(
         "--rmapi", help="path to rmapi executable", default="rmapi"
     )
     parser.add_argument(
@@ -252,7 +275,7 @@ def parse_args():
     )
     parser.add_argument("--gs", help="path to gs executable", default="gs")
     parser.add_argument(
-        "input", help="url to an arxiv paper or existing pdf file"
+        "input", help="url to an arxiv paper, url to pdf, or existing pdf file"
     )
     return parser.parse_args()
 
@@ -261,8 +284,16 @@ def parse_args():
 def main():
     args = parse_args()
 
-    if not (os.path.exists(args.input) or validate_url(args.input)):
-        exception("Input not a file or arXiv url.")
+    if os.path.exists(args.input):
+        mode = "local_file"
+    elif arxiv_url(args.input):
+        mode = "arxiv_url"
+    elif valid_url(args.input):
+        if args.filename is None:
+            exception("Filename must be provided with pdf url (use --filename)")
+        mode = "pdf_url"
+    else:
+        exception("Input not a valid url, arxiv url, or existing file.")
 
     if not args.verbose:
         logger.remove(0)
@@ -270,18 +301,28 @@ def main():
     start_wd = os.getcwd()
 
     with tempfile.TemporaryDirectory() as working_dir:
-        if os.path.exists(args.input):
+        if mode == "local_file":
             shutil.copy(args.input, working_dir)
             filename = os.path.basename(args.input)
-            clean_filename = filename
+            clean_filename = args.filename if args.filename else filename
 
         os.chdir(working_dir)
-        if validate_url(args.input):
-            pdf_url, abs_url = get_urls(args.input)
+        if mode == "arxiv_url":
+            pdf_url, abs_url = get_arxiv_urls(args.input)
             filename = "paper.pdf"
             download_url(pdf_url, filename)
-            paper_info = get_paper_info(abs_url)
-            clean_filename = generate_filename(paper_info)
+            if args.filename:
+                clean_filename = args.filename
+            else:
+                paper_info = get_paper_info(abs_url)
+                clean_filename = generate_filename(paper_info)
+
+        if mode == "pdf_url":
+            filename = "paper.pdf"
+            download_url(args.input, filename)
+            if not check_file_is_pdf(filename):
+                exception("Input url doesn't point to valid pdf file.")
+            clean_filename = args.filename
 
         dearxived = dearxiv(filename, pdftk_path=args.pdftk)
         cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-03-04 18:22:40 -0500
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-03-04 18:22:40 -0500
commit	a839d252b22124b3fb0570fadea881ea9ebaef46 (patch)
tree	f6c16f4fe8035eb66d6d8086bdc773f8178ba6e1
parent	If pdf file supplied, use that as rM filename (diff)
download	paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.tar.gz paper2remarkable-a839d252b22124b3fb0570fadea881ea9ebaef46.zip