[wip] First commit of new code structure

Not fully functional yet probably
author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-10-24 14:02:36 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-10-24 14:02:36 +0100
commit: 83df50f47426cefb71c2f4fde161c8fad934dba3 (patch)
tree: 904bbc056ae2f4ac8a11fee0c4255ed23aa7295c
parent: Move tests to separate directory (diff)
download: paper2remarkable-83df50f47426cefb71c2f4fde161c8fad934dba3.tar.gz
paper2remarkable-83df50f47426cefb71c2f4fde161c8fad934dba3.zip
17 files changed, 1179 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ed2d040
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,62 @@
+# Makefile for easier installation and cleanup.
+#
+# Uses self-documenting macros from here:
+# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+
+PACKAGE=paper2remarkable
+DOC_DIR='./docs/'
+VENV_DIR='/tmp/p2r_venv/'
+
+.PHONY: help cover dist
+
+.DEFAULT_GOAL := help
+
+help:
+	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
+		 awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
+		 %s\n", $$1, $$2}'
+
+release: ## Make a release
+	python make_release.py
+
+
+install: ## Install for the current user using the default python command
+	python setup.py build_ext --inplace
+	python setup.py install --user
+
+
+test: venv ## Run unit tests
+	source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit
+
+
+clean: ## Clean build dist and egg directories left after install
+	rm -rf ./dist
+	rm -rf ./build
+	rm -rf ./$(PACKAGE).egg-info
+	rm -rf $(VENV_DIR)
+	rm -f MANIFEST
+	find . -type f -iname '*.pyc' -delete
+	find . -type d -name '__pycache__' -empty -delete
+
+dist: ## Make Python source distribution
+	python setup.py sdist
+	python setup.py	bdist_wheel --universal
+
+docs: doc
+doc: install ## Build documentation with Sphinx
+	m2r README.md && mv README.rst $(DOC_DIR)
+	m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
+	cd $(DOC_DIR) && \
+		rm source/* && \
+		sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \
+		touch source/AUTOGENERATED
+	$(MAKE) -C $(DOC_DIR) html
+
+
+
+venv: $(VENV_DIR)/bin/activate
+
+$(VENV_DIR)/bin/activate:
+	test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
+	source $(VENV_DIR)/bin/activate && pip install -q -e .[dev]
+	touch $(VENV_DIR)/bin/activate
diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/paper2remarkable/__init__.py
diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py
new file mode 100644
index 0000000..b97d538
--- /dev/null
+++ b/paper2remarkable/__main__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+"""
+Caller for the command line application
+"""
+
+import sys
+
+def main():
+    from .ui import main as realmain
+
+    sys.exit(realmain())
+
+if __name__ == '__main__':
+    main()
diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py
new file mode 100644
index 0000000..5bee2af
--- /dev/null
+++ b/paper2remarkable/__version__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+VERSION = (0, 4, 0)
+
+__version__ = '.'.join(map(str, VERSION))
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
new file mode 100644
index 0000000..b25b178
--- /dev/null
+++ b/paper2remarkable/crop.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+"""Code for cropping a PDF file
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import os
+import subprocess
+import pdfplumber
+
+RM_WIDTH = 1404
+RM_HEIGHT = 1872
+
+
+class Cropper(object):
+    def __init__(
+        self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+    ):
+        if not input_file is None:
+            self.input_file = os.path.abspath(input_file)
+            self.reader = PyPDF2.PdfFileReader(self.input_file)
+        if not output_file is None:
+            self.output_file = os.path.abspath(output_file)
+        self.pdfcrop_path = pdfcrop_path
+
+        self.writer = PyPDF2.PdfFileWriter()
+
+    def crop(self, margins=1):
+        return self.process_file(self.crop_page, margins=margins)
+
+    def center(self, padding=15):
+        return self.process_file(self.center_page, padding=padding)
+
+    def process_file(self, page_func, *args, **kwargs):
+        for page_idx in range(self.reader.getNumPages()):
+            status = page_func(page_idx, *args, **kwargs)
+            if not status == 0:
+                return status
+        with open(self.output_file, "wb") as fp:
+            self.writer.write(fp)
+        return 0
+
+    def center_page(self, page_idx, padding):
+        return self.process_page(
+            page_idx, self.get_center_bbox, padding=padding
+        )
+
+    def crop_page(self, page_idx, margins):
+        return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+    def export_page(self, page_idx):
+        """Helper function that exports a single page given by index """
+        page = self.reader.getPage(page_idx)
+        writer = PyPDF2.PdfFileWriter()
+        writer.addPage(page)
+        tmpfname = "./page.pdf"
+        with open(tmpfname, "wb") as fp:
+            writer.write(fp)
+        return tmpfname
+
+    def process_page(self, page_idx, bbox_func, *args, **kwargs):
+        """Process a single page and add it to the writer """
+        tmpfname = self.export_page(page_idx)
+        tmpfout = "./output.pdf"
+        bbox = bbox_func(tmpfname, *args, **kwargs)
+        status = subprocess.call(
+            [
+                self.pdfcrop_path,
+                "--bbox",
+                " ".join(map(str, bbox)),
+                tmpfname,
+                tmpfout,
+            ],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            return status
+        reader = PyPDF2.PdfFileReader(tmpfout)
+        page = reader.getPage(0)
+        self.writer.addPage(page)
+        os.unlink(tmpfname)
+        os.unlink(tmpfout)
+        return 0
+
+    def get_bbox(self, filename, margins=1, resolution=72):
+        """Get the bounding box, with optional margins
+
+        if margins is integer, used for all margins, else
+        margins = [left, top, right, bottom]
+
+        We get the bounding box by finding the smallest rectangle that is 
+        completely surrounded by white pixels.
+        """
+        if isinstance(margins, int):
+            margins = [margins for _ in range(4)]
+        pdf = pdfplumber.open(filename)
+        im = pdf.pages[0].to_image(resolution=resolution)
+        pdf.close()
+
+        pixels = list(im.original.getdata())
+        W, H = im.original.size
+
+        # M is a list of H lists with each W integers that equal the sum of the
+        # pixel values
+        M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+        left, top, bottom, right = 0, 0, 0, 0
+        while top < H and sum(M[top]) == W * 255 * 3:
+            top += 1
+        while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+            bottom += 1
+
+        # Transpose M
+        M = list(zip(*M))
+        while left < W and sum(M[left]) == H * 255 * 3:
+            left += 1
+        while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+            right += 1
+
+        left -= margins[0]
+        top -= margins[1]
+        right -= margins[2]
+        bottom -= margins[3]
+
+        # This is the bounding box in PIL format: (0, 0) top left
+        x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+        # Get the bbox in Ghostscript format: (0, 0) bottom left
+        a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+        return [a0, b0, a1, b1]
+
+    def get_center_bbox(self, filename, padding=15):
+        """Compute a bounding box that will center the page file on the 
+        reMarkable
+        """
+        bbox = self.get_bbox(filename, margins=0)
+
+        h = bbox[3] - bbox[1]
+        w = bbox[2] - bbox[0]
+
+        # we want some minimal padding all around, because it is visually more
+        # pleasing.
+        h_prime = h + 2 * padding
+        w_prime = w + 2 * padding
+
+        # if the document is wider than the remarkable, we add top-padding to
+        # center it, otherwise we add left-padding
+        x, y = 0, 0
+        if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+            y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+        else:
+            x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+        margins = [padding + x, padding + y, padding, padding]
+        return self.get_bbox(filename, margins=margins)
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
new file mode 100644
index 0000000..361c11e
--- /dev/null
+++ b/paper2remarkable/providers/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+from .arxiv import Arxiv
+from .pubmed import Pubmed
+from .acm import ACM
+from .openreview import OpenReview
+from .springer import Springer
+from .local import LocalFile
+from .pdf_url import PdfUrl
+
+providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
new file mode 100644
index 0000000..05fc0b7
--- /dev/null
+++ b/paper2remarkable/providers/_base.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+
+"""Base for the Provider class
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import abc
+import bs4
+import datetime
+import os
+import re
+import requests
+import shutil
+import string
+import subprocess
+import tempfile
+import time
+import titlecase
+import unidecode
+
+from ..crop import Cropper
+from ..utils import exception
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+    "Safari/537.36"
+}
+
+
+class Provider(metaclass=abc.ABCMeta):
+    """ ABC for providers of pdf sources """
+
+    meta_author_key = "citation_author"
+    meta_title_key = "citation_title"
+    meta_date_key = "citation_date"
+
+    def __init__(
+        self,
+        verbose=False,
+        upload=True,
+        debug=False,
+        center=False,
+        blank=False,
+        remarkable_dir="/",
+        rmapi_path="rmapi",
+        pdfcrop_path="pdfcrop",
+        pdftk_path="pdftk",
+        gs_path="gs",
+    ):
+        self.verbose = verbose
+        self.upload = upload
+        self.debug = debug
+        self.center = center
+        self.blank = blank
+        self.remarkable_dir = remarkable_dir
+        self.rmapi_path = rmapi_path
+        self.pdfcrop_path = pdfcrop_path
+        self.pdftk_path = pdftk_path
+        self.gs_path = gs_path
+
+        self.log("Starting %s" % type(self).__name__)
+
+    def log(self, msg, mode="info"):
+        if not self.verbose:
+            return
+        if not mode in ["info", "warning"]:
+            raise ValueError("unknown logging mode.")
+        now = datetime.datetime.now()
+        print(
+            now.strftime("%Y-%m-%d %H:%M:%S")
+            + " - "
+            + mode.upper()
+            + " - "
+            + msg
+        )
+
+    def warn(self, msg):
+        self.log(msg, mode="warning")
+
+    @staticmethod
+    @abc.abstractmethod
+    def validate(src):
+        """ Validate whether ``src`` is appropriate for this provider """
+
+    def retrieve_pdf(self, src, filename):
+        """ Download pdf from src and save to filename """
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+        op = (lambda x: x) if op is None else op
+        # format the author list retrieved by bs4
+        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+    def get_authors(self, soup):
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": self.meta_author_key})
+        ]
+        return self._format_authors(authors)
+
+    def get_title(self, soup):
+        target = soup.find_all("meta", {"name": self.meta_title_key})
+        return target[0]["content"]
+
+    def _format_date(self, soup_date):
+        return soup_date
+
+    def get_date(self, soup):
+        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+            "content"
+        ]
+        return self._format_date(date)
+
+    def get_paper_info(
+        self,
+        src,
+        author_key="citation_author",
+        title_key="citation_title",
+        date_key="citation_date",
+    ):
+        """ Retrieve the title/author (surnames)/year information """
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        self.log("Getting paper info")
+        page = self.get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = self.get_authors(soup)
+        title = self.get_title(soup)
+        date = self.get_date(soup)
+        return dict(title=title, date=date, authors=authors)
+
+    def string_clean(self, s):
+        """ Clean a string to replace accented characters with equivalents and 
+        keep only the allowed characters """
+        normalized = unidecode.unidecode(s)
+        allowed = string.ascii_letters + string.digits + "_ ."
+        cleaned = "".join(c if c in allowed else "_" for c in normalized)
+        return cleaned
+
+    def create_filename(self, info, filename=None):
+        """ Generate filename using the info dict or filename if provided """
+        if not filename is None:
+            return filename
+        # we assume that the list of authors is surname only.
+        self.log("Generating output filename")
+
+        if len(info["authors"]) > 3:
+            author_part = info["authors"][0] + "_et_al"
+        else:
+            author_part = "_".join(info["authors"])
+        author_part = self.string_clean(author_part)
+
+        title_part = self.string_clean(info["title"])
+        title_part = titlecase.titlecase(title_part).replace(" ", "_")
+
+        year_part = info["date"].split("/")[0]
+
+        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+        name = unidecode.unidecode(name)
+        self.log("Created filename: %s" % name)
+        return name
+
+    def blank_pdf(self, filepath):
+        if not self.blank:
+            return filepath
+
+        self.log("Adding blank pages")
+        input_pdf = PyPDF2.PdfFileReader(filepath)
+        output_pdf = PyPDF2.PdfFileWriter()
+        for page in input_pdf.pages:
+            output_pdf.addPage(page)
+            output_pdf.addBlankPage()
+
+        output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
+        with open(output_file, "wb") as fp:
+            output_pdf.write(fp)
+        return output_file
+
+    def crop_pdf(self, filepath):
+        self.log("Cropping pdf file")
+        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+        cropper = Cropper(
+            filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
+        )
+        status = cropper.crop(margins=15)
+
+        if not status == 0:
+            self.warn("Failed to crop the pdf file at: %s" % filepath)
+            return filepath
+        if not os.path.exists(cropped_file):
+            self.warn(
+                "Can't find cropped file '%s' where expected." % cropped_file
+            )
+            return filepath
+        return cropped_file
+
+    def center_pdf(self, filepath):
+        if not self.center:
+            return filepath
+
+        self.log("Centering pdf file")
+        centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+        cropper = Cropper(
+            filepath, centered_file, pdfcrop_path=self.pdfcrop_path
+        )
+        status = cropper.center()
+        if not status == 0:
+            self.warn("Failed to center the pdf file at: %s" % filepath)
+            return filepath
+        if not os.path.exists(centered_file):
+            self.warn(
+                "Can't find centered file '%s' where expected." % centered_file
+            )
+            return filepath
+        return centered_file
+
+    def shrink_pdf(self, filepath):
+        self.log("Shrinking pdf file")
+        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+        status = subprocess.call(
+            [
+                self.gs_path,
+                "-sDEVICE=pdfwrite",
+                "-dCompatibilityLevel=1.4",
+                "-dPDFSETTINGS=/printer",
+                "-dNOPAUSE",
+                "-dBATCH",
+                "-dQUIET",
+                "-sOutputFile=%s" % output_file,
+                filepath,
+            ]
+        )
+        if not status == 0:
+            self.warn("Failed to shrink the pdf file")
+            return filepath
+        return output_file
+
+    def check_file_is_pdf(self, filename):
+        try:
+            fp = open(filename, "rb")
+            pdf = PyPDF2.PdfFileReader(fp, strict=False)
+            fp.close()
+            del pdf
+            return True
+        except PyPDF2.utils.PdfReadError:
+            exception("Downloaded file isn't a valid pdf file.")
+
+    def download_url(self, url, filename):
+        """Download the content of an url and save it to a filename """
+        self.log("Downloading file at url: %s" % url)
+        content = self.get_page_with_retry(url)
+        with open(filename, "wb") as fid:
+            fid.write(content)
+
+    def get_page_with_retry(self, url, tries=5):
+        count = 0
+        while count < tries:
+            count += 1
+            error = False
+            try:
+                res = requests.get(url, headers=HEADERS)
+            except requests.exceptions.ConnectionError:
+                error = True
+            if error or not res.ok:
+                self.warn("Error getting url %s. Retrying in 5 seconds" % url)
+                time.sleep(5)
+                continue
+            self.log("Downloading url: %s" % url)
+            return res.content
+
+    def upload_to_rm(self, filepath):
+        remarkable_dir = self.remarkable_dir.rstrip("/")
+        self.log("Starting upload to reMarkable")
+        if remarkable_dir:
+            status = subprocess.call(
+                [self.rmapi_path, "mkdir", remarkable_dir + "/"],
+                stdout=subprocess.DEVNULL,
+            )
+            if not status == 0:
+                exception(
+                    "Creating directory %s on reMarkable failed"
+                    % remarkable_dir
+                )
+        status = subprocess.call(
+            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            exception("Uploading file %s to reMarkable failed" % filepath)
+        self.log("Upload successful.")
+
+    def dearxiv(self, input_file):
+        """Remove the arXiv timestamp from a pdf"""
+        self.log("Removing arXiv timestamp")
+        basename = os.path.splitext(input_file)[0]
+        uncompress_file = basename + "_uncompress.pdf"
+
+        status = subprocess.call(
+            [
+                self.pdftk_path,
+                input_file,
+                "output",
+                uncompress_file,
+                "uncompress",
+            ]
+        )
+        if not status == 0:
+            exception("pdftk failed to uncompress the pdf.")
+
+        with open(uncompress_file, "rb") as fid:
+            data = fid.read()
+            # Remove the text element
+            data = re.sub(
+                b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+                b"()Tj",
+                data,
+            )
+            # Remove the URL element
+            data = re.sub(
+                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+                b"",
+                data,
+            )
+
+        removed_file = basename + "_removed.pdf"
+        with open(removed_file, "wb") as oid:
+            oid.write(data)
+
+        output_file = basename + "_dearxiv.pdf"
+        status = subprocess.call(
+            [self.pdftk_path, removed_file, "output", output_file, "compress"]
+        )
+        if not status == 0:
+            exception("pdftk failed to compress the pdf.")
+
+        return output_file
+
+    def run(self, src, filename=None):
+        info = self.get_paper_info(src)
+        clean_filename = self.create_filename(info, filename)
+        tmp_filename = "paper.pdf"
+
+        self.initial_dir = os.getcwd()
+        with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
+            os.chdir(working_dir)
+            self.retrieve_pdf(src, tmp_filename)
+            self.check_file_is_pdf(tmp_filename)
+
+            ops = [
+                self.dearxiv,
+                self.crop_pdf,
+                self.center_pdf,
+                self.blank_pdf,
+                self.shrink_pdf,
+            ]
+            intermediate_fname = tmp_filename
+            for op in ops:
+                intermediate_fname = op(intermediate_fname)
+            shutil.move(intermediate_fname, clean_filename)
+
+            if self.debug:
+                print("Paused in debug mode in dir: %s" % working_dir)
+                print("Press enter to exit.")
+                return input()
+
+            if self.upload:
+                return self.upload_to_rm(clean_filename)
+
+            target_path = os.path.join(self.initial_dir, clean_filename)
+            while os.path.exists(target_path):
+                base = os.path.splitext(target_path)[0]
+                target_path = base + "_.pdf"
+            shutil.move(clean_filename, target_path)
+            return target_path
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
new file mode 100644
index 0000000..be98e16
--- /dev/null
+++ b/paper2remarkable/providers/acm.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for ACM
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import bs4
+import re
+
+from . import Provider
+from ..utils import exception
+
+# TODO: put this somewhere central, now multiply defined
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+
+class ACM(Provider):
+
+    meta_author_key = "citation_authors"
+
+    re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_acm_pdf_url(self, url):
+        page = self.get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        thea = None
+        for a in soup.find_all("a"):
+            if a.get("name") == "FullTextPDF":
+                thea = a
+                break
+        if thea is None:
+            return None
+        href = thea.get("href")
+        if href.startswith("http"):
+            return href
+        else:
+            return "https://dl.acm.org/" + href
+
+    def get_abs_pdf_urls(self, url):
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = self.get_acm_pdf_url(url)
+            if pdf_url is None:
+                exception(
+                    "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+                )
+        else:
+            exception(
+                "Couldn't figure out ACM urls, please provide a URL of the "
+                "format: http(s)://dl.acm.org/citation.cfm?id=..."
+            )
+        return abs_url, pdf_url
+
+    def validate(src):
+        m = re.fullmatch(ACM.re_abs, src)
+        return not m is None
+
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(";")
+        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+    def _format_date(self, soup_date):
+        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
+            self.warn(
+                "Couldn't extract year from ACM page, please raise an "
+                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
+            )
+        return soup_date.strip().split("/")[-1]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
new file mode 100644
index 0000000..fc5c004
--- /dev/null
+++ b/paper2remarkable/providers/arxiv.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for arxiv.org
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ..utils import exception
+
+
+class Arxiv(Provider):
+
+    re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
+    re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and abs url from any given arXiv url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("abs", "pdf") + ".pdf"
+        elif re.match(self.re_pdf, url):
+            abs_url = url[:-4].replace("pdf", "abs")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out arXiv urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        """Check if the url is to an arXiv page. """
+        return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
new file mode 100644
index 0000000..68ce030
--- /dev/null
+++ b/paper2remarkable/providers/local.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for local files
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import os
+import shutil
+
+from . import Provider
+
+
+class LocalFile(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def validate(src):
+        return os.path.exists(src)
+
+    def retrieve_pdf(self, src, filename):
+        source = os.path.join(self.initial_dir, src)
+        shutil.copy(source, filename)
+
+    def get_paper_info(self, src):
+        return {"filename": src}
+
+    def create_filename(self, info, filename=None):
+        if not filename is None:
+            return filename
+        return os.path.basename(info["filename"])
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
new file mode 100644
index 0000000..b7e1d77
--- /dev/null
+++ b/paper2remarkable/providers/openreview.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for OpenReview
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from . import Provider
+from ..utils import exception
+
+
+class OpenReview(Provider):
+
+    meta_date_key = "citation_publication_date"
+
+    re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
+    re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract url from a OpenReview url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("forum", "pdf")
+        elif re.match(self.re_pdf, url):
+            abs_url = url.replace("pdf", "forum")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out OpenReview urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        """ Check if the url is a valid OpenReview url. """
+        return re.match(OpenReview.re_abs, src) or re.match(
+            OpenReview.re_pdf, src
+        )
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
new file mode 100644
index 0000000..56427d3
--- /dev/null
+++ b/paper2remarkable/providers/pdf_url.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for generic PDF url
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import urllib
+
+from . import Provider
+from ..utils import exception
+
+
+class PdfUrl(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def validate(src):
+        try:
+            result = urllib.parse.urlparse(src)
+            return all([result.scheme, result.netloc, result.path])
+        except:
+            return False
+
+    def retrieve_pdf(self, url, filename):
+        self.download_url(url, filename)
+
+    def get_paper_info(self, src):
+        return None
+
+    def create_filename(self, info, filename=None):
+        if filename is None:
+            exception(
+                "Filename must be provided with PDFUrlProvider (use --filename)"
+            )
+        return filename
diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py
new file mode 100644
index 0000000..29bdb31
--- /dev/null
+++ b/paper2remarkable/providers/pubmed.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for PubMed
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from . import Provider
+from ..utils import exception
+
+class Pubmed(Provider):
+
+    meta_author_key = "citation_authors"
+
+    re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
+    re_pdf = (
+        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and html url from a given PMC url """
+        if re.match(self.re_pdf, url):
+            idx = url.index("pdf")
+            abs_url = url[: idx - 1]
+            pdf_url = url
+        elif re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
+        else:
+            exception("Couldn't figure out PMC urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
+
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(",")
+        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+    def _format_date(self, soup_date):
+        if re.match("\w+\ \d{4}", soup_date):
+            return soup_date.split(" ")[-1]
+        return soup_date.replace(" ", "_")
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
new file mode 100644
index 0000000..ce16007
--- /dev/null
+++ b/paper2remarkable/providers/springer.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Springer
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+import urllib
+
+from . import Provider
+from ..utils import exception
+
+
+class Springer(Provider):
+
+    meta_date_key = "citation_online_date"
+
+    re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract urls from a Springer url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("article", "content/pdf")
+        elif re.match(self.re_pdf, url):
+            abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+            pdf_url = urllib.parse.unquote(url)
+        else:
+            exception("Couldn't figure out Springer urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
new file mode 100644
index 0000000..71fc655
--- /dev/null
+++ b/paper2remarkable/ui.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+"""Command line interface
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import argparse
+
+from .providers import providers
+from .utils import exception
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "-b",
+        "--blank",
+        help="Add a blank page after every page of the PDF",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="be verbose", action="store_true"
+    )
+    parser.add_argument(
+        "-n",
+        "--no-upload",
+        help="don't upload to the reMarkable, save the output in current working dir",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        help="debug mode, doesn't upload to reMarkable",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-c",
+        "--center",
+        help="Center the PDF on the page, instead of left align",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--filename",
+        help="Filename to use for the file on reMarkable",
+        default=None,
+    )
+    parser.add_argument(
+        "-p",
+        "--remarkable-path",
+        help="directory on reMarkable to put the file (created if missing)",
+        dest="remarkable_dir",
+        default="/",
+    )
+    parser.add_argument(
+        "--rmapi", help="path to rmapi executable", default="rmapi"
+    )
+    parser.add_argument(
+        "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
+    )
+    parser.add_argument(
+        "--pdftk", help="path to pdftk executable", default="pdftk"
+    )
+    parser.add_argument("--gs", help="path to gs executable", default="gs")
+    parser.add_argument(
+        "input", help="URL to a paper or the path of a local PDF file"
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    provider = next((p for p in providers if p.validate(args.input)), None)
+    if provider is None:
+        exception("Input not valid, no provider can handle this source.")
+
+    prov = provider(
+        verbose=args.verbose,
+        upload=not args.no_upload,
+        debug=args.debug,
+        center=args.center,
+        blank=args.blank,
+        remarkable_dir=args.remarkable_dir,
+        rmapi_path=args.rmapi,
+        pdfcrop_path=args.pdfcrop,
+        pdftk_path=args.pdftk,
+        gs_path=args.gs,
+    )
+
+    prov.run(args.input, filename=args.filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
new file mode 100644
index 0000000..af19d22
--- /dev/null
+++ b/paper2remarkable/utils.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+"""Utility functions for a2r
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+
+import sys
+
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+
+
+def exception(msg):
+    print("ERROR: " + msg, file=sys.stderr)
+    print("Error occurred. Exiting.", file=sys.stderr)
+    print("", file=sys.stderr)
+    print(
+        "If you think this might be a bug, please raise an issue on GitHub: %s"
+        % GITHUB_URL
+    )
+    raise SystemExit(1)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e5a697e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import io
+import os
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+AUTHOR = "Gertjan van den Burg"
+DESCRIPTION = "Easily download an academic paper and send it to the reMarkable"
+EMAIL = "gertjanvandenburg@gmail.com"
+LICENSE = "MIT"
+LICENSE_TROVE = "License :: OSI Approved :: MIT License"
+NAME = "paper2remarkable"
+REQUIRES_PYTHON = ">=3.5.0"
+URL = "https://github.com/GjjvdBurg/paper2remarkable"
+VERSION = None
+
+# What packages are required for this module to be executed?
+REQUIRED = [
+        "bs4>=4.8.0",
+        "requests>=2.21",
+        "pdfplumber>=0.5.12",
+        "unidecode>=1.1"
+]
+
+docs_require = []
+test_require = []
+dev_require = []
+
+# What packages are optional?
+EXTRAS = {
+    "docs": docs_require,
+    "tests": test_require,
+    "dev": docs_require + test_require + dev_require,
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
+        long_description = "\n" + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
+    with open(os.path.join(here, project_slug, "__version__.py")) as f:
+        exec(f.read(), about)
+else:
+    about["__version__"] = VERSION
+
+# Where the magic happens:
+setup(
+    name=NAME,
+    version=about["__version__"],
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(
+        exclude=["tests", "*.tests", "*.tests.*", "tests.*"]
+    ),
+    install_requires=REQUIRED,
+    extras_require=EXTRAS,
+    include_package_data=True,
+    license=LICENSE,
+    ext_modules=[],
+    entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]},
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        LICENSE_TROVE,
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: Implementation :: CPython",
+        "Programming Language :: Python :: Implementation :: PyPy",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "Topic :: Education",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Utilities",
+    ],
+)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-10-24 14:02:36 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-10-24 14:02:36 +0100
commit	83df50f47426cefb71c2f4fde161c8fad934dba3 (patch)
tree	904bbc056ae2f4ac8a11fee0c4255ed23aa7295c
parent	Move tests to separate directory (diff)
download	paper2remarkable-83df50f47426cefb71c2f4fde161c8fad934dba3.tar.gz paper2remarkable-83df50f47426cefb71c2f4fde161c8fad934dba3.zip