From 0105cd484921ca854a1489abcaa35d0167c85ceb Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:01:27 +0100
Subject: Move tests to separate directory

---
 test.py       | 106 ----------------------------------------------------------
 tests/test.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 106 deletions(-)
 delete mode 100644 test.py
 create mode 100644 tests/test.py

diff --git a/test.py b/test.py
deleted file mode 100644
index 83c74af..0000000
--- a/test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__author__ = "G.J.J. van den Burg"
-
-"""Tests"""
-
-import unittest
-import tempfile
-import hashlib
-import shutil
-import os
-
-from arxiv2remarkable import (
-    ACM,
-    Arxiv,
-    LocalFile,
-    OpenReview,
-    PdfUrl,
-    Pubmed,
-    Springer,
-)
-
-VERBOSE = False
-
-
-def md5sum(filename):
-    blocksize = 65536
-    hasher = hashlib.md5()
-    with open(filename, "rb") as fid:
-        buf = fid.read(blocksize)
-        while len(buf) > 0:
-            hasher.update(buf)
-            buf = fid.read(blocksize)
-    return hasher.hexdigest()
-
-
-class Tests(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.original_dir = os.getcwd()
-
-    def setUp(self):
-        self.test_dir = tempfile.mkdtemp()
-        os.chdir(self.test_dir)
-
-    def tearDown(self):
-        os.chdir(self.original_dir)
-        shutil.rmtree(self.test_dir)
-
-    def test_arxiv(self):
-        prov = Arxiv(upload=False, verbose=VERBOSE)
-        url = "https://arxiv.org/abs/1811.11242v1"
-        exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_pmc(self):
-        prov = Pubmed(upload=False, verbose=VERBOSE)
-        url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
-        exp_filename = (
-            "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
-        )
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_acm(self):
-        prov = ACM(upload=False, verbose=VERBOSE)
-        url = "https://dl.acm.org/citation.cfm?id=3025626"
-        exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_openreview(self):
-        prov = OpenReview(upload=False, verbose=VERBOSE)
-        url = "https://openreview.net/forum?id=S1x4ghC9tQ"
-        exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_springer(self):
-        prov = Springer(upload=False, verbose=VERBOSE)
-        url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
-        exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_local(self):
-        local_filename = "test.pdf"
-        with open(local_filename, "w") as fp:
-            fp.write(
-                "%PDF-1.1\n%¥±ë\n\n1 0 obj\n  << /Type /Catalog\n     /Pages 2 0 R\n  >>\nendobj\n\n2 0 obj\n  << /Type /Pages\n     /Kids [3 0 R]\n     /Count 1\n     /MediaBox [0 0 300 144]\n  >>\nendobj\n\n3 0 obj\n  <<  /Type /Page\n      /Parent 2 0 R\n      /Resources\n       << /Font\n           << /F1\n               << /Type /Font\n                  /Subtype /Type1\n                  /BaseFont /Times-Roman\n               >>\n           >>\n       >>\n      /Contents 4 0 R\n  >>\nendobj\n\n4 0 obj\n  << /Length 55 >>\nstream\n  BT\n    /F1 18 Tf\n    0 0 Td\n    (Hello World) Tj\n  ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n  <<  /Root 1 0 R\n      /Size 5\n  >>\nstartxref\n565\n%%EOF"
-            )
-        prov = LocalFile(upload=False, verbose=VERBOSE)
-        filename = prov.run(local_filename)
-        self.assertEqual("test_.pdf", os.path.basename(filename))
-
-    def test_pdfurl(self):
-        prov = PdfUrl(upload=False, verbose=VERBOSE)
-        url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
-        filename = prov.run(url, filename="test.pdf")
-        self.assertEqual("test.pdf", os.path.basename(filename))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test.py b/tests/test.py
new file mode 100644
index 0000000..83c74af
--- /dev/null
+++ b/tests/test.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__author__ = "G.J.J. van den Burg"
+
+"""Tests"""
+
+import unittest
+import tempfile
+import hashlib
+import shutil
+import os
+
+from arxiv2remarkable import (
+    ACM,
+    Arxiv,
+    LocalFile,
+    OpenReview,
+    PdfUrl,
+    Pubmed,
+    Springer,
+)
+
+VERBOSE = False
+
+
+def md5sum(filename):
+    blocksize = 65536
+    hasher = hashlib.md5()
+    with open(filename, "rb") as fid:
+        buf = fid.read(blocksize)
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = fid.read(blocksize)
+    return hasher.hexdigest()
+
+
+class Tests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.original_dir = os.getcwd()
+
+    def setUp(self):
+        self.test_dir = tempfile.mkdtemp()
+        os.chdir(self.test_dir)
+
+    def tearDown(self):
+        os.chdir(self.original_dir)
+        shutil.rmtree(self.test_dir)
+
+    def test_arxiv(self):
+        prov = Arxiv(upload=False, verbose=VERBOSE)
+        url = "https://arxiv.org/abs/1811.11242v1"
+        exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_pmc(self):
+        prov = Pubmed(upload=False, verbose=VERBOSE)
+        url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
+        exp_filename = (
+            "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
+        )
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_acm(self):
+        prov = ACM(upload=False, verbose=VERBOSE)
+        url = "https://dl.acm.org/citation.cfm?id=3025626"
+        exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_openreview(self):
+        prov = OpenReview(upload=False, verbose=VERBOSE)
+        url = "https://openreview.net/forum?id=S1x4ghC9tQ"
+        exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_springer(self):
+        prov = Springer(upload=False, verbose=VERBOSE)
+        url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
+        exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_local(self):
+        local_filename = "test.pdf"
+        with open(local_filename, "w") as fp:
+            fp.write(
+                "%PDF-1.1\n%¥±ë\n\n1 0 obj\n  << /Type /Catalog\n     /Pages 2 0 R\n  >>\nendobj\n\n2 0 obj\n  << /Type /Pages\n     /Kids [3 0 R]\n     /Count 1\n     /MediaBox [0 0 300 144]\n  >>\nendobj\n\n3 0 obj\n  <<  /Type /Page\n      /Parent 2 0 R\n      /Resources\n       << /Font\n           << /F1\n               << /Type /Font\n                  /Subtype /Type1\n                  /BaseFont /Times-Roman\n               >>\n           >>\n       >>\n      /Contents 4 0 R\n  >>\nendobj\n\n4 0 obj\n  << /Length 55 >>\nstream\n  BT\n    /F1 18 Tf\n    0 0 Td\n    (Hello World) Tj\n  ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n  <<  /Root 1 0 R\n      /Size 5\n  >>\nstartxref\n565\n%%EOF"
+            )
+        prov = LocalFile(upload=False, verbose=VERBOSE)
+        filename = prov.run(local_filename)
+        self.assertEqual("test_.pdf", os.path.basename(filename))
+
+    def test_pdfurl(self):
+        prov = PdfUrl(upload=False, verbose=VERBOSE)
+        url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+        filename = prov.run(url, filename="test.pdf")
+        self.assertEqual("test.pdf", os.path.basename(filename))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
cgit v1.2.3


From 83df50f47426cefb71c2f4fde161c8fad934dba3 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:02:36 +0100
Subject: [wip] First commit of new code structure

Not fully functional yet probably
---
 Makefile                                 |  62 +++++
 paper2remarkable/__init__.py             |   0
 paper2remarkable/__main__.py             |  15 ++
 paper2remarkable/__version__.py          |   5 +
 paper2remarkable/crop.py                 | 160 +++++++++++++
 paper2remarkable/providers/__init__.py   |  11 +
 paper2remarkable/providers/_base.py      | 380 +++++++++++++++++++++++++++++++
 paper2remarkable/providers/acm.py        |  74 ++++++
 paper2remarkable/providers/arxiv.py      |  39 ++++
 paper2remarkable/providers/local.py      |  34 +++
 paper2remarkable/providers/openreview.py |  46 ++++
 paper2remarkable/providers/pdf_url.py    |  39 ++++
 paper2remarkable/providers/pubmed.py     |  51 +++++
 paper2remarkable/providers/springer.py   |  44 ++++
 paper2remarkable/ui.py                   |  96 ++++++++
 paper2remarkable/utils.py                |  25 ++
 setup.py                                 |  98 ++++++++
 17 files changed, 1179 insertions(+)
 create mode 100644 Makefile
 create mode 100644 paper2remarkable/__init__.py
 create mode 100644 paper2remarkable/__main__.py
 create mode 100644 paper2remarkable/__version__.py
 create mode 100644 paper2remarkable/crop.py
 create mode 100644 paper2remarkable/providers/__init__.py
 create mode 100644 paper2remarkable/providers/_base.py
 create mode 100644 paper2remarkable/providers/acm.py
 create mode 100644 paper2remarkable/providers/arxiv.py
 create mode 100644 paper2remarkable/providers/local.py
 create mode 100644 paper2remarkable/providers/openreview.py
 create mode 100644 paper2remarkable/providers/pdf_url.py
 create mode 100644 paper2remarkable/providers/pubmed.py
 create mode 100644 paper2remarkable/providers/springer.py
 create mode 100644 paper2remarkable/ui.py
 create mode 100644 paper2remarkable/utils.py
 create mode 100644 setup.py

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ed2d040
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,62 @@
+# Makefile for easier installation and cleanup.
+#
+# Uses self-documenting macros from here:
+# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+
+PACKAGE=paper2remarkable
+DOC_DIR='./docs/'
+VENV_DIR='/tmp/p2r_venv/'
+
+.PHONY: help cover dist
+
+.DEFAULT_GOAL := help
+
+help:
+	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
+		 awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
+		 %s\n", $$1, $$2}'
+
+release: ## Make a release
+	python make_release.py
+
+
+install: ## Install for the current user using the default python command
+	python setup.py build_ext --inplace
+	python setup.py install --user
+
+
+test: venv ## Run unit tests
+	source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit
+
+
+clean: ## Clean build dist and egg directories left after install
+	rm -rf ./dist
+	rm -rf ./build
+	rm -rf ./$(PACKAGE).egg-info
+	rm -rf $(VENV_DIR)
+	rm -f MANIFEST
+	find . -type f -iname '*.pyc' -delete
+	find . -type d -name '__pycache__' -empty -delete
+
+dist: ## Make Python source distribution
+	python setup.py sdist
+	python setup.py	bdist_wheel --universal
+
+docs: doc
+doc: install ## Build documentation with Sphinx
+	m2r README.md && mv README.rst $(DOC_DIR)
+	m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
+	cd $(DOC_DIR) && \
+		rm source/* && \
+		sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \
+		touch source/AUTOGENERATED
+	$(MAKE) -C $(DOC_DIR) html
+
+
+
+venv: $(VENV_DIR)/bin/activate
+
+$(VENV_DIR)/bin/activate:
+	test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
+	source $(VENV_DIR)/bin/activate && pip install -q -e .[dev]
+	touch $(VENV_DIR)/bin/activate
diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py
new file mode 100644
index 0000000..b97d538
--- /dev/null
+++ b/paper2remarkable/__main__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+"""
+Caller for the command line application
+"""
+
+import sys
+
+def main():
+    from .ui import main as realmain
+
+    sys.exit(realmain())
+
+if __name__ == '__main__':
+    main()
diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py
new file mode 100644
index 0000000..5bee2af
--- /dev/null
+++ b/paper2remarkable/__version__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+VERSION = (0, 4, 0)
+
+__version__ = '.'.join(map(str, VERSION))
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
new file mode 100644
index 0000000..b25b178
--- /dev/null
+++ b/paper2remarkable/crop.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+"""Code for cropping a PDF file
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import os
+import subprocess
+import pdfplumber
+
+RM_WIDTH = 1404
+RM_HEIGHT = 1872
+
+
+class Cropper(object):
+    def __init__(
+        self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+    ):
+        if not input_file is None:
+            self.input_file = os.path.abspath(input_file)
+            self.reader = PyPDF2.PdfFileReader(self.input_file)
+        if not output_file is None:
+            self.output_file = os.path.abspath(output_file)
+        self.pdfcrop_path = pdfcrop_path
+
+        self.writer = PyPDF2.PdfFileWriter()
+
+    def crop(self, margins=1):
+        return self.process_file(self.crop_page, margins=margins)
+
+    def center(self, padding=15):
+        return self.process_file(self.center_page, padding=padding)
+
+    def process_file(self, page_func, *args, **kwargs):
+        for page_idx in range(self.reader.getNumPages()):
+            status = page_func(page_idx, *args, **kwargs)
+            if not status == 0:
+                return status
+        with open(self.output_file, "wb") as fp:
+            self.writer.write(fp)
+        return 0
+
+    def center_page(self, page_idx, padding):
+        return self.process_page(
+            page_idx, self.get_center_bbox, padding=padding
+        )
+
+    def crop_page(self, page_idx, margins):
+        return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+    def export_page(self, page_idx):
+        """Helper function that exports a single page given by index """
+        page = self.reader.getPage(page_idx)
+        writer = PyPDF2.PdfFileWriter()
+        writer.addPage(page)
+        tmpfname = "./page.pdf"
+        with open(tmpfname, "wb") as fp:
+            writer.write(fp)
+        return tmpfname
+
+    def process_page(self, page_idx, bbox_func, *args, **kwargs):
+        """Process a single page and add it to the writer """
+        tmpfname = self.export_page(page_idx)
+        tmpfout = "./output.pdf"
+        bbox = bbox_func(tmpfname, *args, **kwargs)
+        status = subprocess.call(
+            [
+                self.pdfcrop_path,
+                "--bbox",
+                " ".join(map(str, bbox)),
+                tmpfname,
+                tmpfout,
+            ],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            return status
+        reader = PyPDF2.PdfFileReader(tmpfout)
+        page = reader.getPage(0)
+        self.writer.addPage(page)
+        os.unlink(tmpfname)
+        os.unlink(tmpfout)
+        return 0
+
+    def get_bbox(self, filename, margins=1, resolution=72):
+        """Get the bounding box, with optional margins
+
+        if margins is integer, used for all margins, else
+        margins = [left, top, right, bottom]
+
+        We get the bounding box by finding the smallest rectangle that is 
+        completely surrounded by white pixels.
+        """
+        if isinstance(margins, int):
+            margins = [margins for _ in range(4)]
+        pdf = pdfplumber.open(filename)
+        im = pdf.pages[0].to_image(resolution=resolution)
+        pdf.close()
+
+        pixels = list(im.original.getdata())
+        W, H = im.original.size
+
+        # M is a list of H lists with each W integers that equal the sum of the
+        # pixel values
+        M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+        left, top, bottom, right = 0, 0, 0, 0
+        while top < H and sum(M[top]) == W * 255 * 3:
+            top += 1
+        while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+            bottom += 1
+
+        # Transpose M
+        M = list(zip(*M))
+        while left < W and sum(M[left]) == H * 255 * 3:
+            left += 1
+        while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+            right += 1
+
+        left -= margins[0]
+        top -= margins[1]
+        right -= margins[2]
+        bottom -= margins[3]
+
+        # This is the bounding box in PIL format: (0, 0) top left
+        x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+        # Get the bbox in Ghostscript format: (0, 0) bottom left
+        a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+        return [a0, b0, a1, b1]
+
+    def get_center_bbox(self, filename, padding=15):
+        """Compute a bounding box that will center the page file on the 
+        reMarkable
+        """
+        bbox = self.get_bbox(filename, margins=0)
+
+        h = bbox[3] - bbox[1]
+        w = bbox[2] - bbox[0]
+
+        # we want some minimal padding all around, because it is visually more
+        # pleasing.
+        h_prime = h + 2 * padding
+        w_prime = w + 2 * padding
+
+        # if the document is wider than the remarkable, we add top-padding to
+        # center it, otherwise we add left-padding
+        x, y = 0, 0
+        if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+            y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+        else:
+            x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+        margins = [padding + x, padding + y, padding, padding]
+        return self.get_bbox(filename, margins=margins)
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
new file mode 100644
index 0000000..361c11e
--- /dev/null
+++ b/paper2remarkable/providers/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+from .arxiv import Arxiv
+from .pubmed import Pubmed
+from .acm import ACM
+from .openreview import OpenReview
+from .springer import Springer
+from .local import LocalFile
+from .pdf_url import PdfUrl
+
+providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
new file mode 100644
index 0000000..05fc0b7
--- /dev/null
+++ b/paper2remarkable/providers/_base.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+
+"""Base for the Provider class
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import abc
+import bs4
+import datetime
+import os
+import re
+import requests
+import shutil
+import string
+import subprocess
+import tempfile
+import time
+import titlecase
+import unidecode
+
+from ..crop import Cropper
+from ..utils import exception
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+    "Safari/537.36"
+}
+
+
+class Provider(metaclass=abc.ABCMeta):
+    """ ABC for providers of pdf sources """
+
+    meta_author_key = "citation_author"
+    meta_title_key = "citation_title"
+    meta_date_key = "citation_date"
+
+    def __init__(
+        self,
+        verbose=False,
+        upload=True,
+        debug=False,
+        center=False,
+        blank=False,
+        remarkable_dir="/",
+        rmapi_path="rmapi",
+        pdfcrop_path="pdfcrop",
+        pdftk_path="pdftk",
+        gs_path="gs",
+    ):
+        self.verbose = verbose
+        self.upload = upload
+        self.debug = debug
+        self.center = center
+        self.blank = blank
+        self.remarkable_dir = remarkable_dir
+        self.rmapi_path = rmapi_path
+        self.pdfcrop_path = pdfcrop_path
+        self.pdftk_path = pdftk_path
+        self.gs_path = gs_path
+
+        self.log("Starting %s" % type(self).__name__)
+
+    def log(self, msg, mode="info"):
+        if not self.verbose:
+            return
+        if not mode in ["info", "warning"]:
+            raise ValueError("unknown logging mode.")
+        now = datetime.datetime.now()
+        print(
+            now.strftime("%Y-%m-%d %H:%M:%S")
+            + " - "
+            + mode.upper()
+            + " - "
+            + msg
+        )
+
+    def warn(self, msg):
+        self.log(msg, mode="warning")
+
+    @staticmethod
+    @abc.abstractmethod
+    def validate(src):
+        """ Validate whether ``src`` is appropriate for this provider """
+
+    def retrieve_pdf(self, src, filename):
+        """ Download pdf from src and save to filename """
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+        op = (lambda x: x) if op is None else op
+        # format the author list retrieved by bs4
+        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+    def get_authors(self, soup):
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": self.meta_author_key})
+        ]
+        return self._format_authors(authors)
+
+    def get_title(self, soup):
+        target = soup.find_all("meta", {"name": self.meta_title_key})
+        return target[0]["content"]
+
+    def _format_date(self, soup_date):
+        return soup_date
+
+    def get_date(self, soup):
+        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+            "content"
+        ]
+        return self._format_date(date)
+
+    def get_paper_info(
+        self,
+        src,
+        author_key="citation_author",
+        title_key="citation_title",
+        date_key="citation_date",
+    ):
+        """ Retrieve the title/author (surnames)/year information """
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        self.log("Getting paper info")
+        page = self.get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = self.get_authors(soup)
+        title = self.get_title(soup)
+        date = self.get_date(soup)
+        return dict(title=title, date=date, authors=authors)
+
+    def string_clean(self, s):
+        """ Clean a string to replace accented characters with equivalents and 
+        keep only the allowed characters """
+        normalized = unidecode.unidecode(s)
+        allowed = string.ascii_letters + string.digits + "_ ."
+        cleaned = "".join(c if c in allowed else "_" for c in normalized)
+        return cleaned
+
+    def create_filename(self, info, filename=None):
+        """ Generate filename using the info dict or filename if provided """
+        if not filename is None:
+            return filename
+        # we assume that the list of authors is surname only.
+        self.log("Generating output filename")
+
+        if len(info["authors"]) > 3:
+            author_part = info["authors"][0] + "_et_al"
+        else:
+            author_part = "_".join(info["authors"])
+        author_part = self.string_clean(author_part)
+
+        title_part = self.string_clean(info["title"])
+        title_part = titlecase.titlecase(title_part).replace(" ", "_")
+
+        year_part = info["date"].split("/")[0]
+
+        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+        name = unidecode.unidecode(name)
+        self.log("Created filename: %s" % name)
+        return name
+
+    def blank_pdf(self, filepath):
+        if not self.blank:
+            return filepath
+
+        self.log("Adding blank pages")
+        input_pdf = PyPDF2.PdfFileReader(filepath)
+        output_pdf = PyPDF2.PdfFileWriter()
+        for page in input_pdf.pages:
+            output_pdf.addPage(page)
+            output_pdf.addBlankPage()
+
+        output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
+        with open(output_file, "wb") as fp:
+            output_pdf.write(fp)
+        return output_file
+
+    def crop_pdf(self, filepath):
+        self.log("Cropping pdf file")
+        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+        cropper = Cropper(
+            filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
+        )
+        status = cropper.crop(margins=15)
+
+        if not status == 0:
+            self.warn("Failed to crop the pdf file at: %s" % filepath)
+            return filepath
+        if not os.path.exists(cropped_file):
+            self.warn(
+                "Can't find cropped file '%s' where expected." % cropped_file
+            )
+            return filepath
+        return cropped_file
+
+    def center_pdf(self, filepath):
+        if not self.center:
+            return filepath
+
+        self.log("Centering pdf file")
+        centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+        cropper = Cropper(
+            filepath, centered_file, pdfcrop_path=self.pdfcrop_path
+        )
+        status = cropper.center()
+        if not status == 0:
+            self.warn("Failed to center the pdf file at: %s" % filepath)
+            return filepath
+        if not os.path.exists(centered_file):
+            self.warn(
+                "Can't find centered file '%s' where expected." % centered_file
+            )
+            return filepath
+        return centered_file
+
+    def shrink_pdf(self, filepath):
+        self.log("Shrinking pdf file")
+        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+        status = subprocess.call(
+            [
+                self.gs_path,
+                "-sDEVICE=pdfwrite",
+                "-dCompatibilityLevel=1.4",
+                "-dPDFSETTINGS=/printer",
+                "-dNOPAUSE",
+                "-dBATCH",
+                "-dQUIET",
+                "-sOutputFile=%s" % output_file,
+                filepath,
+            ]
+        )
+        if not status == 0:
+            self.warn("Failed to shrink the pdf file")
+            return filepath
+        return output_file
+
+    def check_file_is_pdf(self, filename):
+        try:
+            fp = open(filename, "rb")
+            pdf = PyPDF2.PdfFileReader(fp, strict=False)
+            fp.close()
+            del pdf
+            return True
+        except PyPDF2.utils.PdfReadError:
+            exception("Downloaded file isn't a valid pdf file.")
+
+    def download_url(self, url, filename):
+        """Download the content of an url and save it to a filename """
+        self.log("Downloading file at url: %s" % url)
+        content = self.get_page_with_retry(url)
+        with open(filename, "wb") as fid:
+            fid.write(content)
+
+    def get_page_with_retry(self, url, tries=5):
+        count = 0
+        while count < tries:
+            count += 1
+            error = False
+            try:
+                res = requests.get(url, headers=HEADERS)
+            except requests.exceptions.ConnectionError:
+                error = True
+            if error or not res.ok:
+                self.warn("Error getting url %s. Retrying in 5 seconds" % url)
+                time.sleep(5)
+                continue
+            self.log("Downloading url: %s" % url)
+            return res.content
+
+    def upload_to_rm(self, filepath):
+        remarkable_dir = self.remarkable_dir.rstrip("/")
+        self.log("Starting upload to reMarkable")
+        if remarkable_dir:
+            status = subprocess.call(
+                [self.rmapi_path, "mkdir", remarkable_dir + "/"],
+                stdout=subprocess.DEVNULL,
+            )
+            if not status == 0:
+                exception(
+                    "Creating directory %s on reMarkable failed"
+                    % remarkable_dir
+                )
+        status = subprocess.call(
+            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            exception("Uploading file %s to reMarkable failed" % filepath)
+        self.log("Upload successful.")
+
+    def dearxiv(self, input_file):
+        """Remove the arXiv timestamp from a pdf"""
+        self.log("Removing arXiv timestamp")
+        basename = os.path.splitext(input_file)[0]
+        uncompress_file = basename + "_uncompress.pdf"
+
+        status = subprocess.call(
+            [
+                self.pdftk_path,
+                input_file,
+                "output",
+                uncompress_file,
+                "uncompress",
+            ]
+        )
+        if not status == 0:
+            exception("pdftk failed to uncompress the pdf.")
+
+        with open(uncompress_file, "rb") as fid:
+            data = fid.read()
+            # Remove the text element
+            data = re.sub(
+                b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+                b"()Tj",
+                data,
+            )
+            # Remove the URL element
+            data = re.sub(
+                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+                b"",
+                data,
+            )
+
+        removed_file = basename + "_removed.pdf"
+        with open(removed_file, "wb") as oid:
+            oid.write(data)
+
+        output_file = basename + "_dearxiv.pdf"
+        status = subprocess.call(
+            [self.pdftk_path, removed_file, "output", output_file, "compress"]
+        )
+        if not status == 0:
+            exception("pdftk failed to compress the pdf.")
+
+        return output_file
+
+    def run(self, src, filename=None):
+        info = self.get_paper_info(src)
+        clean_filename = self.create_filename(info, filename)
+        tmp_filename = "paper.pdf"
+
+        self.initial_dir = os.getcwd()
+        with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
+            os.chdir(working_dir)
+            self.retrieve_pdf(src, tmp_filename)
+            self.check_file_is_pdf(tmp_filename)
+
+            ops = [
+                self.dearxiv,
+                self.crop_pdf,
+                self.center_pdf,
+                self.blank_pdf,
+                self.shrink_pdf,
+            ]
+            intermediate_fname = tmp_filename
+            for op in ops:
+                intermediate_fname = op(intermediate_fname)
+            shutil.move(intermediate_fname, clean_filename)
+
+            if self.debug:
+                print("Paused in debug mode in dir: %s" % working_dir)
+                print("Press enter to exit.")
+                return input()
+
+            if self.upload:
+                return self.upload_to_rm(clean_filename)
+
+            target_path = os.path.join(self.initial_dir, clean_filename)
+            while os.path.exists(target_path):
+                base = os.path.splitext(target_path)[0]
+                target_path = base + "_.pdf"
+            shutil.move(clean_filename, target_path)
+            return target_path
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
new file mode 100644
index 0000000..be98e16
--- /dev/null
+++ b/paper2remarkable/providers/acm.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for ACM
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import bs4
+import re
+
+from . import Provider
+from ..utils import exception
+
+# TODO: put this somewhere central, now multiply defined
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+
+class ACM(Provider):
+
+    meta_author_key = "citation_authors"
+
+    re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_acm_pdf_url(self, url):
+        page = self.get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        thea = None
+        for a in soup.find_all("a"):
+            if a.get("name") == "FullTextPDF":
+                thea = a
+                break
+        if thea is None:
+            return None
+        href = thea.get("href")
+        if href.startswith("http"):
+            return href
+        else:
+            return "https://dl.acm.org/" + href
+
+    def get_abs_pdf_urls(self, url):
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = self.get_acm_pdf_url(url)
+            if pdf_url is None:
+                exception(
+                    "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+                )
+        else:
+            exception(
+                "Couldn't figure out ACM urls, please provide a URL of the "
+                "format: http(s)://dl.acm.org/citation.cfm?id=..."
+            )
+        return abs_url, pdf_url
+
+    def validate(src):
+        m = re.fullmatch(ACM.re_abs, src)
+        return not m is None
+
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(";")
+        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+    def _format_date(self, soup_date):
+        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
+            self.warn(
+                "Couldn't extract year from ACM page, please raise an "
+                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
+            )
+        return soup_date.strip().split("/")[-1]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
new file mode 100644
index 0000000..fc5c004
--- /dev/null
+++ b/paper2remarkable/providers/arxiv.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for arxiv.org
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ..utils import exception
+
+
+class Arxiv(Provider):
+
+    re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
+    re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and abs url from any given arXiv url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("abs", "pdf") + ".pdf"
+        elif re.match(self.re_pdf, url):
+            abs_url = url[:-4].replace("pdf", "abs")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out arXiv urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        """Check if the url is to an arXiv page. """
+        return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
new file mode 100644
index 0000000..68ce030
--- /dev/null
+++ b/paper2remarkable/providers/local.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for local files
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import os
+import shutil
+
+from . import Provider
+
+
+class LocalFile(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def validate(src):
+        return os.path.exists(src)
+
+    def retrieve_pdf(self, src, filename):
+        source = os.path.join(self.initial_dir, src)
+        shutil.copy(source, filename)
+
+    def get_paper_info(self, src):
+        return {"filename": src}
+
+    def create_filename(self, info, filename=None):
+        if not filename is None:
+            return filename
+        return os.path.basename(info["filename"])
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
new file mode 100644
index 0000000..b7e1d77
--- /dev/null
+++ b/paper2remarkable/providers/openreview.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for OpenReview
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from . import Provider
+from ..utils import exception
+
+
+class OpenReview(Provider):
+
+    meta_date_key = "citation_publication_date"
+
+    re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
+    re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract url from a OpenReview url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("forum", "pdf")
+        elif re.match(self.re_pdf, url):
+            abs_url = url.replace("pdf", "forum")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out OpenReview urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        """ Check if the url is a valid OpenReview url. """
+        return re.match(OpenReview.re_abs, src) or re.match(
+            OpenReview.re_pdf, src
+        )
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
new file mode 100644
index 0000000..56427d3
--- /dev/null
+++ b/paper2remarkable/providers/pdf_url.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for generic PDF url
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import urllib
+
+from . import Provider
+from ..utils import exception
+
+
+class PdfUrl(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def validate(src):
+        try:
+            result = urllib.parse.urlparse(src)
+            return all([result.scheme, result.netloc, result.path])
+        except:
+            return False
+
+    def retrieve_pdf(self, url, filename):
+        self.download_url(url, filename)
+
+    def get_paper_info(self, src):
+        return None
+
+    def create_filename(self, info, filename=None):
+        if filename is None:
+            exception(
+                "Filename must be provided with PDFUrlProvider (use --filename)"
+            )
+        return filename
diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py
new file mode 100644
index 0000000..29bdb31
--- /dev/null
+++ b/paper2remarkable/providers/pubmed.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for PubMed
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from . import Provider
+from ..utils import exception
+
+class Pubmed(Provider):
+
+    meta_author_key = "citation_authors"
+
+    re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
+    re_pdf = (
+        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and html url from a given PMC url """
+        if re.match(self.re_pdf, url):
+            idx = url.index("pdf")
+            abs_url = url[: idx - 1]
+            pdf_url = url
+        elif re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
+        else:
+            exception("Couldn't figure out PMC urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
+
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(",")
+        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+    def _format_date(self, soup_date):
+        if re.match("\w+\ \d{4}", soup_date):
+            return soup_date.split(" ")[-1]
+        return soup_date.replace(" ", "_")
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
new file mode 100644
index 0000000..ce16007
--- /dev/null
+++ b/paper2remarkable/providers/springer.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Springer
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+import urllib
+
+from . import Provider
+from ..utils import exception
+
+
+class Springer(Provider):
+
+    meta_date_key = "citation_online_date"
+
+    re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract urls from a Springer url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("article", "content/pdf")
+        elif re.match(self.re_pdf, url):
+            abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+            pdf_url = urllib.parse.unquote(url)
+        else:
+            exception("Couldn't figure out Springer urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
new file mode 100644
index 0000000..71fc655
--- /dev/null
+++ b/paper2remarkable/ui.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+"""Command line interface
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import argparse
+
+from .providers import providers
+from .utils import exception
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "-b",
+        "--blank",
+        help="Add a blank page after every page of the PDF",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="be verbose", action="store_true"
+    )
+    parser.add_argument(
+        "-n",
+        "--no-upload",
+        help="don't upload to the reMarkable, save the output in current working dir",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        help="debug mode, doesn't upload to reMarkable",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-c",
+        "--center",
+        help="Center the PDF on the page, instead of left align",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--filename",
+        help="Filename to use for the file on reMarkable",
+        default=None,
+    )
+    parser.add_argument(
+        "-p",
+        "--remarkable-path",
+        help="directory on reMarkable to put the file (created if missing)",
+        dest="remarkable_dir",
+        default="/",
+    )
+    parser.add_argument(
+        "--rmapi", help="path to rmapi executable", default="rmapi"
+    )
+    parser.add_argument(
+        "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
+    )
+    parser.add_argument(
+        "--pdftk", help="path to pdftk executable", default="pdftk"
+    )
+    parser.add_argument("--gs", help="path to gs executable", default="gs")
+    parser.add_argument(
+        "input", help="URL to a paper or the path of a local PDF file"
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    provider = next((p for p in providers if p.validate(args.input)), None)
+    if provider is None:
+        exception("Input not valid, no provider can handle this source.")
+
+    prov = provider(
+        verbose=args.verbose,
+        upload=not args.no_upload,
+        debug=args.debug,
+        center=args.center,
+        blank=args.blank,
+        remarkable_dir=args.remarkable_dir,
+        rmapi_path=args.rmapi,
+        pdfcrop_path=args.pdfcrop,
+        pdftk_path=args.pdftk,
+        gs_path=args.gs,
+    )
+
+    prov.run(args.input, filename=args.filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
new file mode 100644
index 0000000..af19d22
--- /dev/null
+++ b/paper2remarkable/utils.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+"""Utility functions for a2r
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+
+import sys
+
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+
+
+def exception(msg):
+    print("ERROR: " + msg, file=sys.stderr)
+    print("Error occurred. Exiting.", file=sys.stderr)
+    print("", file=sys.stderr)
+    print(
+        "If you think this might be a bug, please raise an issue on GitHub: %s"
+        % GITHUB_URL
+    )
+    raise SystemExit(1)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e5a697e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import io
+import os
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+AUTHOR = "Gertjan van den Burg"
+DESCRIPTION = "Easily download an academic paper and send it to the reMarkable"
+EMAIL = "gertjanvandenburg@gmail.com"
+LICENSE = "MIT"
+LICENSE_TROVE = "License :: OSI Approved :: MIT License"
+NAME = "paper2remarkable"
+REQUIRES_PYTHON = ">=3.5.0"
+URL = "https://github.com/GjjvdBurg/paper2remarkable"
+VERSION = None
+
+# What packages are required for this module to be executed?
+REQUIRED = [
+        "bs4>=4.8.0",
+        "requests>=2.21",
+        "pdfplumber>=0.5.12",
+        "unidecode>=1.1"
+]
+
+docs_require = []
+test_require = []
+dev_require = []
+
+# What packages are optional?
+EXTRAS = {
+    "docs": docs_require,
+    "tests": test_require,
+    "dev": docs_require + test_require + dev_require,
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
+        long_description = "\n" + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
+    with open(os.path.join(here, project_slug, "__version__.py")) as f:
+        exec(f.read(), about)
+else:
+    about["__version__"] = VERSION
+
+# Where the magic happens:
+setup(
+    name=NAME,
+    version=about["__version__"],
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(
+        exclude=["tests", "*.tests", "*.tests.*", "tests.*"]
+    ),
+    install_requires=REQUIRED,
+    extras_require=EXTRAS,
+    include_package_data=True,
+    license=LICENSE,
+    ext_modules=[],
+    entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]},
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        LICENSE_TROVE,
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: Implementation :: CPython",
+        "Programming Language :: Python :: Implementation :: PyPy",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "Topic :: Education",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Utilities",
+    ],
+)
-- 
cgit v1.2.3


From 1bb2edea5723c8987de60f8783ba645df8e0cfd5 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:22:00 +0100
Subject: Define operations in the init function

This gives cleaner code and allows operations to
be defined and registered by specific providers,
such as the dearxiv functionality.
---
 paper2remarkable/providers/_base.py | 72 +++++--------------------------------
 paper2remarkable/providers/arxiv.py | 51 ++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 63 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 05fc0b7..77413a9 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -13,7 +13,6 @@ import abc
 import bs4
 import datetime
 import os
-import re
 import requests
 import shutil
 import string
@@ -56,14 +55,20 @@ class Provider(metaclass=abc.ABCMeta):
         self.verbose = verbose
         self.upload = upload
         self.debug = debug
-        self.center = center
-        self.blank = blank
         self.remarkable_dir = remarkable_dir
         self.rmapi_path = rmapi_path
         self.pdfcrop_path = pdfcrop_path
         self.pdftk_path = pdftk_path
         self.gs_path = gs_path
 
+        # Define the operations to run on the pdf. Providers can add others
+        self.operations = [("crop", self.crop_pdf)]
+        if center:
+            self.operations.append(("center", self.center_pdf))
+        if blank:
+            self.operations.append(("blank", self.blank_pdf))
+        self.operations.append(("shrink", self.shrink_pdf))
+
         self.log("Starting %s" % type(self).__name__)
 
     def log(self, msg, mode="info"):
@@ -167,9 +172,6 @@ class Provider(metaclass=abc.ABCMeta):
         return name
 
     def blank_pdf(self, filepath):
-        if not self.blank:
-            return filepath
-
         self.log("Adding blank pages")
         input_pdf = PyPDF2.PdfFileReader(filepath)
         output_pdf = PyPDF2.PdfFileWriter()
@@ -201,9 +203,6 @@ class Provider(metaclass=abc.ABCMeta):
         return cropped_file
 
     def center_pdf(self, filepath):
-        if not self.center:
-            return filepath
-
         self.log("Centering pdf file")
         centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
         cropper = Cropper(
@@ -295,52 +294,6 @@ class Provider(metaclass=abc.ABCMeta):
             exception("Uploading file %s to reMarkable failed" % filepath)
         self.log("Upload successful.")
 
-    def dearxiv(self, input_file):
-        """Remove the arXiv timestamp from a pdf"""
-        self.log("Removing arXiv timestamp")
-        basename = os.path.splitext(input_file)[0]
-        uncompress_file = basename + "_uncompress.pdf"
-
-        status = subprocess.call(
-            [
-                self.pdftk_path,
-                input_file,
-                "output",
-                uncompress_file,
-                "uncompress",
-            ]
-        )
-        if not status == 0:
-            exception("pdftk failed to uncompress the pdf.")
-
-        with open(uncompress_file, "rb") as fid:
-            data = fid.read()
-            # Remove the text element
-            data = re.sub(
-                b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
-                b"()Tj",
-                data,
-            )
-            # Remove the URL element
-            data = re.sub(
-                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
-                b"",
-                data,
-            )
-
-        removed_file = basename + "_removed.pdf"
-        with open(removed_file, "wb") as oid:
-            oid.write(data)
-
-        output_file = basename + "_dearxiv.pdf"
-        status = subprocess.call(
-            [self.pdftk_path, removed_file, "output", output_file, "compress"]
-        )
-        if not status == 0:
-            exception("pdftk failed to compress the pdf.")
-
-        return output_file
-
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
@@ -352,15 +305,8 @@ class Provider(metaclass=abc.ABCMeta):
             self.retrieve_pdf(src, tmp_filename)
             self.check_file_is_pdf(tmp_filename)
 
-            ops = [
-                self.dearxiv,
-                self.crop_pdf,
-                self.center_pdf,
-                self.blank_pdf,
-                self.shrink_pdf,
-            ]
             intermediate_fname = tmp_filename
-            for op in ops:
+            for op in self.operations:
                 intermediate_fname = op(intermediate_fname)
             shutil.move(intermediate_fname, clean_filename)
 
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index fc5c004..b1982f4 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -8,7 +8,9 @@ Copyright: 2019, G.J.J. van den Burg
 
 """
 
+import os
 import re
+import subprocess
 
 from ._base import Provider
 from ..utils import exception
@@ -22,6 +24,9 @@ class Arxiv(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # register the dearxiv operation
+        self.operations.insert(0, ("dearxiv", self.dearxiv))
+
     def get_abs_pdf_urls(self, url):
         """Get the pdf and abs url from any given arXiv url """
         if re.match(self.re_abs, url):
@@ -37,3 +42,49 @@ class Arxiv(Provider):
     def validate(src):
         """Check if the url is to an arXiv page. """
         return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
+
+    def dearxiv(self, input_file):
+        """Remove the arXiv timestamp from a pdf"""
+        self.log("Removing arXiv timestamp")
+        basename = os.path.splitext(input_file)[0]
+        uncompress_file = basename + "_uncompress.pdf"
+
+        status = subprocess.call(
+            [
+                self.pdftk_path,
+                input_file,
+                "output",
+                uncompress_file,
+                "uncompress",
+            ]
+        )
+        if not status == 0:
+            exception("pdftk failed to uncompress the pdf.")
+
+        with open(uncompress_file, "rb") as fid:
+            data = fid.read()
+            # Remove the text element
+            data = re.sub(
+                b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+                b"()Tj",
+                data,
+            )
+            # Remove the URL element
+            data = re.sub(
+                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+                b"",
+                data,
+            )
+
+        removed_file = basename + "_removed.pdf"
+        with open(removed_file, "wb") as oid:
+            oid.write(data)
+
+        output_file = basename + "_dearxiv.pdf"
+        status = subprocess.call(
+            [self.pdftk_path, removed_file, "output", output_file, "compress"]
+        )
+        if not status == 0:
+            exception("pdftk failed to compress the pdf.")
+
+        return output_file
-- 
cgit v1.2.3


From 1ac27a769c1fabd3f2339f7f929c4d39cf20564e Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:49:13 +0100
Subject: Move pdf operations to a separate module

---
 paper2remarkable/pdf_ops.py         | 97 +++++++++++++++++++++++++++++++++++++
 paper2remarkable/providers/_base.py | 82 +++++--------------------------
 2 files changed, 109 insertions(+), 70 deletions(-)
 create mode 100644 paper2remarkable/pdf_ops.py

diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
new file mode 100644
index 0000000..d1eae40
--- /dev/null
+++ b/paper2remarkable/pdf_ops.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+"""Operations on PDF files
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+
+import PyPDF2
+import logging
+import os
+import subprocess
+
+from .crop import Cropper
+
+
+def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
+    """Crop the pdf file using Cropper
+    """
+    logging.info("Cropping pdf file")
+    cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+
+    cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
+    status = cropper.crop(margins=15)
+
+    if not status == 0:
+        logging.warning("Failed to crop the pdf file at: %s" % filepath)
+        return filepath
+    if not os.path.exists(cropped_file):
+        logging.warning(
+            "Can't find cropped file '%s' where expected." % cropped_file
+        )
+        return filepath
+    return cropped_file
+
+
+def center_pdf(filepath, pdfcrop_path="pdfcrop"):
+    """Center the pdf file on the reMarkable
+    """
+    logging.info("Centering pdf file")
+    centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+
+    cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
+    status = cropper.center()
+
+    if not status == 0:
+        logging.warning("Failed to center the pdf file at: %s" % filepath)
+        return filepath
+    if not os.path.exists(centered_file):
+        logging.warning(
+            "Can't find centered file '%s' where expected." % centered_file
+        )
+        return filepath
+    return centered_file
+
+
+def blank_pdf(filepath):
+    """Add blank pages to PDF
+    """
+    logging.info("Adding blank pages")
+    input_pdf = PyPDF2.PdfFileReader(filepath)
+    output_pdf = PyPDF2.PdfFileWriter()
+    for page in input_pdf.pages:
+        output_pdf.addPage(page)
+        output_pdf.addBlankPage()
+
+    output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
+    with open(output_file, "wb") as fp:
+        output_pdf.write(fp)
+    return output_file
+
+
+def shrink_pdf(filepath, gs_path="gs"):
+    """Shrink the PDF file size using Ghostscript
+    """
+    logging.info("Shrinking pdf file")
+    output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+    status = subprocess.call(
+        [
+            gs_path,
+            "-sDEVICE=pdfwrite",
+            "-dCompatibilityLevel=1.4",
+            "-dPDFSETTINGS=/printer",
+            "-dNOPAUSE",
+            "-dBATCH",
+            "-dQUIET",
+            "-sOutputFile=%s" % output_file,
+            filepath,
+        ]
+    )
+    if not status == 0:
+        logging.warning("Failed to shrink the pdf file")
+        return filepath
+    return output_file
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 77413a9..d427f9e 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -22,7 +22,7 @@ import time
 import titlecase
 import unidecode
 
-from ..crop import Cropper
+from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
 from ..utils import exception
 
 HEADERS = {
@@ -66,7 +66,7 @@ class Provider(metaclass=abc.ABCMeta):
         if center:
             self.operations.append(("center", self.center_pdf))
         if blank:
-            self.operations.append(("blank", self.blank_pdf))
+            self.operations.append(("blank", blank_pdf))
         self.operations.append(("shrink", self.shrink_pdf))
 
         self.log("Starting %s" % type(self).__name__)
@@ -93,6 +93,16 @@ class Provider(metaclass=abc.ABCMeta):
     def validate(src):
         """ Validate whether ``src`` is appropriate for this provider """
 
+    # Wrappers for pdf operations that have additional arguments
+    def crop_pdf(self, filepath):
+        return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+
+    def center_pdf(self, filepath):
+        return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+
+    def shrink_pdf(self, filepath):
+        return shrink_pdf(filepath, gs_path=self.gs_path)
+
     def retrieve_pdf(self, src, filename):
         """ Download pdf from src and save to filename """
         _, pdf_url = self.get_abs_pdf_urls(src)
@@ -171,74 +181,6 @@ class Provider(metaclass=abc.ABCMeta):
         self.log("Created filename: %s" % name)
         return name
 
-    def blank_pdf(self, filepath):
-        self.log("Adding blank pages")
-        input_pdf = PyPDF2.PdfFileReader(filepath)
-        output_pdf = PyPDF2.PdfFileWriter()
-        for page in input_pdf.pages:
-            output_pdf.addPage(page)
-            output_pdf.addBlankPage()
-
-        output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
-        with open(output_file, "wb") as fp:
-            output_pdf.write(fp)
-        return output_file
-
-    def crop_pdf(self, filepath):
-        self.log("Cropping pdf file")
-        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
-        cropper = Cropper(
-            filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
-        )
-        status = cropper.crop(margins=15)
-
-        if not status == 0:
-            self.warn("Failed to crop the pdf file at: %s" % filepath)
-            return filepath
-        if not os.path.exists(cropped_file):
-            self.warn(
-                "Can't find cropped file '%s' where expected." % cropped_file
-            )
-            return filepath
-        return cropped_file
-
-    def center_pdf(self, filepath):
-        self.log("Centering pdf file")
-        centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
-        cropper = Cropper(
-            filepath, centered_file, pdfcrop_path=self.pdfcrop_path
-        )
-        status = cropper.center()
-        if not status == 0:
-            self.warn("Failed to center the pdf file at: %s" % filepath)
-            return filepath
-        if not os.path.exists(centered_file):
-            self.warn(
-                "Can't find centered file '%s' where expected." % centered_file
-            )
-            return filepath
-        return centered_file
-
-    def shrink_pdf(self, filepath):
-        self.log("Shrinking pdf file")
-        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
-        status = subprocess.call(
-            [
-                self.gs_path,
-                "-sDEVICE=pdfwrite",
-                "-dCompatibilityLevel=1.4",
-                "-dPDFSETTINGS=/printer",
-                "-dNOPAUSE",
-                "-dBATCH",
-                "-dQUIET",
-                "-sOutputFile=%s" % output_file,
-                filepath,
-            ]
-        )
-        if not status == 0:
-            self.warn("Failed to shrink the pdf file")
-            return filepath
-        return output_file
 
     def check_file_is_pdf(self, filename):
         try:
-- 
cgit v1.2.3


From febe13fc7006db65f3a90bbb8e30d646fd0b72af Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:49:34 +0100
Subject: Move pdf file check to utils

---
 paper2remarkable/providers/_base.py | 11 -----------
 paper2remarkable/utils.py           | 12 ++++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index d427f9e..8e9223e 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -181,17 +181,6 @@ class Provider(metaclass=abc.ABCMeta):
         self.log("Created filename: %s" % name)
         return name
 
-
-    def check_file_is_pdf(self, filename):
-        try:
-            fp = open(filename, "rb")
-            pdf = PyPDF2.PdfFileReader(fp, strict=False)
-            fp.close()
-            del pdf
-            return True
-        except PyPDF2.utils.PdfReadError:
-            exception("Downloaded file isn't a valid pdf file.")
-
     def download_url(self, url, filename):
         """Download the content of an url and save it to a filename """
         self.log("Downloading file at url: %s" % url)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index af19d22..5188afb 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -9,6 +9,7 @@ Copyright: 2019, G.J.J. van den Burg
 """
 
 
+import PyPDF2
 import sys
 
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
@@ -23,3 +24,14 @@ def exception(msg):
         % GITHUB_URL
     )
     raise SystemExit(1)
+
+
+def check_file_is_pdf(filename):
+    try:
+        fp = open(filename, "rb")
+        pdf = PyPDF2.PdfFileReader(fp, strict=False)
+        fp.close()
+        del pdf
+        return True
+    except PyPDF2.utils.PdfReadError:
+        exception("Downloaded file isn't a valid pdf file.")
-- 
cgit v1.2.3


From 283cc289655448f3d3685f57c8adfb84af2f6d69 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:50:01 +0100
Subject: Switch to logging module throughout

---
 paper2remarkable/providers/_base.py | 45 +++++++++++++------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 8e9223e..3692924 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -8,10 +8,9 @@ Copyright: 2019, G.J.J. van den Burg
 
 """
 
-import PyPDF2
 import abc
 import bs4
-import datetime
+import logging
 import os
 import requests
 import shutil
@@ -52,7 +51,6 @@ class Provider(metaclass=abc.ABCMeta):
         pdftk_path="pdftk",
         gs_path="gs",
     ):
-        self.verbose = verbose
         self.upload = upload
         self.debug = debug
         self.remarkable_dir = remarkable_dir
@@ -61,32 +59,19 @@ class Provider(metaclass=abc.ABCMeta):
         self.pdftk_path = pdftk_path
         self.gs_path = gs_path
 
+        if not self.verbose:
+            logging.disable()
+
         # Define the operations to run on the pdf. Providers can add others
         self.operations = [("crop", self.crop_pdf)]
         if center:
             self.operations.append(("center", self.center_pdf))
+
         if blank:
             self.operations.append(("blank", blank_pdf))
         self.operations.append(("shrink", self.shrink_pdf))
 
-        self.log("Starting %s" % type(self).__name__)
-
-    def log(self, msg, mode="info"):
-        if not self.verbose:
-            return
-        if not mode in ["info", "warning"]:
-            raise ValueError("unknown logging mode.")
-        now = datetime.datetime.now()
-        print(
-            now.strftime("%Y-%m-%d %H:%M:%S")
-            + " - "
-            + mode.upper()
-            + " - "
-            + msg
-        )
-
-    def warn(self, msg):
-        self.log(msg, mode="warning")
+        logging.info("Starting %s" % type(self).__name__)
 
     @staticmethod
     @abc.abstractmethod
@@ -142,7 +127,7 @@ class Provider(metaclass=abc.ABCMeta):
     ):
         """ Retrieve the title/author (surnames)/year information """
         abs_url, _ = self.get_abs_pdf_urls(src)
-        self.log("Getting paper info")
+        logging.info("Getting paper info")
         page = self.get_page_with_retry(abs_url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = self.get_authors(soup)
@@ -163,7 +148,7 @@ class Provider(metaclass=abc.ABCMeta):
         if not filename is None:
             return filename
         # we assume that the list of authors is surname only.
-        self.log("Generating output filename")
+        logging.info("Generating output filename")
 
         if len(info["authors"]) > 3:
             author_part = info["authors"][0] + "_et_al"
@@ -178,12 +163,12 @@ class Provider(metaclass=abc.ABCMeta):
 
         name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
         name = unidecode.unidecode(name)
-        self.log("Created filename: %s" % name)
+        logging.info("Created filename: %s" % name)
         return name
 
     def download_url(self, url, filename):
         """Download the content of an url and save it to a filename """
-        self.log("Downloading file at url: %s" % url)
+        logging.info("Downloading file at url: %s" % url)
         content = self.get_page_with_retry(url)
         with open(filename, "wb") as fid:
             fid.write(content)
@@ -198,15 +183,17 @@ class Provider(metaclass=abc.ABCMeta):
             except requests.exceptions.ConnectionError:
                 error = True
             if error or not res.ok:
-                self.warn("Error getting url %s. Retrying in 5 seconds" % url)
+                logging.warning(
+                    "Error getting url %s. Retrying in 5 seconds" % url
+                )
                 time.sleep(5)
                 continue
-            self.log("Downloading url: %s" % url)
+            logging.info("Downloading url: %s" % url)
             return res.content
 
     def upload_to_rm(self, filepath):
         remarkable_dir = self.remarkable_dir.rstrip("/")
-        self.log("Starting upload to reMarkable")
+        logging.info("Starting upload to reMarkable")
         if remarkable_dir:
             status = subprocess.call(
                 [self.rmapi_path, "mkdir", remarkable_dir + "/"],
@@ -223,7 +210,7 @@ class Provider(metaclass=abc.ABCMeta):
         )
         if not status == 0:
             exception("Uploading file %s to reMarkable failed" % filepath)
-        self.log("Upload successful.")
+        logging.info("Upload successful.")
 
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
-- 
cgit v1.2.3


From 2b8289495ff5910d75013b903d82085bcd7742a1 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:56:53 +0100
Subject: Move upload functionality to utils

---
 paper2remarkable/providers/_base.py | 30 ++++++------------------------
 paper2remarkable/utils.py           | 27 +++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 3692924..85415a9 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -15,14 +15,13 @@ import os
 import requests
 import shutil
 import string
-import subprocess
 import tempfile
 import time
 import titlecase
 import unidecode
 
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
-from ..utils import exception
+from ..utils import upload_to_remarkable, check_file_is_pdf
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -191,27 +190,6 @@ class Provider(metaclass=abc.ABCMeta):
             logging.info("Downloading url: %s" % url)
             return res.content
 
-    def upload_to_rm(self, filepath):
-        remarkable_dir = self.remarkable_dir.rstrip("/")
-        logging.info("Starting upload to reMarkable")
-        if remarkable_dir:
-            status = subprocess.call(
-                [self.rmapi_path, "mkdir", remarkable_dir + "/"],
-                stdout=subprocess.DEVNULL,
-            )
-            if not status == 0:
-                exception(
-                    "Creating directory %s on reMarkable failed"
-                    % remarkable_dir
-                )
-        status = subprocess.call(
-            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
-            stdout=subprocess.DEVNULL,
-        )
-        if not status == 0:
-            exception("Uploading file %s to reMarkable failed" % filepath)
-        logging.info("Upload successful.")
-
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
@@ -234,7 +212,11 @@ class Provider(metaclass=abc.ABCMeta):
                 return input()
 
             if self.upload:
-                return self.upload_to_rm(clean_filename)
+                return upload_to_remarkable(
+                    clean_filename,
+                    remarkable_dir=self.remarkable_dir,
+                    rmapi_path=self.rmapi_path,
+                )
 
             target_path = os.path.join(self.initial_dir, clean_filename)
             while os.path.exists(target_path):
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 5188afb..26b024e 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -10,6 +10,8 @@ Copyright: 2019, G.J.J. van den Burg
 
 
 import PyPDF2
+import logging
+import subprocess
 import sys
 
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
@@ -35,3 +37,28 @@ def check_file_is_pdf(filename):
         return True
     except PyPDF2.utils.PdfReadError:
         exception("Downloaded file isn't a valid pdf file.")
+
+
+def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
+    logging.info("Starting upload to reMarkable")
+
+    # Create the reMarkable dir if it doesn't exist
+    remarkable_dir = remarkable_dir.rstrip("/")
+    if remarkable_dir:
+        status = subprocess.call(
+            [rmapi_path, "mkdir", remarkable_dir + "/"],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            exception(
+                "Creating directory %s on reMarkable failed" % remarkable_dir
+            )
+
+    # Upload the file
+    status = subprocess.call(
+        [rmapi_path, "put", filepath, remarkable_dir + "/"],
+        stdout=subprocess.DEVNULL,
+    )
+    if not status == 0:
+        exception("Uploading file %s to reMarkable failed" % filepath)
+    logging.info("Upload successful.")
-- 
cgit v1.2.3


From 61807b2ce2d1d4c70016a114c77a8fe5da9fbcdb Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 14:57:19 +0100
Subject: Minor fixes to check_file_is_pdf

---
 paper2remarkable/providers/_base.py | 2 +-
 paper2remarkable/utils.py           | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 85415a9..f703874 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -199,7 +199,7 @@ class Provider(metaclass=abc.ABCMeta):
         with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
             os.chdir(working_dir)
             self.retrieve_pdf(src, tmp_filename)
-            self.check_file_is_pdf(tmp_filename)
+            check_file_is_pdf(tmp_filename)
 
             intermediate_fname = tmp_filename
             for op in self.operations:
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 26b024e..110453b 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -29,6 +29,10 @@ def exception(msg):
 
 
 def check_file_is_pdf(filename):
+    """Check that a given file is a PDF file.
+
+    This is done by trying to open it using PyPDF2.
+    """
     try:
         fp = open(filename, "rb")
         pdf = PyPDF2.PdfFileReader(fp, strict=False)
@@ -36,7 +40,7 @@ def check_file_is_pdf(filename):
         del pdf
         return True
     except PyPDF2.utils.PdfReadError:
-        exception("Downloaded file isn't a valid pdf file.")
+        exception("File %s isn't a valid pdf file." % filename)
 
 
 def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
-- 
cgit v1.2.3


From 7551591bf876f005c47a5fe98618e0ec6e2412d2 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 15:01:32 +0100
Subject: Move download functionality to utils

---
 paper2remarkable/providers/_base.py | 44 +++++++------------------------------
 paper2remarkable/utils.py           | 36 ++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index f703874..4354776 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -12,22 +12,19 @@ import abc
 import bs4
 import logging
 import os
-import requests
 import shutil
 import string
 import tempfile
-import time
 import titlecase
 import unidecode
 
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
-from ..utils import upload_to_remarkable, check_file_is_pdf
-
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
-    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
-    "Safari/537.36"
-}
+from ..utils import (
+    upload_to_remarkable,
+    check_file_is_pdf,
+    download_url,
+    get_page_with_retry,
+)
 
 
 class Provider(metaclass=abc.ABCMeta):
@@ -90,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta):
     def retrieve_pdf(self, src, filename):
         """ Download pdf from src and save to filename """
         _, pdf_url = self.get_abs_pdf_urls(src)
-        self.download_url(pdf_url, filename)
+        download_url(pdf_url, filename)
 
     def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
         op = (lambda x: x) if op is None else op
@@ -127,7 +124,7 @@ class Provider(metaclass=abc.ABCMeta):
         """ Retrieve the title/author (surnames)/year information """
         abs_url, _ = self.get_abs_pdf_urls(src)
         logging.info("Getting paper info")
-        page = self.get_page_with_retry(abs_url)
+        page = get_page_with_retry(abs_url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = self.get_authors(soup)
         title = self.get_title(soup)
@@ -165,31 +162,6 @@ class Provider(metaclass=abc.ABCMeta):
         logging.info("Created filename: %s" % name)
         return name
 
-    def download_url(self, url, filename):
-        """Download the content of an url and save it to a filename """
-        logging.info("Downloading file at url: %s" % url)
-        content = self.get_page_with_retry(url)
-        with open(filename, "wb") as fid:
-            fid.write(content)
-
-    def get_page_with_retry(self, url, tries=5):
-        count = 0
-        while count < tries:
-            count += 1
-            error = False
-            try:
-                res = requests.get(url, headers=HEADERS)
-            except requests.exceptions.ConnectionError:
-                error = True
-            if error or not res.ok:
-                logging.warning(
-                    "Error getting url %s. Retrying in 5 seconds" % url
-                )
-                time.sleep(5)
-                continue
-            logging.info("Downloading url: %s" % url)
-            return res.content
-
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 110453b..e2a714b 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -13,9 +13,17 @@ import PyPDF2
 import logging
 import subprocess
 import sys
+import requests
+import time
 
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
 
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+    "Safari/537.36"
+}
+
 
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
@@ -43,6 +51,34 @@ def check_file_is_pdf(filename):
         exception("File %s isn't a valid pdf file." % filename)
 
 
+def download_url(url, filename):
+    """Download the content of an url and save it to a filename """
+    logging.info("Downloading file at url: %s" % url)
+    content = get_page_with_retry(url)
+    with open(filename, "wb") as fid:
+        fid.write(content)
+
+
+def get_page_with_retry(url, tries=5):
+    count = 0
+    while count < tries:
+        count += 1
+        error = False
+        try:
+            res = requests.get(url, headers=HEADERS)
+        except requests.exceptions.ConnectionError:
+            error = True
+        if error or not res.ok:
+            logging.warning(
+                "(%i/%i) Error getting url %s. Retrying in 5 seconds." % 
+                (count, tries, url)
+            )
+            time.sleep(5)
+            continue
+        logging.info("Downloading url: %s" % url)
+        return res.content
+
+
 def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
     logging.info("Starting upload to reMarkable")
 
-- 
cgit v1.2.3


From eadbd95da32057e01c1b4d5f2cb554e4c0c0b292 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 15:10:47 +0100
Subject: Move string cleaning to utils

---
 paper2remarkable/providers/_base.py | 15 ++++-----------
 paper2remarkable/utils.py           | 19 +++++++++++++++----
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 4354776..db13434 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -13,17 +13,17 @@ import bs4
 import logging
 import os
 import shutil
-import string
 import tempfile
 import titlecase
 import unidecode
 
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
 from ..utils import (
-    upload_to_remarkable,
     check_file_is_pdf,
+    clean_string,
     download_url,
     get_page_with_retry,
+    upload_to_remarkable,
 )
 
 
@@ -131,13 +131,6 @@ class Provider(metaclass=abc.ABCMeta):
         date = self.get_date(soup)
         return dict(title=title, date=date, authors=authors)
 
-    def string_clean(self, s):
-        """ Clean a string to replace accented characters with equivalents and 
-        keep only the allowed characters """
-        normalized = unidecode.unidecode(s)
-        allowed = string.ascii_letters + string.digits + "_ ."
-        cleaned = "".join(c if c in allowed else "_" for c in normalized)
-        return cleaned
 
     def create_filename(self, info, filename=None):
         """ Generate filename using the info dict or filename if provided """
@@ -150,9 +143,9 @@ class Provider(metaclass=abc.ABCMeta):
             author_part = info["authors"][0] + "_et_al"
         else:
             author_part = "_".join(info["authors"])
-        author_part = self.string_clean(author_part)
+        author_part = clean_string(author_part)
 
-        title_part = self.string_clean(info["title"])
+        title_part = clean_string(info["title"])
         title_part = titlecase.titlecase(title_part).replace(" ", "_")
 
         year_part = info["date"].split("/")[0]
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index e2a714b..15cac95 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -8,13 +8,14 @@ Copyright: 2019, G.J.J. van den Burg
 
 """
 
-
 import PyPDF2
 import logging
+import requests
+import string
 import subprocess
 import sys
-import requests
 import time
+import unidecode
 
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
 
@@ -36,6 +37,16 @@ def exception(msg):
     raise SystemExit(1)
 
 
+def clean_string(s):
+    """ Clean a string by replacing accented characters with equivalents and 
+    keeping only the allowed characters (ascii letters, digits, underscore, 
+    space, and period)"""
+    normalized = unidecode.unidecode(s)
+    allowed = string.ascii_letters + string.digits + "_ ."
+    cleaned = "".join(c if c in allowed else "_" for c in normalized)
+    return cleaned
+
+
 def check_file_is_pdf(filename):
     """Check that a given file is a PDF file.
 
@@ -70,8 +81,8 @@ def get_page_with_retry(url, tries=5):
             error = True
         if error or not res.ok:
             logging.warning(
-                "(%i/%i) Error getting url %s. Retrying in 5 seconds." % 
-                (count, tries, url)
+                "(%i/%i) Error getting url %s. Retrying in 5 seconds."
+                % (count, tries, url)
             )
             time.sleep(5)
             continue
-- 
cgit v1.2.3


From 5a8b1f64445f55201999e3355589b83c01f05ba4 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 15:11:16 +0100
Subject: Simplify filename code

---
 paper2remarkable/providers/_base.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index db13434..8b454b0 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -131,11 +131,8 @@ class Provider(metaclass=abc.ABCMeta):
         date = self.get_date(soup)
         return dict(title=title, date=date, authors=authors)
 
-
-    def create_filename(self, info, filename=None):
+    def create_filename(self, info):
         """ Generate filename using the info dict or filename if provided """
-        if not filename is None:
-            return filename
         # we assume that the list of authors is surname only.
         logging.info("Generating output filename")
 
@@ -157,7 +154,7 @@ class Provider(metaclass=abc.ABCMeta):
 
     def run(self, src, filename=None):
         info = self.get_paper_info(src)
-        clean_filename = self.create_filename(info, filename)
+        clean_filename = filename or self.create_filename(info)
         tmp_filename = "paper.pdf"
 
         self.initial_dir = os.getcwd()
-- 
cgit v1.2.3


From 221a27aaf0b5e7746a790610fe568ed33dcfbd7a Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 22:56:18 +0100
Subject: Rename unit test file

---
 tests/test.py           | 106 ------------------------------------------------
 tests/test_providers.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 106 deletions(-)
 delete mode 100644 tests/test.py
 create mode 100644 tests/test_providers.py

diff --git a/tests/test.py b/tests/test.py
deleted file mode 100644
index 83c74af..0000000
--- a/tests/test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__author__ = "G.J.J. van den Burg"
-
-"""Tests"""
-
-import unittest
-import tempfile
-import hashlib
-import shutil
-import os
-
-from arxiv2remarkable import (
-    ACM,
-    Arxiv,
-    LocalFile,
-    OpenReview,
-    PdfUrl,
-    Pubmed,
-    Springer,
-)
-
-VERBOSE = False
-
-
-def md5sum(filename):
-    blocksize = 65536
-    hasher = hashlib.md5()
-    with open(filename, "rb") as fid:
-        buf = fid.read(blocksize)
-        while len(buf) > 0:
-            hasher.update(buf)
-            buf = fid.read(blocksize)
-    return hasher.hexdigest()
-
-
-class Tests(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.original_dir = os.getcwd()
-
-    def setUp(self):
-        self.test_dir = tempfile.mkdtemp()
-        os.chdir(self.test_dir)
-
-    def tearDown(self):
-        os.chdir(self.original_dir)
-        shutil.rmtree(self.test_dir)
-
-    def test_arxiv(self):
-        prov = Arxiv(upload=False, verbose=VERBOSE)
-        url = "https://arxiv.org/abs/1811.11242v1"
-        exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_pmc(self):
-        prov = Pubmed(upload=False, verbose=VERBOSE)
-        url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
-        exp_filename = (
-            "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
-        )
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_acm(self):
-        prov = ACM(upload=False, verbose=VERBOSE)
-        url = "https://dl.acm.org/citation.cfm?id=3025626"
-        exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_openreview(self):
-        prov = OpenReview(upload=False, verbose=VERBOSE)
-        url = "https://openreview.net/forum?id=S1x4ghC9tQ"
-        exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_springer(self):
-        prov = Springer(upload=False, verbose=VERBOSE)
-        url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
-        exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp_filename, os.path.basename(filename))
-
-    def test_local(self):
-        local_filename = "test.pdf"
-        with open(local_filename, "w") as fp:
-            fp.write(
-                "%PDF-1.1\n%¥±ë\n\n1 0 obj\n  << /Type /Catalog\n     /Pages 2 0 R\n  >>\nendobj\n\n2 0 obj\n  << /Type /Pages\n     /Kids [3 0 R]\n     /Count 1\n     /MediaBox [0 0 300 144]\n  >>\nendobj\n\n3 0 obj\n  <<  /Type /Page\n      /Parent 2 0 R\n      /Resources\n       << /Font\n           << /F1\n               << /Type /Font\n                  /Subtype /Type1\n                  /BaseFont /Times-Roman\n               >>\n           >>\n       >>\n      /Contents 4 0 R\n  >>\nendobj\n\n4 0 obj\n  << /Length 55 >>\nstream\n  BT\n    /F1 18 Tf\n    0 0 Td\n    (Hello World) Tj\n  ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n  <<  /Root 1 0 R\n      /Size 5\n  >>\nstartxref\n565\n%%EOF"
-            )
-        prov = LocalFile(upload=False, verbose=VERBOSE)
-        filename = prov.run(local_filename)
-        self.assertEqual("test_.pdf", os.path.basename(filename))
-
-    def test_pdfurl(self):
-        prov = PdfUrl(upload=False, verbose=VERBOSE)
-        url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
-        filename = prov.run(url, filename="test.pdf")
-        self.assertEqual("test.pdf", os.path.basename(filename))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_providers.py b/tests/test_providers.py
new file mode 100644
index 0000000..bb793b3
--- /dev/null
+++ b/tests/test_providers.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__author__ = "G.J.J. van den Burg"
+
+"""Tests"""
+
+import unittest
+import tempfile
+import hashlib
+import shutil
+import os
+
+from paper2remarkable.providers import (
+    ACM,
+    Arxiv,
+    LocalFile,
+    OpenReview,
+    PdfUrl,
+    PubMed,
+    Springer,
+)
+
+VERBOSE = True
+
+
+def md5sum(filename):
+    blocksize = 65536
+    hasher = hashlib.md5()
+    with open(filename, "rb") as fid:
+        buf = fid.read(blocksize)
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = fid.read(blocksize)
+    return hasher.hexdigest()
+
+
+class Tests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.original_dir = os.getcwd()
+
+    def setUp(self):
+        self.test_dir = tempfile.mkdtemp()
+        os.chdir(self.test_dir)
+
+    def tearDown(self):
+        os.chdir(self.original_dir)
+        shutil.rmtree(self.test_dir)
+
+    def test_arxiv(self):
+        prov = Arxiv(upload=False, verbose=VERBOSE)
+        url = "https://arxiv.org/abs/1811.11242v1"
+        exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_pmc(self):
+        prov = PubMed(upload=False, verbose=VERBOSE)
+        url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
+        exp_filename = (
+            "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
+        )
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_acm(self):
+        prov = ACM(upload=False, verbose=VERBOSE)
+        url = "https://dl.acm.org/citation.cfm?id=3025626"
+        exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_openreview(self):
+        prov = OpenReview(upload=False, verbose=VERBOSE)
+        url = "https://openreview.net/forum?id=S1x4ghC9tQ"
+        exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_springer(self):
+        prov = Springer(upload=False, verbose=VERBOSE)
+        url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
+        exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
+    def test_local(self):
+        local_filename = "test.pdf"
+        with open(local_filename, "w") as fp:
+            fp.write(
+                "%PDF-1.1\n%¥±ë\n\n1 0 obj\n  << /Type /Catalog\n     /Pages 2 0 R\n  >>\nendobj\n\n2 0 obj\n  << /Type /Pages\n     /Kids [3 0 R]\n     /Count 1\n     /MediaBox [0 0 300 144]\n  >>\nendobj\n\n3 0 obj\n  <<  /Type /Page\n      /Parent 2 0 R\n      /Resources\n       << /Font\n           << /F1\n               << /Type /Font\n                  /Subtype /Type1\n                  /BaseFont /Times-Roman\n               >>\n           >>\n       >>\n      /Contents 4 0 R\n  >>\nendobj\n\n4 0 obj\n  << /Length 55 >>\nstream\n  BT\n    /F1 18 Tf\n    0 0 Td\n    (Hello World) Tj\n  ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n  <<  /Root 1 0 R\n      /Size 5\n  >>\nstartxref\n565\n%%EOF"
+            )
+        prov = LocalFile(upload=False, verbose=VERBOSE)
+        filename = prov.run(local_filename)
+        self.assertEqual("test_.pdf", os.path.basename(filename))
+
+    def test_pdfurl(self):
+        prov = PdfUrl(upload=False, verbose=VERBOSE)
+        url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+        filename = prov.run(url, filename="test.pdf")
+        self.assertEqual("test.pdf", os.path.basename(filename))
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
cgit v1.2.3


From 058589548a6b91350e240468f5ddaa47e7a10abf Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 22:57:44 +0100
Subject: Move paper info functionality to Informer class

---
 paper2remarkable/__init__.py             |   3 +
 paper2remarkable/providers/__init__.py   |   4 +-
 paper2remarkable/providers/_base.py      | 100 +++++-------------------------
 paper2remarkable/providers/_info.py      | 103 +++++++++++++++++++++++++++++++
 paper2remarkable/providers/acm.py        |  41 ++++++------
 paper2remarkable/providers/arxiv.py      |   9 ++-
 paper2remarkable/providers/local.py      |  26 ++++----
 paper2remarkable/providers/openreview.py |  15 +++--
 paper2remarkable/providers/pdf_url.py    |  27 ++++----
 paper2remarkable/providers/pubmed.py     |  30 +++++----
 paper2remarkable/providers/springer.py   |  15 +++--
 paper2remarkable/utils.py                |   6 +-
 12 files changed, 221 insertions(+), 158 deletions(-)
 create mode 100644 paper2remarkable/providers/_info.py

diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
index e69de29..71c1105 100644
--- a/paper2remarkable/__init__.py
+++ b/paper2remarkable/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 361c11e..f6f93f9 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -1,11 +1,11 @@
 # -*- coding: utf-8 -*-
 
 from .arxiv import Arxiv
-from .pubmed import Pubmed
+from .pubmed import PubMed
 from .acm import ACM
 from .openreview import OpenReview
 from .springer import Springer
 from .local import LocalFile
 from .pdf_url import PdfUrl
 
-providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
+providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 8b454b0..ca6ab70 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -9,31 +9,19 @@ Copyright: 2019, G.J.J. van den Burg
 """
 
 import abc
-import bs4
 import logging
 import os
 import shutil
 import tempfile
-import titlecase
-import unidecode
 
+from ._info import Informer
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
-from ..utils import (
-    check_file_is_pdf,
-    clean_string,
-    download_url,
-    get_page_with_retry,
-    upload_to_remarkable,
-)
+from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable
 
 
 class Provider(metaclass=abc.ABCMeta):
     """ ABC for providers of pdf sources """
 
-    meta_author_key = "citation_author"
-    meta_title_key = "citation_title"
-    meta_date_key = "citation_date"
-
     def __init__(
         self,
         verbose=False,
@@ -54,11 +42,14 @@ class Provider(metaclass=abc.ABCMeta):
         self.pdfcrop_path = pdfcrop_path
         self.pdftk_path = pdftk_path
         self.gs_path = gs_path
+        self.informer = Informer()
 
-        if not self.verbose:
+        # disable logging if requested
+        logging.basicConfig(level=logging.INFO)
+        if not verbose:
             logging.disable()
 
-        # Define the operations to run on the pdf. Providers can add others
+        # Define the operations to run on the pdf. Providers can add others.
         self.operations = [("crop", self.crop_pdf)]
         if center:
             self.operations.append(("center", self.center_pdf))
@@ -84,87 +75,24 @@ class Provider(metaclass=abc.ABCMeta):
     def shrink_pdf(self, filepath):
         return shrink_pdf(filepath, gs_path=self.gs_path)
 
-    def retrieve_pdf(self, src, filename):
+    def retrieve_pdf(self, pdf_url, filename):
         """ Download pdf from src and save to filename """
-        _, pdf_url = self.get_abs_pdf_urls(src)
+        # This must exist so that the LocalFile provider can overwrite it
         download_url(pdf_url, filename)
 
-    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
-        op = (lambda x: x) if op is None else op
-        # format the author list retrieved by bs4
-        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
-
-    def get_authors(self, soup):
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": self.meta_author_key})
-        ]
-        return self._format_authors(authors)
-
-    def get_title(self, soup):
-        target = soup.find_all("meta", {"name": self.meta_title_key})
-        return target[0]["content"]
-
-    def _format_date(self, soup_date):
-        return soup_date
-
-    def get_date(self, soup):
-        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
-            "content"
-        ]
-        return self._format_date(date)
-
-    def get_paper_info(
-        self,
-        src,
-        author_key="citation_author",
-        title_key="citation_title",
-        date_key="citation_date",
-    ):
-        """ Retrieve the title/author (surnames)/year information """
-        abs_url, _ = self.get_abs_pdf_urls(src)
-        logging.info("Getting paper info")
-        page = get_page_with_retry(abs_url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = self.get_authors(soup)
-        title = self.get_title(soup)
-        date = self.get_date(soup)
-        return dict(title=title, date=date, authors=authors)
-
-    def create_filename(self, info):
-        """ Generate filename using the info dict or filename if provided """
-        # we assume that the list of authors is surname only.
-        logging.info("Generating output filename")
-
-        if len(info["authors"]) > 3:
-            author_part = info["authors"][0] + "_et_al"
-        else:
-            author_part = "_".join(info["authors"])
-        author_part = clean_string(author_part)
-
-        title_part = clean_string(info["title"])
-        title_part = titlecase.titlecase(title_part).replace(" ", "_")
-
-        year_part = info["date"].split("/")[0]
-
-        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-        name = unidecode.unidecode(name)
-        logging.info("Created filename: %s" % name)
-        return name
-
     def run(self, src, filename=None):
-        info = self.get_paper_info(src)
-        clean_filename = filename or self.create_filename(info)
+        abs_url, pdf_url = self.get_abs_pdf_urls(src)
+        clean_filename = filename or self.informer.get_filename(abs_url)
         tmp_filename = "paper.pdf"
 
         self.initial_dir = os.getcwd()
         with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
             os.chdir(working_dir)
-            self.retrieve_pdf(src, tmp_filename)
-            check_file_is_pdf(tmp_filename)
+            self.retrieve_pdf(pdf_url, tmp_filename)
+            assert_file_is_pdf(tmp_filename)
 
             intermediate_fname = tmp_filename
-            for op in self.operations:
+            for opname, op in self.operations:
                 intermediate_fname = op(intermediate_fname)
             shutil.move(intermediate_fname, clean_filename)
 
diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
new file mode 100644
index 0000000..04efcb1
--- /dev/null
+++ b/paper2remarkable/providers/_info.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+"""Functionality for retrieving paper info
+"""
+
+import logging
+import titlecase
+import unidecode
+import bs4
+
+from ..utils import clean_string, get_page_with_retry
+
+
+class Informer:
+    """Base class for the informers.
+
+    The "informer" class is used to retrieve the title, authors, and year of 
+    publication of the provided paper.
+
+    This base class provides the main functionality, but because various 
+    outlets use different conventions to embed author, title, and publication 
+    year information, we expect that individual providers will subclass this 
+    class and overwrite some of the methods.
+    """
+
+    meta_author_key = "citation_author"
+    meta_title_key = "citation_title"
+    meta_date_key = "citation_date"
+
+    def __init__(self, title=None, authors=None, year=None):
+        self.title = title
+        self.authors = authors or []
+        self.year = year
+
+    def get_filename(self, abs_url):
+        """ Generate nice filename using the paper information
+
+        The provided url must be to a HTMl page where this information can be 
+        found, not to the PDF file itself.
+        """
+        logging.info("Generating output filename")
+
+        # Retrieve the paper information
+        self.get_info(abs_url)
+
+        # we assume that the list of authors is surname only.
+        if len(self.authors) > 3:
+            authors = self.authors[0] + "_et_al"
+        else:
+            authors = "_".join(self.authors)
+        authors = clean_string(authors)
+
+        # Clean the title and make it titlecase
+        title = clean_string(self.title)
+        title = titlecase.titlecase(title)
+        title = title.replace(" ", "_")
+
+        year = str(self.year)
+
+        name = authors + "_-_" + title + "_" + year + ".pdf"
+        name = unidecode.unidecode(name)
+        logging.info("Created filename: %s" % name)
+        return name
+
+    def get_info(self, url):
+        logging.info("Getting paper info")
+        page = get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        self.authors = self.authors or self.get_authors(soup)
+        self.title = self.title or self.get_title(soup)
+        self.year = self.year or self.get_year(soup)
+
+    ## Title
+
+    def get_title(self, soup):
+        target = soup.find_all("meta", {"name": self.meta_title_key})
+        return target[0]["content"]
+
+    ## Authors
+
+    def get_authors(self, soup):
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": self.meta_author_key})
+        ]
+        return self._format_authors(authors)
+
+    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+        op = (lambda x: x) if op is None else op
+        # format the author list retrieved by bs4
+        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+    ## Year
+
+    def _format_year(self, soup_date):
+        return soup_date.split("/")[0]
+
+    def get_year(self, soup):
+        """ Retrieve the contents of the meta_date_key field and format it """
+        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+            "content"
+        ]
+        return self._format_year(date)
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
index be98e16..e14efa7 100644
--- a/paper2remarkable/providers/acm.py
+++ b/paper2remarkable/providers/acm.py
@@ -11,23 +11,38 @@ Copyright: 2019, G.J.J. van den Burg
 import bs4
 import re
 
-from . import Provider
-from ..utils import exception
+from ._base import Provider
+from ._info import Informer
+from .. import GITHUB_URL
+from ..utils import exception, get_page_with_retry
 
-# TODO: put this somewhere central, now multiply defined
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
-
-class ACM(Provider):
 
+class ACMInformer(Informer):
     meta_author_key = "citation_authors"
 
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(";")
+        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+    def _format_year(self, soup_date):
+        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
+            self.warn(
+                "Couldn't extract year from ACM page, please raise an "
+                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
+            )
+        return soup_date.strip().split("/")[-1]
+
+
+class ACM(Provider):
+
     re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = ACMInformer()
 
     def get_acm_pdf_url(self, url):
-        page = self.get_page_with_retry(url)
+        page = get_page_with_retry(url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         thea = None
         for a in soup.find_all("a"):
@@ -60,15 +75,3 @@ class ACM(Provider):
     def validate(src):
         m = re.fullmatch(ACM.re_abs, src)
         return not m is None
-
-    def _format_authors(self, soup_authors):
-        op = lambda x: x[0].split(";")
-        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
-
-    def _format_date(self, soup_date):
-        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
-            self.warn(
-                "Couldn't extract year from ACM page, please raise an "
-                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
-            )
-        return soup_date.strip().split("/")[-1]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index b1982f4..d950e47 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -11,11 +11,17 @@ Copyright: 2019, G.J.J. van den Burg
 import os
 import re
 import subprocess
+import logging
 
+from ._info import Informer
 from ._base import Provider
 from ..utils import exception
 
 
+class ArxivInformer(Informer):
+    pass
+
+
 class Arxiv(Provider):
 
     re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
@@ -23,6 +29,7 @@ class Arxiv(Provider):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = ArxivInformer()
 
         # register the dearxiv operation
         self.operations.insert(0, ("dearxiv", self.dearxiv))
@@ -45,7 +52,7 @@ class Arxiv(Provider):
 
     def dearxiv(self, input_file):
         """Remove the arXiv timestamp from a pdf"""
-        self.log("Removing arXiv timestamp")
+        logging.info("Removing arXiv timestamp")
         basename = os.path.splitext(input_file)[0]
         uncompress_file = basename + "_uncompress.pdf"
 
diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
index 68ce030..b1201d3 100644
--- a/paper2remarkable/providers/local.py
+++ b/paper2remarkable/providers/local.py
@@ -11,24 +11,28 @@ Copyright: 2019, G.J.J. van den Burg
 import os
 import shutil
 
-from . import Provider
+from ._base import Provider
+from ._info import Informer
+
+
+class LocalFileInformer(Informer):
+    def get_filenames(self, abs_url):
+        return os.path.basename(abs_url)
 
 
 class LocalFile(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = LocalFileInformer()
+
+    def get_abs_pdf_url(self, url):
+        # The 'url' is the path to the local file. We use this as abs_url and
+        # pdf_url.
+        return url, url
 
     def validate(src):
         return os.path.exists(src)
 
-    def retrieve_pdf(self, src, filename):
-        source = os.path.join(self.initial_dir, src)
+    def retrieve_pdf(self, pdf_url, filename):
+        source = os.path.join(self.initial_dir, pdf_url)
         shutil.copy(source, filename)
-
-    def get_paper_info(self, src):
-        return {"filename": src}
-
-    def create_filename(self, info, filename=None):
-        if not filename is None:
-            return filename
-        return os.path.basename(info["filename"])
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
index b7e1d77..bfb139d 100644
--- a/paper2remarkable/providers/openreview.py
+++ b/paper2remarkable/providers/openreview.py
@@ -10,19 +10,27 @@ Copyright: 2019, G.J.J. van den Burg
 
 import re
 
-from . import Provider
+from ._base import Provider
+from ._info import Informer
 from ..utils import exception
 
 
-class OpenReview(Provider):
+class OpenReviewInformer(Informer):
 
     meta_date_key = "citation_publication_date"
 
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class OpenReview(Provider):
+
     re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
     re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = OpenReviewInformer()
 
     def get_abs_pdf_urls(self, url):
         """ Get the pdf and abstract url from a OpenReview url """
@@ -41,6 +49,3 @@ class OpenReview(Provider):
         return re.match(OpenReview.re_abs, src) or re.match(
             OpenReview.re_pdf, src
         )
-
-    def _format_authors(self, soup_authors):
-        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index 56427d3..f28c742 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -10,13 +10,25 @@ Copyright: 2019, G.J.J. van den Burg
 
 import urllib
 
-from . import Provider
+from ._base import Provider
+from ._info import Informer
+
 from ..utils import exception
 
+class PdfUrlInformer(Informer):
+
+    def get_filename(self, abs_url):
+        # if this is called, filename must not be provided
+        exception(
+                "Filename must be provided with PDFUrlProvider (use --filename)"
+            )
+
+
 
 class PdfUrl(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = PdfUrlInformer()
 
     def validate(src):
         try:
@@ -24,16 +36,3 @@ class PdfUrl(Provider):
             return all([result.scheme, result.netloc, result.path])
         except:
             return False
-
-    def retrieve_pdf(self, url, filename):
-        self.download_url(url, filename)
-
-    def get_paper_info(self, src):
-        return None
-
-    def create_filename(self, info, filename=None):
-        if filename is None:
-            exception(
-                "Filename must be provided with PDFUrlProvider (use --filename)"
-            )
-        return filename
diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py
index 29bdb31..ba4cca0 100644
--- a/paper2remarkable/providers/pubmed.py
+++ b/paper2remarkable/providers/pubmed.py
@@ -10,13 +10,27 @@ Copyright: 2019, G.J.J. van den Burg
 
 import re
 
-from . import Provider
+from ._base import Provider
+from ._info import Informer
 from ..utils import exception
 
-class Pubmed(Provider):
+
+class PubMedInformer(Informer):
 
     meta_author_key = "citation_authors"
 
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(",")
+        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+    def _format_year(self, soup_date):
+        if re.match("\w+\ \d{4}", soup_date):
+            return soup_date.split(" ")[-1]
+        return soup_date.replace(" ", "_")
+
+
+class PubMed(Provider):
+
     re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
     re_pdf = (
         "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
@@ -24,6 +38,7 @@ class Pubmed(Provider):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = PubMedInformer()
 
     def get_abs_pdf_urls(self, url):
         """Get the pdf and html url from a given PMC url """
@@ -39,13 +54,4 @@ class Pubmed(Provider):
         return abs_url, pdf_url
 
     def validate(src):
-        return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
-
-    def _format_authors(self, soup_authors):
-        op = lambda x: x[0].split(",")
-        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
-
-    def _format_date(self, soup_date):
-        if re.match("\w+\ \d{4}", soup_date):
-            return soup_date.split(" ")[-1]
-        return soup_date.replace(" ", "_")
+        return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src)
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
index ce16007..ce4acdd 100644
--- a/paper2remarkable/providers/springer.py
+++ b/paper2remarkable/providers/springer.py
@@ -11,19 +11,27 @@ Copyright: 2019, G.J.J. van den Burg
 import re
 import urllib
 
-from . import Provider
+from ._base import Provider
+from ._info import Informer
 from ..utils import exception
 
 
-class Springer(Provider):
+class SpringerInformer(Informer):
 
     meta_date_key = "citation_online_date"
 
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class Springer(Provider):
+
     re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
     re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.informer = SpringerInformer()
 
     def get_abs_pdf_urls(self, url):
         """ Get the pdf and abstract urls from a Springer url """
@@ -39,6 +47,3 @@ class Springer(Provider):
 
     def validate(src):
         return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
-
-    def _format_authors(self, soup_authors):
-        return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 15cac95..2bed231 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -17,7 +17,7 @@ import sys
 import time
 import unidecode
 
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
+from . import GITHUB_URL
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -47,8 +47,8 @@ def clean_string(s):
     return cleaned
 
 
-def check_file_is_pdf(filename):
-    """Check that a given file is a PDF file.
+def assert_file_is_pdf(filename):
+    """Assert that a given file is a PDF file.
 
     This is done by trying to open it using PyPDF2.
     """
-- 
cgit v1.2.3


From 552fdeff2832bfe6dc71ebdfdaf92387f5cb98b0 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 22:57:50 +0100
Subject: fix dependencies

---
 setup.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index e5a697e..3d1fbc5 100644
--- a/setup.py
+++ b/setup.py
@@ -19,15 +19,19 @@ VERSION = None
 
 # What packages are required for this module to be executed?
 REQUIRED = [
-        "bs4>=4.8.0",
-        "requests>=2.21",
-        "pdfplumber>=0.5.12",
-        "unidecode>=1.1"
+    "beautifulsoup4>=4.8",
+    "requests>=2.21",
+    "pdfplumber>=0.5",
+    "unidecode>=1.1",
+    "titlecase>=0.12",
+    "PyPDF2>=1.26"
+
 ]
 
 docs_require = []
 test_require = []
-dev_require = []
+dev_require = [
+        'green']
 
 # What packages are optional?
 EXTRAS = {
-- 
cgit v1.2.3


From db56f74e5430ac1f1a1b255db3dc3fe799bffbbb Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 22:58:08 +0100
Subject: minor makefile fixes

---
 Makefile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index ed2d040..baccb92 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@
 
 PACKAGE=paper2remarkable
 DOC_DIR='./docs/'
-VENV_DIR='/tmp/p2r_venv/'
+VENV_DIR=/tmp/p2r_venv/
 
 .PHONY: help cover dist
 
@@ -26,7 +26,7 @@ install: ## Install for the current user using the default python command
 
 
 test: venv ## Run unit tests
-	source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit
+	source $(VENV_DIR)/bin/activate && green -v ./tests
 
 
 clean: ## Clean build dist and egg directories left after install
@@ -52,11 +52,9 @@ doc: install ## Build documentation with Sphinx
 		touch source/AUTOGENERATED
 	$(MAKE) -C $(DOC_DIR) html
 
-
-
 venv: $(VENV_DIR)/bin/activate
 
 $(VENV_DIR)/bin/activate:
 	test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
-	source $(VENV_DIR)/bin/activate && pip install -q -e .[dev]
+	source $(VENV_DIR)/bin/activate && pip install -e .[dev]
 	touch $(VENV_DIR)/bin/activate
-- 
cgit v1.2.3


From 754ae016ae27a337bf230d162abf6ea1b423bd7d Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 22:58:31 +0100
Subject: Remove poetry stuff

---
 poetry.lock    | 183 ---------------------------------------------------------
 pyproject.toml |  19 ------
 2 files changed, 202 deletions(-)
 delete mode 100644 poetry.lock
 delete mode 100644 pyproject.toml

diff --git a/poetry.lock b/poetry.lock
deleted file mode 100644
index 272967c..0000000
--- a/poetry.lock
+++ /dev/null
@@ -1,183 +0,0 @@
-[[package]]
-category = "main"
-description = "Screen-scraping library"
-name = "beautifulsoup4"
-optional = false
-python-versions = "*"
-version = "4.7.1"
-
-[package.dependencies]
-soupsieve = ">=1.2"
-
-[[package]]
-category = "main"
-description = "Dummy package for Beautiful Soup"
-name = "bs4"
-optional = false
-python-versions = "*"
-version = "0.0.1"
-
-[package.dependencies]
-beautifulsoup4 = "*"
-
-[[package]]
-category = "main"
-description = "Python package for providing Mozilla's CA Bundle."
-name = "certifi"
-optional = false
-python-versions = "*"
-version = "2018.11.29"
-
-[[package]]
-category = "main"
-description = "Universal encoding detector for Python 2 and 3"
-name = "chardet"
-optional = false
-python-versions = "*"
-version = "3.0.4"
-
-[[package]]
-category = "main"
-description = "Internationalized Domain Names in Applications (IDNA)"
-name = "idna"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "2.8"
-
-[[package]]
-category = "main"
-description = "PDF parser and analyzer"
-name = "pdfminer.six"
-optional = false
-python-versions = "*"
-version = "20181108"
-
-[package.dependencies]
-pycryptodome = "*"
-six = "*"
-sortedcontainers = "*"
-
-[[package]]
-category = "main"
-description = "Plumb a PDF for detailed information about each char, rectangle, and line."
-name = "pdfplumber"
-optional = false
-python-versions = "*"
-version = "0.5.12"
-
-[package.dependencies]
-chardet = "*"
-"pdfminer.six" = "20181108"
-pillow = ">=3.0.0"
-pycryptodome = "*"
-unicodecsv = ">=0.14.1"
-wand = "*"
-
-[[package]]
-category = "main"
-description = "Python Imaging Library (Fork)"
-name = "pillow"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-version = "6.0.0"
-
-[[package]]
-category = "main"
-description = "Cryptographic library for Python"
-name = "pycryptodome"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "3.8.2"
-
-[[package]]
-category = "main"
-description = "Python HTTP for Humans."
-name = "requests"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "2.21.0"
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-chardet = ">=3.0.2,<3.1.0"
-idna = ">=2.5,<2.9"
-urllib3 = ">=1.21.1,<1.25"
-
-[[package]]
-category = "main"
-description = "Python 2 and 3 compatibility utilities"
-name = "six"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*"
-version = "1.12.0"
-
-[[package]]
-category = "main"
-description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
-name = "sortedcontainers"
-optional = false
-python-versions = "*"
-version = "2.1.0"
-
-[[package]]
-category = "main"
-description = "A CSS4 selector implementation for Beautiful Soup."
-name = "soupsieve"
-optional = false
-python-versions = "*"
-version = "1.7.3"
-
-[[package]]
-category = "main"
-description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*."
-name = "unicodecsv"
-optional = false
-python-versions = "*"
-version = "0.14.1"
-
-[[package]]
-category = "main"
-description = "ASCII transliterations of Unicode text"
-name = "unidecode"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "1.1.1"
-
-[[package]]
-category = "main"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-name = "urllib3"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
-version = "1.24.1"
-
-[[package]]
-category = "main"
-description = "Ctypes-based simple MagickWand API binding for Python"
-name = "wand"
-optional = false
-python-versions = "*"
-version = "0.5.4"
-
-[metadata]
-content-hash = "51a0dc0e8f6e6e23395cd5aca6a81e9b3aa121ec86f120f1304f2142eb2b65b0"
-python-versions = "^3.5"
-
-[metadata.hashes]
-beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"]
-bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
-certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
-chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
-idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
-"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"]
-pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"]
-pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"]
-pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"]
-requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
-six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"]
-sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"]
-soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
-unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"]
-unidecode = ["1d7a042116536098d05d599ef2b8616759f02985c85b4fef50c78a5aaf10822a", "2b6aab710c2a1647e928e36d69c21e76b453cd455f4e2621000e54b2a9b8cce8"]
-urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
-wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"]
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 7e9c629..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[tool.poetry]
-name = "arxiv2remarkable"
-version = "0.1.0"
-description = "Download an arXiv paper and send it to reMarkable"
-authors = ["Gertjan van den Burg <gertjanvandenburg@gmail.com>"]
-license = "MIT"
-
-[tool.poetry.dependencies]
-python = "^3.5"
-bs4 = "^0.0.1"
-requests = "^2.21"
-pdfplumber = "^0.5.12"
-unidecode = "^1.1"
-
-[tool.poetry.dev-dependencies]
-
-[build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
-- 
cgit v1.2.3


From 317e79cc6aaa9572e4090dad653df8fd6eff9563 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 23:03:41 +0100
Subject: Remove old script as its no longer needed

---
 arxiv2remarkable.py | 859 ----------------------------------------------------
 1 file changed, 859 deletions(-)
 delete mode 100755 arxiv2remarkable.py

diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
deleted file mode 100755
index 5694e1b..0000000
--- a/arxiv2remarkable.py
+++ /dev/null
@@ -1,859 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__version__ = "0.3.5"
-__author__ = "G.J.J. van den Burg"
-
-"""
-Download a paper from various sources and send it to the reMarkable.
-
-Author: G.J.J. van den Burg
-Date: 2019-02-02
-License: MIT
-
-"""
-
-import PyPDF2
-import abc
-import argparse
-import bs4
-import datetime
-import os
-import pdfplumber
-import re
-import requests
-import shutil
-import string
-import subprocess
-import sys
-import tempfile
-import time
-import titlecase
-import unidecode
-import urllib.parse
-
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
-
-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
-    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
-    "Safari/537.36"
-}
-
-RM_WIDTH = 1404
-RM_HEIGHT = 1872
-
-
-class Provider(metaclass=abc.ABCMeta):
-    """ ABC for providers of pdf sources """
-
-    meta_author_key = "citation_author"
-    meta_title_key = "citation_title"
-    meta_date_key = "citation_date"
-
-    def __init__(
-        self,
-        verbose=False,
-        upload=True,
-        debug=False,
-        center=False,
-        blank=False,
-        remarkable_dir="/",
-        rmapi_path="rmapi",
-        pdfcrop_path="pdfcrop",
-        pdftk_path="pdftk",
-        gs_path="gs",
-    ):
-        self.verbose = verbose
-        self.upload = upload
-        self.debug = debug
-        self.center = center
-        self.blank = blank
-        self.remarkable_dir = remarkable_dir
-        self.rmapi_path = rmapi_path
-        self.pdfcrop_path = pdfcrop_path
-        self.pdftk_path = pdftk_path
-        self.gs_path = gs_path
-
-        self.log("Starting %s" % type(self).__name__)
-
-    def log(self, msg, mode="info"):
-        if not self.verbose:
-            return
-        if not mode in ["info", "warning"]:
-            raise ValueError("unknown logging mode.")
-        now = datetime.datetime.now()
-        print(
-            now.strftime("%Y-%m-%d %H:%M:%S")
-            + " - "
-            + mode.upper()
-            + " - "
-            + msg
-        )
-
-    def warn(self, msg):
-        self.log(msg, mode="warning")
-
-    @staticmethod
-    @abc.abstractmethod
-    def validate(src):
-        """ Validate whether ``src`` is appropriate for this provider """
-
-    def retrieve_pdf(self, src, filename):
-        """ Download pdf from src and save to filename """
-        _, pdf_url = self.get_abs_pdf_urls(src)
-        self.download_url(pdf_url, filename)
-
-    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
-        op = (lambda x: x) if op is None else op
-        # format the author list retrieved by bs4
-        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
-
-    def get_authors(self, soup):
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": self.meta_author_key})
-        ]
-        return self._format_authors(authors)
-
-    def get_title(self, soup):
-        target = soup.find_all("meta", {"name": self.meta_title_key})
-        return target[0]["content"]
-
-    def _format_date(self, soup_date):
-        return soup_date
-
-    def get_date(self, soup):
-        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
-            "content"
-        ]
-        return self._format_date(date)
-
-    def get_paper_info(
-        self,
-        src,
-        author_key="citation_author",
-        title_key="citation_title",
-        date_key="citation_date",
-    ):
-        """ Retrieve the title/author (surnames)/year information """
-        abs_url, _ = self.get_abs_pdf_urls(src)
-        self.log("Getting paper info")
-        page = self.get_page_with_retry(abs_url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = self.get_authors(soup)
-        title = self.get_title(soup)
-        date = self.get_date(soup)
-        return dict(title=title, date=date, authors=authors)
-
-    def string_clean(self, s):
-        """ Clean a string to replace accented characters with equivalents and 
-        keep only the allowed characters """
-        normalized = unidecode.unidecode(s)
-        allowed = string.ascii_letters + string.digits + "_ ."
-        cleaned = "".join(c if c in allowed else "_" for c in normalized)
-        return cleaned
-
-    def create_filename(self, info, filename=None):
-        """ Generate filename using the info dict or filename if provided """
-        if not filename is None:
-            return filename
-        # we assume that the list of authors is surname only.
-        self.log("Generating output filename")
-
-        if len(info["authors"]) > 3:
-            author_part = info["authors"][0] + "_et_al"
-        else:
-            author_part = "_".join(info["authors"])
-        author_part = self.string_clean(author_part)
-
-        title_part = self.string_clean(info["title"])
-        title_part = titlecase.titlecase(title_part).replace(" ", "_")
-
-        year_part = info["date"].split("/")[0]
-
-        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-        name = unidecode.unidecode(name)
-        self.log("Created filename: %s" % name)
-        return name
-
-    def blank_pdf(self, filepath):
-        if not self.blank:
-            return filepath
-
-        self.log("Adding blank pages")
-        input_pdf = PyPDF2.PdfFileReader(filepath)
-        output_pdf = PyPDF2.PdfFileWriter()
-        for page in input_pdf.pages:
-            output_pdf.addPage(page)
-            output_pdf.addBlankPage()
-
-        output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
-        with open(output_file, "wb") as fp:
-            output_pdf.write(fp)
-        return output_file
-
-    def crop_pdf(self, filepath):
-        self.log("Cropping pdf file")
-        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
-        cropper = Cropper(
-            filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
-        )
-        status = cropper.crop(margins=15)
-
-        if not status == 0:
-            self.warn("Failed to crop the pdf file at: %s" % filepath)
-            return filepath
-        if not os.path.exists(cropped_file):
-            self.warn(
-                "Can't find cropped file '%s' where expected." % cropped_file
-            )
-            return filepath
-        return cropped_file
-
-    def center_pdf(self, filepath):
-        if not self.center:
-            return filepath
-
-        self.log("Centering pdf file")
-        centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
-        cropper = Cropper(
-            filepath, centered_file, pdfcrop_path=self.pdfcrop_path
-        )
-        status = cropper.center()
-        if not status == 0:
-            self.warn("Failed to center the pdf file at: %s" % filepath)
-            return filepath
-        if not os.path.exists(centered_file):
-            self.warn(
-                "Can't find centered file '%s' where expected." % centered_file
-            )
-            return filepath
-        return centered_file
-
-    def shrink_pdf(self, filepath):
-        self.log("Shrinking pdf file")
-        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
-        status = subprocess.call(
-            [
-                self.gs_path,
-                "-sDEVICE=pdfwrite",
-                "-dCompatibilityLevel=1.4",
-                "-dPDFSETTINGS=/printer",
-                "-dNOPAUSE",
-                "-dBATCH",
-                "-dQUIET",
-                "-sOutputFile=%s" % output_file,
-                filepath,
-            ]
-        )
-        if not status == 0:
-            self.warn("Failed to shrink the pdf file")
-            return filepath
-        return output_file
-
-    def check_file_is_pdf(self, filename):
-        try:
-            fp = open(filename, "rb")
-            pdf = PyPDF2.PdfFileReader(fp, strict=False)
-            fp.close()
-            del pdf
-            return True
-        except PyPDF2.utils.PdfReadError:
-            exception("Downloaded file isn't a valid pdf file.")
-
-    def download_url(self, url, filename):
-        """Download the content of an url and save it to a filename """
-        self.log("Downloading file at url: %s" % url)
-        content = self.get_page_with_retry(url)
-        with open(filename, "wb") as fid:
-            fid.write(content)
-
-    def get_page_with_retry(self, url, tries=5):
-        count = 0
-        while count < tries:
-            count += 1
-            error = False
-            try:
-                res = requests.get(url, headers=HEADERS)
-            except requests.exceptions.ConnectionError:
-                error = True
-            if error or not res.ok:
-                self.warn("Error getting url %s. Retrying in 5 seconds" % url)
-                time.sleep(5)
-                continue
-            self.log("Downloading url: %s" % url)
-            return res.content
-
-    def upload_to_rm(self, filepath):
-        remarkable_dir = self.remarkable_dir.rstrip("/")
-        self.log("Starting upload to reMarkable")
-        if remarkable_dir:
-            status = subprocess.call(
-                [self.rmapi_path, "mkdir", remarkable_dir + "/"],
-                stdout=subprocess.DEVNULL,
-            )
-            if not status == 0:
-                exception(
-                    "Creating directory %s on reMarkable failed"
-                    % remarkable_dir
-                )
-        status = subprocess.call(
-            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
-            stdout=subprocess.DEVNULL,
-        )
-        if not status == 0:
-            exception("Uploading file %s to reMarkable failed" % filepath)
-        self.log("Upload successful.")
-
-    def dearxiv(self, input_file):
-        """Remove the arXiv timestamp from a pdf"""
-        self.log("Removing arXiv timestamp")
-        basename = os.path.splitext(input_file)[0]
-        uncompress_file = basename + "_uncompress.pdf"
-
-        status = subprocess.call(
-            [
-                self.pdftk_path,
-                input_file,
-                "output",
-                uncompress_file,
-                "uncompress",
-            ]
-        )
-        if not status == 0:
-            exception("pdftk failed to uncompress the pdf.")
-
-        with open(uncompress_file, "rb") as fid:
-            data = fid.read()
-            # Remove the text element
-            data = re.sub(
-                b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
-                b"()Tj",
-                data,
-            )
-            # Remove the URL element
-            data = re.sub(
-                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
-                b"",
-                data,
-            )
-
-        removed_file = basename + "_removed.pdf"
-        with open(removed_file, "wb") as oid:
-            oid.write(data)
-
-        output_file = basename + "_dearxiv.pdf"
-        status = subprocess.call(
-            [self.pdftk_path, removed_file, "output", output_file, "compress"]
-        )
-        if not status == 0:
-            exception("pdftk failed to compress the pdf.")
-
-        return output_file
-
-    def run(self, src, filename=None):
-        info = self.get_paper_info(src)
-        clean_filename = self.create_filename(info, filename)
-        tmp_filename = "paper.pdf"
-
-        self.initial_dir = os.getcwd()
-        with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
-            os.chdir(working_dir)
-            self.retrieve_pdf(src, tmp_filename)
-            self.check_file_is_pdf(tmp_filename)
-
-            ops = [
-                self.dearxiv,
-                self.crop_pdf,
-                self.center_pdf,
-                self.blank_pdf,
-                self.shrink_pdf,
-            ]
-            intermediate_fname = tmp_filename
-            for op in ops:
-                intermediate_fname = op(intermediate_fname)
-            shutil.move(intermediate_fname, clean_filename)
-
-            if self.debug:
-                print("Paused in debug mode in dir: %s" % working_dir)
-                print("Press enter to exit.")
-                return input()
-
-            if self.upload:
-                return self.upload_to_rm(clean_filename)
-
-            target_path = os.path.join(self.initial_dir, clean_filename)
-            while os.path.exists(target_path):
-                base = os.path.splitext(target_path)[0]
-                target_path = base + "_.pdf"
-            shutil.move(clean_filename, target_path)
-            return target_path
-
-
-class Arxiv(Provider):
-
-    re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
-    re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def get_abs_pdf_urls(self, url):
-        """Get the pdf and abs url from any given arXiv url """
-        if re.match(self.re_abs, url):
-            abs_url = url
-            pdf_url = url.replace("abs", "pdf") + ".pdf"
-        elif re.match(self.re_pdf, url):
-            abs_url = url[:-4].replace("pdf", "abs")
-            pdf_url = url
-        else:
-            exception("Couldn't figure out arXiv urls.")
-        return abs_url, pdf_url
-
-    def validate(src):
-        """Check if the url is to an arXiv page. """
-        return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
-
-
-class Pubmed(Provider):
-
-    meta_author_key = "citation_authors"
-
-    re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
-    re_pdf = (
-        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
-    )
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def get_abs_pdf_urls(self, url):
-        """Get the pdf and html url from a given PMC url """
-        if re.match(self.re_pdf, url):
-            idx = url.index("pdf")
-            abs_url = url[: idx - 1]
-            pdf_url = url
-        elif re.match(self.re_abs, url):
-            abs_url = url
-            pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
-        else:
-            exception("Couldn't figure out PMC urls.")
-        return abs_url, pdf_url
-
-    def validate(src):
-        return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
-
-    def _format_authors(self, soup_authors):
-        op = lambda x: x[0].split(",")
-        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
-
-    def _format_date(self, soup_date):
-        if re.match("\w+\ \d{4}", soup_date):
-            return soup_date.split(" ")[-1]
-        return soup_date.replace(" ", "_")
-
-
-class ACM(Provider):
-
-    meta_author_key = "citation_authors"
-
-    re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def get_acm_pdf_url(self, url):
-        page = self.get_page_with_retry(url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        thea = None
-        for a in soup.find_all("a"):
-            if a.get("name") == "FullTextPDF":
-                thea = a
-                break
-        if thea is None:
-            return None
-        href = thea.get("href")
-        if href.startswith("http"):
-            return href
-        else:
-            return "https://dl.acm.org/" + href
-
-    def get_abs_pdf_urls(self, url):
-        if re.match(self.re_abs, url):
-            abs_url = url
-            pdf_url = self.get_acm_pdf_url(url)
-            if pdf_url is None:
-                exception(
-                    "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
-                )
-        else:
-            exception(
-                "Couldn't figure out ACM urls, please provide a URL of the "
-                "format: http(s)://dl.acm.org/citation.cfm?id=..."
-            )
-        return abs_url, pdf_url
-
-    def validate(src):
-        m = re.fullmatch(ACM.re_abs, src)
-        return not m is None
-
-    def _format_authors(self, soup_authors):
-        op = lambda x: x[0].split(";")
-        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
-
-    def _format_date(self, soup_date):
-        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
-            self.warn(
-                "Couldn't extract year from ACM page, please raise an "
-                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
-            )
-        return soup_date.strip().split("/")[-1]
-
-
-class OpenReview(Provider):
-
-    meta_date_key = "citation_publication_date"
-
-    re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
-    re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def get_abs_pdf_urls(self, url):
-        """ Get the pdf and abstract url from a OpenReview url """
-        if re.match(self.re_abs, url):
-            abs_url = url
-            pdf_url = url.replace("forum", "pdf")
-        elif re.match(self.re_pdf, url):
-            abs_url = url.replace("pdf", "forum")
-            pdf_url = url
-        else:
-            exception("Couldn't figure out OpenReview urls.")
-        return abs_url, pdf_url
-
-    def validate(src):
-        """ Check if the url is a valid OpenReview url. """
-        return re.match(OpenReview.re_abs, src) or re.match(
-            OpenReview.re_pdf, src
-        )
-
-    def _format_authors(self, soup_authors):
-        return super()._format_authors(soup_authors, sep=" ", idx=-1)
-
-
-class Springer(Provider):
-
-    meta_date_key = "citation_online_date"
-
-    re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
-    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def get_abs_pdf_urls(self, url):
-        """ Get the pdf and abstract urls from a Springer url """
-        if re.match(self.re_abs, url):
-            abs_url = url
-            pdf_url = url.replace("article", "content/pdf")
-        elif re.match(self.re_pdf, url):
-            abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
-            pdf_url = urllib.parse.unquote(url)
-        else:
-            exception("Couldn't figure out Springer urls.")
-        return abs_url, pdf_url
-
-    def validate(src):
-        return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
-
-    def _format_authors(self, soup_authors):
-        return super()._format_authors(soup_authors, sep=" ", idx=-1)
-
-
-class LocalFile(Provider):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def validate(src):
-        return os.path.exists(src)
-
-    def retrieve_pdf(self, src, filename):
-        source = os.path.join(self.initial_dir, src)
-        shutil.copy(source, filename)
-
-    def get_paper_info(self, src):
-        return {"filename": src}
-
-    def create_filename(self, info, filename=None):
-        if not filename is None:
-            return filename
-        return os.path.basename(info["filename"])
-
-
-class PdfUrl(Provider):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def validate(src):
-        try:
-            result = urllib.parse.urlparse(src)
-            return all([result.scheme, result.netloc, result.path])
-        except:
-            return False
-
-    def retrieve_pdf(self, url, filename):
-        self.download_url(url, filename)
-
-    def get_paper_info(self, src):
-        return None
-
-    def create_filename(self, info, filename=None):
-        if filename is None:
-            exception(
-                "Filename must be provided with PDFUrlProvider (use --filename)"
-            )
-        return filename
-
-
-class Cropper(object):
-    def __init__(
-        self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
-    ):
-        if not input_file is None:
-            self.input_file = os.path.abspath(input_file)
-            self.reader = PyPDF2.PdfFileReader(self.input_file)
-        if not output_file is None:
-            self.output_file = os.path.abspath(output_file)
-        self.pdfcrop_path = pdfcrop_path
-
-        self.writer = PyPDF2.PdfFileWriter()
-
-    def crop(self, margins=1):
-        return self.process_file(self.crop_page, margins=margins)
-
-    def center(self, padding=15):
-        return self.process_file(self.center_page, padding=padding)
-
-    def process_file(self, page_func, *args, **kwargs):
-        for page_idx in range(self.reader.getNumPages()):
-            status = page_func(page_idx, *args, **kwargs)
-            if not status == 0:
-                return status
-        with open(self.output_file, "wb") as fp:
-            self.writer.write(fp)
-        return 0
-
-    def center_page(self, page_idx, padding):
-        return self.process_page(
-            page_idx, self.get_center_bbox, padding=padding
-        )
-
-    def crop_page(self, page_idx, margins):
-        return self.process_page(page_idx, self.get_bbox, margins=margins)
-
-    def export_page(self, page_idx):
-        """Helper function that exports a single page given by index """
-        page = self.reader.getPage(page_idx)
-        writer = PyPDF2.PdfFileWriter()
-        writer.addPage(page)
-        tmpfname = "./page.pdf"
-        with open(tmpfname, "wb") as fp:
-            writer.write(fp)
-        return tmpfname
-
-    def process_page(self, page_idx, bbox_func, *args, **kwargs):
-        """Process a single page and add it to the writer """
-        tmpfname = self.export_page(page_idx)
-        tmpfout = "./output.pdf"
-        bbox = bbox_func(tmpfname, *args, **kwargs)
-        status = subprocess.call(
-            [
-                self.pdfcrop_path,
-                "--bbox",
-                " ".join(map(str, bbox)),
-                tmpfname,
-                tmpfout,
-            ],
-            stdout=subprocess.DEVNULL,
-        )
-        if not status == 0:
-            return status
-        reader = PyPDF2.PdfFileReader(tmpfout)
-        page = reader.getPage(0)
-        self.writer.addPage(page)
-        os.unlink(tmpfname)
-        os.unlink(tmpfout)
-        return 0
-
-    def get_bbox(self, filename, margins=1, resolution=72):
-        """Get the bounding box, with optional margins
-
-        if margins is integer, used for all margins, else
-        margins = [left, top, right, bottom]
-
-        We get the bounding box by finding the smallest rectangle that is 
-        completely surrounded by white pixels.
-        """
-        if isinstance(margins, int):
-            margins = [margins for _ in range(4)]
-        pdf = pdfplumber.open(filename)
-        im = pdf.pages[0].to_image(resolution=resolution)
-        pdf.close()
-
-        pixels = list(im.original.getdata())
-        W, H = im.original.size
-
-        # M is a list of H lists with each W integers that equal the sum of the
-        # pixel values
-        M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
-
-        left, top, bottom, right = 0, 0, 0, 0
-        while top < H and sum(M[top]) == W * 255 * 3:
-            top += 1
-        while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
-            bottom += 1
-
-        # Transpose M
-        M = list(zip(*M))
-        while left < W and sum(M[left]) == H * 255 * 3:
-            left += 1
-        while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
-            right += 1
-
-        left -= margins[0]
-        top -= margins[1]
-        right -= margins[2]
-        bottom -= margins[3]
-
-        # This is the bounding box in PIL format: (0, 0) top left
-        x0, y0, x1, y1 = left, top, W - right, H - bottom
-
-        # Get the bbox in Ghostscript format: (0, 0) bottom left
-        a0, b0, a1, b1 = x0, H - y1, x1, H - y0
-        return [a0, b0, a1, b1]
-
-    def get_center_bbox(self, filename, padding=15):
-        """Compute a bounding box that will center the page file on the 
-        reMarkable
-        """
-        bbox = self.get_bbox(filename, margins=0)
-
-        h = bbox[3] - bbox[1]
-        w = bbox[2] - bbox[0]
-
-        # we want some minimal padding all around, because it is visually more
-        # pleasing.
-        h_prime = h + 2 * padding
-        w_prime = w + 2 * padding
-
-        # if the document is wider than the remarkable, we add top-padding to
-        # center it, otherwise we add left-padding
-        x, y = 0, 0
-        if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
-            y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
-        else:
-            x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
-
-        margins = [padding + x, padding + y, padding, padding]
-        return self.get_bbox(filename, margins=margins)
-
-
-def exception(msg):
-    print("ERROR: " + msg, file=sys.stderr)
-    print("Error occurred. Exiting.", file=sys.stderr)
-    print("", file=sys.stderr)
-    print(
-        "If you think this might be a bug, please raise an issue on GitHub: %s"
-        % GITHUB_URL
-    )
-    raise SystemExit(1)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "-b",
-        "--blank",
-        help="Add a blank page after every page of the PDF",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-v", "--verbose", help="be verbose", action="store_true"
-    )
-    parser.add_argument(
-        "-n",
-        "--no-upload",
-        help="don't upload to the reMarkable, save the output in current working dir",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-d",
-        "--debug",
-        help="debug mode, doesn't upload to reMarkable",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-c",
-        "--center",
-        help="Center the PDF on the page, instead of left align",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--filename",
-        help="Filename to use for the file on reMarkable",
-        default=None,
-    )
-    parser.add_argument(
-        "-p",
-        "--remarkable-path",
-        help="directory on reMarkable to put the file (created if missing)",
-        dest="remarkable_dir",
-        default="/",
-    )
-    parser.add_argument(
-        "--rmapi", help="path to rmapi executable", default="rmapi"
-    )
-    parser.add_argument(
-        "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
-    )
-    parser.add_argument(
-        "--pdftk", help="path to pdftk executable", default="pdftk"
-    )
-    parser.add_argument("--gs", help="path to gs executable", default="gs")
-    parser.add_argument(
-        "input", help="URL to a paper or the path of a local PDF file"
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
-
-    provider = next((p for p in providers if p.validate(args.input)), None)
-    if provider is None:
-        exception("Input not valid, no provider can handle this source.")
-
-    prov = provider(
-        verbose=args.verbose,
-        upload=not args.no_upload,
-        debug=args.debug,
-        center=args.center,
-        blank=args.blank,
-        remarkable_dir=args.remarkable_dir,
-        rmapi_path=args.rmapi,
-        pdfcrop_path=args.pdfcrop,
-        pdftk_path=args.pdftk,
-        gs_path=args.gs,
-    )
-
-    prov.run(args.input, filename=args.filename)
-
-
-if __name__ == "__main__":
-    main()
-- 
cgit v1.2.3


From b0b3b177dd2ee5555fb5a6a68c529d5673df83bb Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Thu, 24 Oct 2019 23:39:03 +0100
Subject: Switch to a simple logger singleton

The logging module gave problems because one
of the pdf packages is based on a package
that extensively used the info level of the
logging module, and this seemed like the
easiest solution.
---
 paper2remarkable/log.py               | 56 +++++++++++++++++++++++++++++++++++
 paper2remarkable/pdf_ops.py           | 21 ++++++-------
 paper2remarkable/providers/_base.py   |  9 +++---
 paper2remarkable/providers/_info.py   | 10 ++++---
 paper2remarkable/providers/acm.py     |  5 +++-
 paper2remarkable/providers/arxiv.py   |  6 ++--
 paper2remarkable/providers/pdf_url.py |  1 -
 paper2remarkable/utils.py             | 15 ++++++----
 8 files changed, 95 insertions(+), 28 deletions(-)
 create mode 100644 paper2remarkable/log.py

diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py
new file mode 100644
index 0000000..bae1cbf
--- /dev/null
+++ b/paper2remarkable/log.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+"""Just a simple logger
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+# NOTE: I know about the logging module, but this was easier because one of the
+# dependencies was using that and it became complicated. This one is obviously
+# not thread-safe and is very simple.
+
+import datetime
+import sys
+
+
+class Singleton(type):
+    # https://stackoverflow.com/q/6760685
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(
+                *args, **kwargs
+            )
+        return cls._instances[cls]
+
+
+class Logger(metaclass=Singleton):
+    def __init__(self):
+        self.enabled = True
+
+    def enable(self):
+        self.enabled = True
+
+    def disable(self):
+        self.enabled = False
+
+    def _log(self, msg, mode):
+        if not self.enabled:
+            return
+        if not mode in ("info", "warn"):
+            raise ValueError("Unknown logging mode: %s" % mode)
+        file = sys.stdout if mode == "info" else sys.stderr
+        now = datetime.datetime.now()
+        nowstr = now.strftime("%Y-%m-%d %H:%M:%S")
+        print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file)
+        file.flush()
+
+    def info(self, msg):
+        self._log(msg, "info")
+
+    def warning(self, msg):
+        self._log(msg, "warn")
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index d1eae40..8636017 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -10,27 +10,28 @@ Copyright: 2019, The Alan Turing Institute
 
 
 import PyPDF2
-import logging
 import os
 import subprocess
 
 from .crop import Cropper
+from .log import Logger
 
+logger = Logger()
 
 def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
     """Crop the pdf file using Cropper
     """
-    logging.info("Cropping pdf file")
+    logger.info("Cropping pdf file")
     cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
 
     cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
     status = cropper.crop(margins=15)
 
     if not status == 0:
-        logging.warning("Failed to crop the pdf file at: %s" % filepath)
+        logger.warning("Failed to crop the pdf file at: %s" % filepath)
         return filepath
     if not os.path.exists(cropped_file):
-        logging.warning(
+        logger.warning(
             "Can't find cropped file '%s' where expected." % cropped_file
         )
         return filepath
@@ -40,17 +41,17 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
 def center_pdf(filepath, pdfcrop_path="pdfcrop"):
     """Center the pdf file on the reMarkable
     """
-    logging.info("Centering pdf file")
+    logger.info("Centering pdf file")
     centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
 
     cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
     status = cropper.center()
 
     if not status == 0:
-        logging.warning("Failed to center the pdf file at: %s" % filepath)
+        logger.warning("Failed to center the pdf file at: %s" % filepath)
         return filepath
     if not os.path.exists(centered_file):
-        logging.warning(
+        logger.warning(
             "Can't find centered file '%s' where expected." % centered_file
         )
         return filepath
@@ -60,7 +61,7 @@ def center_pdf(filepath, pdfcrop_path="pdfcrop"):
 def blank_pdf(filepath):
     """Add blank pages to PDF
     """
-    logging.info("Adding blank pages")
+    logger.info("Adding blank pages")
     input_pdf = PyPDF2.PdfFileReader(filepath)
     output_pdf = PyPDF2.PdfFileWriter()
     for page in input_pdf.pages:
@@ -76,7 +77,7 @@ def blank_pdf(filepath):
 def shrink_pdf(filepath, gs_path="gs"):
     """Shrink the PDF file size using Ghostscript
     """
-    logging.info("Shrinking pdf file")
+    logger.info("Shrinking pdf file")
     output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
     status = subprocess.call(
         [
@@ -92,6 +93,6 @@ def shrink_pdf(filepath, gs_path="gs"):
         ]
     )
     if not status == 0:
-        logging.warning("Failed to shrink the pdf file")
+        logger.warning("Failed to shrink the pdf file")
         return filepath
     return output_file
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index ca6ab70..5432d48 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -9,7 +9,6 @@ Copyright: 2019, G.J.J. van den Burg
 """
 
 import abc
-import logging
 import os
 import shutil
 import tempfile
@@ -17,6 +16,9 @@ import tempfile
 from ._info import Informer
 from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
 from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable
+from ..log import Logger
+
+logger = Logger()
 
 
 class Provider(metaclass=abc.ABCMeta):
@@ -45,9 +47,8 @@ class Provider(metaclass=abc.ABCMeta):
         self.informer = Informer()
 
         # disable logging if requested
-        logging.basicConfig(level=logging.INFO)
         if not verbose:
-            logging.disable()
+            logger.disable()
 
         # Define the operations to run on the pdf. Providers can add others.
         self.operations = [("crop", self.crop_pdf)]
@@ -58,7 +59,7 @@ class Provider(metaclass=abc.ABCMeta):
             self.operations.append(("blank", blank_pdf))
         self.operations.append(("shrink", self.shrink_pdf))
 
-        logging.info("Starting %s" % type(self).__name__)
+        logger.info("Starting %s" % type(self).__name__)
 
     @staticmethod
     @abc.abstractmethod
diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
index 04efcb1..9130e34 100644
--- a/paper2remarkable/providers/_info.py
+++ b/paper2remarkable/providers/_info.py
@@ -3,12 +3,14 @@
 """Functionality for retrieving paper info
 """
 
-import logging
 import titlecase
 import unidecode
 import bs4
 
 from ..utils import clean_string, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
 
 
 class Informer:
@@ -38,7 +40,7 @@ class Informer:
         The provided url must be to a HTMl page where this information can be 
         found, not to the PDF file itself.
         """
-        logging.info("Generating output filename")
+        logger.info("Generating output filename")
 
         # Retrieve the paper information
         self.get_info(abs_url)
@@ -59,11 +61,11 @@ class Informer:
 
         name = authors + "_-_" + title + "_" + year + ".pdf"
         name = unidecode.unidecode(name)
-        logging.info("Created filename: %s" % name)
+        logger.info("Created filename: %s" % name)
         return name
 
     def get_info(self, url):
-        logging.info("Getting paper info")
+        logger.info("Getting paper info")
         page = get_page_with_retry(url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         self.authors = self.authors or self.get_authors(soup)
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
index e14efa7..a0d79bd 100644
--- a/paper2remarkable/providers/acm.py
+++ b/paper2remarkable/providers/acm.py
@@ -15,6 +15,9 @@ from ._base import Provider
 from ._info import Informer
 from .. import GITHUB_URL
 from ..utils import exception, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
 
 
 class ACMInformer(Informer):
@@ -26,7 +29,7 @@ class ACMInformer(Informer):
 
     def _format_year(self, soup_date):
         if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
-            self.warn(
+            logger.warning(
                 "Couldn't extract year from ACM page, please raise an "
                 "issue on GitHub so it can be fixed: %s" % GITHUB_URL
             )
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index d950e47..e022658 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -11,11 +11,13 @@ Copyright: 2019, G.J.J. van den Burg
 import os
 import re
 import subprocess
-import logging
 
 from ._info import Informer
 from ._base import Provider
 from ..utils import exception
+from ..log import Logger
+
+logger = Logger()
 
 
 class ArxivInformer(Informer):
@@ -52,7 +54,7 @@ class Arxiv(Provider):
 
     def dearxiv(self, input_file):
         """Remove the arXiv timestamp from a pdf"""
-        logging.info("Removing arXiv timestamp")
+        logger.info("Removing arXiv timestamp")
         basename = os.path.splitext(input_file)[0]
         uncompress_file = basename + "_uncompress.pdf"
 
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index f28c742..dfc8646 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -12,7 +12,6 @@ import urllib
 
 from ._base import Provider
 from ._info import Informer
-
 from ..utils import exception
 
 class PdfUrlInformer(Informer):
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 2bed231..d80c954 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -9,7 +9,6 @@ Copyright: 2019, G.J.J. van den Burg
 """
 
 import PyPDF2
-import logging
 import requests
 import string
 import subprocess
@@ -18,6 +17,7 @@ import time
 import unidecode
 
 from . import GITHUB_URL
+from .log import Logger
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -26,6 +26,8 @@ HEADERS = {
 }
 
 
+logger = Logger()
+
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
     print("Error occurred. Exiting.", file=sys.stderr)
@@ -37,6 +39,7 @@ def exception(msg):
     raise SystemExit(1)
 
 
+
 def clean_string(s):
     """ Clean a string by replacing accented characters with equivalents and 
     keeping only the allowed characters (ascii letters, digits, underscore, 
@@ -64,7 +67,7 @@ def assert_file_is_pdf(filename):
 
 def download_url(url, filename):
     """Download the content of an url and save it to a filename """
-    logging.info("Downloading file at url: %s" % url)
+    logger.info("Downloading file at url: %s" % url)
     content = get_page_with_retry(url)
     with open(filename, "wb") as fid:
         fid.write(content)
@@ -80,18 +83,18 @@ def get_page_with_retry(url, tries=5):
         except requests.exceptions.ConnectionError:
             error = True
         if error or not res.ok:
-            logging.warning(
+            logger.warning(
                 "(%i/%i) Error getting url %s. Retrying in 5 seconds."
                 % (count, tries, url)
             )
             time.sleep(5)
             continue
-        logging.info("Downloading url: %s" % url)
+        logger.info("Downloading url: %s" % url)
         return res.content
 
 
 def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
-    logging.info("Starting upload to reMarkable")
+    logger.info("Starting upload to reMarkable")
 
     # Create the reMarkable dir if it doesn't exist
     remarkable_dir = remarkable_dir.rstrip("/")
@@ -112,4 +115,4 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
     )
     if not status == 0:
         exception("Uploading file %s to reMarkable failed" % filepath)
-    logging.info("Upload successful.")
+    logger.info("Upload successful.")
-- 
cgit v1.2.3


From 89f3fb37ab5aad7284ca1da29aa610ae196b6fcf Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 00:03:17 +0100
Subject: Improve string cleaning

---
 paper2remarkable/providers/_info.py | 1 +
 paper2remarkable/utils.py           | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
index 9130e34..0b28658 100644
--- a/paper2remarkable/providers/_info.py
+++ b/paper2remarkable/providers/_info.py
@@ -56,6 +56,7 @@ class Informer:
         title = clean_string(self.title)
         title = titlecase.titlecase(title)
         title = title.replace(" ", "_")
+        title = clean_string(title)
 
         year = str(self.year)
 
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index d80c954..a313ffe 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -28,6 +28,7 @@ HEADERS = {
 
 logger = Logger()
 
+
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
     print("Error occurred. Exiting.", file=sys.stderr)
@@ -39,14 +40,15 @@ def exception(msg):
     raise SystemExit(1)
 
 
-
 def clean_string(s):
     """ Clean a string by replacing accented characters with equivalents and 
     keeping only the allowed characters (ascii letters, digits, underscore, 
-    space, and period)"""
+    space, dash, and period)"""
     normalized = unidecode.unidecode(s)
-    allowed = string.ascii_letters + string.digits + "_ ."
+    allowed = string.ascii_letters + string.digits + "_ .-"
     cleaned = "".join(c if c in allowed else "_" for c in normalized)
+    while "__" in cleaned:
+        cleaned = cleaned.replace("__", "_")
     return cleaned
 
 
-- 
cgit v1.2.3


From f8ccc47b17a19655860fa16149420eb422d71c26 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 00:03:39 +0100
Subject: bugfixes

---
 paper2remarkable/providers/local.py   |  2 +-
 paper2remarkable/providers/pdf_url.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
index b1201d3..68b88ea 100644
--- a/paper2remarkable/providers/local.py
+++ b/paper2remarkable/providers/local.py
@@ -25,7 +25,7 @@ class LocalFile(Provider):
         super().__init__(*args, **kwargs)
         self.informer = LocalFileInformer()
 
-    def get_abs_pdf_url(self, url):
+    def get_abs_pdf_urls(self, url):
         # The 'url' is the path to the local file. We use this as abs_url and
         # pdf_url.
         return url, url
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index dfc8646..d80b1a9 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -14,14 +14,13 @@ from ._base import Provider
 from ._info import Informer
 from ..utils import exception
 
-class PdfUrlInformer(Informer):
 
+class PdfUrlInformer(Informer):
     def get_filename(self, abs_url):
         # if this is called, filename must not be provided
         exception(
-                "Filename must be provided with PDFUrlProvider (use --filename)"
-            )
-
+            "Filename must be provided with PDFUrlProvider (use --filename)"
+        )
 
 
 class PdfUrl(Provider):
@@ -29,6 +28,9 @@ class PdfUrl(Provider):
         super().__init__(*args, **kwargs)
         self.informer = PdfUrlInformer()
 
+    def get_abs_pdf_urls(self, url):
+        return (None, url)
+
     def validate(src):
         try:
             result = urllib.parse.urlparse(src)
-- 
cgit v1.2.3


From 395ab716bb5c8ed74a4f0b447ec8243f64515ea8 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 00:03:48 +0100
Subject: makefile changes

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index baccb92..2a656d4 100644
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ install: ## Install for the current user using the default python command
 
 
 test: venv ## Run unit tests
-	source $(VENV_DIR)/bin/activate && green -v ./tests
+	source $(VENV_DIR)/bin/activate && green -f -vv -a ./tests
 
 
 clean: ## Clean build dist and egg directories left after install
-- 
cgit v1.2.3


From a405d661552b2e574725fcfb9e75f54d3f3d86ca Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 09:42:43 +0100
Subject: fix typo

---
 paper2remarkable/providers/local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
index 68b88ea..3f581b2 100644
--- a/paper2remarkable/providers/local.py
+++ b/paper2remarkable/providers/local.py
@@ -16,7 +16,7 @@ from ._info import Informer
 
 
 class LocalFileInformer(Informer):
-    def get_filenames(self, abs_url):
+    def get_filename(self, abs_url):
         return os.path.basename(abs_url)
 
 
-- 
cgit v1.2.3


From a02c4c27d81df8aa012f923d2b150db37e064c80 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 09:42:55 +0100
Subject: update tempdir prefix

---
 paper2remarkable/providers/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 5432d48..bdc9558 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -87,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta):
         tmp_filename = "paper.pdf"
 
         self.initial_dir = os.getcwd()
-        with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
+        with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir:
             os.chdir(working_dir)
             self.retrieve_pdf(pdf_url, tmp_filename)
             assert_file_is_pdf(tmp_filename)
-- 
cgit v1.2.3


From a37dd132ba815e8c10cf3e2f4e8a928dae96ae2d Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 09:43:50 +0100
Subject: Update gitignore

---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index c18dd8d..558dbc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
 __pycache__/
+paper2remarkable.egg-info/
+dist/*
+build/*
+*.pyc
+*/__pycache__/
-- 
cgit v1.2.3


From 7e544bb68e05cb3c1705c1a50076bdee33d759b2 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 10:05:10 +0100
Subject: Add description string to help

---
 paper2remarkable/__init__.py | 2 ++
 paper2remarkable/ui.py       | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
index 71c1105..113fc83 100644
--- a/paper2remarkable/__init__.py
+++ b/paper2remarkable/__init__.py
@@ -1,3 +1,5 @@
 # -*- coding: utf-8 -*-
 
+from .__version__ import __version__
+
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 71fc655..1466ef4 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -10,12 +10,15 @@ Copyright: 2019, G.J.J. van den Burg
 
 import argparse
 
+from . import __version__
+
 from .providers import providers
 from .utils import exception
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
+            description='Paper2reMarkable version %s' % __version__,
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
     parser.add_argument(
-- 
cgit v1.2.3


From 24707255dad3f065fe484a34c8dc2de5e371e419 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 10:05:22 +0100
Subject: Reorder command line arguments

---
 paper2remarkable/ui.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 1466ef4..d51ae0a 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -28,12 +28,9 @@ def parse_args():
         action="store_true",
     )
     parser.add_argument(
-        "-v", "--verbose", help="be verbose", action="store_true"
-    )
-    parser.add_argument(
-        "-n",
-        "--no-upload",
-        help="don't upload to the reMarkable, save the output in current working dir",
+        "-c",
+        "--center",
+        help="Center the PDF on the page, instead of left align",
         action="store_true",
     )
     parser.add_argument(
@@ -43,16 +40,11 @@ def parse_args():
         action="store_true",
     )
     parser.add_argument(
-        "-c",
-        "--center",
-        help="Center the PDF on the page, instead of left align",
+        "-n",
+        "--no-upload",
+        help="don't upload to the reMarkable, save the output in current working dir",
         action="store_true",
     )
-    parser.add_argument(
-        "--filename",
-        help="Filename to use for the file on reMarkable",
-        default=None,
-    )
     parser.add_argument(
         "-p",
         "--remarkable-path",
@@ -61,15 +53,23 @@ def parse_args():
         default="/",
     )
     parser.add_argument(
-        "--rmapi", help="path to rmapi executable", default="rmapi"
+        "-v", "--verbose", help="be verbose", action="store_true"
+    )
+    parser.add_argument(
+        "--filename",
+        help="Filename to use for the file on reMarkable",
+        default=None,
     )
+    parser.add_argument("--gs", help="path to gs executable", default="gs")
     parser.add_argument(
         "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
     )
     parser.add_argument(
         "--pdftk", help="path to pdftk executable", default="pdftk"
     )
-    parser.add_argument("--gs", help="path to gs executable", default="gs")
+    parser.add_argument(
+        "--rmapi", help="path to rmapi executable", default="rmapi"
+    )
     parser.add_argument(
         "input", help="URL to a paper or the path of a local PDF file"
     )
-- 
cgit v1.2.3


From 6e544fc055c9fc68857c7321a4f117042fe92565 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 10:15:32 +0100
Subject: Clarify some help text

---
 paper2remarkable/ui.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index d51ae0a..5323996 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -18,8 +18,7 @@ from .utils import exception
 
 def parse_args():
     parser = argparse.ArgumentParser(
-            description='Paper2reMarkable version %s' % __version__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+        description="Paper2reMarkable version %s" % __version__
     )
     parser.add_argument(
         "-b",
@@ -48,7 +47,7 @@ def parse_args():
     parser.add_argument(
         "-p",
         "--remarkable-path",
-        help="directory on reMarkable to put the file (created if missing)",
+        help="directory on reMarkable to put the file (created if missing, default: /)",
         dest="remarkable_dir",
         default="/",
     )
@@ -60,18 +59,27 @@ def parse_args():
         help="Filename to use for the file on reMarkable",
         default=None,
     )
-    parser.add_argument("--gs", help="path to gs executable", default="gs")
     parser.add_argument(
-        "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
+        "--gs", help="path to gs executable (default: gs)", default="gs"
     )
     parser.add_argument(
-        "--pdftk", help="path to pdftk executable", default="pdftk"
+        "--pdfcrop",
+        help="path to pdfcrop executable (default: pdfcrop)",
+        default="pdfcrop",
     )
     parser.add_argument(
-        "--rmapi", help="path to rmapi executable", default="rmapi"
+        "--pdftk",
+        help="path to pdftk executable (default: pdftk)",
+        default="pdftk",
     )
     parser.add_argument(
-        "input", help="URL to a paper or the path of a local PDF file"
+        "--rmapi",
+        help="path to rmapi executable (default: rmapi)",
+        default="rmapi",
+    )
+    parser.add_argument(
+        "input",
+        help="URL to a paper or the path of a local PDF file",
     )
     return parser.parse_args()
 
-- 
cgit v1.2.3


From 51bce213c917644ff9e512a3f81dd266477c19fe Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 11:09:09 +0100
Subject: Update readme

---
 README.md | 100 ++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 48 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index a01665c..8295e37 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,21 @@
-# arxiv2remarkable.py
+# paper2remarkable
 
-``arxiv2remarkable`` is a command line program to quickly transfer a paper to 
-your reMarkable. The script can be run as a plain Python script or via Docker 
+*Note: ``paper2remarkable`` is the new name for the ``arxiv2remarkable`` 
+script. The name was changed because it better captures what the program 
+does.*
+
+``paper2remarkable`` is a command line program for quickly and easily 
+transferring an academic paper to your reMarkable:
+
+```
+$ p2r https://arxiv.org/abs/1811.11242
+```
+
+The script can be run through the ``p2r`` command line program or via Docker 
 (see below).
 
-This script makes it as easy as possible to get a PDF on your reMarkable from 
-any of the following sources:
+paper2remarkable makes it as easy as possible to get a PDF on your reMarkable 
+from any of the following sources:
 
 - an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``)
 - a PubMed Central url (either to the HTML or the PDF)
@@ -16,10 +26,10 @@ any of the following sources:
 - a url to a PDF file
 - a local file.
 
-The script takes the source and:
+When called, the paper2remarkable takes the source and:
 
 1. Downloads the pdf if necessary
-2. Removes the arXiv timestamp
+2. Removes the arXiv timestamp (for arXiv sources)
 3. Crops the pdf to remove unnecessary borders
 4. Shrinks the pdf file to reduce the filesize
 5. Generates a nice filename based on author/title/year of the paper
@@ -37,41 +47,39 @@ Optionally, you can:
 Here's the full help of the script:
 
 ```text
-usage: arxiv2remarkable.py [-h] [-b] [-v] [-n] [-d] [-c] [--filename FILENAME]
-                           [-p REMARKABLE_DIR] [--rmapi RMAPI]
-                           [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS]
-                           input
+usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v]
+           [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK]
+           [--rmapi RMAPI]
+           input
+
+Paper2reMarkable version 0.4.0
 
 positional arguments:
   input                 URL to a paper or the path of a local PDF file
 
 optional arguments:
   -h, --help            show this help message and exit
-  -b, --blank           Add a blank page after every page of the PDF (default:
-                        False)
-  -v, --verbose         be verbose (default: False)
-  -n, --no-upload       don't upload to the reMarkable, save the output in
-                        current working dir (default: False)
-  -d, --debug           debug mode, doesn't upload to reMarkable (default:
-                        False)
+  -b, --blank           Add a blank page after every page of the PDF
   -c, --center          Center the PDF on the page, instead of left align
-                        (default: False)
-  --filename FILENAME   Filename to use for the file on reMarkable (default:
-                        None)
+  -d, --debug           debug mode, doesn't upload to reMarkable
+  -n, --no-upload       don't upload to the reMarkable, save the output in
+                        current working dir
   -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR
                         directory on reMarkable to put the file (created if
-                        missing) (default: /)
-  --rmapi RMAPI         path to rmapi executable (default: rmapi)
+                        missing, default: /)
+  -v, --verbose         be verbose
+  --filename FILENAME   Filename to use for the file on reMarkable
+  --gs GS               path to gs executable (default: gs)
   --pdfcrop PDFCROP     path to pdfcrop executable (default: pdfcrop)
   --pdftk PDFTK         path to pdftk executable (default: pdftk)
-  --gs GS               path to gs executable (default: gs)
+  --rmapi RMAPI         path to rmapi executable (default: rmapi)
 ```
 
 And here's an example with verbose mode enabled that shows everything the 
 script does by default:
 
-```bash
-$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
+```
+$ p2r -v https://arxiv.org/abs/1811.11242
 2019-05-30 00:38:27 - INFO - Starting ArxivProvider
 2019-05-30 00:38:27 - INFO - Getting paper info from arXiv
 2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242
@@ -86,7 +94,7 @@ $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
 2019-05-30 00:38:42 - INFO - Upload successful.
 ```
 
-## Dependencies
+## Installation
 
 The script requires the following external programs to be available:
 
@@ -96,27 +104,15 @@ The script requires the following external programs to be available:
 - [GhostScript](https://www.ghostscript.com/)
 - [rMAPI](https://github.com/juruen/rmapi)
 
-If these scripts are not available on the ``PATH`` variable, you can supply them 
-with the relevant options to the script.
-
-The script also needs the following Python packages:
+If these scripts are not available on the ``PATH`` variable, you can supply 
+them with the relevant options to the script. Then, you can install 
+paper2remarkable from PyPI:
 
-- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML
-- [requests](https://pypi.org/project/requests/): getting HTML
-- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF
-- [titlecase](https://pypi.org/project/titlecase/): fancy titles
-- [pdfplumber](https://github.com/jsvine/pdfplumber): used for better page 
-  cropping
-- [unidecode](https://pypi.org/project/Unidecode/): clean accented characters 
-  from the filename
-
-If you use [Poetry](https://poetry.eustace.io/) you can install these 
-dependencies using ``poetry install`` in the project directory. Alternatively, 
-you can use ``pip`` with the following command:
-
-```bash
-pip install --user bs4 requests PyPDF2 titlecase pdfplumber unidecode
 ```
+pip install paper2remarkable
+```
+
+This installs the ``p2r`` command line program.
 
 ## Docker
 
@@ -127,7 +123,7 @@ First clone this repository with `git clone` and `cd` inside of it, then build
 the container:
 
 ```bash
-docker build -t arxiv2remarkable .
+docker build -t paper2remarkable .
 ```
 
 ### Authorization
@@ -137,7 +133,7 @@ we'll use `rmapi` to create it.
 
 ```bash
 touch ${HOME}/.rmapi
-docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi arxiv2remarkable version
+docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi paper2remarkable version
 ```
 
 which should end with output like
@@ -149,15 +145,15 @@ rmapi version: 0.0.5
 
 ### Usage
 
-Use the container by replacing `python arxiv2remarkable.py` with `docker run 
---rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable`, e.g.
+Use the container by replacing `p2r` with `docker run --rm -v 
+"${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable`, e.g.
 
 ```
 # print help and exit
-docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable --help
+docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable --help
 
 # equivalent to above usage via `python`
-docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable -v https://arxiv.org/abs/1811.11242
+docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable -v https://arxiv.org/abs/1811.11242
 ```
 
 # Notes
-- 
cgit v1.2.3


From 61fb99f59aa5456627ac4cb8ec14503862780462 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 11:09:17 +0100
Subject: setup.py formatting

---
 setup.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 3d1fbc5..f54170a 100644
--- a/setup.py
+++ b/setup.py
@@ -24,14 +24,12 @@ REQUIRED = [
     "pdfplumber>=0.5",
     "unidecode>=1.1",
     "titlecase>=0.12",
-    "PyPDF2>=1.26"
-
+    "PyPDF2>=1.26",
 ]
 
 docs_require = []
 test_require = []
-dev_require = [
-        'green']
+dev_require = ["green"]
 
 # What packages are optional?
 EXTRAS = {
-- 
cgit v1.2.3


From a38318b2e0df603b4e46a39781cf73cf6fa9a148 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 11:09:54 +0100
Subject: add changelog

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..ac4f357
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,6 @@
+# Changelog
+
+## Version 0.4.0
+
+* Refactor code to make it a real Python package
+* Rename to ``paper2remarkable``
-- 
cgit v1.2.3


From f5c5308083e80b3a717aa904131833fed12e98a8 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 11:10:19 +0100
Subject: add packaging code

---
 MANIFEST.in     |  10 +++
 make_release.py | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+)
 create mode 100644 MANIFEST.in
 create mode 100644 make_release.py

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..021523f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
+include setup.py
+include README.md
+include LICENSE
+recursive-include paper2remarkable *.py
+recursive-include tests *.py
+exclude Makefile
+exclude .gitignore
+exclude Dockerfile
+exclude make_release.py
+prune old
diff --git a/make_release.py b/make_release.py
new file mode 100644
index 0000000..932209a
--- /dev/null
+++ b/make_release.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Do-nothing script for making a release
+
+This idea comes from here: 
+https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/
+
+Author: Gertjan van den Burg
+Date: 2019-07-23
+
+"""
+
+import colorama
+import os
+
+
+def colored(msg, color=None, style=None):
+    colors = {
+        "red": colorama.Fore.RED,
+        "green": colorama.Fore.GREEN,
+        "cyan": colorama.Fore.CYAN,
+        "yellow": colorama.Fore.YELLOW,
+        "magenta": colorama.Fore.MAGENTA,
+        None: "",
+    }
+    styles = {
+        "bright": colorama.Style.BRIGHT,
+        "dim": colorama.Style.DIM,
+        None: "",
+    }
+    pre = colors[color] + styles[style]
+    post = colorama.Style.RESET_ALL
+    return f"{pre}{msg}{post}"
+
+
+def cprint(msg, color=None, style=None):
+    print(colored(msg, color=color, style=style))
+
+
+def wait_for_enter():
+    input(colored("\nPress Enter to continue", style="dim"))
+    print()
+
+
+def get_package_name():
+    with open("./setup.py", "r") as fp:
+        nameline = next(
+            (l.strip() for l in fp if l.startswith("NAME = ")), None
+        )
+        return nameline.split("=")[-1].strip().strip('"')
+
+
+class Step:
+    def pre(self, context):
+        pass
+
+    def post(self, context):
+        wait_for_enter()
+
+    def run(self, context):
+        try:
+            self.pre(context)
+            self.action(context)
+            self.post(context)
+        except KeyboardInterrupt:
+            cprint("\nInterrupted.", color="red")
+            raise SystemExit(1)
+
+    def instruct(self, msg):
+        cprint(msg, color="green")
+
+    def print_run(self, msg):
+        cprint("Run:", color="cyan", style="bright")
+        self.print_cmd(msg)
+
+    def print_cmd(self, msg):
+        cprint("\t" + msg, color="cyan", style="bright")
+
+    def do_cmd(self, cmd):
+        cprint(f"Going to run: {cmd}", color="magenta", style="bright")
+        wait_for_enter()
+        os.system(cmd)
+
+
+class GitToMaster(Step):
+    def action(self, context):
+        self.instruct("Make sure you're on master and changes are merged in")
+        self.print_run("git checkout master")
+
+
+class UpdateChangelog(Step):
+    def action(self, context):
+        self.instruct(f"Update change log for version {context['version']}")
+        self.print_run("vi CHANGELOG.md")
+
+
+class RunTests(Step):
+    def action(self, context):
+        self.do_cmd("make test")
+
+
+class BumpVersionPackage(Step):
+    def action(self, context):
+        self.instruct(f"Update __version__.py with new version")
+        self.print_run(f"vi {context['pkgname']}/__version__.py")
+
+    def post(self, context):
+        wait_for_enter()
+        context["version"] = self._get_version(context)
+
+    def _get_version(self, context):
+        # Get the version from the version file
+        about = {}
+        with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp:
+            exec(fp.read(), about)
+        return about["__version__"]
+
+
+class MakeClean(Step):
+    def action(self, context):
+        self.do_cmd("make clean")
+
+
+class MakeDocs(Step):
+    def action(self, context):
+        self.do_cmd("make docs")
+
+
+class MakeDist(Step):
+    def action(self, context):
+        self.do_cmd("make dist")
+
+
+class PushToTestPyPI(Step):
+    def action(self, context):
+        self.do_cmd(
+            "twine upload --repository-url https://test.pypi.org/legacy/ dist/*"
+        )
+
+
+class InstallFromTestPyPI(Step):
+    def action(self, context):
+        self.print_run("cd /tmp/")
+        self.print_cmd("rm -rf ./venv")
+        self.print_cmd("virtualenv ./venv")
+        self.print_cmd("cd ./venv")
+        self.print_cmd("source bin/activate")
+        self.print_cmd(
+            "pip install --index-url https://test.pypi.org/simple/ "
+            + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}"
+        )
+
+
+class TestPackage(Step):
+    def action(self, context):
+        self.instruct(
+            f"Ensure that the following command gives version {context['version']}"
+        )
+        self.print_run(f"{context['pkgname']} -h")
+
+
+class DeactivateVenv(Step):
+    def action(self, context):
+        self.print_run("deactivate")
+        self.instruct("Go back to the project directory")
+
+
+class GitTagVersion(Step):
+    def action(self, context):
+        self.do_cmd(f"git tag v{context['version']}")
+
+
+class GitAdd(Step):
+    def action(self, context):
+        self.instruct("Add everything to git and commit")
+        self.print_run("git gui")
+
+
+class PushToPyPI(Step):
+    def action(self, context):
+        self.do_cmd("twine upload dist/*")
+
+
+class PushToGitHub(Step):
+    def action(self, context):
+        self.do_cmd("git push -u --tags origin master")
+
+
+class WaitForTravis(Step):
+    def action(self, context):
+        self.instruct(
+            "Wait for Travis to complete and verify that its successful"
+        )
+
+
+class WaitForAppVeyor(Step):
+    def action(self, context):
+        self.instruct(
+            "Wait for AppVeyor to complete and verify that its successful"
+        )
+
+
+class WaitForRTD(Step):
+    def action(self, context):
+        self.instruct(
+            "Wait for ReadTheDocs to complete and verify that its successful"
+        )
+
+
+def main():
+    colorama.init()
+    procedure = [
+        GitToMaster(),
+        GitAdd(),
+        PushToGitHub(),
+        BumpVersionPackage(),
+        UpdateChangelog(),
+        MakeClean(),
+        RunTests(),
+        MakeDist(),
+        PushToTestPyPI(),
+        InstallFromTestPyPI(),
+        TestPackage(),
+        DeactivateVenv(),
+        GitAdd(),
+        PushToPyPI(),
+        GitTagVersion(),
+        PushToGitHub(),
+    ]
+    context = {}
+    context["pkgname"] = get_package_name()
+    for step in procedure:
+        step.run(context)
+    cprint("\nDone!", color="yellow", style="bright")
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 754c29c70fdea59b190cd2ff1f2b63e4a0efc9da Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Oct 2019 11:36:23 +0100
Subject: update dockerfile

---
 Dockerfile | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6578db3..cb7cb19 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,14 +19,6 @@ RUN apt-get update \
         pdftk \
         texlive-extra-utils  # contains pdfcrop
 
-RUN pip install \
-    bs4 \
-    requests \
-    PyPDF2 \
-    titlecase \
-    pdfplumber \
-    unidecode
+RUN pip install paper2remarkable
 
-COPY arxiv2remarkable.py ./
-
-ENTRYPOINT ["python", "arxiv2remarkable.py"]
+ENTRYPOINT ["p2r"]
-- 
cgit v1.2.3