6 files changed, 579 insertions, 459 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c18dd8d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/README.md b/README.md
index 3b7be2a..bbda5a7 100644
--- a/README.md
+++ b/README.md
@@ -57,16 +57,18 @@ And here's an example with verbose mode enabled that shows everything the
 script does:
 ```bash
 $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
-2019-02-03 18:11:41.816 | INFO     | __main__:download_url:106 - Downloading file at url: https://arxiv.org/pdf/1811.11242v1.pdf
-2019-02-03 18:11:46.833 | INFO     | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/pdf/1811.11242v1.pdf
-2019-02-03 18:11:46.835 | INFO     | __main__:get_paper_info:194 - Getting paper info from arXiv
-2019-02-03 18:11:47.496 | INFO     | __main__:get_page_with_retry:92 - Downloading url: https://arxiv.org/abs/1811.11242v1
-2019-02-03 18:11:47.508 | INFO     | __main__:generate_filename:206 - Generating output filename
-2019-02-03 18:11:47.508 | INFO     | __main__:dearxiv:114 - Removing arXiv timestamp
-2019-02-03 18:11:49.221 | INFO     | __main__:crop_pdf:154 - Cropping pdf file
-2019-02-03 18:11:53.247 | INFO     | __main__:shrink_pdf:172 - Shrinking pdf file
-2019-02-03 18:11:54.802 | INFO     | __main__:upload_to_rm:218 - Starting upload to reMarkable
-2019-02-03 18:11:57.767 | INFO     | __main__:upload_to_rm:223 - Upload successful.
+2019-05-30 00:38:27 - INFO - Starting ArxivProvider
+2019-05-30 00:38:27 - INFO - Getting paper info from arXiv
+2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242
+2019-05-30 00:38:27 - INFO - Generating output filename
+2019-05-30 00:38:27 - INFO - Created filename: Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf
+2019-05-30 00:38:27 - INFO - Downloading file at url: https://arxiv.org/pdf/1811.11242.pdf
+2019-05-30 00:38:32 - INFO - Downloading url: https://arxiv.org/pdf/1811.11242.pdf
+2019-05-30 00:38:32 - INFO - Removing arXiv timestamp
+2019-05-30 00:38:34 - INFO - Cropping pdf file
+2019-05-30 00:38:37 - INFO - Shrinking pdf file
+2019-05-30 00:38:38 - INFO - Starting upload to reMarkable
+2019-05-30 00:38:42 - INFO - Upload successful.
 ```
 
 ## Dependencies
@@ -79,21 +81,20 @@ The script requires the following external programs to be available:
 - [GhostScript](https://www.ghostscript.com/)
 - [rMAPI](https://github.com/juruen/rmapi)
 
-If these scripts are not available on the PATH variable, you can supply them 
+If these scripts are not available on the ``PATH`` variable, you can supply them 
 with the relevant options to the script.
 
 The script also needs the following Python packages:
 
 - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML
 - [requests](https://pypi.org/project/requests/): getting HTML
-- [loguru](https://pypi.org/project/loguru/): easy logging
 - [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF
 - [titlecase](https://pypi.org/project/titlecase/): fancy titles
 
 You can use this line:
 
 ```bash
-pip install --user bs4 requests loguru PyPDF2 titlecase
+pip install --user bs4 requests PyPDF2 titlecase
 ```
 
 # Notes
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 61a3667..08beaca 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -1,16 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-"""
-Given an arXiv paper url this script:
+__version__ = "0.2.0"
+__author__ = "G.J.J. van den Burg"
 
-1. Downloads the paper
-2. Strips the timestamp
-3. Crops the pdf to remove unnecessary borders
-4. Shrinks the pdf to reduce the filesize
-5. Renames it using the format:
-    '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf
-6. Uploads it to the reMarkable using rMapi.
+"""
+Download a paper from various sources and send it to the reMarkable.
 
 Author: G.J.J. van den Burg
 Date: 2019-02-02
@@ -19,8 +14,10 @@ License: MIT
 """
 
 import PyPDF2
+import abc
 import argparse
 import bs4
+import datetime
 import os
 import re
 import requests
@@ -32,8 +29,6 @@ import time
 import titlecase
 import urllib.parse
 
-from loguru import logger
-
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
 
 HEADERS = {
@@ -43,334 +38,468 @@ HEADERS = {
 }
 
 
-def exception(msg):
-    print("ERROR: " + msg, file=sys.stderr)
-    print("Error occurred. Exiting.", file=sys.stderr)
-    raise SystemExit(1)
-
-
-def arxiv_url(url):
-    """Check if the url is to an arXiv page.
-
-    >>> validate_url("https://arxiv.org/abs/1811.11242")
-    True
-    >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
-    True
-    >>> validate_url("http://arxiv.org/abs/1811.11242")
-    True
-    >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
-    True
-    >>> validate_url("https://arxiv.org/abs/1811.11242v1")
-    True
-    >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
-    True
-    >>> validate_url("https://gertjanvandenburg.com")
-    False
-    """
-    m = re.match(
-        "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url
-    )
-    return not m is None
-
+class Provider(metaclass=abc.ABCMeta):
+    """ ABC for providers of pdf sources """
 
-def pmc_url(url):
-    m = re.fullmatch(
-        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
-    )
-    return not m is None
-
-
-def acm_url(url):
-    m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url)
-    return not m is None
-
-
-def valid_url(url):
-    try:
-        result = urllib.parse.urlparse(url)
-        return all([result.scheme, result.netloc, result.path])
-    except:
-        return False
+    def __init__(
+        self,
+        verbose=False,
+        upload=True,
+        debug=False,
+        remarkable_dir="/",
+        rmapi_path="rmapi",
+        pdfcrop_path="pdfcrop",
+        pdftk_path="pdftk",
+        gs_path="gs",
+    ):
+        self.verbose = verbose
+        self.upload = upload
+        self.debug = debug
+        self.remarkable_dir = remarkable_dir
+        self.rmapi_path = rmapi_path
+        self.pdfcrop_path = pdfcrop_path
+        self.pdftk_path = pdftk_path
+        self.gs_path = gs_path
+
+        self.log("Starting %s" % type(self).__name__)
+
+    def log(self, msg, mode="info"):
+        if not self.verbose:
+            return
+        if not mode in ["info", "warning"]:
+            raise ValueError("unknown logging mode.")
+        now = datetime.datetime.now()
+        print(
+            now.strftime("%Y-%m-%d %H:%M:%S")
+            + " - "
+            + mode.upper()
+            + " - "
+            + msg
+        )
 
+    def warn(self, msg):
+        self.log(msg, mode="warning")
+
+    @staticmethod
+    @abc.abstractmethod
+    def validate(src):
+        """ Validate whether ``src`` is appropriate for this provider """
+
+    @abc.abstractmethod
+    def retrieve_pdf(self, src, filename):
+        """ Download pdf from src and save to filename """
+
+    @abc.abstractmethod
+    def get_paper_info(self, src):
+        """ Retrieve the title/author (surnames)/year information """
+
+    def create_filename(self, info, filename=None):
+        """ Generate filename using the info dict or filename if provided """
+        if not filename is None:
+            return filename
+        # we assume that the list of authors is surname only.
+        self.log("Generating output filename")
+        if len(info["authors"]) > 3:
+            author_part = info["authors"][0] + "_et_al"
+        else:
+            author_part = "_".join(info["authors"])
+        author_part = author_part.replace(" ", "_")
+        title = info["title"].replace(",", "").replace(":", "")
+        title_part = titlecase.titlecase(title).replace(" ", "_")
+        year_part = info["date"].split("/")[0]
+        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+        self.log("Created filename: %s" % name)
+        return name
+
+    def crop_pdf(self, filepath):
+        self.log("Cropping pdf file")
+        status = subprocess.call(
+            [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            self.warn("Failed to crop the pdf file at: %s" % filepath)
+            return filepath
+        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+        if not os.path.exists(cropped_file):
+            self.warn(
+                "Can't find cropped file '%s' where expected." % cropped_file
+            )
+            return filepath
+        return cropped_file
 
-def check_file_is_pdf(filename):
-    try:
-        PyPDF2.PdfFileReader(open(filename, "rb"))
-        return True
-    except PyPDF2.utils.PdfReadError:
-        return False
+    def shrink_pdf(self, filepath):
+        self.log("Shrinking pdf file")
+        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+        status = subprocess.call(
+            [
+                self.gs_path,
+                "-sDEVICE=pdfwrite",
+                "-dCompatibilityLevel=1.4",
+                "-dPDFSETTINGS=/printer",
+                "-dNOPAUSE",
+                "-dBATCH",
+                "-dQUIET",
+                "-sOutputFile=%s" % output_file,
+                filepath,
+            ]
+        )
+        if not status == 0:
+            self.warn("Failed to shrink the pdf file")
+            return filepath
+        return output_file
 
+    def check_file_is_pdf(self, filename):
+        try:
+            fp = open(filename, "rb")
+            pdf = PyPDF2.PdfFileReader(fp, strict=False)
+            fp.close()
+            del pdf
+            return True
+        except PyPDF2.utils.PdfReadError:
+            exception("Downloaded file isn't a valid pdf file.")
+
+    def download_url(self, url, filename):
+        """Download the content of an url and save it to a filename """
+        self.log("Downloading file at url: %s" % url)
+        content = self.get_page_with_retry(url)
+        with open(filename, "wb") as fid:
+            fid.write(content)
+
+    def get_page_with_retry(self, url, tries=5):
+        count = 0
+        while count < tries:
+            count += 1
+            error = False
+            try:
+                res = requests.get(url, headers=HEADERS)
+            except requests.exceptions.ConnectionError:
+                error = True
+            if error or not res.ok:
+                time.sleep(5)
+                self.warn("Error getting url %s. Retrying in 5 seconds" % url)
+                continue
+            self.log("Downloading url: %s" % url)
+            return res.content
 
-def get_arxiv_urls(url):
-    """Get the pdf and abs url from any given arXiv url """
-    if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
-        abs_url = url
-        pdf_url = url.replace("abs", "pdf") + ".pdf"
-    elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url):
-        abs_url = url[:-4].replace("pdf", "abs")
-        pdf_url = url
-    else:
-        exception("Couldn't figure out arXiv urls.")
-    return pdf_url, abs_url
+    def upload_to_rm(self, filepath):
+        remarkable_dir = self.remarkable_dir.rstrip("/")
+        self.log("Starting upload to reMarkable")
+        if remarkable_dir:
+            status = subprocess.call(
+                [self.rmapi_path, "mkdir", remarkable_dir],
+                stdout=subprocess.DEVNULL,
+            )
+            if not status == 0:
+                exception(
+                    "Creating directory %s on reMarkable failed"
+                    % remarkable_dir
+                )
+        status = subprocess.call(
+            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            exception("Uploading file %s to reMarkable failed" % filepath)
+        self.log("Upload successful.")
 
+    def dearxiv(self, input_file):
+        """Remove the arXiv timestamp from a pdf"""
+        self.log("Removing arXiv timestamp")
+        basename = os.path.splitext(input_file)[0]
+        uncompress_file = basename + "_uncompress.pdf"
 
-def get_pmc_urls(url):
-    """Get the pdf and html url from a given PMC url """
-    if re.match(
-        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
-        url,
-    ):
-        idx = url.index("pdf")
-        abs_url = url[: idx - 1]
-        pdf_url = url
-    elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url):
-        abs_url = url
-        pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
-    else:
-        exception("Couldn't figure out PMC urls.")
-    return pdf_url, abs_url
-
-
-def get_acm_pdf_url(url):
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    thea = None
-    for a in soup.find_all("a"):
-        if a.get("name") == "FullTextPDF":
-            thea = a
-            break
-    if thea is None:
-        return None
-    href = thea.get("href")
-    if href.startswith("http"):
-        return href
-    else:
-        return "https://dl.acm.org/" + href
-
-
-def get_acm_urls(url):
-    if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
-        abs_url = url
-        pdf_url = get_acm_pdf_url(url)
-        if pdf_url is None:
-            exception("Couldn't extract PDF url from ACM citation page.")
-    else:
-        exception(
-            "Couldn't figure out ACM urls, please provide a URL of the "
-            "format: http(s)://dl.acm.org/citation.cfm?id=..."
+        status = subprocess.call(
+            [
+                self.pdftk_path,
+                input_file,
+                "output",
+                uncompress_file,
+                "uncompress",
+            ]
         )
-    return pdf_url, abs_url
+        if not status == 0:
+            exception("pdftk failed to uncompress the pdf.")
+
+        with open(uncompress_file, "rb") as fid:
+            data = fid.read()
+            # Remove the text element
+            data = re.sub(
+                b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+                b"()Tj",
+                data,
+            )
+            # Remove the URL element
+            data = re.sub(
+                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
+                b"",
+                data,
+            )
 
+        removed_file = basename + "_removed.pdf"
+        with open(removed_file, "wb") as oid:
+            oid.write(data)
 
-def get_page_with_retry(url):
-    """Get the content of an url, retrying up to five times on failure. """
+        output_file = basename + "_dearxiv.pdf"
+        status = subprocess.call(
+            [self.pdftk_path, removed_file, "output", output_file, "compress"]
+        )
+        if not status == 0:
+            exception("pdftk failed to compress the pdf.")
+
+        return output_file
+
+    def run(self, src, filename=None):
+        info = self.get_paper_info(src)
+        clean_filename = self.create_filename(info, filename)
+        tmp_filename = "paper.pdf"
+
+        self.initial_dir = os.getcwd()
+        with tempfile.TemporaryDirectory() as working_dir:
+            os.chdir(working_dir)
+            self.retrieve_pdf(src, tmp_filename)
+            self.check_file_is_pdf(tmp_filename)
+
+            ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf]
+            intermediate_fname = tmp_filename
+            for op in ops:
+                intermediate_fname = op(intermediate_fname)
+            shutil.move(intermediate_fname, clean_filename)
+
+            if self.debug:
+                print("Paused in debug mode in dir: %s" % working_dir)
+                print("Press enter to exit.")
+                return input()
+
+            if self.upload:
+                return self.upload_to_rm(clean_filename)
+
+            target_path = os.path.join(self.initial_dir, clean_filename)
+            while os.path.exists(target_path):
+                base = os.path.splitext(target_path)[0]
+                target_path = base + "_.pdf"
+            shutil.move(clean_filename, target_path)
+            return target_path
+
+
+class ArxivProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and abs url from any given arXiv url """
+        if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
+            abs_url = url
+            pdf_url = url.replace("abs", "pdf") + ".pdf"
+        elif re.match(
+            "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url
+        ):
+            abs_url = url[:-4].replace("pdf", "abs")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out arXiv urls.")
+        return abs_url, pdf_url
 
-    def retry(url, count):
-        if count < 5:
-            logger.info(
-                "Caught error for url %s. Retrying in 5 seconds." % url
-            )
-            time.sleep(5)
+    def validate(src):
+        """Check if the url is to an arXiv page. """
+        m = re.match(
+            "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
+        )
+        return not m is None
+
+    def retrieve_pdf(self, src, filename):
+        """ Download the file and save as filename """
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        self.log("Getting paper info from arXiv")
+        page = self.get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_author"})
+        ]
+        authors = [x.split(",")[0].strip() for x in authors]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        return dict(title=title, date=date, authors=authors)
+
+
+class PMCProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and html url from a given PMC url """
+        if re.match(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
+            url,
+        ):
+            idx = url.index("pdf")
+            abs_url = url[: idx - 1]
+            pdf_url = url
+        elif re.match(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url
+        ):
+            abs_url = url
+            pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
         else:
-            exception("Failed to download url: %s" % url)
+            exception("Couldn't figure out PMC urls.")
+        return abs_url, pdf_url
 
-    count = 0
-    while True:
-        count += 1
-        try:
-            res = requests.get(url, headers=HEADERS)
-        except requests.exceptions.ConnectionError:
-            retry(url, count)
-            continue
-        if res.ok:
-            logger.info("Downloading url: %s" % url)
-            return res.content
+    def validate(src):
+        m = re.fullmatch(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src
+        )
+        return not m is None
+
+    def retrieve_pdf(self, src, filename):
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        self.log("Getting paper info from PMC")
+        page = self.get_page_with_retry(src)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_authors"})
+        ]
+        # We only use last names, and this method is a guess at best. I'm open to
+        # more advanced approaches.
+        authors = [
+            x.strip().split(" ")[-1].strip() for x in authors[0].split(",")
+        ]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        if re.match("\w+\ \d{4}", date):
+            date = date.split(" ")[-1]
+        else:
+            date = date.replace(" ", "_")
+        return dict(title=title, date=date, authors=authors)
+
+
+class ACMProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_acm_pdf_url(self, url):
+        page = self.get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        thea = None
+        for a in soup.find_all("a"):
+            if a.get("name") == "FullTextPDF":
+                thea = a
+                break
+        if thea is None:
+            return None
+        href = thea.get("href")
+        if href.startswith("http"):
+            return href
+        else:
+            return "https://dl.acm.org/" + href
+
+    def get_abs_pdf_urls(self, url):
+        if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
+            abs_url = url
+            pdf_url = self.get_acm_pdf_url(url)
+            if pdf_url is None:
+                exception(
+                    "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+                )
         else:
-            retry(url, count)
+            exception(
+                "Couldn't figure out ACM urls, please provide a URL of the "
+                "format: http(s)://dl.acm.org/citation.cfm?id=..."
+            )
+        return abs_url, pdf_url
+
+    def retrieve_pdf(self, src, filename):
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def validate(src):
+        m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src)
+        return not m is None
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        self.log("Getting paper info from ACM")
+        page = self.get_page_with_retry(src)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_authors"})
+        ]
+        # We only use last names, and this method is a guess. I'm open to more
+        # advanced approaches.
+        authors = [
+            x.strip().split(",")[0].strip() for x in authors[0].split(";")
+        ]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
+            self.warn(
+                "Couldn't extract year from ACM page, please raise an "
+                "issue on GitHub so I can fix it: %s" % GITHUB_URL
+            )
+        date = date.strip().split("/")[-1]
+        return dict(title=title, date=date, authors=authors)
 
 
-def download_url(url, filename):
-    """Download the content of an url and save it to a filename """
-    logger.info("Downloading file at url: %s" % url)
-    content = get_page_with_retry(url)
-    with open(filename, "wb") as fid:
-        fid.write(content)
+class LocalFileProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
+    def validate(src):
+        return os.path.exists(src)
 
-def dearxiv(input_file, pdftk_path="pdftk"):
-    """Remove the arXiv timestamp from a pdf"""
-    logger.info("Removing arXiv timestamp")
-    basename = os.path.splitext(input_file)[0]
-    uncompress_file = basename + "_uncompress.pdf"
+    def retrieve_pdf(self, src, filename):
+        source = os.path.join(self.initial_dir, src)
+        shutil.copy(source, filename)
 
-    status = subprocess.call(
-        [pdftk_path, input_file, "output", uncompress_file, "uncompress"]
-    )
-    if not status == 0:
-        exception("pdftk failed to uncompress the pdf.")
-
-    with open(uncompress_file, "rb") as fid:
-        data = fid.read()
-        # Remove the text element
-        data = re.sub(
-            b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
-            b"()Tj",
-            data,
-        )
-        # Remove the URL element
-        data = re.sub(
-            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
-            b"",
-            data,
-        )
+    def get_paper_info(self, src):
+        return {"filename": src}
 
-    removed_file = basename + "_removed.pdf"
-    with open(removed_file, "wb") as oid:
-        oid.write(data)
+    def create_filename(self, info, filename=None):
+        if not filename is None:
+            return filename
+        return os.path.basename(info["filename"])
 
-    output_file = basename + "_dearxiv.pdf"
-    status = subprocess.call(
-        [pdftk_path, removed_file, "output", output_file, "compress"]
-    )
-    if not status == 0:
-        exception("pdftk failed to compress the pdf.")
 
-    return output_file
+class PdfUrlProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
+    def validate(src):
+        try:
+            result = urllib.parse.urlparse(src)
+            return all([result.scheme, result.netloc, result.path])
+        except:
+            return False
 
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
-    logger.info("Cropping pdf file")
-    status = subprocess.call(
-        [pdfcrop_path, "--margins", "15 40 15 15", filepath],
-        stdout=subprocess.DEVNULL,
-    )
-    if not status == 0:
-        logger.warning("Failed to crop the pdf file at: %s" % filepath)
-        return filepath
-    cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
-    if not os.path.exists(cropped_file):
-        logger.warning(
-            "Can't find cropped file '%s' where expected." % cropped_file
-        )
-        return filepath
-    return cropped_file
-
-
-def shrink_pdf(filepath, gs_path="gs"):
-    logger.info("Shrinking pdf file")
-    output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
-    status = subprocess.call(
-        [
-            "gs",
-            "-sDEVICE=pdfwrite",
-            "-dCompatibilityLevel=1.4",
-            "-dPDFSETTINGS=/printer",
-            "-dNOPAUSE",
-            "-dBATCH",
-            "-dQUIET",
-            "-sOutputFile=%s" % output_file,
-            filepath,
-        ]
-    )
-    if not status == 0:
-        logger.warning("Failed to shrink the pdf file")
-        return filepath
-    return output_file
-
-
-def get_paper_info_arxiv(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from arXiv")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_author"})
-    ]
-    authors = [x.split(",")[0].strip() for x in authors]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_pmc(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from PMC")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_authors"})
-    ]
-    # We only use last names, and this method is a guess at best. I'm open to
-    # more advanced approaches.
-    authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    if re.match("\w+\ \d{4}", date):
-        date = date.split(" ")[-1]
-    else:
-        date = date.replace(" ", "_")
-    return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_acm(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from ACM")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_authors"})
-    ]
-    # We only use last names, and this method is a guess. I'm open to more
-    # advanced approaches.
-    authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
-        logger.warning(
-            "Couldn't extract year from ACM page, please raise an "
-            "issue on GitHub so I can fix it: %s",
-            GITHUB_URL,
-        )
-    date = date.strip().split("/")[-1]
-    return dict(title=title, date=date, authors=authors)
-
-
-def generate_filename(info):
-    """ Generate a nice filename for a paper given the info dict """
-    # we assume that the list of authors is lastname only.
-    logger.info("Generating output filename")
-    if len(info["authors"]) > 3:
-        author_part = info["authors"][0] + "_et_al"
-    else:
-        author_part = "_".join(info["authors"])
-    author_part = author_part.replace(" ", "_")
-    title = info["title"].replace(",", "").replace(":", "").replace(" ", "_")
-    title_part = titlecase.titlecase(title)
-    year_part = info["date"].split("/")[0]
-    return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-
-
-def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
-    remarkable_dir = remarkable_dir.rstrip("/")
-    logger.info("Starting upload to reMarkable")
-    if remarkable_dir:
-        status = subprocess.call(
-            [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL
-        )
-        if not status == 0:
+    def retrieve_pdf(self, url, filename):
+        self.download_url(url, filename)
+
+    def get_paper_info(self, src):
+        return None
+
+    def create_filename(self, info, filename=None):
+        if filename is None:
             exception(
-                "Creating directory %s on reMarkable failed" % remarkable_dir
+                "Filename must be provided with PDFUrlProvider (use --filename)"
             )
-    status = subprocess.call(
-        [rmapi_path, "put", filepath, remarkable_dir + "/"],
-        stdout=subprocess.DEVNULL,
-    )
-    if not status == 0:
-        exception("Uploading file %s to reMarkable failed" % filepath)
-    logger.info("Upload successful.")
+        return filename
+
+
+def exception(msg):
+    print("ERROR: " + msg, file=sys.stderr)
+    print("Error occurred. Exiting.", file=sys.stderr)
+    raise SystemExit(1)
 
 
 def parse_args():
@@ -420,82 +549,33 @@ def parse_args():
     return parser.parse_args()
 
 
-@logger.catch
 def main():
     args = parse_args()
 
-    if os.path.exists(args.input):
-        mode = "local_file"
-    elif arxiv_url(args.input):
-        mode = "arxiv_url"
-    elif pmc_url(args.input):
-        mode = "pmc_url"
-    elif acm_url(args.input):
-        mode = "acm_url"
-    elif valid_url(args.input):
-        if args.filename is None:
-            exception(
-                "Filename must be provided with pdf url (use --filename)"
-            )
-        mode = "pdf_url"
-    else:
-        exception("Input not a valid url, arxiv url, or existing file.")
-
-    if not args.verbose:
-        logger.remove(0)
-
-    start_wd = os.getcwd()
-
-    with tempfile.TemporaryDirectory() as working_dir:
-        if mode == "local_file":
-            shutil.copy(args.input, working_dir)
-            filename = os.path.basename(args.input)
-            clean_filename = args.filename if args.filename else filename
-
-        os.chdir(working_dir)
-        if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]:
-            filename = "paper.pdf"
-            if mode == "arxiv_url":
-                pdf_url, abs_url = get_arxiv_urls(args.input)
-                paper_info = get_paper_info_arxiv(abs_url)
-            elif mode == "pmc_url":
-                pdf_url, abs_url = get_pmc_urls(args.input)
-                paper_info = get_paper_info_pmc(abs_url)
-            elif mode == "acm_url":
-                pdf_url, abs_url = get_acm_urls(args.input)
-                paper_info = get_paper_info_acm(abs_url)
-            else:
-                pdf_url = args.input
-            download_url(pdf_url, filename)
-            if not check_file_is_pdf(filename):
-                exception("Downloaded file isn't a valid pdf file.")
-            if args.filename:
-                clean_filename = args.filename
-            else:
-                clean_filename = generate_filename(paper_info)
-
-        dearxived = dearxiv(filename, pdftk_path=args.pdftk)
-        cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
-        shrinked = shrink_pdf(cropped)
-        shutil.move(shrinked, clean_filename)
-
-        if args.debug:
-            print("Paused in debug mode in dir: %s" % working_dir)
-            print("Press enter to exit.")
-            return input()
-
-        if args.no_upload:
-            if os.path.exists(os.path.join(start_wd, clean_filename)):
-                tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
-                shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
-            else:
-                shutil.move(clean_filename, start_wd)
-        else:
-            upload_to_rm(
-                clean_filename,
-                remarkable_dir=args.remarkable_dir,
-                rmapi_path=args.rmapi,
-            )
+    providers = [
+        ArxivProvider,
+        PMCProvider,
+        ACMProvider,
+        LocalFileProvider,
+        PdfUrlProvider,
+    ]
+
+    provider = next((p for p in providers if p.validate(args.input)), None)
+    if provider is None:
+        exception("Input not valid, no provider can handle this source.")
+
+    prov = provider(
+        args.verbose,
+        not args.no_upload,
+        args.debug,
+        args.remarkable_dir,
+        args.rmapi,
+        args.pdfcrop,
+        args.pdftk,
+        args.gs,
+    )
+
+    prov.run(args.input, filename=args.filename)
 
 
 if __name__ == "__main__":
diff --git a/poetry.lock b/poetry.lock
index d8a1205..893007f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,16 +1,5 @@
 [[package]]
 category = "main"
-description = "Produce colored terminal text with an xml-like markup"
-name = "ansimarkup"
-optional = false
-python-versions = "*"
-version = "1.4.0"
-
-[package.dependencies]
-colorama = "*"
-
-[[package]]
-category = "main"
 description = "Screen-scraping library"
 name = "beautifulsoup4"
 optional = false
@@ -22,19 +11,6 @@ soupsieve = ">=1.2"
 
 [[package]]
 category = "main"
-description = "Pretty and helpful exceptions, automatically"
-name = "better-exceptions-fork"
-optional = false
-python-versions = "*"
-version = "0.2.1.post6"
-
-[package.dependencies]
-ansimarkup = ">=1.3.0"
-colorama = "*"
-pygments = ">=2.2.0"
-
-[[package]]
-category = "main"
 description = "Dummy package for Beautiful Soup"
 name = "bs4"
 optional = false
@@ -62,14 +38,6 @@ version = "3.0.4"
 
 [[package]]
 category = "main"
-description = "Cross-platform colored terminal text."
-name = "colorama"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "0.4.1"
-
-[[package]]
-category = "main"
 description = "Internationalized Domain Names in Applications (IDNA)"
 name = "idna"
 optional = false
@@ -78,27 +46,6 @@ version = "2.8"
 
 [[package]]
 category = "main"
-description = "Python logging made (stupidly) simple"
-name = "loguru"
-optional = false
-python-versions = ">=3.5"
-version = "0.2.5"
-
-[package.dependencies]
-ansimarkup = ">=1.4.0"
-better-exceptions-fork = ">=0.2.1.post6"
-colorama = ">=0.3.4"
-
-[[package]]
-category = "main"
-description = "Pygments is a syntax highlighting package written in Python."
-name = "pygments"
-optional = false
-python-versions = "*"
-version = "2.3.1"
-
-[[package]]
-category = "main"
 description = "Python HTTP for Humans."
 name = "requests"
 optional = false
@@ -128,20 +75,15 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
 version = "1.24.1"
 
 [metadata]
-content-hash = "b92b4b1d2c4f9d3181044c1ad99fd9bfa49e8618c6ff5de7bd64c557bcc27e39"
+content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82"
 python-versions = "^3.5"
 
 [metadata.hashes]
-ansimarkup = ["06365e3ef89a12734fc408b2449cb4642d5fe2e603e95e7296eff9e98a0fe0b4", "174d920481416cec8d5a707af542d6fba25a1df1c21d8996479c32ba453649a4"]
 beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"]
-better-exceptions-fork = ["5f0983da51e956dbdaf8b9a3d10e2774b382ce6c6ff2e54685c33e2dbe8f1472"]
 bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
 certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
 chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
-colorama = ["05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", "f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48"]
 idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
-loguru = ["68297d9f23064c2f4764bb5d0c5c767f3ed7f9fc1218244841878f5fc7c94add", "ebac59630946721fd6207264679b267a8bdc290b086226067d6aad86830e3123"]
-pygments = ["5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", "e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"]
 requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
 soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
 urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
diff --git a/pyproject.toml b/pyproject.toml
index 6f67ecd..2c28224 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,6 @@ license = "MIT"
 python = "^3.5"
 bs4 = "^0.0.1"
 requests = "^2.21"
-loguru = "^0.2.5"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..2ec59d8
--- /dev/null
+++ b/test.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+__author__ = "G.J.J. van den Burg"
+
+"""Tests"""
+
+import unittest
+import tempfile
+import hashlib
+import shutil
+import os
+
+from arxiv2remarkable import (
+    ArxivProvider,
+    PMCProvider,
+    ACMProvider,
+    LocalFileProvider,
+    PdfUrlProvider,
+)
+
+
+def md5sum(filename):
+    blocksize = 65536
+    hasher = hashlib.md5()
+    with open(filename, "rb") as fid:
+        buf = fid.read(blocksize)
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = fid.read(blocksize)
+    return hasher.hexdigest()
+
+
+class Tests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.original_dir = os.getcwd()
+
+    def setUp(self):
+        self.test_dir = tempfile.mkdtemp()
+        os.chdir(self.test_dir)
+
+    def tearDown(self):
+        os.chdir(self.original_dir)
+        shutil.rmtree(self.test_dir)
+
+    def test_arxiv(self):
+        prov = ArxivProvider(upload=False)
+        url = "https://arxiv.org/abs/1811.11242v1"
+        exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+        fsize = os.path.getsize(filename)
+        self.assertTrue(1054082 < fsize <= 1056082)
+
+    def test_pmc(self):
+        prov = PMCProvider(upload=False)
+        url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
+        exp_filename = (
+            "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"
+        )
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+        fsize = os.path.getsize(filename)
+        self.assertTrue(376640 < fsize <= 378640)
+
+    def test_acm(self):
+        prov = ACMProvider(upload=False)
+        url = "https://dl.acm.org/citation.cfm?id=3300356"
+        exp_filename = "Muller_et_al_-_How_Data_Science_Workers_Work_With_Data_Discovery_Capture_Curation_Design_Creation_2019.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+        fsize = os.path.getsize(filename)
+        self.assertTrue(1691444 < fsize <= 1693444)
+
+    def test_local(self):
+        local_filename = "test.pdf"
+        with open(local_filename, "w") as fp:
+            fp.write(
+                "%PDF-1.1\n%¥±ë\n\n1 0 obj\n  << /Type /Catalog\n     /Pages 2 0 R\n  >>\nendobj\n\n2 0 obj\n  << /Type /Pages\n     /Kids [3 0 R]\n     /Count 1\n     /MediaBox [0 0 300 144]\n  >>\nendobj\n\n3 0 obj\n  <<  /Type /Page\n      /Parent 2 0 R\n      /Resources\n       << /Font\n           << /F1\n               << /Type /Font\n                  /Subtype /Type1\n                  /BaseFont /Times-Roman\n               >>\n           >>\n       >>\n      /Contents 4 0 R\n  >>\nendobj\n\n4 0 obj\n  << /Length 55 >>\nstream\n  BT\n    /F1 18 Tf\n    0 0 Td\n    (Hello World) Tj\n  ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n  <<  /Root 1 0 R\n      /Size 5\n  >>\nstartxref\n565\n%%EOF"
+            )
+        prov = LocalFileProvider(upload=False)
+        filename = prov.run(local_filename)
+        self.assertEqual("test_.pdf", os.path.basename(filename))
+        fsize = os.path.getsize(filename)
+        self.assertTrue(5843 < fsize <= 7843)
+
+    def test_pdfurl(self):
+        prov = PdfUrlProvider(upload=False)
+        url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+        filename = prov.run(url, filename="test.pdf")
+        self.assertEqual("test.pdf", os.path.basename(filename))
+        fsize = os.path.getsize(filename)
+        self.assertTrue(1828169 < fsize <= 1830169)
+
+if __name__ == "__main__":
+    unittest.main()