even more work in progress

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-05-29 23:11:02 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-05-29 23:11:02 +0100
commit: 42fd83b66037bd7c714a67d4f22d38eda478ecb8 (patch)
tree: b4bf7abbc5986b120be3b26aaa36e719cc95ffbf
parent: more work in progress (diff)
download: paper2remarkable-42fd83b66037bd7c714a67d4f22d38eda478ecb8.tar.gz
paper2remarkable-42fd83b66037bd7c714a67d4f22d38eda478ecb8.zip
1 files changed, 220 insertions, 253 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 9686ee1..8a8d58b 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -21,10 +21,11 @@ License: MIT
 
 """
 
-import abc
 import PyPDF2
+import abc
 import argparse
 import bs4
+import datetime
 import os
 import re
 import requests
@@ -36,8 +37,6 @@ import time
 import titlecase
 import urllib.parse
 
-from loguru import logger
-
 GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
 
 HEADERS = {
@@ -50,13 +49,45 @@ HEADERS = {
 class Provider(metaclass=abc.ABCMeta):
     """ ABC for providers of pdf sources """
 
-    def __init__(self, remarkable_dir="/", rmapi_path="rmapi"):
+    def __init__(
+        self,
+        verbose=False,
+        upload=True,
+        debug=False,
+        remarkable_dir="/",
+        rmapi_path="rmapi",
+        pdfcrop_path="pdfcrop",
+        pdftk_path="pdftk",
+        gs_path="gs",
+    ):
+        self.verbose = verbose
+        self.upload = upload
+        self.debug = debug
         self.remarkable_dir = remarkable_dir
         self.rmapi_path = rmapi_path
+        self.pdfcrop_path = pdfcrop_path
+        self.pdftk_path = pdftk_path
+        self.gs_path = gs_path
+
+        self.log("Starting %s" % type(self).__name__)
+
+    def log(self, msg, mode="info"):
+        if not self.verbose:
+            return
+        if not mode in ["info", "warning"]:
+            raise ValueError("unknown logging mode.")
+        now = datetime.datetime.now()
+        print(
+            now.strftime("%Y-%m-%d %H:%M:%S")
+            + " - "
+            + mode.upper()
+            + " - "
+            + msg
+        )
 
     @staticmethod
     @abc.abstractmethod
-    def validate(self, src):
+    def validate(src):
         """ Validate whether ``src`` is appropriate for this provider """
 
     @abc.abstractmethod
@@ -72,40 +103,41 @@ class Provider(metaclass=abc.ABCMeta):
         if not filename is None:
             return filename
         # we assume that the list of authors is surname only.
-        logger.info("Generating output filename")
+        self.log("Generating output filename")
         if len(info["authors"]) > 3:
             author_part = info["authors"][0] + "_et_al"
         else:
             author_part = "_".join(info["authors"])
         author_part = author_part.replace(" ", "_")
-        title = (
-            info["title"].replace(",", "").replace(":", "").replace(" ", "_")
-        )
-        title_part = titlecase.titlecase(title)
+        title = info["title"].replace(",", "").replace(":", "")
+        title_part = titlecase.titlecase(title).replace(" ", "_")
         year_part = info["date"].split("/")[0]
         name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-        logger.info("Created filename: %s" % name)
+        self.log("Created filename: %s" % name)
         return name
 
     def crop_pdf(self, filepath):
-        logger.info("Cropping pdf file")
+        self.log("Cropping pdf file")
         status = subprocess.call(
             [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
             stdout=subprocess.DEVNULL,
         )
         if not status == 0:
-            logger.warning("Failed to crop the pdf file at: %s" % filepath)
+            self.log(
+                "Failed to crop the pdf file at: %s" % filepath, mode="warning"
+            )
             return filepath
         cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
         if not os.path.exists(cropped_file):
-            logger.warning(
-                "Can't find cropped file '%s' where expected." % cropped_file
+            self.log(
+                "Can't find cropped file '%s' where expected." % cropped_file,
+                mode="warning",
             )
             return filepath
         return cropped_file
 
     def shrink_pdf(self, filepath):
-        logger.info("Shrinking pdf file")
+        self.log("Shrinking pdf file")
         output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
         status = subprocess.call(
             [
@@ -121,20 +153,58 @@ class Provider(metaclass=abc.ABCMeta):
             ]
         )
         if not status == 0:
-            logger.warning("Failed to shrink the pdf file")
+            self.log("Failed to shrink the pdf file", mode="warning")
             return filepath
         return output_file
 
     def check_file_is_pdf(self, filename):
         try:
-            PyPDF2.PdfFileReader(open(filename, "rb"))
+            fp = open(filename, "rb")
+            pdf = PyPDF2.PdfFileReader(fp, strict=False)
+            fp.close()
+            del pdf
             return True
         except PyPDF2.utils.PdfReadError:
             exception("Downloaded file isn't a valid pdf file.")
 
+    def download_url(self, url, filename):
+        """Download the content of an url and save it to a filename """
+        self.log("Downloading file at url: %s" % url)
+        content = self.get_page_with_retry(url)
+        with open(filename, "wb") as fid:
+            fid.write(content)
+
+    def get_page_with_retry(self, url, times=5):
+        """ Get the content of an url, retrying on failure.
+        """
+
+        def retry(url, count):
+            if count < times:
+                self.log(
+                    "Caught error for url %s. Retrying in 5 seconds." % url,
+                    mode="warning",
+                )
+                time.sleep(5)
+            else:
+                exception("Failed to download url: %s" % url)
+
+        count = 0
+        while True:
+            count += 1
+            try:
+                res = requests.get(url, headers=HEADERS)
+            except requests.exceptions.ConnectionError:
+                retry(url, count)
+                continue
+            if res.ok:
+                self.log("Downloading url: %s" % url)
+                return res.content
+            else:
+                retry(url, count)
+
     def upload_to_rm(self, filepath):
         remarkable_dir = self.remarkable_dir.rstrip("/")
-        logger.info("Starting upload to reMarkable")
+        self.log("Starting upload to reMarkable")
         if remarkable_dir:
             status = subprocess.call(
                 [self.rmapi_path, "mkdir", remarkable_dir],
@@ -151,34 +221,86 @@ class Provider(metaclass=abc.ABCMeta):
         )
         if not status == 0:
             exception("Uploading file %s to reMarkable failed" % filepath)
-        logger.info("Upload successful.")
+        self.log("Upload successful.")
 
-    def run(self, src, filename=None, debug=False, upload=True):
+    def dearxiv(self, input_file):
+        """Remove the arXiv timestamp from a pdf"""
+        self.log("Removing arXiv timestamp")
+        basename = os.path.splitext(input_file)[0]
+        uncompress_file = basename + "_uncompress.pdf"
+
+        status = subprocess.call(
+            [
+                self.pdftk_path,
+                input_file,
+                "output",
+                uncompress_file,
+                "uncompress",
+            ]
+        )
+        if not status == 0:
+            exception("pdftk failed to uncompress the pdf.")
+
+        with open(uncompress_file, "rb") as fid:
+            data = fid.read()
+            # Remove the text element
+            data = re.sub(
+                b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+                b"()Tj",
+                data,
+            )
+            # Remove the URL element
+            data = re.sub(
+                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
+                b"",
+                data,
+            )
+
+        removed_file = basename + "_removed.pdf"
+        with open(removed_file, "wb") as oid:
+            oid.write(data)
+
+        output_file = basename + "_dearxiv.pdf"
+        status = subprocess.call(
+            [self.pdftk_path, removed_file, "output", output_file, "compress"]
+        )
+        if not status == 0:
+            exception("pdftk failed to compress the pdf.")
+
+        return output_file
+
+    def run(self, src, filename=None):
         info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
         tmp_filename = "paper.pdf"
-        self.retrieve_pdf(src, tmp_filename)
-        self.check_file_is_pdf(tmp_filename)
-
-        ops = [self.dearxiv, self.crop, self.shrink]
-        intermediate_fname = tmp_filename
-        for op in ops:
-            intermediate_fname = op(tmp_filename)
-        shutil.move(intermediate_fname, clean_filename)
-
-        if debug:
-            print("Paused in debug mode in dir: %s" % working_dir)
-            print("Press enter to exit.")
-            return input()
-
-        if upload:
-            return self.upload_to_rm(clean_filename)
-
-        if os.path.exists(os.path.join(start_wd, clean_filename)):
-            tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
-            shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
-        else:
-            shutil.move(clean_filename, start_wd)
+
+        self.initial_dir = os.getcwd()
+        with tempfile.TemporaryDirectory() as working_dir:
+            os.chdir(working_dir)
+            self.retrieve_pdf(src, tmp_filename)
+            self.check_file_is_pdf(tmp_filename)
+
+            ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf]
+            intermediate_fname = tmp_filename
+            for op in ops:
+                intermediate_fname = op(intermediate_fname)
+            shutil.move(intermediate_fname, clean_filename)
+
+            if self.debug:
+                print("Paused in debug mode in dir: %s" % working_dir)
+                print("Press enter to exit.")
+                return input()
+
+            if self.upload:
+                return self.upload_to_rm(clean_filename)
+
+            target_path = os.path.join(self.initial_dir, clean_filename)
+            while os.path.exists(target_path):
+                base = os.path.splitext(target_path)[0]
+                target_path = base + "_.pdf"
+            shutil.move(clean_filename, target_path)
+            return target_path
+
 
 class ArxivProvider(Provider):
     def __init__(self, *args, **kwargs):
@@ -198,7 +320,7 @@ class ArxivProvider(Provider):
             exception("Couldn't figure out arXiv urls.")
         return abs_url, pdf_url
 
-    def validate(self, src):
+    def validate(src):
         """Check if the url is to an arXiv page. """
         m = re.match(
             "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
@@ -208,13 +330,13 @@ class ArxivProvider(Provider):
     def retrieve_pdf(self, src, filename):
         """ Download the file and save as filename """
         _, pdf_url = self.get_abs_pdf_urls(src)
-        download_url(pdf_url, filename)
+        self.download_url(pdf_url, filename)
 
     def get_paper_info(self, src):
         """ Extract the paper's authors, title, and publication year """
         abs_url, _ = self.get_abs_pdf_urls(src)
-        logger.info("Getting paper info from arXiv")
-        page = get_page_with_retry(abs_url)
+        self.log("Getting paper info from arXiv")
+        page = self.get_page_with_retry(abs_url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = [
             x["content"]
@@ -225,6 +347,7 @@ class ArxivProvider(Provider):
         date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
         return dict(title=title, date=date, authors=authors)
 
+
 class PMCProvider(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -245,9 +368,9 @@ class PMCProvider(Provider):
             pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
         else:
             exception("Couldn't figure out PMC urls.")
-        return pdf_url, abs_url
+        return abs_url, pdf_url
 
-    def validate(self, src):
+    def validate(src):
         m = re.fullmatch(
             "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src
         )
@@ -255,12 +378,12 @@ class PMCProvider(Provider):
 
     def retrieve_pdf(self, src, filename):
         _, pdf_url = self.get_abs_pdf_urls(src)
-        download_url(pdf_url, filename)
+        self.download_url(pdf_url, filename)
 
     def get_paper_info(self, src):
         """ Extract the paper's authors, title, and publication year """
-        logger.info("Getting paper info from PMC")
-        page = get_page_with_retry(src)
+        self.log("Getting paper info from PMC")
+        page = self.get_page_with_retry(src)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = [
             x["content"]
@@ -279,12 +402,13 @@ class PMCProvider(Provider):
             date = date.replace(" ", "_")
         return dict(title=title, date=date, authors=authors)
 
+
 class ACMProvider(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def get_acm_pdf_url(self, url):
-        page = get_page_with_retry(url)
+        page = self.get_page_with_retry(url)
         soup = bs4.BeautifulSoup(page, "html.parser")
         thea = None
         for a in soup.find_all("a"):
@@ -304,26 +428,28 @@ class ACMProvider(Provider):
             abs_url = url
             pdf_url = self.get_acm_pdf_url(url)
             if pdf_url is None:
-                exception("Couldn't extract PDF url from ACM citation page.")
+                exception(
+                    "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+                )
         else:
             exception(
                 "Couldn't figure out ACM urls, please provide a URL of the "
                 "format: http(s)://dl.acm.org/citation.cfm?id=..."
             )
-        return pdf_url, abs_url
+        return abs_url, pdf_url
 
     def retrieve_pdf(self, src, filename):
         _, pdf_url = self.get_abs_pdf_urls(src)
-        download_url(pdf_url, filename)
+        self.download_url(pdf_url, filename)
 
-    def validate(self, src):
+    def validate(src):
         m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src)
         return not m is None
 
     def get_paper_info(self, src):
         """ Extract the paper's authors, title, and publication year """
-        logger.info("Getting paper info from ACM")
-        page = get_page_with_retry(src)
+        self.log("Getting paper info from ACM")
+        page = self.get_page_with_retry(src)
         soup = bs4.BeautifulSoup(page, "html.parser")
         authors = [
             x["content"]
@@ -337,32 +463,40 @@ class ACMProvider(Provider):
         title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
         date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
         if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
-            logger.warning(
+            self.log(
                 "Couldn't extract year from ACM page, please raise an "
-                "issue on GitHub so I can fix it: %s",
-                GITHUB_URL,
+                "issue on GitHub so I can fix it: %s" % GITHUB_URL,
+                mode="warning",
             )
         date = date.strip().split("/")[-1]
         return dict(title=title, date=date, authors=authors)
 
+
 class LocalFileProvider(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def validate(self, src):
+    def validate(src):
         return os.path.exists(src)
 
     def retrieve_pdf(self, src, filename):
-        shutil.copy(src, filename)
+        source = os.path.join(self.initial_dir, src)
+        shutil.copy(source, filename)
 
     def get_paper_info(self, src):
-        return None
+        return {"filename": src}
+
+    def create_filename(self, info, filename=None):
+        if not filename is None:
+            return filename
+        return os.path.basename(info["filename"])
+
 
 class PdfUrlProvider(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def validate(self, src):
+    def validate(src):
         try:
             result = urllib.parse.urlparse(src)
             return all([result.scheme, result.netloc, result.path])
@@ -370,15 +504,18 @@ class PdfUrlProvider(Provider):
             return False
 
     def retrieve_pdf(self, url, filename):
-        if filename is None:
-            exception(
-                "Filename must be provided with pdf url (use --filename)"
-            )
-        download_url(url, filename)
+        self.download_url(url, filename)
 
     def get_paper_info(self, src):
         return None
 
+    def create_filename(self, info, filename=None):
+        if filename is None:
+            exception(
+                "Filename must be provided with PDFUrlProvider (use --filename)"
+            )
+        return filename
+
 
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
@@ -386,102 +523,6 @@ def exception(msg):
     raise SystemExit(1)
 
 
-def get_page_with_retry(url):
-    """Get the content of an url, retrying up to five times on failure. """
-
-    def retry(url, count):
-        if count < 5:
-            logger.info(
-                "Caught error for url %s. Retrying in 5 seconds." % url
-            )
-            time.sleep(5)
-        else:
-            exception("Failed to download url: %s" % url)
-
-    count = 0
-    while True:
-        count += 1
-        try:
-            res = requests.get(url, headers=HEADERS)
-        except requests.exceptions.ConnectionError:
-            retry(url, count)
-            continue
-        if res.ok:
-            logger.info("Downloading url: %s" % url)
-            return res.content
-        else:
-            retry(url, count)
-
-
-def download_url(url, filename):
-    """Download the content of an url and save it to a filename """
-    logger.info("Downloading file at url: %s" % url)
-    content = get_page_with_retry(url)
-    with open(filename, "wb") as fid:
-        fid.write(content)
-
-
-def dearxiv(input_file, pdftk_path="pdftk"):
-    """Remove the arXiv timestamp from a pdf"""
-    logger.info("Removing arXiv timestamp")
-    basename = os.path.splitext(input_file)[0]
-    uncompress_file = basename + "_uncompress.pdf"
-
-    status = subprocess.call(
-        [pdftk_path, input_file, "output", uncompress_file, "uncompress"]
-    )
-    if not status == 0:
-        exception("pdftk failed to uncompress the pdf.")
-
-    with open(uncompress_file, "rb") as fid:
-        data = fid.read()
-        # Remove the text element
-        data = re.sub(
-            b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
-            b"()Tj",
-            data,
-        )
-        # Remove the URL element
-        data = re.sub(
-            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n",
-            b"",
-            data,
-        )
-
-    removed_file = basename + "_removed.pdf"
-    with open(removed_file, "wb") as oid:
-        oid.write(data)
-
-    output_file = basename + "_dearxiv.pdf"
-    status = subprocess.call(
-        [pdftk_path, removed_file, "output", output_file, "compress"]
-    )
-    if not status == 0:
-        exception("pdftk failed to compress the pdf.")
-
-    return output_file
-
-
-def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
-    remarkable_dir = remarkable_dir.rstrip("/")
-    logger.info("Starting upload to reMarkable")
-    if remarkable_dir:
-        status = subprocess.call(
-            [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL
-        )
-        if not status == 0:
-            exception(
-                "Creating directory %s on reMarkable failed" % remarkable_dir
-            )
-    status = subprocess.call(
-        [rmapi_path, "put", filepath, remarkable_dir + "/"],
-        stdout=subprocess.DEVNULL,
-    )
-    if not status == 0:
-        exception("Uploading file %s to reMarkable failed" % filepath)
-    logger.info("Upload successful.")
-
-
 def parse_args():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
@@ -529,8 +570,7 @@ def parse_args():
     return parser.parse_args()
 
 
-@logger.catch
-def newmain():
+def main():
     args = parse_args()
 
     providers = [
@@ -545,91 +585,18 @@ def newmain():
     if provider is None:
         exception("Input not valid, no provider can handle this source.")
 
-    if not args.verbose:
-        logger.remove(0)
-
-
-    start_wd = os.getcwd()
-    with tempfile.TemporaryDirector() as working_dir:
-        provider.run(args.input, debug=args.debug, upload=not args.no_upload)
-
-
-@logger.catch
-def main():
-    args = parse_args()
+    prov = provider(
+        args.verbose,
+        not args.no_upload,
+        args.debug,
+        args.remarkable_dir,
+        args.rmapi,
+        args.pdfcrop,
+        args.pdftk,
+        args.gs,
+    )
 
-    if os.path.exists(args.input):
-        mode = "local_file"
-    elif arxiv_url(args.input):
-        mode = "arxiv_url"
-    elif pmc_url(args.input):
-        mode = "pmc_url"
-    elif acm_url(args.input):
-        mode = "acm_url"
-    elif valid_url(args.input):
-        if args.filename is None:
-            exception(
-                "Filename must be provided with pdf url (use --filename)"
-            )
-        mode = "pdf_url"
-    else:
-        exception("Input not a valid url, arxiv url, or existing file.")
-
-    if not args.verbose:
-        logger.remove(0)
-
-    start_wd = os.getcwd()
-
-    with tempfile.TemporaryDirectory() as working_dir:
-        if mode == "local_file":
-            shutil.copy(args.input, working_dir)
-            filename = os.path.basename(args.input)
-            clean_filename = args.filename if args.filename else filename
-
-        os.chdir(working_dir)
-        if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]:
-            filename = "paper.pdf"
-            if mode == "arxiv_url":
-                pdf_url, abs_url = get_arxiv_urls(args.input)
-                paper_info = get_paper_info_arxiv(abs_url)
-            elif mode == "pmc_url":
-                pdf_url, abs_url = get_pmc_urls(args.input)
-                paper_info = get_paper_info_pmc(abs_url)
-            elif mode == "acm_url":
-                pdf_url, abs_url = get_acm_urls(args.input)
-                paper_info = get_paper_info_acm(abs_url)
-            else:
-                pdf_url = args.input
-            download_url(pdf_url, filename)
-            if not check_file_is_pdf(filename):
-                exception("Downloaded file isn't a valid pdf file.")
-            if args.filename:
-                clean_filename = args.filename
-            else:
-                clean_filename = generate_filename(paper_info)
-
-        dearxived = dearxiv(filename, pdftk_path=args.pdftk)
-        cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
-        shrinked = shrink_pdf(cropped)
-        shutil.move(shrinked, clean_filename)
-
-        if args.debug:
-            print("Paused in debug mode in dir: %s" % working_dir)
-            print("Press enter to exit.")
-            return input()
-
-        if args.no_upload:
-            if os.path.exists(os.path.join(start_wd, clean_filename)):
-                tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
-                shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
-            else:
-                shutil.move(clean_filename, start_wd)
-        else:
-            upload_to_rm(
-                clean_filename,
-                remarkable_dir=args.remarkable_dir,
-                rmapi_path=args.rmapi,
-            )
+    prov.run(args.input, filename=args.filename)
 
 
 if __name__ == "__main__":
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-05-29 23:11:02 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-05-29 23:11:02 +0100
commit	42fd83b66037bd7c714a67d4f22d38eda478ecb8 (patch)
tree	b4bf7abbc5986b120be3b26aaa36e719cc95ffbf
parent	more work in progress (diff)
download	paper2remarkable-42fd83b66037bd7c714a67d4f22d38eda478ecb8.tar.gz paper2remarkable-42fd83b66037bd7c714a67d4f22d38eda478ecb8.zip