more work in progress

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-05-21 10:40:38 -0400
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-05-21 10:40:38 -0400
commit: 66a9a4d63068fb32e12f61d1fe7612586d6dc30e (patch)
tree: 4a30dfe315a604fe6f91441335f4dd646c29506d
parent: Merge branch 'master' into wip_gertjan (diff)
download: paper2remarkable-66a9a4d63068fb32e12f61d1fe7612586d6dc30e.tar.gz
paper2remarkable-66a9a4d63068fb32e12f61d1fe7612586d6dc30e.zip
1 files changed, 254 insertions, 225 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 5152018..9686ee1 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+__version__ = "0.2.0"
+__author__ = "G.J.J. van den Burg"
+
 """
 Given an arXiv paper url this script:
 
@@ -47,8 +50,9 @@ HEADERS = {
 class Provider(metaclass=abc.ABCMeta):
     """ ABC for providers of pdf sources """
 
-    def __init__(self):
-        pass
+    def __init__(self, remarkable_dir="/", rmapi_path="rmapi"):
+        self.remarkable_dir = remarkable_dir
+        self.rmapi_path = rmapi_path
 
     @staticmethod
     @abc.abstractmethod
@@ -79,10 +83,78 @@ class Provider(metaclass=abc.ABCMeta):
         )
         title_part = titlecase.titlecase(title)
         year_part = info["date"].split("/")[0]
-        return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+        name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+        logger.info("Created filename: %s" % name)
+        return name
+
+    def crop_pdf(self, filepath):
+        logger.info("Cropping pdf file")
+        status = subprocess.call(
+            [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            logger.warning("Failed to crop the pdf file at: %s" % filepath)
+            return filepath
+        cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+        if not os.path.exists(cropped_file):
+            logger.warning(
+                "Can't find cropped file '%s' where expected." % cropped_file
+            )
+            return filepath
+        return cropped_file
+
+    def shrink_pdf(self, filepath):
+        logger.info("Shrinking pdf file")
+        output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+        status = subprocess.call(
+            [
+                self.gs_path,
+                "-sDEVICE=pdfwrite",
+                "-dCompatibilityLevel=1.4",
+                "-dPDFSETTINGS=/printer",
+                "-dNOPAUSE",
+                "-dBATCH",
+                "-dQUIET",
+                "-sOutputFile=%s" % output_file,
+                filepath,
+            ]
+        )
+        if not status == 0:
+            logger.warning("Failed to shrink the pdf file")
+            return filepath
+        return output_file
 
-    def run(self, src, filename=None):
-        info = get_paper_info(src)
+    def check_file_is_pdf(self, filename):
+        try:
+            PyPDF2.PdfFileReader(open(filename, "rb"))
+            return True
+        except PyPDF2.utils.PdfReadError:
+            exception("Downloaded file isn't a valid pdf file.")
+
+    def upload_to_rm(self, filepath):
+        remarkable_dir = self.remarkable_dir.rstrip("/")
+        logger.info("Starting upload to reMarkable")
+        if remarkable_dir:
+            status = subprocess.call(
+                [self.rmapi_path, "mkdir", remarkable_dir],
+                stdout=subprocess.DEVNULL,
+            )
+            if not status == 0:
+                exception(
+                    "Creating directory %s on reMarkable failed"
+                    % remarkable_dir
+                )
+        status = subprocess.call(
+            [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
+            stdout=subprocess.DEVNULL,
+        )
+        if not status == 0:
+            exception("Uploading file %s to reMarkable failed" % filepath)
+        logger.info("Upload successful.")
+
+    def run(self, src, filename=None, debug=False, upload=True):
+        info = self.get_paper_info(src)
         clean_filename = self.create_filename(info, filename)
         tmp_filename = "paper.pdf"
         self.retrieve_pdf(src, tmp_filename)
@@ -93,21 +165,24 @@ class Provider(metaclass=abc.ABCMeta):
         for op in ops:
             intermediate_fname = op(tmp_filename)
         shutil.move(intermediate_fname, clean_filename)
-        # TODO: here
-
-
-
-
-
 
+        if debug:
+            print("Paused in debug mode in dir: %s" % working_dir)
+            print("Press enter to exit.")
+            return input()
 
+        if upload:
+            return self.upload_to_rm(clean_filename)
 
+        if os.path.exists(os.path.join(start_wd, clean_filename)):
+            tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
+            shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
+        else:
+            shutil.move(clean_filename, start_wd)
 
 class ArxivProvider(Provider):
-    def __init__(self):
-        super().__init__()
-        self.abs_url = None
-        self.pdf_url = None
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
     def get_abs_pdf_urls(self, url):
         """Get the pdf and abs url from any given arXiv url """
@@ -124,23 +199,7 @@ class ArxivProvider(Provider):
         return abs_url, pdf_url
 
     def validate(self, src):
-        """Check if the url is to an arXiv page.
-
-        >>> validate_url("https://arxiv.org/abs/1811.11242")
-        True
-        >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
-        True
-        >>> validate_url("http://arxiv.org/abs/1811.11242")
-        True
-        >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
-        True
-        >>> validate_url("https://arxiv.org/abs/1811.11242v1")
-        True
-        >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
-        True
-        >>> validate_url("https://gertjanvandenburg.com")
-        False
-        """
+        """Check if the url is to an arXiv page. """
         m = re.match(
             "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
         )
@@ -166,87 +225,165 @@ class ArxivProvider(Provider):
         date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
         return dict(title=title, date=date, authors=authors)
 
+class PMCProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-def exception(msg):
-    print("ERROR: " + msg, file=sys.stderr)
-    print("Error occurred. Exiting.", file=sys.stderr)
-    raise SystemExit(1)
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and html url from a given PMC url """
+        if re.match(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
+            url,
+        ):
+            idx = url.index("pdf")
+            abs_url = url[: idx - 1]
+            pdf_url = url
+        elif re.match(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url
+        ):
+            abs_url = url
+            pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
+        else:
+            exception("Couldn't figure out PMC urls.")
+        return pdf_url, abs_url
 
+    def validate(self, src):
+        m = re.fullmatch(
+            "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src
+        )
+        return not m is None
 
-def pmc_url(url):
-    m = re.fullmatch(
-        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
-    )
-    return not m is None
-
-
-def acm_url(url):
-    m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url)
-    return not m is None
-
-
-def valid_url(url):
-    try:
-        result = urllib.parse.urlparse(url)
-        return all([result.scheme, result.netloc, result.path])
-    except:
-        return False
-
-
-def check_file_is_pdf(filename):
-    try:
-        PyPDF2.PdfFileReader(open(filename, "rb"))
-        return True
-    except PyPDF2.utils.PdfReadError:
-        return False
-
-
-def get_pmc_urls(url):
-    """Get the pdf and html url from a given PMC url """
-    if re.match(
-        "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf",
-        url,
-    ):
-        idx = url.index("pdf")
-        abs_url = url[: idx - 1]
-        pdf_url = url
-    elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url):
-        abs_url = url
-        pdf_url = url.rstrip("/") + "/pdf"  # it redirects, usually
-    else:
-        exception("Couldn't figure out PMC urls.")
-    return pdf_url, abs_url
-
-
-def get_acm_pdf_url(url):
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    thea = None
-    for a in soup.find_all("a"):
-        if a.get("name") == "FullTextPDF":
-            thea = a
-            break
-    if thea is None:
+    def retrieve_pdf(self, src, filename):
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        download_url(pdf_url, filename)
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        logger.info("Getting paper info from PMC")
+        page = get_page_with_retry(src)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_authors"})
+        ]
+        # We only use last names, and this method is a guess at best. I'm open to
+        # more advanced approaches.
+        authors = [
+            x.strip().split(" ")[-1].strip() for x in authors[0].split(",")
+        ]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        if re.match("\w+\ \d{4}", date):
+            date = date.split(" ")[-1]
+        else:
+            date = date.replace(" ", "_")
+        return dict(title=title, date=date, authors=authors)
+
+class ACMProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_acm_pdf_url(self, url):
+        page = get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        thea = None
+        for a in soup.find_all("a"):
+            if a.get("name") == "FullTextPDF":
+                thea = a
+                break
+        if thea is None:
+            return None
+        href = thea.get("href")
+        if href.startswith("http"):
+            return href
+        else:
+            return "https://dl.acm.org/" + href
+
+    def get_abs_pdf_urls(self, url):
+        if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
+            abs_url = url
+            pdf_url = self.get_acm_pdf_url(url)
+            if pdf_url is None:
+                exception("Couldn't extract PDF url from ACM citation page.")
+        else:
+            exception(
+                "Couldn't figure out ACM urls, please provide a URL of the "
+                "format: http(s)://dl.acm.org/citation.cfm?id=..."
+            )
+        return pdf_url, abs_url
+
+    def retrieve_pdf(self, src, filename):
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        download_url(pdf_url, filename)
+
+    def validate(self, src):
+        m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src)
+        return not m is None
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        logger.info("Getting paper info from ACM")
+        page = get_page_with_retry(src)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_authors"})
+        ]
+        # We only use last names, and this method is a guess. I'm open to more
+        # advanced approaches.
+        authors = [
+            x.strip().split(",")[0].strip() for x in authors[0].split(";")
+        ]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
+            logger.warning(
+                "Couldn't extract year from ACM page, please raise an "
+                "issue on GitHub so I can fix it: %s",
+                GITHUB_URL,
+            )
+        date = date.strip().split("/")[-1]
+        return dict(title=title, date=date, authors=authors)
+
+class LocalFileProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def validate(self, src):
+        return os.path.exists(src)
+
+    def retrieve_pdf(self, src, filename):
+        shutil.copy(src, filename)
+
+    def get_paper_info(self, src):
         return None
-    href = thea.get("href")
-    if href.startswith("http"):
-        return href
-    else:
-        return "https://dl.acm.org/" + href
 
+class PdfUrlProvider(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-def get_acm_urls(url):
-    if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url):
-        abs_url = url
-        pdf_url = get_acm_pdf_url(url)
-        if pdf_url is None:
-            exception("Couldn't extract PDF url from ACM citation page.")
-    else:
-        exception(
-            "Couldn't figure out ACM urls, please provide a URL of the "
-            "format: http(s)://dl.acm.org/citation.cfm?id=..."
-        )
-    return pdf_url, abs_url
+    def validate(self, src):
+        try:
+            result = urllib.parse.urlparse(src)
+            return all([result.scheme, result.netloc, result.path])
+        except:
+            return False
+
+    def retrieve_pdf(self, url, filename):
+        if filename is None:
+            exception(
+                "Filename must be provided with pdf url (use --filename)"
+            )
+        download_url(url, filename)
+
+    def get_paper_info(self, src):
+        return None
+
+
+def exception(msg):
+    print("ERROR: " + msg, file=sys.stderr)
+    print("Error occurred. Exiting.", file=sys.stderr)
+    raise SystemExit(1)
 
 
 def get_page_with_retry(url):
@@ -325,123 +462,6 @@ def dearxiv(input_file, pdftk_path="pdftk"):
     return output_file
 
 
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
-    logger.info("Cropping pdf file")
-    status = subprocess.call(
-        [pdfcrop_path, "--margins", "15 40 15 15", filepath],
-        stdout=subprocess.DEVNULL,
-    )
-    if not status == 0:
-        logger.warning("Failed to crop the pdf file at: %s" % filepath)
-        return filepath
-    cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
-    if not os.path.exists(cropped_file):
-        logger.warning(
-            "Can't find cropped file '%s' where expected." % cropped_file
-        )
-        return filepath
-    return cropped_file
-
-
-def shrink_pdf(filepath, gs_path="gs"):
-    logger.info("Shrinking pdf file")
-    output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
-    status = subprocess.call(
-        [
-            "gs",
-            "-sDEVICE=pdfwrite",
-            "-dCompatibilityLevel=1.4",
-            "-dPDFSETTINGS=/printer",
-            "-dNOPAUSE",
-            "-dBATCH",
-            "-dQUIET",
-            "-sOutputFile=%s" % output_file,
-            filepath,
-        ]
-    )
-    if not status == 0:
-        logger.warning("Failed to shrink the pdf file")
-        return filepath
-    return output_file
-
-
-def get_paper_info_arxiv(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from arXiv")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_author"})
-    ]
-    authors = [x.split(",")[0].strip() for x in authors]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_pmc(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from PMC")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_authors"})
-    ]
-    # We only use last names, and this method is a guess at best. I'm open to
-    # more advanced approaches.
-    authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    if re.match("\w+\ \d{4}", date):
-        date = date.split(" ")[-1]
-    else:
-        date = date.replace(" ", "_")
-    return dict(title=title, date=date, authors=authors)
-
-
-def get_paper_info_acm(url):
-    """ Extract the paper's authors, title, and publication year """
-    logger.info("Getting paper info from ACM")
-    page = get_page_with_retry(url)
-    soup = bs4.BeautifulSoup(page, "html.parser")
-    authors = [
-        x["content"]
-        for x in soup.find_all("meta", {"name": "citation_authors"})
-    ]
-    # We only use last names, and this method is a guess. I'm open to more
-    # advanced approaches.
-    authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")]
-    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-    if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
-        logger.warning(
-            "Couldn't extract year from ACM page, please raise an "
-            "issue on GitHub so I can fix it: %s",
-            GITHUB_URL,
-        )
-    date = date.strip().split("/")[-1]
-    return dict(title=title, date=date, authors=authors)
-
-
-def generate_filename(info):
-    """ Generate a nice filename for a paper given the info dict """
-    # we assume that the list of authors is lastname only.
-    logger.info("Generating output filename")
-    if len(info["authors"]) > 3:
-        author_part = info["authors"][0] + "_et_al"
-    else:
-        author_part = "_".join(info["authors"])
-    author_part = author_part.replace(" ", "_")
-    title = info["title"].replace(",", "").replace(":", "").replace(" ", "_")
-    title_part = titlecase.titlecase(title)
-    year_part = info["date"].split("/")[0]
-    name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
-    logger.info("Created filename: %s" % name)
-    return name
-
-
 def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
     remarkable_dir = remarkable_dir.rstrip("/")
     logger.info("Starting upload to reMarkable")
@@ -513,6 +533,14 @@ def parse_args():
 def newmain():
     args = parse_args()
 
+    providers = [
+        ArxivProvider,
+        PMCProvider,
+        ACMProvider,
+        LocalFileProvider,
+        PdfUrlProvider,
+    ]
+
     provider = next((p for p in providers if p.validate(args.input)), None)
     if provider is None:
         exception("Input not valid, no provider can handle this source.")
@@ -520,9 +548,10 @@ def newmain():
     if not args.verbose:
         logger.remove(0)
 
+
     start_wd = os.getcwd()
     with tempfile.TemporaryDirector() as working_dir:
-        provider.run(args.input)
+        provider.run(args.input, debug=args.debug, upload=not args.no_upload)
 
 
 @logger.catch
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-05-21 10:40:38 -0400
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-05-21 10:40:38 -0400
commit	66a9a4d63068fb32e12f61d1fe7612586d6dc30e (patch)
tree	4a30dfe315a604fe6f91441335f4dd646c29506d
parent	Merge branch 'master' into wip_gertjan (diff)
download	paper2remarkable-66a9a4d63068fb32e12f61d1fe7612586d6dc30e.tar.gz paper2remarkable-66a9a4d63068fb32e12f61d1fe7612586d6dc30e.zip