[WIP] rewrite to use Providers

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-04-05 18:15:10 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-04-05 18:15:10 +0100
commit: 92786bc2f898575f80845f9b7111baa3d386734a (patch)
tree: caeb10dd6c989ca2a3dd76ebf47db88cb58f9988 /arxiv2remarkable.py
parent: Simplify the code (diff)
download: paper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.tar.gz
paper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.zip
1 files changed, 140 insertions, 37 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 61a3667..a0b4a94 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -18,6 +18,7 @@ License: MIT
 
 """
 
+import abc
 import PyPDF2
 import argparse
 import bs4
@@ -43,36 +44,135 @@ HEADERS = {
 }
 
 
+class Provider(metaclass=abc.ABCMeta):
+    """ ABC for providers of pdf sources """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    @abc.abstractmethod
+    def validate(self, src):
+        """ Validate whether ``src`` is appropriate for this provider """
+
+    @abc.abstractmethod
+    def retrieve_pdf(self, src, filename):
+        """ Download pdf from src and save to filename """
+
+    @abc.abstractmethod
+    def get_paper_info(self, src):
+        """ Retrieve the title/author (surnames)/year information """
+
+    def create_filename(self, info, filename=None):
+        """ Generate filename using the info dict or filename if provided """
+        if not filename is None:
+            return filename
+        # we assume that the list of authors is surname only.
+        logger.info("Generating output filename")
+        if len(info["authors"]) > 3:
+            author_part = info["authors"][0] + "_et_al"
+        else:
+            author_part = "_".join(info["authors"])
+        author_part = author_part.replace(" ", "_")
+        title = (
+            info["title"].replace(",", "").replace(":", "").replace(" ", "_")
+        )
+        title_part = titlecase.titlecase(title)
+        year_part = info["date"].split("/")[0]
+        return author_part + "_-_" + title_part + "_" + year_part + ".pdf"
+
+    def run(self, src, filename=None):
+        info = get_paper_info(src)
+        clean_filename = self.create_filename(info, filename)
+        tmp_filename = "paper.pdf"
+        self.retrieve_pdf(src, tmp_filename)
+        self.check_file_is_pdf(tmp_filename)
+
+        ops = [self.dearxiv, self.crop, self.shrink]
+        intermediate_fname = tmp_filename
+        for op in ops:
+            intermediate_fname = op(tmp_filename)
+        shutil.move(intermediate_fname, clean_filename)
+        # TODO: here
+
+
+
+
+
+
+
+
+
+class ArxivProvider(Provider):
+    def __init__(self):
+        super().__init__()
+        self.abs_url = None
+        self.pdf_url = None
+
+    def get_abs_pdf_urls(self, url):
+        """Get the pdf and abs url from any given arXiv url """
+        if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
+            abs_url = url
+            pdf_url = url.replace("abs", "pdf") + ".pdf"
+        elif re.match(
+            "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url
+        ):
+            abs_url = url[:-4].replace("pdf", "abs")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out arXiv urls.")
+        return abs_url, pdf_url
+
+    def validate(self, src):
+        """Check if the url is to an arXiv page.
+
+        >>> validate_url("https://arxiv.org/abs/1811.11242")
+        True
+        >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
+        True
+        >>> validate_url("http://arxiv.org/abs/1811.11242")
+        True
+        >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
+        True
+        >>> validate_url("https://arxiv.org/abs/1811.11242v1")
+        True
+        >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
+        True
+        >>> validate_url("https://gertjanvandenburg.com")
+        False
+        """
+        m = re.match(
+            "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src
+        )
+        return not m is None
+
+    def retrieve_pdf(self, src, filename):
+        """ Download the file and save as filename """
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        download_url(pdf_url, filename)
+
+    def get_paper_info(self, src):
+        """ Extract the paper's authors, title, and publication year """
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        logger.info("Getting paper info from arXiv")
+        page = get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_author"})
+        ]
+        authors = [x.split(",")[0].strip() for x in authors]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
+        return dict(title=title, date=date, authors=authors)
+
+
 def exception(msg):
     print("ERROR: " + msg, file=sys.stderr)
     print("Error occurred. Exiting.", file=sys.stderr)
     raise SystemExit(1)
 
 
-def arxiv_url(url):
-    """Check if the url is to an arXiv page.
-
-    >>> validate_url("https://arxiv.org/abs/1811.11242")
-    True
-    >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
-    True
-    >>> validate_url("http://arxiv.org/abs/1811.11242")
-    True
-    >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
-    True
-    >>> validate_url("https://arxiv.org/abs/1811.11242v1")
-    True
-    >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
-    True
-    >>> validate_url("https://gertjanvandenburg.com")
-    False
-    """
-    m = re.match(
-        "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url
-    )
-    return not m is None
-
-
 def pmc_url(url):
     m = re.fullmatch(
         "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url
@@ -101,19 +201,6 @@ def check_file_is_pdf(filename):
         return False
 
 
-def get_arxiv_urls(url):
-    """Get the pdf and abs url from any given arXiv url """
-    if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
-        abs_url = url
-        pdf_url = url.replace("abs", "pdf") + ".pdf"
-    elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url):
-        abs_url = url[:-4].replace("pdf", "abs")
-        pdf_url = url
-    else:
-        exception("Couldn't figure out arXiv urls.")
-    return pdf_url, abs_url
-
-
 def get_pmc_urls(url):
     """Get the pdf and html url from a given PMC url """
     if re.match(
@@ -421,6 +508,22 @@ def parse_args():
 
 
 @logger.catch
+def newmain():
+    args = parse_args()
+
+    provider = next((p for p in providers if p.validate(args.input)), None)
+    if provider is None:
+        exception("Input not valid, no provider can handle this source.")
+
+    if not args.verbose:
+        logger.remove(0)
+
+    start_wd = os.getcwd()
+    with tempfile.TemporaryDirector() as working_dir:
+        provider.run(args.input)
+
+
+@logger.catch
 def main():
     args = parse_args()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-04-05 18:15:10 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-04-05 18:15:10 +0100
commit	92786bc2f898575f80845f9b7111baa3d386734a (patch)
tree	caeb10dd6c989ca2a3dd76ebf47db88cb58f9988 /arxiv2remarkable.py
parent	Simplify the code (diff)
download	paper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.tar.gz paper2remarkable-92786bc2f898575f80845f9b7111baa3d386734a.zip