From 6f0ef966248a7e79e26388ff76eed900e87ffe44 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 19 Aug 2019 18:28:19 +0100
Subject: Unify ``get_paper_info``

---
 arxiv2remarkable.py | 164 ++++++++++++++++++++++++----------------------------
 1 file changed, 75 insertions(+), 89 deletions(-)

diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index bf999fb..01ab273 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -45,6 +45,10 @@ RM_HEIGHT = 1872
 class Provider(metaclass=abc.ABCMeta):
     """ ABC for providers of pdf sources """
 
+    meta_author_key = "citation_author"
+    meta_title_key = "citation_title"
+    meta_date_key = "citation_date"
+
     def __init__(
         self,
         verbose=False,
@@ -97,9 +101,47 @@ class Provider(metaclass=abc.ABCMeta):
     def retrieve_pdf(self, src, filename):
         """ Download pdf from src and save to filename """
 
-    @abc.abstractmethod
-    def get_paper_info(self, src):
+    def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+        op = (lambda x: x) if op is None else op
+        # format the author list retrieved by bs4
+        return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+    def get_authors(self, soup):
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": self.meta_author_key})
+        ]
+        return self._format_authors(authors)
+
+    def get_title(self, soup):
+        target = soup.find_all("meta", {"name": self.meta_title_key})
+        return target[0]["content"]
+
+    def _format_date(self, soup_date):
+        return soup_date
+
+    def get_date(self, soup):
+        date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+            "content"
+        ]
+        return self._format_date(date)
+
+    def get_paper_info(
+        self,
+        src,
+        author_key="citation_author",
+        title_key="citation_title",
+        date_key="citation_date",
+    ):
         """ Retrieve the title/author (surnames)/year information """
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        self.log("Getting paper info")
+        page = self.get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = self.get_authors(soup)
+        title = self.get_title(soup)
+        date = self.get_date(soup)
+        return dict(title=title, date=date, authors=authors)
 
     def create_filename(self, info, filename=None):
         """ Generate filename using the info dict or filename if provided """
@@ -363,23 +405,11 @@ class Arxiv(Provider):
         _, pdf_url = self.get_abs_pdf_urls(src)
         self.download_url(pdf_url, filename)
 
-    def get_paper_info(self, src):
-        """ Extract the paper's authors, title, and publication year """
-        abs_url, _ = self.get_abs_pdf_urls(src)
-        self.log("Getting paper info from arXiv")
-        page = self.get_page_with_retry(abs_url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": "citation_author"})
-        ]
-        authors = [x.split(",")[0].strip() for x in authors]
-        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-        return dict(title=title, date=date, authors=authors)
-
 
 class Pubmed(Provider):
+
+    meta_author_key = "citation_authors"
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -411,30 +441,20 @@ class Pubmed(Provider):
         _, pdf_url = self.get_abs_pdf_urls(src)
         self.download_url(pdf_url, filename)
 
-    def get_paper_info(self, src):
-        """ Extract the paper's authors, title, and publication year """
-        self.log("Getting paper info from PMC")
-        page = self.get_page_with_retry(src)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": "citation_authors"})
-        ]
-        # We only use last names, and this method is a guess at best. I'm open to
-        # more advanced approaches.
-        authors = [
-            x.strip().split(" ")[-1].strip() for x in authors[0].split(",")
-        ]
-        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-        if re.match("\w+\ \d{4}", date):
-            date = date.split(" ")[-1]
-        else:
-            date = date.replace(" ", "_")
-        return dict(title=title, date=date, authors=authors)
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(",")
+        return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+    def _format_date(self, soup_date):
+        if re.match("\w+\ \d{4}", soup_date):
+            return soup_date.split(" ")[-1]
+        return soup_date.replace(" ", "_")
 
 
 class ACM(Provider):
+
+    meta_author_key = "citation_authors"
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -477,32 +497,23 @@ class ACM(Provider):
         m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src)
         return not m is None
 
-    def get_paper_info(self, src):
-        """ Extract the paper's authors, title, and publication year """
-        self.log("Getting paper info from ACM")
-        page = self.get_page_with_retry(src)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": "citation_authors"})
-        ]
-        # We only use last names, and this method is a guess. I'm open to more
-        # advanced approaches.
-        authors = [
-            x.strip().split(",")[0].strip() for x in authors[0].split(";")
-        ]
-        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-        date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
-        if not re.match("\d{2}/\d{2}/\d{4}", date.strip()):
+    def _format_authors(self, soup_authors):
+        op = lambda x: x[0].split(";")
+        return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+    def _format_date(self, soup_date):
+        if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
             self.warn(
                 "Couldn't extract year from ACM page, please raise an "
-                "issue on GitHub so I can fix it: %s" % GITHUB_URL
+                "issue on GitHub so it can be fixed: %s" % GITHUB_URL
             )
-        date = date.strip().split("/")[-1]
-        return dict(title=title, date=date, authors=authors)
+        return soup_date.strip().split("/")[-1]
 
 
 class OpenReview(Provider):
+
+    meta_date_key = "citation_publication_date"
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -530,26 +541,14 @@ class OpenReview(Provider):
         _, pdf_url = self.get_abs_pdf_urls(src)
         self.download_url(pdf_url, filename)
 
-    def get_paper_info(self, src):
-        """ Extract the paper's authors, title, and publication year """
-        abs_url, _ = self.get_abs_pdf_urls(src)
-        self.log("Getting paper info from OpenReview")
-        page = self.get_page_with_retry(abs_url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": "citation_author"})
-        ]
-        authors = [x.split(" ")[-1].strip() for x in authors]
-        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-        date = soup.find_all("meta", {"name": "citation_publication_date"})[0][
-            "content"
-        ]
-        return dict(title=title, date=date, authors=authors)
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
 
 
 class Springer(Provider):
 
+    meta_date_key = "citation_online_date"
+
     re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
     re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
 
@@ -575,21 +574,8 @@ class Springer(Provider):
         _, pdf_url = self.get_abs_pdf_urls(src)
         self.download_url(pdf_url, filename)
 
-    def get_paper_info(self, src):
-        abs_url, _ = self.get_abs_pdf_urls(src)
-        self.log("Getting paper info from Springer")
-        page = self.get_page_with_retry(abs_url)
-        soup = bs4.BeautifulSoup(page, "html.parser")
-        authors = [
-            x["content"]
-            for x in soup.find_all("meta", {"name": "citation_author"})
-        ]
-        authors = [x.split(" ")[-1].strip() for x in authors]
-        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
-        date = soup.find_all("meta", {"name": "citation_online_date"})[0][
-            "content"
-        ]
-        return dict(title=title, date=date, authors=authors)
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
 
 
 class LocalFile(Provider):
-- 
cgit v1.2.3