Add provider for Springer papers

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-08-19 17:42:47 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2019-08-19 17:42:47 +0100
commit: 5965e2d32538defe6c79edabe177747ca2c114bc (patch)
tree: a4cdcf650e835b851fa550f36b644e828afa9ad6
parent: Simplify provider names (diff)
download: paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.tar.gz
paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.zip
1 files changed, 45 insertions, 8 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index 8f605ab..bf999fb 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -548,6 +548,50 @@ class OpenReview(Provider):
         return dict(title=title, date=date, authors=authors)
 
 
+class Springer(Provider):
+
+    re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract urls from a Springer url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("article", "content/pdf")
+        elif re.match(self.re_pdf, url):
+            abs_url = url.replace("content/pdf", "article")
+            pdf_url = url
+        else:
+            exception("Couldn't figure out Springer urls.")
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+
+    def retrieve_pdf(self, src, filename):
+        _, pdf_url = self.get_abs_pdf_urls(src)
+        self.download_url(pdf_url, filename)
+
+    def get_paper_info(self, src):
+        abs_url, _ = self.get_abs_pdf_urls(src)
+        self.log("Getting paper info from Springer")
+        page = self.get_page_with_retry(abs_url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        authors = [
+            x["content"]
+            for x in soup.find_all("meta", {"name": "citation_author"})
+        ]
+        authors = [x.split(" ")[-1].strip() for x in authors]
+        title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
+        date = soup.find_all("meta", {"name": "citation_online_date"})[0][
+            "content"
+        ]
+        return dict(title=title, date=date, authors=authors)
+
+
 class LocalFile(Provider):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -804,14 +848,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    providers = [
-        Arxiv,
-        Pubmed,
-        ACM,
-        OpenReview,
-        LocalFile,
-        PdfUrl,
-    ]
+    providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
 
     provider = next((p for p in providers if p.validate(args.input)), None)
     if provider is None:
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-08-19 17:42:47 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2019-08-19 17:42:47 +0100
commit	5965e2d32538defe6c79edabe177747ca2c114bc (patch)
tree	a4cdcf650e835b851fa550f36b644e828afa9ad6
parent	Simplify provider names (diff)
download	paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.tar.gz paper2remarkable-5965e2d32538defe6c79edabe177747ca2c114bc.zip