From 0a6a4ff3893474e33f71ef2d8a881cc360a29094 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:23:12 +0100 Subject: Improve robustness of springer provider Adds support for downloading chapters --- paper2remarkable/providers/springer.py | 37 +++++++++++++++++++++++++++++----- tests/test_providers.py | 9 ++++++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index 5ce2564..dea8bd5 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -10,10 +10,12 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib +import requests from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..utils import HEADERS class SpringerInformer(Informer): @@ -26,24 +28,49 @@ class SpringerInformer(Informer): class Springer(Provider): - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = SpringerInformer() + def _get_abs_url(self, pdf_url): + article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")] + req = requests.head( + article_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return article_url + + chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")] + req = requests.head( + chapter_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return chapter_url + + raise URLResolutionError("Springer", pdf_url) + def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): + if re.match(self.re_abs_1, url): abs_url = url pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_abs_2, url): + abs_url = url + pdf_url = url.replace("chapter", "content/pdf") elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + abs_url = self._get_abs_url(url) pdf_url = urllib.parse.unquote(url) else: raise URLResolutionError("Springer", url) return abs_url, pdf_url def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + return ( + re.match(Springer.re_abs_1, src) + or re.match(Springer.re_abs_2, src) + or re.match(Springer.re_pdf, src) + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index 1a6f84f..5c8a8e4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -126,13 +126,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: -- cgit v1.2.3 From d36bda173d5488e23ec918d4bd51c3e6fd76ae06 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:37:16 +0100 Subject: Improve publication date extraction --- paper2remarkable/providers/springer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index dea8bd5..f9dc952 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -20,11 +20,23 @@ from ..utils import HEADERS class SpringerInformer(Informer): - meta_date_key = "citation_online_date" + meta_date_key = None def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) + def get_year(self, soup): + meta = soup.find_all('meta', {'name': 'citation_online_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + meta = soup.find_all('meta', {'name': 'citation_publication_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + return '' + + class Springer(Provider): -- cgit v1.2.3 From f1f6ec91ca263e2e47357f4ddfd7e0e746fd93e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:42:25 +0100 Subject: simplify code --- paper2remarkable/providers/springer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index f9dc952..31f0a67 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -26,16 +26,12 @@ class SpringerInformer(Informer): return super()._format_authors(soup_authors, sep=" ", idx=-1) def get_year(self, soup): - meta = soup.find_all('meta', {'name': 'citation_online_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - meta = soup.find_all('meta', {'name': 'citation_publication_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - return '' - + for key in ["citation_online_date", "citation_publication_date"]: + meta = soup.find_all("meta", {"name": key}) + if not meta: + continue + return self._format_year(meta[0]["content"]) + return "" class Springer(Provider): -- cgit v1.2.3