aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-07-15 16:23:12 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-07-15 16:23:12 +0100
commit0a6a4ff3893474e33f71ef2d8a881cc360a29094 (patch)
tree7bc838830b30028f705d32b6fe9fefe884ffb608
parentBump version and update changelog (diff)
downloadpaper2remarkable-0a6a4ff3893474e33f71ef2d8a881cc360a29094.tar.gz
paper2remarkable-0a6a4ff3893474e33f71ef2d8a881cc360a29094.zip
Improve robustness of springer provider
Adds support for downloading chapters
-rw-r--r--paper2remarkable/providers/springer.py37
-rw-r--r--tests/test_providers.py9
2 files changed, 40 insertions, 6 deletions
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
index 5ce2564..dea8bd5 100644
--- a/paper2remarkable/providers/springer.py
+++ b/paper2remarkable/providers/springer.py
@@ -10,10 +10,12 @@ Copyright: 2019, G.J.J. van den Burg
import re
import urllib
+import requests
from ._base import Provider
from ._info import Informer
from ..exceptions import URLResolutionError
+from ..utils import HEADERS
class SpringerInformer(Informer):
@@ -26,24 +28,49 @@ class SpringerInformer(Informer):
class Springer(Provider):
- re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
- re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+ re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+ re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+"
+ re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = SpringerInformer()
+ def _get_abs_url(self, pdf_url):
+ article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")]
+ req = requests.head(
+ article_url, headers=HEADERS, cookies=self.cookiejar
+ )
+ if req.status_code == 200:
+ return article_url
+
+ chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")]
+ req = requests.head(
+ chapter_url, headers=HEADERS, cookies=self.cookiejar
+ )
+ if req.status_code == 200:
+ return chapter_url
+
+ raise URLResolutionError("Springer", pdf_url)
+
def get_abs_pdf_urls(self, url):
""" Get the pdf and abstract urls from a Springer url """
- if re.match(self.re_abs, url):
+ if re.match(self.re_abs_1, url):
abs_url = url
pdf_url = url.replace("article", "content/pdf")
+ elif re.match(self.re_abs_2, url):
+ abs_url = url
+ pdf_url = url.replace("chapter", "content/pdf")
elif re.match(self.re_pdf, url):
- abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+ abs_url = self._get_abs_url(url)
pdf_url = urllib.parse.unquote(url)
else:
raise URLResolutionError("Springer", url)
return abs_url, pdf_url
def validate(src):
- return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+ return (
+ re.match(Springer.re_abs_1, src)
+ or re.match(Springer.re_abs_2, src)
+ or re.match(Springer.re_pdf, src)
+ )
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 1a6f84f..5c8a8e4 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -126,13 +126,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
- def test_springer(self):
+ def test_springer_1(self):
prov = Springer(upload=False, verbose=VERBOSE)
url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_springer_2(self):
+ prov = Springer(upload=False, verbose=VERBOSE)
+ url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf"
+ exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_local(self):
local_filename = "test.pdf"
with open(local_filename, "w") as fp: