diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-11-17 16:50:40 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-11-17 16:50:40 +0000 |
| commit | 7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432 (patch) | |
| tree | bbdceb1bf460e4620531eb72cbce89ece7a1b02a | |
| parent | minor fixes to docs (diff) | |
| download | paper2remarkable-7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432.tar.gz paper2remarkable-7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432.zip | |
Add provider for ScienceDirect
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/science_direct.py | 106 | ||||
| -rw-r--r-- | tests/test_providers.py | 22 |
3 files changed, 130 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 371ab82..2be218f 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -15,6 +15,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .sagepub import SagePub +from .science_direct import ScienceDirect from .semantic_scholar import SemanticScholar from .springer import Springer from .tandfonline import TandFOnline @@ -33,6 +34,7 @@ providers = [ PMLR, PubMed, SagePub, + ScienceDirect, Springer, SemanticScholar, TandFOnline, diff --git a/paper2remarkable/providers/science_direct.py b/paper2remarkable/providers/science_direct.py new file mode 100644 index 0000000..704a3b0 --- /dev/null +++ b/paper2remarkable/providers/science_direct.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +"""Provider for ScienceDirect + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re +import bs4 +import urllib +import json + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..log import Logger +from ..utils import get_page_with_retry, follow_redirects + +logger = Logger() + + +class ScienceDirectInformer(Informer): + + meta_date_key = "citation_publication_date" + + def get_authors(self, soup): + surname_tags = soup.find_all("span", attrs={"class": "text surname"}) + if not surname_tags: + logger.warning( + "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + ) + return "" + authors = [x.text for x in surname_tags] + return authors + + +class ScienceDirect(Provider): + + re_abs = ( + "https?:\/\/www.sciencedirect.com/science/article/pii/[A-Za-z0-9]+" + ) + re_pdf = "https://pdf.sciencedirectassets.com/\d+/([0-9a-zA-Z\-\.]+)/(?P<data>[0-9a-zA-Z\-\.]+)/main.pdf\?.*" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = ScienceDirectInformer() + + def get_abs_pdf_urls(self, url): + m1 = re.match(self.re_abs, url) + m2 = re.match(self.re_pdf, url) + if m1: + abs_url = url + pdf_url = self._get_pdf_url(abs_url) + elif m2: + pdf_url = url + data = m2.group("data") + paper_id = data.split("-")[-1] + abs_url = ( + f"https://www.sciencedirect.com/science/article/pii/{paper_id}" + ) + else: + raise URLResolutionError("ScienceDirect", url) + return abs_url, pdf_url + + def _get_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + + # For open access (and maybe behind institution?) the full text pdf url + # is currently in the json payload of a script tag. + scripts = soup.find_all("script", attrs={"data-iso-key": "_0"}) + if not scripts: + raise URLResolutionError("ScienceDirect", url) + json_data = scripts[0].string + data = json.loads(json_data) + if not "article" in data: + raise URLResolutionError("ScienceDirect", url) + data = data["article"] + if not "pdfDownload" in data: + raise URLResolutionError("ScienceDirect", url) + data = data["pdfDownload"] + if not "linkToPdf" in data: + raise URLResolutionError("ScienceDirect", url) + link = data["linkToPdf"] + tmp_url = urllib.parse.urljoin("https://sciencedirect.com/", link) + + # tmp_url gives a page with a ten second wait or a direct url, we need + # the direct url + page = get_page_with_retry(tmp_url) + soup = bs4.BeautifulSoup(page, "html.parser") + noscript = soup.find_all("noscript") + if not noscript: + raise URLResolutionError("ScienceDirect", url) + a = noscript[0].find_all("a") + if not a: + raise URLResolutionError("ScienceDirect", url) + pdf_url = a[0].get("href") + return pdf_url + + def validate(src): + return re.match(ScienceDirect.re_abs, src) or re.match( + ScienceDirect.re_pdf, src + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index 9b2f24d..db616e9 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -28,6 +28,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, SagePub, + ScienceDirect, SemanticScholar, Springer, TandFOnline, @@ -392,6 +393,27 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_sciencedirect_1(self): + prov = ScienceDirect(upload=False, verbose=VERBOSE) + url = "https://www.sciencedirect.com/science/article/pii/S0166354220302011" + exp = "Caly_et_al_-_The_FDA-approved_Drug_Ivermectin_Inhibits_the_Replication_of_SARS-CoV-2_in_Vitro_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sciencedirect_2(self): + prov = ScienceDirect(upload=False, verbose=VERBOSE) + url = "https://www.sciencedirect.com/science/article/pii/S0047235220302543" + exp = "Bolger_Lytle_Bolger_-_What_Matters_in_Citizen_Satisfaction_With_Police_a_Meta-Analysis_2021.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sciencedirect_3(self): + prov = ScienceDirect(upload=False, verbose=VERBOSE) + url = r"https://pdf.sciencedirectassets.com/272398/1-s2.0-S0022039616X00095/1-s2.0-S0022039616001029/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjELf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQCRRFGFc7b02V86pkMeqytyBK%2BR8I%2BfdsIpYbjfXSpIBwIhAORxDxLYdr4EoSyn1P7wlhG%2F1RnX8tIG0IRGOidKKm69KrQDCDAQAxoMMDU5MDAzNTQ2ODY1IgwzsYwSRMjSfdr4cbUqkQOPUxG702LEv3POe5ESC9FBVVHGeUF%2BB46FTtWqkhHgjkRIpuoFiavu1cuBWHQ9FwCZjcocan56LfXiySYBfl259MC8ieSYor9FKZLBaAhDCEblkiTdW2%2Fk4nfogp6fwWVdckC8gGVbu3wQ9Mdh%2FE91ZEix%2FIftmJ6IpAZkm0l0AFFt%2BngI7geWoZDeku5iImEUw6JJPgFz5Yw9cKa%2FuGM3hi29JsuI30qzBqZC9nGRCIx%2FLYeiDfF1v0QjFLmT%2FE5xpaNxMt%2FoWLiazRcconSQCCax6%2Bw9SR4NvWg2illOrLMEPuRYacIFRNhV9zj7Y06Bf%2BfG%2FTQxXdnDLH0VMkUWx%2BgjwRAqSvIb0JRg9q5gErPB1cZLCuCd3ybFSmtj7aQmfl7uhMAjQwnCcN6fhtlVK6Xb3Us7YglDaHekzf8RDv9stbxBWFGMPVmDUXHWOsUo89LY%2F9IbtQTs5Uu3ieMGePUVMY4ox3FPYAb5jWjaOFqs54LqfQ5nqjkLMiAY%2F11zCVyOAoPiDnDs6Wjuj52iszCtuc%2F9BTrqATkmIC%2Bu2w6MEow0zbPVAaqNF%2BjUh8Tv%2BWTInq9G3Q4PXIqL3CNNiISPDvuUggRwWGJDgXtr0C%2B4Gtv1bfs3BGHHgWOD261c6O0LHQuP11BLN8GCr7bFO1hjVAqHhC06vyhGQRmRzN32CPwo8pUM2gWw9xXGUioUiSJ%2FgRpDaszsW4Yr8Wm7L9Q7jAOYxEf7WLxPwAWO69o8JbJoouxwL4qeTEGMJ5IpUk3x3xPQIlawOlqY%2FHi0s4E1DE4ZMjH21hc3PrQ%2FiwI%2BTqY9Rg5sjLCBJ4vRCiqb3dpOWLsR5LFOTySXWoqIdO7b9Q%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20201117T155020Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY7OS7PK7A%2F20201117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=03abad117208b684a1a4ca2ffdcbe5b9a40a19e6c841c609e299315a2f2234ce&hash=24f71da9f05f6835c9797841d1462d11eea85c49e9655dde043ed9f748edf17e&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S0022039616001029&tid=spdf-6b78a4fa-826e-4267-8ce6-43c814fa51b2&sid=776192553463724f1a4b56613fcf5e514b72gxrqb&type=client" + exp = "Kristiansen_Wulff_-_Exponential_Estimates_of_Symplectic_Slow_Manifolds_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
