aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-11-17 16:50:40 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-11-17 16:50:40 +0000
commit7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432 (patch)
treebbdceb1bf460e4620531eb72cbce89ece7a1b02a
parentminor fixes to docs (diff)
downloadpaper2remarkable-7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432.tar.gz
paper2remarkable-7675b7ea4f07e0f5f0d9dcfb9a3846eaace07432.zip
Add provider for ScienceDirect
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/science_direct.py106
-rw-r--r--tests/test_providers.py22
3 files changed, 130 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 371ab82..2be218f 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -15,6 +15,7 @@ from .pdf_url import PdfUrl
from .pmlr import PMLR
from .pubmed import PubMed
from .sagepub import SagePub
+from .science_direct import ScienceDirect
from .semantic_scholar import SemanticScholar
from .springer import Springer
from .tandfonline import TandFOnline
@@ -33,6 +34,7 @@ providers = [
PMLR,
PubMed,
SagePub,
+ ScienceDirect,
Springer,
SemanticScholar,
TandFOnline,
diff --git a/paper2remarkable/providers/science_direct.py b/paper2remarkable/providers/science_direct.py
new file mode 100644
index 0000000..704a3b0
--- /dev/null
+++ b/paper2remarkable/providers/science_direct.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for ScienceDirect
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+import bs4
+import urllib
+import json
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+from ..log import Logger
+from ..utils import get_page_with_retry, follow_redirects
+
+logger = Logger()
+
+
+class ScienceDirectInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def get_authors(self, soup):
+ surname_tags = soup.find_all("span", attrs={"class": "text surname"})
+ if not surname_tags:
+ logger.warning(
+ "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ authors = [x.text for x in surname_tags]
+ return authors
+
+
+class ScienceDirect(Provider):
+
+ re_abs = (
+ "https?:\/\/www.sciencedirect.com/science/article/pii/[A-Za-z0-9]+"
+ )
+ re_pdf = "https://pdf.sciencedirectassets.com/\d+/([0-9a-zA-Z\-\.]+)/(?P<data>[0-9a-zA-Z\-\.]+)/main.pdf\?.*"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ScienceDirectInformer()
+
+ def get_abs_pdf_urls(self, url):
+ m1 = re.match(self.re_abs, url)
+ m2 = re.match(self.re_pdf, url)
+ if m1:
+ abs_url = url
+ pdf_url = self._get_pdf_url(abs_url)
+ elif m2:
+ pdf_url = url
+ data = m2.group("data")
+ paper_id = data.split("-")[-1]
+ abs_url = (
+ f"https://www.sciencedirect.com/science/article/pii/{paper_id}"
+ )
+ else:
+ raise URLResolutionError("ScienceDirect", url)
+ return abs_url, pdf_url
+
+ def _get_pdf_url(self, url):
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+
+ # For open access (and maybe behind institution?) the full text pdf url
+ # is currently in the json payload of a script tag.
+ scripts = soup.find_all("script", attrs={"data-iso-key": "_0"})
+ if not scripts:
+ raise URLResolutionError("ScienceDirect", url)
+ json_data = scripts[0].string
+ data = json.loads(json_data)
+ if not "article" in data:
+ raise URLResolutionError("ScienceDirect", url)
+ data = data["article"]
+ if not "pdfDownload" in data:
+ raise URLResolutionError("ScienceDirect", url)
+ data = data["pdfDownload"]
+ if not "linkToPdf" in data:
+ raise URLResolutionError("ScienceDirect", url)
+ link = data["linkToPdf"]
+ tmp_url = urllib.parse.urljoin("https://sciencedirect.com/", link)
+
+ # tmp_url gives a page with a ten second wait or a direct url, we need
+ # the direct url
+ page = get_page_with_retry(tmp_url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ noscript = soup.find_all("noscript")
+ if not noscript:
+ raise URLResolutionError("ScienceDirect", url)
+ a = noscript[0].find_all("a")
+ if not a:
+ raise URLResolutionError("ScienceDirect", url)
+ pdf_url = a[0].get("href")
+ return pdf_url
+
+ def validate(src):
+ return re.match(ScienceDirect.re_abs, src) or re.match(
+ ScienceDirect.re_pdf, src
+ )
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 9b2f24d..db616e9 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -28,6 +28,7 @@ from paper2remarkable.providers import (
PdfUrl,
PubMed,
SagePub,
+ ScienceDirect,
SemanticScholar,
Springer,
TandFOnline,
@@ -392,6 +393,27 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_sciencedirect_1(self):
+ prov = ScienceDirect(upload=False, verbose=VERBOSE)
+ url = "https://www.sciencedirect.com/science/article/pii/S0166354220302011"
+ exp = "Caly_et_al_-_The_FDA-approved_Drug_Ivermectin_Inhibits_the_Replication_of_SARS-CoV-2_in_Vitro_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sciencedirect_2(self):
+ prov = ScienceDirect(upload=False, verbose=VERBOSE)
+ url = "https://www.sciencedirect.com/science/article/pii/S0047235220302543"
+ exp = "Bolger_Lytle_Bolger_-_What_Matters_in_Citizen_Satisfaction_With_Police_a_Meta-Analysis_2021.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sciencedirect_3(self):
+ prov = ScienceDirect(upload=False, verbose=VERBOSE)
+ url = r"https://pdf.sciencedirectassets.com/272398/1-s2.0-S0022039616X00095/1-s2.0-S0022039616001029/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjELf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQCRRFGFc7b02V86pkMeqytyBK%2BR8I%2BfdsIpYbjfXSpIBwIhAORxDxLYdr4EoSyn1P7wlhG%2F1RnX8tIG0IRGOidKKm69KrQDCDAQAxoMMDU5MDAzNTQ2ODY1IgwzsYwSRMjSfdr4cbUqkQOPUxG702LEv3POe5ESC9FBVVHGeUF%2BB46FTtWqkhHgjkRIpuoFiavu1cuBWHQ9FwCZjcocan56LfXiySYBfl259MC8ieSYor9FKZLBaAhDCEblkiTdW2%2Fk4nfogp6fwWVdckC8gGVbu3wQ9Mdh%2FE91ZEix%2FIftmJ6IpAZkm0l0AFFt%2BngI7geWoZDeku5iImEUw6JJPgFz5Yw9cKa%2FuGM3hi29JsuI30qzBqZC9nGRCIx%2FLYeiDfF1v0QjFLmT%2FE5xpaNxMt%2FoWLiazRcconSQCCax6%2Bw9SR4NvWg2illOrLMEPuRYacIFRNhV9zj7Y06Bf%2BfG%2FTQxXdnDLH0VMkUWx%2BgjwRAqSvIb0JRg9q5gErPB1cZLCuCd3ybFSmtj7aQmfl7uhMAjQwnCcN6fhtlVK6Xb3Us7YglDaHekzf8RDv9stbxBWFGMPVmDUXHWOsUo89LY%2F9IbtQTs5Uu3ieMGePUVMY4ox3FPYAb5jWjaOFqs54LqfQ5nqjkLMiAY%2F11zCVyOAoPiDnDs6Wjuj52iszCtuc%2F9BTrqATkmIC%2Bu2w6MEow0zbPVAaqNF%2BjUh8Tv%2BWTInq9G3Q4PXIqL3CNNiISPDvuUggRwWGJDgXtr0C%2B4Gtv1bfs3BGHHgWOD261c6O0LHQuP11BLN8GCr7bFO1hjVAqHhC06vyhGQRmRzN32CPwo8pUM2gWw9xXGUioUiSJ%2FgRpDaszsW4Yr8Wm7L9Q7jAOYxEf7WLxPwAWO69o8JbJoouxwL4qeTEGMJ5IpUk3x3xPQIlawOlqY%2FHi0s4E1DE4ZMjH21hc3PrQ%2FiwI%2BTqY9Rg5sjLCBJ4vRCiqb3dpOWLsR5LFOTySXWoqIdO7b9Q%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20201117T155020Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY7OS7PK7A%2F20201117%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=03abad117208b684a1a4ca2ffdcbe5b9a40a19e6c841c609e299315a2f2234ce&hash=24f71da9f05f6835c9797841d1462d11eea85c49e9655dde043ed9f748edf17e&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S0022039616001029&tid=spdf-6b78a4fa-826e-4267-8ce6-43c814fa51b2&sid=776192553463724f1a4b56613fcf5e514b72gxrqb&type=client"
+ exp = "Kristiansen_Wulff_-_Exponential_Estimates_of_Symplectic_Slow_Manifolds_2016.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()