aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-14 23:09:24 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-14 23:09:24 +0000
commit6afbb628fa8b9b16b2182a7ee6e3b233b2df740b (patch)
treef843964eb51031c184e671e53709ebd53f181aed
parentUpdate notes (diff)
parentUpdate ScienceDirect provider to new site structure (diff)
downloadpaper2remarkable-6afbb628fa8b9b16b2182a7ee6e3b233b2df740b.tar.gz
paper2remarkable-6afbb628fa8b9b16b2182a7ee6e3b233b2df740b.zip
Merge branch 'bugfix/sciencedirect'
-rw-r--r--paper2remarkable/providers/science_direct.py28
1 files changed, 25 insertions, 3 deletions
diff --git a/paper2remarkable/providers/science_direct.py b/paper2remarkable/providers/science_direct.py
index 704a3b0..e8fff8a 100644
--- a/paper2remarkable/providers/science_direct.py
+++ b/paper2remarkable/providers/science_direct.py
@@ -70,7 +70,24 @@ class ScienceDirect(Provider):
soup = bs4.BeautifulSoup(page, "html.parser")
# For open access (and maybe behind institution?) the full text pdf url
- # is currently in the json payload of a script tag.
+ # is currently in the json payload of a script tag as:
+ #
+ # "pdfDownload": {
+ # 'isPdfFullText': False,
+ # 'linkType': 'DOWNLOAD',
+ # 'urlMetadata': {
+ # 'path': 'science/article/pii',
+ # 'pdfExtension': '/pdfft',
+ # 'pii': 'S0166354220302011',
+ # 'queryParams': {'md5': 'bd2a8d1cfbe3680f2d405b4a62642a15',
+ # 'pid': '1-s2.0-S0166354220302011-main.pdf'}
+ # }
+ # }
+ #
+ # We construct the url based on the urlMetaData. This leads to an
+ # intermediate page, which contains the actual url to the PDF in the
+ # noscript tag.
+
scripts = soup.find_all("script", attrs={"data-iso-key": "_0"})
if not scripts:
raise URLResolutionError("ScienceDirect", url)
@@ -82,9 +99,14 @@ class ScienceDirect(Provider):
if not "pdfDownload" in data:
raise URLResolutionError("ScienceDirect", url)
data = data["pdfDownload"]
- if not "linkToPdf" in data:
+
+ if not "urlMetadata" in data:
raise URLResolutionError("ScienceDirect", url)
- link = data["linkToPdf"]
+ meta = data["urlMetadata"]
+
+ link = "{path}/{pii}/{pdfExtension}?md5{queryParams[md5]}&pid={queryParams[pid]}".format(
+ **meta
+ )
tmp_url = urllib.parse.urljoin("https://sciencedirect.com/", link)
# tmp_url gives a page with a ten second wait or a direct url, we need