diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-14 22:51:11 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-14 22:51:11 +0000 |
| commit | f36d98a423ee5b195e20796ccfc24ee6fa422a84 (patch) | |
| tree | f843964eb51031c184e671e53709ebd53f181aed | |
| parent | Document previous json payload for posterity (diff) | |
| download | paper2remarkable-f36d98a423ee5b195e20796ccfc24ee6fa422a84.tar.gz paper2remarkable-f36d98a423ee5b195e20796ccfc24ee6fa422a84.zip | |
Update ScienceDirect provider to new site structure
| -rw-r--r-- | paper2remarkable/providers/science_direct.py | 30 |
1 files changed, 22 insertions, 8 deletions
diff --git a/paper2remarkable/providers/science_direct.py b/paper2remarkable/providers/science_direct.py index 9baa48d..e8fff8a 100644 --- a/paper2remarkable/providers/science_direct.py +++ b/paper2remarkable/providers/science_direct.py @@ -73,12 +73,21 @@ class ScienceDirect(Provider): # is currently in the json payload of a script tag as: # # "pdfDownload": { - # "linkType": "DOWNLOAD", - # "linkToPdf": - # "/science/article/pii/S0166354220302011/pdfft?md5=bd2a8d1cfbe3680f2d405b4a62642a15&pid=1-s2.0-S0166354220302011-main.pdf", - # "isPdfFullText": false, - # "fileName": "1-s2.0-S0166354220302011-main.pdf" - # }, + # 'isPdfFullText': False, + # 'linkType': 'DOWNLOAD', + # 'urlMetadata': { + # 'path': 'science/article/pii', + # 'pdfExtension': '/pdfft', + # 'pii': 'S0166354220302011', + # 'queryParams': {'md5': 'bd2a8d1cfbe3680f2d405b4a62642a15', + # 'pid': '1-s2.0-S0166354220302011-main.pdf'} + # } + # } + # + # We construct the url based on the urlMetaData. This leads to an + # intermediate page, which contains the actual url to the PDF in the + # noscript tag. + scripts = soup.find_all("script", attrs={"data-iso-key": "_0"}) if not scripts: raise URLResolutionError("ScienceDirect", url) @@ -90,9 +99,14 @@ class ScienceDirect(Provider): if not "pdfDownload" in data: raise URLResolutionError("ScienceDirect", url) data = data["pdfDownload"] - if not "linkToPdf" in data: + + if not "urlMetadata" in data: raise URLResolutionError("ScienceDirect", url) - link = data["linkToPdf"] + meta = data["urlMetadata"] + + link = "{path}/{pii}/{pdfExtension}?md5{queryParams[md5]}&pid={queryParams[pid]}".format( + **meta + ) tmp_url = urllib.parse.urljoin("https://sciencedirect.com/", link) # tmp_url gives a page with a ten second wait or a direct url, we need |
