diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-27 20:18:29 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-27 20:18:29 +0000 |
| commit | fc21fac9ddac5757aedf91636911f9f5e0798a73 (patch) | |
| tree | 1b50b8afe9801202b440f776e47dd9bf616c932b | |
| parent | Bump version and update changelog (diff) | |
| download | paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.tar.gz paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.zip | |
Bugfix for SemanticScholar provider
Direct urls to PDF files on SemanticScholar seem to
be deprecated. Instead, we need to pull the pdf link
from the html.
| -rw-r--r-- | paper2remarkable/exceptions.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/semantic_scholar.py | 48 | ||||
| -rw-r--r-- | tests/test_providers.py | 20 |
3 files changed, 52 insertions, 18 deletions
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index b433ad4..4a4c572 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -32,7 +32,7 @@ class URLResolutionError(Error): def __init__(self, provider, url, reason=None): self.provider = provider self.url = url - self.reason = None + self.reason = reason def __str__(self): msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format( diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py index 0a1b414..8628156 100644 --- a/paper2remarkable/providers/semantic_scholar.py +++ b/paper2remarkable/providers/semantic_scholar.py @@ -14,7 +14,7 @@ import bs4 from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError -from ..utils import get_page_with_retry +from ..utils import get_page_with_retry, get_content_type_with_retry class SemanticScholarInformer(Informer): @@ -30,7 +30,6 @@ class SemanticScholar(Provider): re_abs = ( "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" ) - re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -41,12 +40,6 @@ class SemanticScholar(Provider): if re.match(self.re_abs, url): abs_url = url pdf_url = self._get_pdf_url(abs_url) - elif re.match(self.re_pdf, url): - pdf_url = url - remainder = pdf_url.split("/")[-1][: -len(".pdf")] - first_four = pdf_url.split("/")[-2] - paper_id = first_four + remainder - abs_url = f"https://www.semanticscholar.org/paper/{paper_id}" else: raise URLResolutionError("SemanticScholar", url) return abs_url, pdf_url @@ -54,12 +47,41 @@ class SemanticScholar(Provider): def _get_pdf_url(self, url): page = get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") + + # First try to get the direct url to the PDF file from the HTML + a = soup.find( + "a", + { + "data-selenium-selector": "paper-link", + "data-heap-direct-pdf-link": "true", + }, + ) + if a: + return a["href"] + + # Next try to get the url from the metadata (not always a pdf) meta = soup.find_all("meta", {"name": "citation_pdf_url"}) if not meta: - raise URLResolutionError("SemanticScholar", url) - return meta[0]["content"] + raise URLResolutionError( + "SemanticScholar", url, reason="Page has no url to PDF file" + ) + pdf_url = meta[0]["content"] + + # Check the content type to check that the data will be a pdf + content_type = get_content_type_with_retry(pdf_url) + if content_type is None: + raise URLResolutionError( + "SemanticScholar", + url, + reason="Can't determine content type for pdf file", + ) + if not content_type == "application/pdf": + raise URLResolutionError( + "SemanticScholar", + url, + reason="PDF url on SemanticScholar doesn't point to a pdf file", + ) + return pdf_url def validate(src): - return re.match(SemanticScholar.re_abs, src) or re.match( - SemanticScholar.re_pdf, src - ) + return re.match(SemanticScholar.re_abs, src) diff --git a/tests/test_providers.py b/tests/test_providers.py index af69c64..e3d7f41 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -13,6 +13,7 @@ import tempfile import unittest from pikepdf import Pdf +from paper2remarkable.exceptions import URLResolutionError from paper2remarkable.providers import ( ACL, ACM, @@ -338,10 +339,14 @@ class TestProviders(unittest.TestCase): def test_semantic_scholar_1(self): prov = SemanticScholar(upload=False, verbose=VERBOSE) - url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" - exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" - filename = prov.run(url) - self.assertEqual(exp, os.path.basename(filename)) + url = "https://www.semanticscholar.org/paper/TableSense%3A-Spreadsheet-Table-Detection-with-Neural-Dong-Liu/1b01dea77e9cbf049b4ee8b68dc4d43529d06299?p2df" + with self.assertRaises(URLResolutionError) as cm: + prov.run(url) + err = cm.exception + self.assertEqual( + err.reason, + "PDF url on SemanticScholar doesn't point to a pdf file", + ) def test_semantic_scholar_2(self): prov = SemanticScholar(upload=False, verbose=VERBOSE) @@ -350,6 +355,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_semantic_scholar_3(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/A-historical-account-of-how-continental-drift-and-Meinhold-%C5%9Eeng%C3%B6r/e7be87319985445e3ef7addf1ebd10899b92441f" + exp = "Meinhold_Sengor_-_A_Historical_Account_of_How_Continental_Drift_and_Plate_Tectonics_Provided_the_Framework_for_Our_Current_Understanding_of_Palaeogeography_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_sagepub_1(self): prov = SagePub(upload=False, verbose=VERBOSE) url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" |
