aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-27 20:18:29 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-27 20:18:29 +0000
commitfc21fac9ddac5757aedf91636911f9f5e0798a73 (patch)
tree1b50b8afe9801202b440f776e47dd9bf616c932b
parentBump version and update changelog (diff)
downloadpaper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.tar.gz
paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.zip
Bugfix for SemanticScholar provider
Direct urls to PDF files on SemanticScholar seem to be deprecated. Instead, we need to pull the pdf link from the html.
-rw-r--r--paper2remarkable/exceptions.py2
-rw-r--r--paper2remarkable/providers/semantic_scholar.py48
-rw-r--r--tests/test_providers.py20
3 files changed, 52 insertions, 18 deletions
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index b433ad4..4a4c572 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -32,7 +32,7 @@ class URLResolutionError(Error):
def __init__(self, provider, url, reason=None):
self.provider = provider
self.url = url
- self.reason = None
+ self.reason = reason
def __str__(self):
msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format(
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
index 0a1b414..8628156 100644
--- a/paper2remarkable/providers/semantic_scholar.py
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -14,7 +14,7 @@ import bs4
from ._base import Provider
from ._info import Informer
from ..exceptions import URLResolutionError
-from ..utils import get_page_with_retry
+from ..utils import get_page_with_retry, get_content_type_with_retry
class SemanticScholarInformer(Informer):
@@ -30,7 +30,6 @@ class SemanticScholar(Provider):
re_abs = (
"https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
)
- re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -41,12 +40,6 @@ class SemanticScholar(Provider):
if re.match(self.re_abs, url):
abs_url = url
pdf_url = self._get_pdf_url(abs_url)
- elif re.match(self.re_pdf, url):
- pdf_url = url
- remainder = pdf_url.split("/")[-1][: -len(".pdf")]
- first_four = pdf_url.split("/")[-2]
- paper_id = first_four + remainder
- abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
else:
raise URLResolutionError("SemanticScholar", url)
return abs_url, pdf_url
@@ -54,12 +47,41 @@ class SemanticScholar(Provider):
def _get_pdf_url(self, url):
page = get_page_with_retry(url)
soup = bs4.BeautifulSoup(page, "html.parser")
+
+ # First try to get the direct url to the PDF file from the HTML
+ a = soup.find(
+ "a",
+ {
+ "data-selenium-selector": "paper-link",
+ "data-heap-direct-pdf-link": "true",
+ },
+ )
+ if a:
+ return a["href"]
+
+ # Next try to get the url from the metadata (not always a pdf)
meta = soup.find_all("meta", {"name": "citation_pdf_url"})
if not meta:
- raise URLResolutionError("SemanticScholar", url)
- return meta[0]["content"]
+ raise URLResolutionError(
+ "SemanticScholar", url, reason="Page has no url to PDF file"
+ )
+ pdf_url = meta[0]["content"]
+
+ # Check the content type to check that the data will be a pdf
+ content_type = get_content_type_with_retry(pdf_url)
+ if content_type is None:
+ raise URLResolutionError(
+ "SemanticScholar",
+ url,
+ reason="Can't determine content type for pdf file",
+ )
+ if not content_type == "application/pdf":
+ raise URLResolutionError(
+ "SemanticScholar",
+ url,
+ reason="PDF url on SemanticScholar doesn't point to a pdf file",
+ )
+ return pdf_url
def validate(src):
- return re.match(SemanticScholar.re_abs, src) or re.match(
- SemanticScholar.re_pdf, src
- )
+ return re.match(SemanticScholar.re_abs, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index af69c64..e3d7f41 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -13,6 +13,7 @@ import tempfile
import unittest
from pikepdf import Pdf
+from paper2remarkable.exceptions import URLResolutionError
from paper2remarkable.providers import (
ACL,
ACM,
@@ -338,10 +339,14 @@ class TestProviders(unittest.TestCase):
def test_semantic_scholar_1(self):
prov = SemanticScholar(upload=False, verbose=VERBOSE)
- url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
- exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
- filename = prov.run(url)
- self.assertEqual(exp, os.path.basename(filename))
+ url = "https://www.semanticscholar.org/paper/TableSense%3A-Spreadsheet-Table-Detection-with-Neural-Dong-Liu/1b01dea77e9cbf049b4ee8b68dc4d43529d06299?p2df"
+ with self.assertRaises(URLResolutionError) as cm:
+ prov.run(url)
+ err = cm.exception
+ self.assertEqual(
+ err.reason,
+ "PDF url on SemanticScholar doesn't point to a pdf file",
+ )
def test_semantic_scholar_2(self):
prov = SemanticScholar(upload=False, verbose=VERBOSE)
@@ -350,6 +355,13 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_semantic_scholar_3(self):
+ prov = SemanticScholar(upload=False, verbose=VERBOSE)
+ url = "https://www.semanticscholar.org/paper/A-historical-account-of-how-continental-drift-and-Meinhold-%C5%9Eeng%C3%B6r/e7be87319985445e3ef7addf1ebd10899b92441f"
+ exp = "Meinhold_Sengor_-_A_Historical_Account_of_How_Continental_Drift_and_Plate_Tectonics_Provided_the_Framework_for_Our_Current_Understanding_of_Palaeogeography_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_sagepub_1(self):
prov = SagePub(upload=False, verbose=VERBOSE)
url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"