Bugfix for SemanticScholar provider

Direct urls to PDF files on SemanticScholar seem to be deprecated. Instead, we need to pull the pdf link from the html.
author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2021-03-27 20:18:29 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2021-03-27 20:18:29 +0000
commit: fc21fac9ddac5757aedf91636911f9f5e0798a73 (patch)
tree: 1b50b8afe9801202b440f776e47dd9bf616c932b
parent: Bump version and update changelog (diff)
download: paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.tar.gz
paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.zip
3 files changed, 52 insertions, 18 deletions
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index b433ad4..4a4c572 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -32,7 +32,7 @@ class URLResolutionError(Error):
     def __init__(self, provider, url, reason=None):
         self.provider = provider
         self.url = url
-        self.reason = None
+        self.reason = reason
 
     def __str__(self):
         msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format(
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
index 0a1b414..8628156 100644
--- a/paper2remarkable/providers/semantic_scholar.py
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -14,7 +14,7 @@ import bs4
 from ._base import Provider
 from ._info import Informer
 from ..exceptions import URLResolutionError
-from ..utils import get_page_with_retry
+from ..utils import get_page_with_retry, get_content_type_with_retry
 
 
 class SemanticScholarInformer(Informer):
@@ -30,7 +30,6 @@ class SemanticScholar(Provider):
     re_abs = (
         "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
     )
-    re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -41,12 +40,6 @@ class SemanticScholar(Provider):
         if re.match(self.re_abs, url):
             abs_url = url
             pdf_url = self._get_pdf_url(abs_url)
-        elif re.match(self.re_pdf, url):
-            pdf_url = url
-            remainder = pdf_url.split("/")[-1][: -len(".pdf")]
-            first_four = pdf_url.split("/")[-2]
-            paper_id = first_four + remainder
-            abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
         else:
             raise URLResolutionError("SemanticScholar", url)
         return abs_url, pdf_url
@@ -54,12 +47,41 @@ class SemanticScholar(Provider):
     def _get_pdf_url(self, url):
         page = get_page_with_retry(url)
         soup = bs4.BeautifulSoup(page, "html.parser")
+
+        # First try to get the direct url to the PDF file from the HTML
+        a = soup.find(
+            "a",
+            {
+                "data-selenium-selector": "paper-link",
+                "data-heap-direct-pdf-link": "true",
+            },
+        )
+        if a:
+            return a["href"]
+
+        # Next try to get the url from the metadata (not always a pdf)
         meta = soup.find_all("meta", {"name": "citation_pdf_url"})
         if not meta:
-            raise URLResolutionError("SemanticScholar", url)
-        return meta[0]["content"]
+            raise URLResolutionError(
+                "SemanticScholar", url, reason="Page has no url to PDF file"
+            )
+        pdf_url = meta[0]["content"]
+
+        # Check the content type to check that the data will be a pdf
+        content_type = get_content_type_with_retry(pdf_url)
+        if content_type is None:
+            raise URLResolutionError(
+                "SemanticScholar",
+                url,
+                reason="Can't determine content type for pdf file",
+            )
+        if not content_type == "application/pdf":
+            raise URLResolutionError(
+                "SemanticScholar",
+                url,
+                reason="PDF url on SemanticScholar doesn't point to a pdf file",
+            )
+        return pdf_url
 
     def validate(src):
-        return re.match(SemanticScholar.re_abs, src) or re.match(
-            SemanticScholar.re_pdf, src
-        )
+        return re.match(SemanticScholar.re_abs, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index af69c64..e3d7f41 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -13,6 +13,7 @@ import tempfile
 import unittest
 from pikepdf import Pdf
 
+from paper2remarkable.exceptions import URLResolutionError
 from paper2remarkable.providers import (
     ACL,
     ACM,
@@ -338,10 +339,14 @@ class TestProviders(unittest.TestCase):
 
     def test_semantic_scholar_1(self):
         prov = SemanticScholar(upload=False, verbose=VERBOSE)
-        url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
-        exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp, os.path.basename(filename))
+        url = "https://www.semanticscholar.org/paper/TableSense%3A-Spreadsheet-Table-Detection-with-Neural-Dong-Liu/1b01dea77e9cbf049b4ee8b68dc4d43529d06299?p2df"
+        with self.assertRaises(URLResolutionError) as cm:
+            prov.run(url)
+        err = cm.exception
+        self.assertEqual(
+            err.reason,
+            "PDF url on SemanticScholar doesn't point to a pdf file",
+        )
 
     def test_semantic_scholar_2(self):
         prov = SemanticScholar(upload=False, verbose=VERBOSE)
@@ -350,6 +355,13 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
 
+    def test_semantic_scholar_3(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://www.semanticscholar.org/paper/A-historical-account-of-how-continental-drift-and-Meinhold-%C5%9Eeng%C3%B6r/e7be87319985445e3ef7addf1ebd10899b92441f"
+        exp = "Meinhold_Sengor_-_A_Historical_Account_of_How_Continental_Drift_and_Plate_Tectonics_Provided_the_Framework_for_Our_Current_Understanding_of_Palaeogeography_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
     def test_sagepub_1(self):
         prov = SagePub(upload=False, verbose=VERBOSE)
         url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2021-03-27 20:18:29 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2021-03-27 20:18:29 +0000
commit	fc21fac9ddac5757aedf91636911f9f5e0798a73 (patch)
tree	1b50b8afe9801202b440f776e47dd9bf616c932b
parent	Bump version and update changelog (diff)
download	paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.tar.gz paper2remarkable-fc21fac9ddac5757aedf91636911f9f5e0798a73.zip