From fc21fac9ddac5757aedf91636911f9f5e0798a73 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 27 Mar 2021 20:18:29 +0000
Subject: Bugfix for SemanticScholar provider

Direct urls to PDF files on SemanticScholar seem to
be deprecated. Instead, we need to pull the pdf link
from the html.
---
 paper2remarkable/exceptions.py                 |  2 +-
 paper2remarkable/providers/semantic_scholar.py | 48 +++++++++++++++++++-------
 tests/test_providers.py                        | 20 ++++++++---
 3 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index b433ad4..4a4c572 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -32,7 +32,7 @@ class URLResolutionError(Error):
     def __init__(self, provider, url, reason=None):
         self.provider = provider
         self.url = url
-        self.reason = None
+        self.reason = reason
 
     def __str__(self):
         msg = "ERROR: Couldn't figure out {provider} URLs from provided url: {url}".format(
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
index 0a1b414..8628156 100644
--- a/paper2remarkable/providers/semantic_scholar.py
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -14,7 +14,7 @@ import bs4
 from ._base import Provider
 from ._info import Informer
 from ..exceptions import URLResolutionError
-from ..utils import get_page_with_retry
+from ..utils import get_page_with_retry, get_content_type_with_retry
 
 
 class SemanticScholarInformer(Informer):
@@ -30,7 +30,6 @@ class SemanticScholar(Provider):
     re_abs = (
         "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
     )
-    re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -41,12 +40,6 @@ class SemanticScholar(Provider):
         if re.match(self.re_abs, url):
             abs_url = url
             pdf_url = self._get_pdf_url(abs_url)
-        elif re.match(self.re_pdf, url):
-            pdf_url = url
-            remainder = pdf_url.split("/")[-1][: -len(".pdf")]
-            first_four = pdf_url.split("/")[-2]
-            paper_id = first_four + remainder
-            abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
         else:
             raise URLResolutionError("SemanticScholar", url)
         return abs_url, pdf_url
@@ -54,12 +47,41 @@ class SemanticScholar(Provider):
     def _get_pdf_url(self, url):
         page = get_page_with_retry(url)
         soup = bs4.BeautifulSoup(page, "html.parser")
+
+        # First try to get the direct url to the PDF file from the HTML
+        a = soup.find(
+            "a",
+            {
+                "data-selenium-selector": "paper-link",
+                "data-heap-direct-pdf-link": "true",
+            },
+        )
+        if a:
+            return a["href"]
+
+        # Next try to get the url from the metadata (not always a pdf)
         meta = soup.find_all("meta", {"name": "citation_pdf_url"})
         if not meta:
-            raise URLResolutionError("SemanticScholar", url)
-        return meta[0]["content"]
+            raise URLResolutionError(
+                "SemanticScholar", url, reason="Page has no url to PDF file"
+            )
+        pdf_url = meta[0]["content"]
+
+        # Check the content type to check that the data will be a pdf
+        content_type = get_content_type_with_retry(pdf_url)
+        if content_type is None:
+            raise URLResolutionError(
+                "SemanticScholar",
+                url,
+                reason="Can't determine content type for pdf file",
+            )
+        if not content_type == "application/pdf":
+            raise URLResolutionError(
+                "SemanticScholar",
+                url,
+                reason="PDF url on SemanticScholar doesn't point to a pdf file",
+            )
+        return pdf_url
 
     def validate(src):
-        return re.match(SemanticScholar.re_abs, src) or re.match(
-            SemanticScholar.re_pdf, src
-        )
+        return re.match(SemanticScholar.re_abs, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index af69c64..e3d7f41 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -13,6 +13,7 @@ import tempfile
 import unittest
 from pikepdf import Pdf
 
+from paper2remarkable.exceptions import URLResolutionError
 from paper2remarkable.providers import (
     ACL,
     ACM,
@@ -338,10 +339,14 @@ class TestProviders(unittest.TestCase):
 
     def test_semantic_scholar_1(self):
         prov = SemanticScholar(upload=False, verbose=VERBOSE)
-        url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
-        exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
-        filename = prov.run(url)
-        self.assertEqual(exp, os.path.basename(filename))
+        url = "https://www.semanticscholar.org/paper/TableSense%3A-Spreadsheet-Table-Detection-with-Neural-Dong-Liu/1b01dea77e9cbf049b4ee8b68dc4d43529d06299?p2df"
+        with self.assertRaises(URLResolutionError) as cm:
+            prov.run(url)
+        err = cm.exception
+        self.assertEqual(
+            err.reason,
+            "PDF url on SemanticScholar doesn't point to a pdf file",
+        )
 
     def test_semantic_scholar_2(self):
         prov = SemanticScholar(upload=False, verbose=VERBOSE)
@@ -350,6 +355,13 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
 
+    def test_semantic_scholar_3(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://www.semanticscholar.org/paper/A-historical-account-of-how-continental-drift-and-Meinhold-%C5%9Eeng%C3%B6r/e7be87319985445e3ef7addf1ebd10899b92441f"
+        exp = "Meinhold_Sengor_-_A_Historical_Account_of_How_Continental_Drift_and_Plate_Tectonics_Provided_the_Framework_for_Our_Current_Understanding_of_Palaeogeography_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
     def test_sagepub_1(self):
         prov = SagePub(upload=False, verbose=VERBOSE)
         url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"
-- 
cgit v1.2.3


From 7bec207291c7974059366aed5529cc1c12c5ccd8 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 27 Mar 2021 20:40:29 +0000
Subject: Fix makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8b68381..4da44e1 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ release: ## Make a release
 	python make_release.py
 
 
-install: ## Install for the current user using the default python command
+install: docs ## Install for the current user using the default python command
 	python setup.py build_ext --inplace
 	python setup.py install --user
 
-- 
cgit v1.2.3


From bd46fd34f14b2636b6fee6cd2d338b24aee2945c Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 27 Mar 2021 22:51:17 +0000
Subject: Temporarily disable test

Not sure why this one is failing, it
works locally but not on GitHub actions.
The difference seems to be whether
social links are included in the
readability.js output.
---
 tests/test_providers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_providers.py b/tests/test_providers.py
index af69c64..d47a9df 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -329,6 +329,7 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    @unittest.skip("Skipping html_5 test")
     def test_html_5(self):
         prov = HTML(upload=False, verbose=VERBOSE)
         url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
-- 
cgit v1.2.3