From fc21fac9ddac5757aedf91636911f9f5e0798a73 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 27 Mar 2021 20:18:29 +0000 Subject: Bugfix for SemanticScholar provider Direct urls to PDF files on SemanticScholar seem to be deprecated. Instead, we need to pull the pdf link from the html. --- tests/test_providers.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index af69c64..e3d7f41 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -13,6 +13,7 @@ import tempfile import unittest from pikepdf import Pdf +from paper2remarkable.exceptions import URLResolutionError from paper2remarkable.providers import ( ACL, ACM, @@ -338,10 +339,14 @@ class TestProviders(unittest.TestCase): def test_semantic_scholar_1(self): prov = SemanticScholar(upload=False, verbose=VERBOSE) - url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" - exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" - filename = prov.run(url) - self.assertEqual(exp, os.path.basename(filename)) + url = "https://www.semanticscholar.org/paper/TableSense%3A-Spreadsheet-Table-Detection-with-Neural-Dong-Liu/1b01dea77e9cbf049b4ee8b68dc4d43529d06299?p2df" + with self.assertRaises(URLResolutionError) as cm: + prov.run(url) + err = cm.exception + self.assertEqual( + err.reason, + "PDF url on SemanticScholar doesn't point to a pdf file", + ) def test_semantic_scholar_2(self): prov = SemanticScholar(upload=False, verbose=VERBOSE) @@ -350,6 +355,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_semantic_scholar_3(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/A-historical-account-of-how-continental-drift-and-Meinhold-%C5%9Eeng%C3%B6r/e7be87319985445e3ef7addf1ebd10899b92441f" + exp = "Meinhold_Sengor_-_A_Historical_Account_of_How_Continental_Drift_and_Plate_Tectonics_Provided_the_Framework_for_Our_Current_Understanding_of_Palaeogeography_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_sagepub_1(self): prov = SagePub(upload=False, verbose=VERBOSE) url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" -- cgit v1.2.3 From bd46fd34f14b2636b6fee6cd2d338b24aee2945c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 27 Mar 2021 22:51:17 +0000 Subject: Temporarily disable test Not sure why this one is failing, it works locally but not on GitHub actions. The difference seems to be whether social links are included in the readability.js output. --- tests/test_providers.py | 1 + 1 file changed, 1 insertion(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index af69c64..d47a9df 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -329,6 +329,7 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + @unittest.skip("Skipping html_5 test") def test_html_5(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" -- cgit v1.2.3