diff options
Diffstat (limited to 'tests/test_providers.py')
| -rw-r--r-- | tests/test_providers.py | 185 |
1 files changed, 166 insertions, 19 deletions
diff --git a/tests/test_providers.py b/tests/test_providers.py index 3204768..4ee6773 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,7 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os -import re +import pdfplumber import shutil import tempfile import unittest @@ -15,17 +15,22 @@ import unittest from paper2remarkable.providers import ( ACM, Arxiv, + CVF, CiteSeerX, + HTML, + JMLR, LocalFile, + NBER, NeurIPS, OpenReview, PMLR, PdfUrl, PubMed, + SagePub, + SemanticScholar, Springer, TandFOnline, ) -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX VERBOSE = False @@ -41,18 +46,6 @@ def md5sum(filename): return hasher.hexdigest() -class TestArxiv(unittest.TestCase): - def test_text_regex_1(self): - key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - def test_text_regex_2(self): - key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): @@ -67,7 +60,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) @@ -96,6 +90,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_5(self): + prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None) + url = "https://arxiv.org/abs/2002.11523" + exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_arxiv_6(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" @@ -128,13 +136,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: @@ -145,11 +160,31 @@ class TestProviders(unittest.TestCase): filename = prov.run(local_filename) self.assertEqual("test_.pdf", os.path.basename(filename)) - def test_pdfurl(self): + def test_pdfurl_1(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url) + self.assertEqual("14-526.pdf", os.path.basename(filename)) + + def test_pdfurl_2(self): prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "https://www.manuelrigger.at/preprints/NoREC.pdf" + filename = prov.run(url) + self.assertEqual("NoREC.pdf", os.path.basename(filename)) + + def test_jmlr_1(self): + prov = JMLR(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) + exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_jmlr_2(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/v10/xu09a.html" + exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) @@ -179,6 +214,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_nber_1(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w26752" + exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_nber_2(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w19152.pdf" + exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_1(self): prov = NeurIPS(upload=False, verbose=VERBOSE) url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf" @@ -193,6 +242,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_3(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits" + exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_neurips_4(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf" + exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): prov = CiteSeerX(upload=False, verbose=VERBOSE) url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" @@ -218,6 +281,90 @@ class TestProviders(unittest.TestCase): prov = TandFOnline(upload=False, verbose=VERBOSE) url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + + def test_html_1(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_html_2(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.nature.com/articles/d41586-020-00176-4" + exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + + def test_html_4(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://sirupsen.com/2019/" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_1(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" + exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_2(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432" + exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_1(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html" + exp = ( + "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_2(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf" + exp = ( + "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf" + ) filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) |
