aboutsummaryrefslogtreecommitdiff
path: root/tests/test_providers.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_providers.py')
-rw-r--r--tests/test_providers.py185
1 files changed, 166 insertions, 19 deletions
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 3204768..4ee6773 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -7,7 +7,7 @@ __author__ = "G.J.J. van den Burg"
import hashlib
import os
-import re
+import pdfplumber
import shutil
import tempfile
import unittest
@@ -15,17 +15,22 @@ import unittest
from paper2remarkable.providers import (
ACM,
Arxiv,
+ CVF,
CiteSeerX,
+ HTML,
+ JMLR,
LocalFile,
+ NBER,
NeurIPS,
OpenReview,
PMLR,
PdfUrl,
PubMed,
+ SagePub,
+ SemanticScholar,
Springer,
TandFOnline,
)
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
VERBOSE = False
@@ -41,18 +46,6 @@ def md5sum(filename):
return hasher.hexdigest()
-class TestArxiv(unittest.TestCase):
- def test_text_regex_1(self):
- key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
- m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
- self.assertIsNotNone(m)
-
- def test_text_regex_2(self):
- key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019"
- m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
- self.assertIsNotNone(m)
-
-
class TestProviders(unittest.TestCase):
@classmethod
def setUpClass(cls):
@@ -67,7 +60,8 @@ class TestProviders(unittest.TestCase):
shutil.rmtree(self.test_dir)
def test_arxiv_1(self):
- prov = Arxiv(upload=False, verbose=VERBOSE)
+ # check with qpdf
+ prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None)
url = "https://arxiv.org/abs/1811.11242v1"
exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
filename = prov.run(url)
@@ -96,6 +90,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_arxiv_5(self):
+ prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None)
+ url = "https://arxiv.org/abs/2002.11523"
+ exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
+ def test_arxiv_6(self):
+ prov = Arxiv(upload=False, verbose=VERBOSE)
+ url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------"
+ exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_pmc(self):
prov = PubMed(upload=False, verbose=VERBOSE)
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
@@ -128,13 +136,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
- def test_springer(self):
+ def test_springer_1(self):
prov = Springer(upload=False, verbose=VERBOSE)
url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_springer_2(self):
+ prov = Springer(upload=False, verbose=VERBOSE)
+ url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf"
+ exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_local(self):
local_filename = "test.pdf"
with open(local_filename, "w") as fp:
@@ -145,11 +160,31 @@ class TestProviders(unittest.TestCase):
filename = prov.run(local_filename)
self.assertEqual("test_.pdf", os.path.basename(filename))
- def test_pdfurl(self):
+ def test_pdfurl_1(self):
+ prov = PdfUrl(upload=False, verbose=VERBOSE)
+ url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+ filename = prov.run(url)
+ self.assertEqual("14-526.pdf", os.path.basename(filename))
+
+ def test_pdfurl_2(self):
prov = PdfUrl(upload=False, verbose=VERBOSE)
+ url = "https://www.manuelrigger.at/preprints/NoREC.pdf"
+ filename = prov.run(url)
+ self.assertEqual("NoREC.pdf", os.path.basename(filename))
+
+ def test_jmlr_1(self):
+ prov = JMLR(upload=False, verbose=VERBOSE)
url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
- filename = prov.run(url, filename="test.pdf")
- self.assertEqual("test.pdf", os.path.basename(filename))
+ exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_jmlr_2(self):
+ prov = JMLR(upload=False, verbose=VERBOSE)
+ url = "http://www.jmlr.org/papers/v10/xu09a.html"
+ exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
def test_pmlr_1(self):
prov = PMLR(upload=False, verbose=VERBOSE)
@@ -179,6 +214,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_nber_1(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w26752"
+ exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_nber_2(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w19152.pdf"
+ exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_neurips_1(self):
prov = NeurIPS(upload=False, verbose=VERBOSE)
url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf"
@@ -193,6 +242,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_neurips_3(self):
+ prov = NeurIPS(upload=False, verbose=VERBOSE)
+ url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits"
+ exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_neurips_4(self):
+ prov = NeurIPS(upload=False, verbose=VERBOSE)
+ url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf"
+ exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_citeseerx_1(self):
prov = CiteSeerX(upload=False, verbose=VERBOSE)
url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548"
@@ -218,6 +281,90 @@ class TestProviders(unittest.TestCase):
prov = TandFOnline(upload=False, verbose=VERBOSE)
url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true"
exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf"
+
+ def test_html_1(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
+ exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_html_2(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.nature.com/articles/d41586-020-00176-4"
+ exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_html_3(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://conclave-team.github.io/conclave-site/"
+ # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ # NOTE: Title differs between Readability.JS and readability-lxml, we
+ # assume that testing is done with Readability.JS
+ exp = "Conclave.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+ # this is a proxy test to check that all images are included
+ self.assertEqual(32, len(pdfplumber.open(filename).pages))
+
+ def test_html_4(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://sirupsen.com/2019/"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
+ def test_html_5(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
+ def test_semantic_scholar_1(self):
+ prov = SemanticScholar(upload=False, verbose=VERBOSE)
+ url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
+ exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_semantic_scholar_2(self):
+ prov = SemanticScholar(upload=False, verbose=VERBOSE)
+ url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f"
+ exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sagepub_1(self):
+ prov = SagePub(upload=False, verbose=VERBOSE)
+ url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"
+ exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sagepub_2(self):
+ prov = SagePub(upload=False, verbose=VERBOSE)
+ url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432"
+ exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_cvf_1(self):
+ prov = CVF(upload=False, verbose=VERBOSE)
+ url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html"
+ exp = (
+ "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_cvf_2(self):
+ prov = CVF(upload=False, verbose=VERBOSE)
+ url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf"
+ exp = (
+ "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf"
+ )
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))