From 0456a377b3deef09a533b79224f4590e02372040 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 13:17:58 +0000 Subject: [WIP] Initial commit of HTML provider --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index e256eec..80f4662 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -16,6 +16,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + HTML, LocalFile, NeurIPS, OpenReview, @@ -206,6 +207,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_1(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From ce9c1333fcf761e322ad169df3969ca23d9938e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 4 Feb 2020 10:50:50 +0000 Subject: Add another test for the html provider --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 80f4662..d0e3d40 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -214,6 +214,14 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_2(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.nature.com/articles/d41586-020-00176-4" + exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 0128dce1c10be8db965584aa387bf00040a3f018 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:14:44 +0000 Subject: Add NBER provider --- tests/test_providers.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index d0e3d40..38f88b7 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -18,6 +18,7 @@ from paper2remarkable.providers import ( CiteSeerX, HTML, LocalFile, + NBER, NeurIPS, OpenReview, PMLR, @@ -179,6 +180,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_nber_1(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w26752" + exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_nber_2(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w19152.pdf" + exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_1(self): prov = NeurIPS(upload=False, verbose=VERBOSE) url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf" -- cgit v1.2.3 From d43e1fbe10f18fdbac08aae414e605c8387cb19b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:38:03 +0000 Subject: Extract filename from url with pdf_url provider Fixes #25 --- tests/test_providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index d0e3d40..82c8500 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -148,8 +148,8 @@ class TestProviders(unittest.TestCase): def test_pdfurl(self): prov = PdfUrl(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) + filename = prov.run(url) + self.assertEqual("14-526.pdf", os.path.basename(filename)) def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) -- cgit v1.2.3 From de7fa6bf3b7c25eb2a2e07fce769b515bca92e7d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 21 Feb 2020 16:23:01 +0000 Subject: Add provider for JMLR --- tests/test_providers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 493a209..2bf7507 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -17,6 +17,7 @@ from paper2remarkable.providers import ( Arxiv, CiteSeerX, HTML, + JMLR, LocalFile, NBER, NeurIPS, @@ -152,6 +153,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual("14-526.pdf", os.path.basename(filename)) + def test_jmlr_1(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_jmlr_2(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/v10/xu09a.html" + exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) url = "http://proceedings.mlr.press/v97/behrmann19a.html" @@ -237,6 +252,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 01c294bccd10f8c430e1c959fbb5ebacea8f3c3a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:53:26 +0100 Subject: Add unit test --- tests/test_providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 2bf7507..e539949 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) -- cgit v1.2.3 From f24d1b3fdba482e69f7cfc7a6fb7ecabbcba069d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 16:20:15 +0100 Subject: Move arXiv tests to a separate file --- tests/test_arxiv.py | 29 +++++++++++++++++++++++++++++ tests/test_providers.py | 14 -------------- 2 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 tests/test_arxiv.py (limited to 'tests') diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py new file mode 100644 index 0000000..beb9baa --- /dev/null +++ b/tests/test_arxiv.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for arXiv provider + +This file is part of paper2remarkable. + +""" + +import re +import unittest + +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX + + +class TestArxiv(unittest.TestCase): + def test_text_regex_1(self): + key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_2(self): + key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index e539949..e0239ed 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,7 +7,6 @@ __author__ = "G.J.J. van den Burg" import hashlib import os -import re import shutil import tempfile import unittest @@ -27,7 +26,6 @@ from paper2remarkable.providers import ( PubMed, Springer, ) -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX VERBOSE = False @@ -43,18 +41,6 @@ def md5sum(filename): return hasher.hexdigest() -class TestArxiv(unittest.TestCase): - def test_text_regex_1(self): - key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - def test_text_regex_2(self): - key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): -- cgit v1.2.3 From ed9b8252a2361604331f7a275a7625b3de9017ff Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 17:20:47 +0100 Subject: Fix provider selection for redirected urls Some urls, such as the arXiv urls with the : in the identifier, didn't work when using the UI interface because the redirected url wasn't past to the provider, but the original url was. This commit fixes that issue and adds unit tests for the provider selection function, hopefully making this more robust in the future. --- tests/test_ui.py | 203 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 tests/test_ui.py (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py new file mode 100644 index 0000000..fc362a0 --- /dev/null +++ b/tests/test_ui.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for command line interface + +This file is part of paper2remarkable. + +""" + +import os +import shutil +import tempfile +import unittest + +from paper2remarkable.exceptions import ( + InvalidURLError, + UnidentifiedSourceError, +) +from paper2remarkable.providers import ( + ACM, + Arxiv, + CiteSeerX, + HTML, + JMLR, + LocalFile, + NBER, + NeurIPS, + OpenReview, + PMLR, + PdfUrl, + PubMed, + Springer, +) +from paper2remarkable.ui import choose_provider + + +class TestUI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_choose_provider_1(self): + tests = [ + ( + Arxiv, + "https://arxiv.org/abs/1811.11242v1", + "https://arxiv.org/abs/1811.11242v1", + ), + ( + Arxiv, + "http://arxiv.org/abs/arXiv:1908.03213", + "https://arxiv.org/abs/1908.03213", + ), + ( + Arxiv, + "https://arxiv.org/abs/math/0309285", + "https://arxiv.org/abs/math/0309285", + ), + ( + Arxiv, + "https://arxiv.org/pdf/physics/0605197v1.pdf", + "https://arxiv.org/pdf/physics/0605197v1.pdf", + ), + ( + PubMed, + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + ), + ( + ACM, + "https://dl.acm.org/citation.cfm?id=3025626", + "https://dl.acm.org/doi/10.1145/3025453.3025626", + ), + ( + ACM, + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true", + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&", + ), + ( + OpenReview, + "http://openreview.net/forum?id=S1x4ghC9tQ", + "https://openreview.net/forum?id=S1x4ghC9tQ", + ), + ( + Springer, + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + ), + ( + PdfUrl, + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/v10/xu09a.html", + "http://www.jmlr.org/papers/v10/xu09a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v97/behrmann19a.html", + "http://proceedings.mlr.press/v97/behrmann19a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/melnyk16.pdf", + "http://proceedings.mlr.press/v48/melnyk16.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/zhangf16.html", + "http://proceedings.mlr.press/v48/zhangf16.html", + ), + ( + NBER, + "https://www.nber.org/papers/w26752", + "https://www.nber.org/papers/w26752", + ), + ( + NBER, + "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/papers/w19152.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/7796-middle-out-decoding", + "https://papers.nips.cc/paper/7796-middle-out-decoding", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + ), + ( + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4" + ), + ] + for exp_prov, url, exp_url in tests: + prov, new_url, jar = choose_provider(url) + with self.subTest(url=url): + self.assertEqual(exp_url, new_url) + self.assertEqual(prov, exp_prov) + + def test_choose_provider_2(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + + prov, new_input, jar = choose_provider(local_filename) + self.assertEqual(prov, LocalFile) + self.assertEqual(new_input, local_filename) + self.assertIsNone(jar) + + def test_choose_provider_3(self): + local_filename = "/tmp/abcdef.pdf" + with self.assertRaises(UnidentifiedSourceError): + choose_provider(local_filename) + + def test_choose_provider_4(self): + url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md" + with self.assertRaises(InvalidURLError): + choose_provider(url) + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From b8452034ed3a503e06e58f524ac322a4ab0203bb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 22:43:11 +0100 Subject: Code formatting --- tests/test_ui.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py index fc362a0..11ed87a 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -160,15 +160,15 @@ class TestUI(unittest.TestCase): "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", ), ( - HTML, - "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", - "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" - ), - ( - HTML, - "https://www.nature.com/articles/d41586-020-00176-4", - "https://www.nature.com/articles/d41586-020-00176-4" - ), + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) -- cgit v1.2.3 From 62d72c8c073376a036df66d872ffd6149374fd7b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 8 Apr 2020 21:15:48 +0100 Subject: Be more robust against spaces in pdf file This caused problems where the arxiv stamp was not removed for some files. This commit adds tests for this and fixes the issue. --- tests/test_arxiv.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index beb9baa..08ea2c4 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -7,13 +7,28 @@ This file is part of paper2remarkable. """ +import os import re +import shutil +import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv class TestArxiv(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + def test_text_regex_1(self): key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" m = re.fullmatch(DEARXIV_TEXT_REGEX, key) @@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_stamp_removed_1(self): + url = "https://arxiv.org/pdf/1703.06103.pdf" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data) + + def test_stamp_removed_2(self): + url = "https://arxiv.org/abs/2003.06222" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From e0aba92623d9961602d37a5e3f6ce01403e3598a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Apr 2020 11:05:19 +0100 Subject: Properly check for the installed pdf tool This fixes #42. --- tests/test_utils.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/test_utils.py (limited to 'tests') diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..4c122e0 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest + +from paper2remarkable.exceptions import NoPDFToolError +from paper2remarkable.utils import check_pdftool + + +class TestUtils(unittest.TestCase): + def test_check_pdftool(self): + # Needs a system with both pdftk and qpdf available + self.assertEqual(check_pdftool("pdftk", "qpdf"), "pdftk") + self.assertEqual(check_pdftool("pdftk_xyz", "qpdf"), "qpdf") + self.assertEqual(check_pdftool("pdftk", "qpdf_xyz"), "pdftk") + with self.assertRaises(NoPDFToolError): + check_pdftool("pdftk_xyz", "qpdf_xyz") + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From fb825cab2e4681a6e6cae8cf32adeeb880a4910c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:33:13 +0100 Subject: Add unit test for this bug --- tests/test_providers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index e0239ed..0787792 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os +import pdfplumber import shutil import tempfile import unittest @@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 58be7d78a105c0b0f871b339daa29cdf8f6557d4 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 28 Apr 2020 13:18:42 +0100 Subject: Add unit test for image urls with trailing slash --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 0787792..a7f17ff 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -248,6 +248,14 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(32, len(pdfplumber.open(filename).pages)) + def test_html_4(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://sirupsen.com/2019/" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From f8c0e4b2c953d617ffea4a09a7373f697a5eb104 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 2 May 2020 19:34:14 +0100 Subject: Various improvements to dearxiv functionality --- tests/test_arxiv.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++- tests/test_providers.py | 8 ++++++- 2 files changed, 68 insertions(+), 2 deletions(-) (limited to 'tests') diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index 08ea2c4..2cb84cf 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -13,7 +13,11 @@ import shutil import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv +from paper2remarkable.providers.arxiv import ( + DEARXIV_TEXT_REGEX, + DEARXIV_URI_REGEX, + Arxiv, +) class TestArxiv(unittest.TestCase): @@ -39,6 +43,26 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_text_regex_3(self): + key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_4(self): + key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_1(self): + key = b"http://arxiv.org/abs/physics/0605197v1" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_2(self): + key = b"https://arxiv.org/abs/1101.0028v3" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + def test_stamp_removed_1(self): url = "https://arxiv.org/pdf/1703.06103.pdf" prov = Arxiv(upload=False) @@ -57,6 +81,42 @@ class TestArxiv(unittest.TestCase): data = fp.read() self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + def test_stamp_removed_3(self): + url = "https://arxiv.org/abs/physics/0605197v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data + ) + self.assertNotIn( + b"/URI (http://arxiv.org/abs/physics/0605197v1)", data + ) + + def test_stamp_removed_4(self): + url = "https://arxiv.org/abs/math/0309285v2" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data) + self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data) + + def test_stamp_removed_5(self): + url = "https://arxiv.org/abs/astro-ph/9207001v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data + ) + self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index a7f17ff..d2fdb0d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -86,6 +86,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_5(self): + prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None) + url = "https://arxiv.org/abs/2002.11523" + exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" @@ -256,6 +263,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(4, len(pdfplumber.open(filename).pages)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 7cc0b6e320e45b9ce442425a04ac4708fb3df077 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 11 May 2020 17:32:21 +0100 Subject: Allow underscore in urls --- tests/test_ui.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py index 11ed87a..7ae1e79 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -99,6 +99,11 @@ class TestUI(unittest.TestCase): "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", ), + ( + PdfUrl, + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + ), ( JMLR, "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", -- cgit v1.2.3 From 96de81d4158f7779132c9f7883c62bc3f15b6915 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 20:45:44 +0100 Subject: Add test for pdf reading issue --- tests/test_providers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index d2fdb0d..fb75fbd 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -142,12 +142,18 @@ class TestProviders(unittest.TestCase): filename = prov.run(local_filename) self.assertEqual("test_.pdf", os.path.basename(filename)) - def test_pdfurl(self): + def test_pdfurl_1(self): prov = PdfUrl(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" filename = prov.run(url) self.assertEqual("14-526.pdf", os.path.basename(filename)) + def test_pdfurl_2(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "https://www.manuelrigger.at/preprints/NoREC.pdf" + filename = prov.run(url) + self.assertEqual("NoREC.pdf", os.path.basename(filename)) + def test_jmlr_1(self): prov = JMLR(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" -- cgit v1.2.3 From a5522a9cc39b61d0d26705f99279381dcb9e7f9f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 21:07:46 +0100 Subject: Remove extra pdfurl test This test seems to fail repeatedly on Travis, for no clear reason (it works locally). Since we have another PdfUrl test I don't think it's necessary to have this one too, so I'll remove it. --- tests/test_ui.py | 5 ----- 1 file changed, 5 deletions(-) (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py index 7ae1e79..7ab5099 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -94,11 +94,6 @@ class TestUI(unittest.TestCase): "https://link.springer.com/article/10.1007/s10618-019-00631-5", "https://link.springer.com/article/10.1007/s10618-019-00631-5", ), - ( - PdfUrl, - "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", - "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", - ), ( PdfUrl, "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", -- cgit v1.2.3 From ec000de563a32b4e757c9afde5a1b1b5ac80a511 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 22:42:10 +0100 Subject: Add support for using ReadabiliPy --- tests/test_providers.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..ca6c1ae 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From d5230d43d58c992212c89f3c221f72784a3a309d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 5 Jul 2020 23:33:11 +0100 Subject: Add provider for Semantic Scholar --- tests/test_providers.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..1a6f84f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -26,6 +26,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + SemanticScholar ) VERBOSE = False @@ -268,6 +269,20 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 0a6a4ff3893474e33f71ef2d8a881cc360a29094 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:23:12 +0100 Subject: Improve robustness of springer provider Adds support for downloading chapters --- tests/test_providers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 1a6f84f..5c8a8e4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -126,13 +126,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: -- cgit v1.2.3 From 071b5a0f2958c34f1a189259346a8732a1110de2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 22:44:33 +0100 Subject: Add provider for SagePub --- tests/test_providers.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 5c8a8e4..ba1cc3a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -25,8 +25,9 @@ from paper2remarkable.providers import ( PMLR, PdfUrl, PubMed, + SagePub, Springer, - SemanticScholar + SemanticScholar, ) VERBOSE = False @@ -290,6 +291,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_sagepub_1(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" + exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_2(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432" + exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From b56d376ff87cfc7fc599f40e13338a3c1a489877 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 11 Aug 2020 22:18:30 +0100 Subject: Fix failing unit test --- tests/test_ui.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py index 7ab5099..5747eb9 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -101,13 +101,13 @@ class TestUI(unittest.TestCase): ), ( JMLR, - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", ), ( JMLR, - "http://www.jmlr.org/papers/v10/xu09a.html", - "http://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", ), ( PMLR, -- cgit v1.2.3 From 25f372c69dfc846faebb4763ecc60e9e0750021b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 20:54:38 +0200 Subject: Improve support for Neurips provider (fixes #59) --- tests/test_providers.py | 14 ++++++++++++++ tests/test_ui.py | 7 ++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index ba1cc3a..eeaef82 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -233,6 +233,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_3(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits" + exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_neurips_4(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf" + exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): prov = CiteSeerX(upload=False, verbose=VERBOSE) url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" diff --git a/tests/test_ui.py b/tests/test_ui.py index 5747eb9..61b371d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -98,7 +98,7 @@ class TestUI(unittest.TestCase): PdfUrl, "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", - ), + ), ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", @@ -149,6 +149,11 @@ class TestUI(unittest.TestCase): "https://papers.nips.cc/paper/7796-middle-out-decoding", "https://papers.nips.cc/paper/7796-middle-out-decoding", ), + ( + NeurIPS, + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + ), ( CiteSeerX, "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", -- cgit v1.2.3 From 6338388cea254ba4c6090eb17a8942a13b7a2b1c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:25:41 +0200 Subject: Clean up readability providers This reorganizes the code a bit to ensure we only pull the HTML page once, and use the same readability provider for both the informer and the converter. --- tests/test_providers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index ca6c1ae..479fb84 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) # this is a proxy test to check that all images are included -- cgit v1.2.3 From fcd8d3cd1d94780315a82655ce6b9571534c0a7d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:00:58 +0200 Subject: Updates to NBER provider after site updates --- tests/test_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_ui.py b/tests/test_ui.py index 61b371d..97ec44d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -137,7 +137,7 @@ class TestUI(unittest.TestCase): ( NBER, "https://www.nber.org/papers/w19152.pdf", - "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/system/files/working_papers/w19152/w19152.pdf", ), ( NeurIPS, -- cgit v1.2.3 From b77c06ad3deb27b90a91f468b0123923d217d53d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:11:18 +0200 Subject: Increase robustness for arXiv urls --- tests/test_providers.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 70d012a..12f748e 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -95,6 +95,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_6(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_with_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" -- cgit v1.2.3 From 1f07867ec7aebb1b1aa6b806e35a46e73c034605 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:15:43 +0200 Subject: fix typo --- tests/test_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 12f748e..b6cce59 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -98,7 +98,7 @@ class TestProviders(unittest.TestCase): def test_arxiv_6(self): prov = Arxiv(upload=False, verbose=VERBOSE) url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" - exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_with_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) -- cgit v1.2.3 From 1e83f9f6537fa108d7a157daaaeb3dc06e80fdce Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 12:24:50 +0200 Subject: Code formatting --- tests/test_providers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index b6cce59..546794c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -285,8 +285,8 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" - # NOTE: Title differs between Readability.JS and readability-lxml, we + # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we # assume that testing is done with Readability.JS exp = "Conclave.pdf" filename = prov.run(url) @@ -337,6 +337,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 14cacacf3fd7b78b287ec7e6b127bd24f0ea4f56 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 17:36:20 +0200 Subject: Add CVF provider --- tests/test_providers.py | 19 +++++++++++++++++++ tests/test_ui.py | 6 ++++++ 2 files changed, 25 insertions(+) (limited to 'tests') diff --git a/tests/test_providers.py b/tests/test_providers.py index 546794c..e701234 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -16,6 +16,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + CVF, HTML, JMLR, LocalFile, @@ -336,6 +337,24 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_cvf_1(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html" + exp = ( + "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_2(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf" + exp = ( + "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ui.py b/tests/test_ui.py index 97ec44d..835f594 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -20,6 +20,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + CVF, HTML, JMLR, LocalFile, @@ -174,6 +175,11 @@ class TestUI(unittest.TestCase): "https://www.nature.com/articles/d41586-020-00176-4", "https://www.nature.com/articles/d41586-020-00176-4", ), + ( + CVF, + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) -- cgit v1.2.3