diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 17:29:23 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 17:29:23 +0100 |
| commit | 9e262c7e15f13920524deda1e9354069a8dbf5f1 (patch) | |
| tree | d0509e42636103a18eab7c790ac964c4548e9f07 | |
| parent | Merge branch 'bugfix/springer_redirect' (diff) | |
| parent | Fix provider selection for redirected urls (diff) | |
| download | paper2remarkable-9e262c7e15f13920524deda1e9354069a8dbf5f1.tar.gz paper2remarkable-9e262c7e15f13920524deda1e9354069a8dbf5f1.zip | |
Merge branch 'bugfix/choose_provider'
| -rw-r--r-- | paper2remarkable/exceptions.py | 22 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 71 | ||||
| -rw-r--r-- | tests/test_ui.py | 203 |
3 files changed, 278 insertions, 18 deletions
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 5ea9a78..94e9470 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -116,3 +116,25 @@ class NoPDFToolError(Error): ) msg += GH_MSG return msg + +class UnidentifiedSourceError(Error): + """Exception raised when the input is neither a local file nor a url """ + + def __str__(self): + msg = ( + "ERROR: Couldn't figure out what source you mean. If it's a " + "local file, please make sure it exists." + ) + msg += GH_MSG + return msg + +class InvalidURLError(Error): + """Exception raised when no provider can handle a url source """ + + def __str__(self): + msg = ( + "ERROR: Input URL is not valid, no provider can handle " + "this source." + ) + msg += GH_MSG + return msg diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index bfb3647..bf57552 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,6 +13,7 @@ import sys from . import __version__, GITHUB_URL +from .exceptions import UnidentifiedSourceError, InvalidURLError from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -118,9 +119,58 @@ def exception(msg): raise SystemExit(1) +def choose_provider(cli_input): + """Choose the provider to use for the given source + + This function first tries to check if the input is a local file, by + checking if the path exists. Next, it checks if the input is a "valid" url + using a regex test. If it is, the registered provider classes are checked + to see which provider can handle this url. + + Returns + ------- + provider : class + The class of the provider than can handle the source. A subclass of the + Provider abc. + + new_input : str + The updated input to the provider. This only has an effect for the url + providers, where this will be the url after following all redirects. + + cookiejar : dict or requests.RequestsCookieJar + Cookies picked up when following redirects. These are needed for some + providers to ensure later requests have the right cookie settings. + + Raises + ------ + UnidentifiedSourceError + Raised when the input is neither an existing local file nor a valid url + + InvalidURLError + Raised when the input *is* a valid url, but no provider can handle it. + + """ + provider = cookiejar = None + if LocalFile.validate(cli_input): + # input is a local file + new_input = cli_input + provider = LocalFile + elif is_url(cli_input): + # input is a url + new_input, cookiejar = follow_redirects(cli_input) + provider = next((p for p in providers if p.validate(new_input)), None) + else: + # not a proper URL or non-existent file + raise UnidentifiedSourceError + + if provider is None: + raise InvalidURLError + + return provider, new_input, cookiejar + + def main(): args = parse_args() - cookiejar = None if args.center and args.right: exception("Can't center and right align at the same time!") @@ -131,22 +181,7 @@ def main(): if args.right and args.no_crop: exception("Can't right align and not crop at the same time!") - if LocalFile.validate(args.input): - # input is a local file - provider = LocalFile - elif is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = next((p for p in providers if p.validate(url)), None) - else: - # not a proper URL or non-existent file - exception( - "Couldn't figure out what source you mean. If it's a " - "local file, make sure it exists." - ) - - if provider is None: - exception("Input not valid, no provider can handle this source.") + provider, new_input, cookiejar = choose_provider(args.input) prov = provider( verbose=args.verbose, @@ -165,4 +200,4 @@ def main(): cookiejar=cookiejar, ) - prov.run(args.input, filename=args.filename) + prov.run(new_input, filename=args.filename) diff --git a/tests/test_ui.py b/tests/test_ui.py new file mode 100644 index 0000000..fc362a0 --- /dev/null +++ b/tests/test_ui.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for command line interface + +This file is part of paper2remarkable. + +""" + +import os +import shutil +import tempfile +import unittest + +from paper2remarkable.exceptions import ( + InvalidURLError, + UnidentifiedSourceError, +) +from paper2remarkable.providers import ( + ACM, + Arxiv, + CiteSeerX, + HTML, + JMLR, + LocalFile, + NBER, + NeurIPS, + OpenReview, + PMLR, + PdfUrl, + PubMed, + Springer, +) +from paper2remarkable.ui import choose_provider + + +class TestUI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_choose_provider_1(self): + tests = [ + ( + Arxiv, + "https://arxiv.org/abs/1811.11242v1", + "https://arxiv.org/abs/1811.11242v1", + ), + ( + Arxiv, + "http://arxiv.org/abs/arXiv:1908.03213", + "https://arxiv.org/abs/1908.03213", + ), + ( + Arxiv, + "https://arxiv.org/abs/math/0309285", + "https://arxiv.org/abs/math/0309285", + ), + ( + Arxiv, + "https://arxiv.org/pdf/physics/0605197v1.pdf", + "https://arxiv.org/pdf/physics/0605197v1.pdf", + ), + ( + PubMed, + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + ), + ( + ACM, + "https://dl.acm.org/citation.cfm?id=3025626", + "https://dl.acm.org/doi/10.1145/3025453.3025626", + ), + ( + ACM, + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true", + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&", + ), + ( + OpenReview, + "http://openreview.net/forum?id=S1x4ghC9tQ", + "https://openreview.net/forum?id=S1x4ghC9tQ", + ), + ( + Springer, + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + ), + ( + PdfUrl, + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/v10/xu09a.html", + "http://www.jmlr.org/papers/v10/xu09a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v97/behrmann19a.html", + "http://proceedings.mlr.press/v97/behrmann19a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/melnyk16.pdf", + "http://proceedings.mlr.press/v48/melnyk16.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/zhangf16.html", + "http://proceedings.mlr.press/v48/zhangf16.html", + ), + ( + NBER, + "https://www.nber.org/papers/w26752", + "https://www.nber.org/papers/w26752", + ), + ( + NBER, + "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/papers/w19152.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/7796-middle-out-decoding", + "https://papers.nips.cc/paper/7796-middle-out-decoding", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + ), + ( + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4" + ), + ] + for exp_prov, url, exp_url in tests: + prov, new_url, jar = choose_provider(url) + with self.subTest(url=url): + self.assertEqual(exp_url, new_url) + self.assertEqual(prov, exp_prov) + + def test_choose_provider_2(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + + prov, new_input, jar = choose_provider(local_filename) + self.assertEqual(prov, LocalFile) + self.assertEqual(new_input, local_filename) + self.assertIsNone(jar) + + def test_choose_provider_3(self): + local_filename = "/tmp/abcdef.pdf" + with self.assertRaises(UnidentifiedSourceError): + choose_provider(local_filename) + + def test_choose_provider_4(self): + url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md" + with self.assertRaises(InvalidURLError): + choose_provider(url) + + +if __name__ == "__main__": + unittest.main() |
