aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 17:20:47 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 17:20:47 +0100
commited9b8252a2361604331f7a275a7625b3de9017ff (patch)
treed0509e42636103a18eab7c790ac964c4548e9f07
parentMerge branch 'bugfix/springer_redirect' (diff)
downloadpaper2remarkable-ed9b8252a2361604331f7a275a7625b3de9017ff.tar.gz
paper2remarkable-ed9b8252a2361604331f7a275a7625b3de9017ff.zip
Fix provider selection for redirected urls
Some urls, such as the arXiv urls with the : in the identifier, didn't work when using the UI interface because the redirected url wasn't past to the provider, but the original url was. This commit fixes that issue and adds unit tests for the provider selection function, hopefully making this more robust in the future.
-rw-r--r--paper2remarkable/exceptions.py22
-rw-r--r--paper2remarkable/ui.py71
-rw-r--r--tests/test_ui.py203
3 files changed, 278 insertions, 18 deletions
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index 5ea9a78..94e9470 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -116,3 +116,25 @@ class NoPDFToolError(Error):
)
msg += GH_MSG
return msg
+
+class UnidentifiedSourceError(Error):
+ """Exception raised when the input is neither a local file nor a url """
+
+ def __str__(self):
+ msg = (
+ "ERROR: Couldn't figure out what source you mean. If it's a "
+ "local file, please make sure it exists."
+ )
+ msg += GH_MSG
+ return msg
+
+class InvalidURLError(Error):
+ """Exception raised when no provider can handle a url source """
+
+ def __str__(self):
+ msg = (
+ "ERROR: Input URL is not valid, no provider can handle "
+ "this source."
+ )
+ msg += GH_MSG
+ return msg
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index bfb3647..bf57552 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -13,6 +13,7 @@ import sys
from . import __version__, GITHUB_URL
+from .exceptions import UnidentifiedSourceError, InvalidURLError
from .providers import providers, LocalFile
from .utils import follow_redirects, is_url
@@ -118,9 +119,58 @@ def exception(msg):
raise SystemExit(1)
+def choose_provider(cli_input):
+ """Choose the provider to use for the given source
+
+ This function first tries to check if the input is a local file, by
+ checking if the path exists. Next, it checks if the input is a "valid" url
+ using a regex test. If it is, the registered provider classes are checked
+ to see which provider can handle this url.
+
+ Returns
+ -------
+ provider : class
+ The class of the provider than can handle the source. A subclass of the
+ Provider abc.
+
+ new_input : str
+ The updated input to the provider. This only has an effect for the url
+ providers, where this will be the url after following all redirects.
+
+ cookiejar : dict or requests.RequestsCookieJar
+ Cookies picked up when following redirects. These are needed for some
+ providers to ensure later requests have the right cookie settings.
+
+ Raises
+ ------
+ UnidentifiedSourceError
+ Raised when the input is neither an existing local file nor a valid url
+
+ InvalidURLError
+ Raised when the input *is* a valid url, but no provider can handle it.
+
+ """
+ provider = cookiejar = None
+ if LocalFile.validate(cli_input):
+ # input is a local file
+ new_input = cli_input
+ provider = LocalFile
+ elif is_url(cli_input):
+ # input is a url
+ new_input, cookiejar = follow_redirects(cli_input)
+ provider = next((p for p in providers if p.validate(new_input)), None)
+ else:
+ # not a proper URL or non-existent file
+ raise UnidentifiedSourceError
+
+ if provider is None:
+ raise InvalidURLError
+
+ return provider, new_input, cookiejar
+
+
def main():
args = parse_args()
- cookiejar = None
if args.center and args.right:
exception("Can't center and right align at the same time!")
@@ -131,22 +181,7 @@ def main():
if args.right and args.no_crop:
exception("Can't right align and not crop at the same time!")
- if LocalFile.validate(args.input):
- # input is a local file
- provider = LocalFile
- elif is_url(args.input):
- # input is a url
- url, cookiejar = follow_redirects(args.input)
- provider = next((p for p in providers if p.validate(url)), None)
- else:
- # not a proper URL or non-existent file
- exception(
- "Couldn't figure out what source you mean. If it's a "
- "local file, make sure it exists."
- )
-
- if provider is None:
- exception("Input not valid, no provider can handle this source.")
+ provider, new_input, cookiejar = choose_provider(args.input)
prov = provider(
verbose=args.verbose,
@@ -165,4 +200,4 @@ def main():
cookiejar=cookiejar,
)
- prov.run(args.input, filename=args.filename)
+ prov.run(new_input, filename=args.filename)
diff --git a/tests/test_ui.py b/tests/test_ui.py
new file mode 100644
index 0000000..fc362a0
--- /dev/null
+++ b/tests/test_ui.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Unit tests for command line interface
+
+This file is part of paper2remarkable.
+
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from paper2remarkable.exceptions import (
+ InvalidURLError,
+ UnidentifiedSourceError,
+)
+from paper2remarkable.providers import (
+ ACM,
+ Arxiv,
+ CiteSeerX,
+ HTML,
+ JMLR,
+ LocalFile,
+ NBER,
+ NeurIPS,
+ OpenReview,
+ PMLR,
+ PdfUrl,
+ PubMed,
+ Springer,
+)
+from paper2remarkable.ui import choose_provider
+
+
+class TestUI(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
+ def test_choose_provider_1(self):
+ tests = [
+ (
+ Arxiv,
+ "https://arxiv.org/abs/1811.11242v1",
+ "https://arxiv.org/abs/1811.11242v1",
+ ),
+ (
+ Arxiv,
+ "http://arxiv.org/abs/arXiv:1908.03213",
+ "https://arxiv.org/abs/1908.03213",
+ ),
+ (
+ Arxiv,
+ "https://arxiv.org/abs/math/0309285",
+ "https://arxiv.org/abs/math/0309285",
+ ),
+ (
+ Arxiv,
+ "https://arxiv.org/pdf/physics/0605197v1.pdf",
+ "https://arxiv.org/pdf/physics/0605197v1.pdf",
+ ),
+ (
+ PubMed,
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/",
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/",
+ ),
+ (
+ ACM,
+ "https://dl.acm.org/citation.cfm?id=3025626",
+ "https://dl.acm.org/doi/10.1145/3025453.3025626",
+ ),
+ (
+ ACM,
+ "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true",
+ "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&",
+ ),
+ (
+ OpenReview,
+ "http://openreview.net/forum?id=S1x4ghC9tQ",
+ "https://openreview.net/forum?id=S1x4ghC9tQ",
+ ),
+ (
+ Springer,
+ "https://link.springer.com/article/10.1007/s10618-019-00631-5",
+ "https://link.springer.com/article/10.1007/s10618-019-00631-5",
+ ),
+ (
+ PdfUrl,
+ "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf",
+ "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf",
+ ),
+ (
+ JMLR,
+ "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+ "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+ ),
+ (
+ JMLR,
+ "http://www.jmlr.org/papers/v10/xu09a.html",
+ "http://www.jmlr.org/papers/v10/xu09a.html",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v97/behrmann19a.html",
+ "http://proceedings.mlr.press/v97/behrmann19a.html",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf",
+ "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v48/melnyk16.pdf",
+ "http://proceedings.mlr.press/v48/melnyk16.pdf",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v48/zhangf16.html",
+ "http://proceedings.mlr.press/v48/zhangf16.html",
+ ),
+ (
+ NBER,
+ "https://www.nber.org/papers/w26752",
+ "https://www.nber.org/papers/w26752",
+ ),
+ (
+ NBER,
+ "https://www.nber.org/papers/w19152.pdf",
+ "https://www.nber.org/papers/w19152.pdf",
+ ),
+ (
+ NeurIPS,
+ "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf",
+ "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf",
+ ),
+ (
+ NeurIPS,
+ "https://papers.nips.cc/paper/7796-middle-out-decoding",
+ "https://papers.nips.cc/paper/7796-middle-out-decoding",
+ ),
+ (
+ CiteSeerX,
+ "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",
+ "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",
+ ),
+ (
+ CiteSeerX,
+ "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf",
+ "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf",
+ ),
+ (
+ HTML,
+ "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines",
+ "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
+ ),
+ (
+ HTML,
+ "https://www.nature.com/articles/d41586-020-00176-4",
+ "https://www.nature.com/articles/d41586-020-00176-4"
+ ),
+ ]
+ for exp_prov, url, exp_url in tests:
+ prov, new_url, jar = choose_provider(url)
+ with self.subTest(url=url):
+ self.assertEqual(exp_url, new_url)
+ self.assertEqual(prov, exp_prov)
+
+ def test_choose_provider_2(self):
+ local_filename = "test.pdf"
+ with open(local_filename, "w") as fp:
+ fp.write(
+ "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF"
+ )
+
+ prov, new_input, jar = choose_provider(local_filename)
+ self.assertEqual(prov, LocalFile)
+ self.assertEqual(new_input, local_filename)
+ self.assertIsNone(jar)
+
+ def test_choose_provider_3(self):
+ local_filename = "/tmp/abcdef.pdf"
+ with self.assertRaises(UnidentifiedSourceError):
+ choose_provider(local_filename)
+
+ def test_choose_provider_4(self):
+ url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md"
+ with self.assertRaises(InvalidURLError):
+ choose_provider(url)
+
+
+if __name__ == "__main__":
+ unittest.main()