diff options
| -rw-r--r-- | paper2remarkable/providers/_base.py | 8 | ||||
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 10 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 18 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 14 | ||||
| -rw-r--r-- | tests/test_providers.py | 33 |
5 files changed, 66 insertions, 17 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index bdc9558..52e3b0e 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -15,7 +15,7 @@ import tempfile from ._info import Informer from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable +from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable, follow_redirects from ..log import Logger logger = Logger() @@ -82,7 +82,13 @@ class Provider(metaclass=abc.ABCMeta): download_url(pdf_url, filename) def run(self, src, filename=None): + # needed with library use + src = follow_redirects(src) + + # extract page and pdf file urls abs_url, pdf_url = self.get_abs_pdf_urls(src) + + # generate nice filename if needed clean_filename = filename or self.informer.get_filename(abs_url) tmp_filename = "paper.pdf" diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index e022658..1fd1795 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -19,6 +19,10 @@ from ..log import Logger logger = Logger() +DEARXIV_TEXT_REGEX = ( + b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}" +) + class ArxivInformer(Informer): pass @@ -73,11 +77,7 @@ class Arxiv(Provider): with open(uncompress_file, "rb") as fid: data = fid.read() # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) + data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data) # Remove the URL element data = re.sub( b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 5323996..2a30e7f 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -12,8 +12,8 @@ import argparse from . import __version__ -from .providers import providers -from .utils import exception +from .providers import providers, LocalFile +from .utils import exception, follow_redirects def parse_args(): @@ -78,8 +78,7 @@ def parse_args(): default="rmapi", ) parser.add_argument( - "input", - help="URL to a paper or the path of a local PDF file", + "input", help="URL to a paper or the path of a local PDF file" ) return parser.parse_args() @@ -87,7 +86,16 @@ def parse_args(): def main(): args = parse_args() - provider = next((p for p in providers if p.validate(args.input)), None) + if LocalFile.validate(args.input): + # input is a local file + provider = LocalFile + else: + # input is a url + url = args.input + # follow all redirects of the url + url = follow_redirects(url) + provider = next((p for p in providers if p.validate(url)), None) + if provider is None: exception("Input not valid, no provider can handle this source.") diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index a313ffe..1b6718e 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -95,6 +95,20 @@ def get_page_with_retry(url, tries=5): return res.content +def follow_redirects(url): + """Follow redirects from the URL (at most 10)""" + it = 0 + while it < 10: + req = requests.head(url, allow_redirects=False) + if req.status_code == 200: + break + if not "Location" in req.headers: + break + url = req.headers["Location"] + it += 1 + return url + + def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): logger.info("Starting upload to reMarkable") diff --git a/tests/test_providers.py b/tests/test_providers.py index bb793b3..1479967 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -5,11 +5,12 @@ __author__ = "G.J.J. van den Burg" """Tests""" -import unittest -import tempfile import hashlib -import shutil import os +import re +import shutil +import tempfile +import unittest from paper2remarkable.providers import ( ACM, @@ -20,8 +21,9 @@ from paper2remarkable.providers import ( PubMed, Springer, ) +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX -VERBOSE = True +VERBOSE = False def md5sum(filename): @@ -35,7 +37,19 @@ def md5sum(filename): return hasher.hexdigest() -class Tests(unittest.TestCase): +class TestArxiv(unittest.TestCase): + def test_text_regex_1(self): + key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_2(self): + key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + +class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): cls.original_dir = os.getcwd() @@ -48,13 +62,20 @@ class Tests(unittest.TestCase): os.chdir(self.original_dir) shutil.rmtree(self.test_dir) - def test_arxiv(self): + def test_arxiv_1(self): prov = Arxiv(upload=False, verbose=VERBOSE) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_2(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "http://arxiv.org/abs/arXiv:1908.03213" + exp_filename = "Ecker_et_al_-_Gravitational_Waves_From_Holographic_Neutron_Star_Mergers_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" |
