diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-25 16:26:23 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-25 16:26:23 +0100 |
| commit | a284f4035416590f875ba9996ec5673affab5da4 (patch) | |
| tree | 1f18660cea9bb6bc441a6600a1a77d509784cc8c | |
| parent | Fix for alternative arXiv urls (#7) (diff) | |
| download | paper2remarkable-a284f4035416590f875ba9996ec5673affab5da4.tar.gz paper2remarkable-a284f4035416590f875ba9996ec5673affab5da4.zip | |
Fix arxiv stamp removal regex and add tests
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 10 | ||||
| -rw-r--r-- | tests/test_providers.py | 22 |
2 files changed, 23 insertions, 9 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index e022658..1fd1795 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -19,6 +19,10 @@ from ..log import Logger logger = Logger() +DEARXIV_TEXT_REGEX = ( + b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}" +) + class ArxivInformer(Informer): pass @@ -73,11 +77,7 @@ class Arxiv(Provider): with open(uncompress_file, "rb") as fid: data = fid.read() # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) + data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data) # Remove the URL element data = re.sub( b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", diff --git a/tests/test_providers.py b/tests/test_providers.py index 143fc78..1479967 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -5,11 +5,12 @@ __author__ = "G.J.J. van den Burg" """Tests""" -import unittest -import tempfile import hashlib -import shutil import os +import re +import shutil +import tempfile +import unittest from paper2remarkable.providers import ( ACM, @@ -20,8 +21,9 @@ from paper2remarkable.providers import ( PubMed, Springer, ) +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX -VERBOSE = True +VERBOSE = False def md5sum(filename): @@ -35,6 +37,18 @@ def md5sum(filename): return hasher.hexdigest() +class TestArxiv(unittest.TestCase): + def test_text_regex_1(self): + key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_2(self): + key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): |
