aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-25 16:26:23 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-25 16:26:23 +0100
commita284f4035416590f875ba9996ec5673affab5da4 (patch)
tree1f18660cea9bb6bc441a6600a1a77d509784cc8c
parentFix for alternative arXiv urls (#7) (diff)
downloadpaper2remarkable-a284f4035416590f875ba9996ec5673affab5da4.tar.gz
paper2remarkable-a284f4035416590f875ba9996ec5673affab5da4.zip
Fix arxiv stamp removal regex and add tests
-rw-r--r--paper2remarkable/providers/arxiv.py10
-rw-r--r--tests/test_providers.py22
2 files changed, 23 insertions, 9 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index e022658..1fd1795 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -19,6 +19,10 @@ from ..log import Logger
logger = Logger()
+DEARXIV_TEXT_REGEX = (
+ b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}"
+)
+
class ArxivInformer(Informer):
pass
@@ -73,11 +77,7 @@ class Arxiv(Provider):
with open(uncompress_file, "rb") as fid:
data = fid.read()
# Remove the text element
- data = re.sub(
- b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
- b"()Tj",
- data,
- )
+ data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
# Remove the URL element
data = re.sub(
b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 143fc78..1479967 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -5,11 +5,12 @@ __author__ = "G.J.J. van den Burg"
"""Tests"""
-import unittest
-import tempfile
import hashlib
-import shutil
import os
+import re
+import shutil
+import tempfile
+import unittest
from paper2remarkable.providers import (
ACM,
@@ -20,8 +21,9 @@ from paper2remarkable.providers import (
PubMed,
Springer,
)
+from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
-VERBOSE = True
+VERBOSE = False
def md5sum(filename):
@@ -35,6 +37,18 @@ def md5sum(filename):
return hasher.hexdigest()
+class TestArxiv(unittest.TestCase):
+ def test_text_regex_1(self):
+ key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_text_regex_2(self):
+ key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+
class TestProviders(unittest.TestCase):
@classmethod
def setUpClass(cls):