diff options
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 8 | ||||
| -rw-r--r-- | tests/test_arxiv.py | 35 |
2 files changed, 38 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 7f3d554..572c2bf 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -84,13 +84,13 @@ class Arxiv(Provider): skip_n -= 1 continue - if line.endswith(b" obj\n"): + if line.endswith(b" obj\n") or line.endswith(b" obj \n"): # Start a new object. Add it to the current object and # record its position for the xref table. current_obj.append(line) objid = int(line.split(b" ")[0]) xref[objid] = char_count - elif current_obj and line == b"endobj\n": + elif current_obj and line.startswith(b'endobj'): # End the current object. If needed, replace the arXiv # stamp in the block (done only once). Reset current # object. @@ -119,7 +119,7 @@ class Arxiv(Provider): elif current_obj: # If we're recording an object, simply add the line to it current_obj.append(line) - elif line == b"xref\n": + elif line in [b"xref\n", b"endobj xref\n"]: # We found the xref table, record its position and write it # out using our updated indices. startxref = sum(map(len, new_data)) @@ -159,7 +159,7 @@ def fix_stream_length(block): do_count = False for line in block: - if line in [b"stream", b"endstream"]: + if line.strip(b" ") in [b"stream", b"endstream"]: do_count = not do_count continue diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index beb9baa..08ea2c4 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -7,13 +7,28 @@ This file is part of paper2remarkable. """ +import os import re +import shutil +import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv class TestArxiv(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + def test_text_regex_1(self): key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" m = re.fullmatch(DEARXIV_TEXT_REGEX, key) @@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_stamp_removed_1(self): + url = "https://arxiv.org/pdf/1703.06103.pdf" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data) + + def test_stamp_removed_2(self): + url = "https://arxiv.org/abs/2003.06222" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + if __name__ == "__main__": unittest.main() |
