aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/arxiv.py8
-rw-r--r--tests/test_arxiv.py35
2 files changed, 38 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 7f3d554..572c2bf 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -84,13 +84,13 @@ class Arxiv(Provider):
skip_n -= 1
continue
- if line.endswith(b" obj\n"):
+ if line.endswith(b" obj\n") or line.endswith(b" obj \n"):
# Start a new object. Add it to the current object and
# record its position for the xref table.
current_obj.append(line)
objid = int(line.split(b" ")[0])
xref[objid] = char_count
- elif current_obj and line == b"endobj\n":
+ elif current_obj and line.startswith(b'endobj'):
# End the current object. If needed, replace the arXiv
# stamp in the block (done only once). Reset current
# object.
@@ -119,7 +119,7 @@ class Arxiv(Provider):
elif current_obj:
# If we're recording an object, simply add the line to it
current_obj.append(line)
- elif line == b"xref\n":
+ elif line in [b"xref\n", b"endobj xref\n"]:
# We found the xref table, record its position and write it
# out using our updated indices.
startxref = sum(map(len, new_data))
@@ -159,7 +159,7 @@ def fix_stream_length(block):
do_count = False
for line in block:
- if line in [b"stream", b"endstream"]:
+ if line.strip(b" ") in [b"stream", b"endstream"]:
do_count = not do_count
continue
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index beb9baa..08ea2c4 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -7,13 +7,28 @@ This file is part of paper2remarkable.
"""
+import os
import re
+import shutil
+import tempfile
import unittest
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
+from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
class TestArxiv(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
def test_text_regex_1(self):
key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
@@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase):
m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
self.assertIsNotNone(m)
+ def test_stamp_removed_1(self):
+ url = "https://arxiv.org/pdf/1703.06103.pdf"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data)
+
+ def test_stamp_removed_2(self):
+ url = "https://arxiv.org/abs/2003.06222"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data)
+
if __name__ == "__main__":
unittest.main()