2 files changed, 38 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 7f3d554..572c2bf 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -84,13 +84,13 @@ class Arxiv(Provider):
                     skip_n -= 1
                     continue
 
-                if line.endswith(b" obj\n"):
+                if line.endswith(b" obj\n") or line.endswith(b" obj \n"):
                     # Start a new object. Add it to the current object and
                     # record its position for the xref table.
                     current_obj.append(line)
                     objid = int(line.split(b" ")[0])
                     xref[objid] = char_count
-                elif current_obj and line == b"endobj\n":
+                elif current_obj and line.startswith(b'endobj'):
                     # End the current object. If needed, replace the arXiv
                     # stamp in the block (done only once). Reset current
                     # object.
@@ -119,7 +119,7 @@ class Arxiv(Provider):
                 elif current_obj:
                     # If we're recording an object, simply add the line to it
                     current_obj.append(line)
-                elif line == b"xref\n":
+                elif line in [b"xref\n", b"endobj xref\n"]:
                     # We found the xref table, record its position and write it
                     # out using our updated indices.
                     startxref = sum(map(len, new_data))
@@ -159,7 +159,7 @@ def fix_stream_length(block):
     do_count = False
 
     for line in block:
-        if line in [b"stream", b"endstream"]:
+        if line.strip(b" ") in [b"stream", b"endstream"]:
             do_count = not do_count
             continue
 
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index beb9baa..08ea2c4 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -7,13 +7,28 @@ This file is part of paper2remarkable.
 
 """
 
+import os
 import re
+import shutil
+import tempfile
 import unittest
 
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
+from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
 
 
 class TestArxiv(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.original_dir = os.getcwd()
+
+    def setUp(self):
+        self.test_dir = tempfile.mkdtemp()
+        os.chdir(self.test_dir)
+
+    def tearDown(self):
+        os.chdir(self.original_dir)
+        shutil.rmtree(self.test_dir)
+
     def test_text_regex_1(self):
         key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
         m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
@@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase):
         m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
         self.assertIsNotNone(m)
 
+    def test_stamp_removed_1(self):
+        url = "https://arxiv.org/pdf/1703.06103.pdf"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(b"arXiv:1703.06103v4  [stat.ML]  26 Oct 2017", data)
+
+    def test_stamp_removed_2(self):
+        url = "https://arxiv.org/abs/2003.06222"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(b"arXiv:2003.06222v1  [stat.ML]  13 Mar 2020", data)
+
 
 if __name__ == "__main__":
     unittest.main()