Be more robust against spaces in pdf file

This caused problems where the arxiv stamp was not removed for some files. This commit adds tests for this and fixes the issue.
author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-08 21:15:48 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-08 21:15:48 +0100
commit: 62d72c8c073376a036df66d872ffd6149374fd7b (patch)
tree: c37ac1936af86c2b3c82af7fd96260ff0a09c51b
parent: Change wd back to initial directory (diff)
download: paper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.tar.gz
paper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.zip
2 files changed, 38 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 7f3d554..572c2bf 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -84,13 +84,13 @@ class Arxiv(Provider):
                     skip_n -= 1
                     continue
 
-                if line.endswith(b" obj\n"):
+                if line.endswith(b" obj\n") or line.endswith(b" obj \n"):
                     # Start a new object. Add it to the current object and
                     # record its position for the xref table.
                     current_obj.append(line)
                     objid = int(line.split(b" ")[0])
                     xref[objid] = char_count
-                elif current_obj and line == b"endobj\n":
+                elif current_obj and line.startswith(b'endobj'):
                     # End the current object. If needed, replace the arXiv
                     # stamp in the block (done only once). Reset current
                     # object.
@@ -119,7 +119,7 @@ class Arxiv(Provider):
                 elif current_obj:
                     # If we're recording an object, simply add the line to it
                     current_obj.append(line)
-                elif line == b"xref\n":
+                elif line in [b"xref\n", b"endobj xref\n"]:
                     # We found the xref table, record its position and write it
                     # out using our updated indices.
                     startxref = sum(map(len, new_data))
@@ -159,7 +159,7 @@ def fix_stream_length(block):
     do_count = False
 
     for line in block:
-        if line in [b"stream", b"endstream"]:
+        if line.strip(b" ") in [b"stream", b"endstream"]:
             do_count = not do_count
             continue
 
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index beb9baa..08ea2c4 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -7,13 +7,28 @@ This file is part of paper2remarkable.
 
 """
 
+import os
 import re
+import shutil
+import tempfile
 import unittest
 
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
+from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
 
 
 class TestArxiv(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.original_dir = os.getcwd()
+
+    def setUp(self):
+        self.test_dir = tempfile.mkdtemp()
+        os.chdir(self.test_dir)
+
+    def tearDown(self):
+        os.chdir(self.original_dir)
+        shutil.rmtree(self.test_dir)
+
     def test_text_regex_1(self):
         key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
         m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
@@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase):
         m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
         self.assertIsNotNone(m)
 
+    def test_stamp_removed_1(self):
+        url = "https://arxiv.org/pdf/1703.06103.pdf"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(b"arXiv:1703.06103v4  [stat.ML]  26 Oct 2017", data)
+
+    def test_stamp_removed_2(self):
+        url = "https://arxiv.org/abs/2003.06222"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(b"arXiv:2003.06222v1  [stat.ML]  13 Mar 2020", data)
+
 
 if __name__ == "__main__":
     unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-08 21:15:48 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-08 21:15:48 +0100
commit	62d72c8c073376a036df66d872ffd6149374fd7b (patch)
tree	c37ac1936af86c2b3c82af7fd96260ff0a09c51b
parent	Change wd back to initial directory (diff)
download	paper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.tar.gz paper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.zip