aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-08 21:15:48 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-08 21:15:48 +0100
commit62d72c8c073376a036df66d872ffd6149374fd7b (patch)
treec37ac1936af86c2b3c82af7fd96260ff0a09c51b
parentChange wd back to initial directory (diff)
downloadpaper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.tar.gz
paper2remarkable-62d72c8c073376a036df66d872ffd6149374fd7b.zip
Be more robust against spaces in pdf file
This caused problems where the arxiv stamp was not removed for some files. This commit adds tests for this and fixes the issue.
-rw-r--r--paper2remarkable/providers/arxiv.py8
-rw-r--r--tests/test_arxiv.py35
2 files changed, 38 insertions, 5 deletions
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 7f3d554..572c2bf 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -84,13 +84,13 @@ class Arxiv(Provider):
skip_n -= 1
continue
- if line.endswith(b" obj\n"):
+ if line.endswith(b" obj\n") or line.endswith(b" obj \n"):
# Start a new object. Add it to the current object and
# record its position for the xref table.
current_obj.append(line)
objid = int(line.split(b" ")[0])
xref[objid] = char_count
- elif current_obj and line == b"endobj\n":
+ elif current_obj and line.startswith(b'endobj'):
# End the current object. If needed, replace the arXiv
# stamp in the block (done only once). Reset current
# object.
@@ -119,7 +119,7 @@ class Arxiv(Provider):
elif current_obj:
# If we're recording an object, simply add the line to it
current_obj.append(line)
- elif line == b"xref\n":
+ elif line in [b"xref\n", b"endobj xref\n"]:
# We found the xref table, record its position and write it
# out using our updated indices.
startxref = sum(map(len, new_data))
@@ -159,7 +159,7 @@ def fix_stream_length(block):
do_count = False
for line in block:
- if line in [b"stream", b"endstream"]:
+ if line.strip(b" ") in [b"stream", b"endstream"]:
do_count = not do_count
continue
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index beb9baa..08ea2c4 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -7,13 +7,28 @@ This file is part of paper2remarkable.
"""
+import os
import re
+import shutil
+import tempfile
import unittest
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
+from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
class TestArxiv(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
def test_text_regex_1(self):
key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
@@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase):
m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
self.assertIsNotNone(m)
+ def test_stamp_removed_1(self):
+ url = "https://arxiv.org/pdf/1703.06103.pdf"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data)
+
+ def test_stamp_removed_2(self):
+ url = "https://arxiv.org/abs/2003.06222"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data)
+
if __name__ == "__main__":
unittest.main()