Merge branch 'bugfix/arXiv_stamp'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 20:03:00 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 20:03:00 +0100
commit: 62e2b5e8fd42ff7857c555cf45f6bd688c1e527f (patch)
tree: cb5ecd748b3d6b124e8c45a54c07122ac789d31e
parent: Code formatting with Black (diff)
parent: Fix the pdf output of dearxiv (diff)
download: paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.tar.gz
paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.zip
2 files changed, 98 insertions, 18 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 53ad78e..20349c2 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -122,16 +122,11 @@ class Provider(metaclass=abc.ABCMeta):
                 [self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
             )
         elif self.pdftool == "qpdf":
-            # TODO: the status == 3 is only needed because when we remove
-            # the arXiv stamp we don't fix the length of the pdf object. This
-            # causes qpdf to raise a warning and give a nonzero exit status.
-            # Fixing the pdf object is the right approach, but this does
-            # work as it is since qpdf fixes the file for us.
             status = subprocess.call(
                 [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,],
                 stderr=subprocess.DEVNULL,
             )
-        if not (status == 0 or status == 3):
+        if not status == 0:
             raise _CalledProcessError(
                 "%s failed to compress the PDF file." % self.pdftool
             )
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 74043ed..7f3d554 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -71,22 +71,107 @@ class Arxiv(Provider):
         uncompress_file = basename + "_uncompress.pdf"
         self.uncompress_pdf(input_file, uncompress_file)
 
-        with open(uncompress_file, "rb") as fid:
-            data = fid.read()
-            # Remove the text element
-            data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
-            # Remove the URL element
-            data = re.sub(
-                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
-                b"",
-                data,
-            )
+        new_data = []
+        current_obj = []
+        replaced_arXiv = False
+        char_count = skip_n = startxref = 0
+        xref = {}
+
+        with open(uncompress_file, "rb") as fp:
+            for line in fp:
+                if skip_n:
+                    # Skip a line
+                    skip_n -= 1
+                    continue
+
+                if line.endswith(b" obj\n"):
+                    # Start a new object. Add it to the current object and
+                    # record its position for the xref table.
+                    current_obj.append(line)
+                    objid = int(line.split(b" ")[0])
+                    xref[objid] = char_count
+                elif current_obj and line == b"endobj\n":
+                    # End the current object. If needed, replace the arXiv
+                    # stamp in the block (done only once). Reset current
+                    # object.
+                    current_obj.append(line)
+                    block = b"".join(current_obj)
+                    if not replaced_arXiv:
+                        # remove the text
+                        block, n_subs1 = re.subn(
+                            b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
+                            b"()Tj",
+                            block,
+                        )
+                        # remove the url
+                        block, n_subs2 = re.subn(
+                            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+                            b"",
+                            block,
+                        )
+                        if n_subs1 or n_subs2:
+                            # fix the length of the object stream
+                            block = fix_stream_length(block)
+                            replaced_arXiv = True
+                    new_data.append(block)
+                    char_count += len(block)
+                    current_obj = []
+                elif current_obj:
+                    # If we're recording an object, simply add the line to it
+                    current_obj.append(line)
+                elif line == b"xref\n":
+                    # We found the xref table, record its position and write it
+                    # out using our updated indices.
+                    startxref = sum(map(len, new_data))
+                    new_data.append(line)
+                    new_data.append(b"0 %i\n" % (len(xref) + 1))
+                    new_data.append(b"0000000000 65535 f \n")
+                    for objid in sorted(xref):
+                        new_data.append(b"%010d 00000 n \n" % xref[objid])
+
+                    # skip the appropriate number of lines
+                    skip_n = len(xref) + 2
+                elif line == b"startxref\n":
+                    # Write out our recorded startxref position, skip the old
+                    # position.
+                    new_data.append(b"startxref\n%i\n" % startxref)
+                    skip_n = 1
+                else:
+                    # Anything else passes through
+                    new_data.append(line)
+                    char_count += len(line)
 
         removed_file = basename + "_removed.pdf"
-        with open(removed_file, "wb") as oid:
-            oid.write(data)
+        with open(removed_file, "wb") as fp:
+            fp.write(b"".join(new_data))
 
         output_file = basename + "_dearxiv.pdf"
         self.compress_pdf(removed_file, output_file)
 
         return output_file
+
+
+def fix_stream_length(block):
+    # This fixes the stream length of a block, which is needed after we have
+    # removed the arXiv stamp.
+    count = 0
+    block = block.split(b"\n")
+    do_count = False
+
+    for line in block:
+        if line in [b"stream", b"endstream"]:
+            do_count = not do_count
+            continue
+
+        if do_count:
+            # +1 for the newline character
+            count += len(line) + 1
+
+    new_block = []
+    for line in block:
+        if b" /Length " in line:
+            new_block.append(b"<< /Length %i >>" % count)
+        else:
+            new_block.append(line)
+
+    return b"\n".join(new_block)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 20:03:00 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 20:03:00 +0100
commit	62e2b5e8fd42ff7857c555cf45f6bd688c1e527f (patch)
tree	cb5ecd748b3d6b124e8c45a54c07122ac789d31e
parent	Code formatting with Black (diff)
parent	Fix the pdf output of dearxiv (diff)
download	paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.tar.gz paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.zip