Fix the pdf output of dearxiv

The dearxiv function removes the arXiv stamp from the pdf, but in the past would return a somewhat broken pdf. When it deleted the stamp the positions of objects and the length of the stream of the arXiv stamp object would be incorrect. This wasn't an issue for pdftk but gives a warning for qpdf. Because we don't want to mask qpdf warnings in general (something might actually be wrong), it was desirable to return a valid pdf file from dearxiv. This commit does exactly that.
author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 19:54:05 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 19:54:05 +0100
commit: fdbc77f3832b9071359b8e2e73ea1570ef691718 (patch)
tree: cb5ecd748b3d6b124e8c45a54c07122ac789d31e
parent: Code formatting with Black (diff)
download: paper2remarkable-fdbc77f3832b9071359b8e2e73ea1570ef691718.tar.gz
paper2remarkable-fdbc77f3832b9071359b8e2e73ea1570ef691718.zip
2 files changed, 98 insertions, 18 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 53ad78e..20349c2 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -122,16 +122,11 @@ class Provider(metaclass=abc.ABCMeta):
                 [self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
             )
         elif self.pdftool == "qpdf":
-            # TODO: the status == 3 is only needed because when we remove
-            # the arXiv stamp we don't fix the length of the pdf object. This
-            # causes qpdf to raise a warning and give a nonzero exit status.
-            # Fixing the pdf object is the right approach, but this does
-            # work as it is since qpdf fixes the file for us.
             status = subprocess.call(
                 [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,],
                 stderr=subprocess.DEVNULL,
             )
-        if not (status == 0 or status == 3):
+        if not status == 0:
             raise _CalledProcessError(
                 "%s failed to compress the PDF file." % self.pdftool
             )
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 74043ed..7f3d554 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -71,22 +71,107 @@ class Arxiv(Provider):
         uncompress_file = basename + "_uncompress.pdf"
         self.uncompress_pdf(input_file, uncompress_file)
 
-        with open(uncompress_file, "rb") as fid:
-            data = fid.read()
-            # Remove the text element
-            data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
-            # Remove the URL element
-            data = re.sub(
-                b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
-                b"",
-                data,
-            )
+        new_data = []
+        current_obj = []
+        replaced_arXiv = False
+        char_count = skip_n = startxref = 0
+        xref = {}
+
+        with open(uncompress_file, "rb") as fp:
+            for line in fp:
+                if skip_n:
+                    # Skip a line
+                    skip_n -= 1
+                    continue
+
+                if line.endswith(b" obj\n"):
+                    # Start a new object. Add it to the current object and
+                    # record its position for the xref table.
+                    current_obj.append(line)
+                    objid = int(line.split(b" ")[0])
+                    xref[objid] = char_count
+                elif current_obj and line == b"endobj\n":
+                    # End the current object. If needed, replace the arXiv
+                    # stamp in the block (done only once). Reset current
+                    # object.
+                    current_obj.append(line)
+                    block = b"".join(current_obj)
+                    if not replaced_arXiv:
+                        # remove the text
+                        block, n_subs1 = re.subn(
+                            b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
+                            b"()Tj",
+                            block,
+                        )
+                        # remove the url
+                        block, n_subs2 = re.subn(
+                            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+                            b"",
+                            block,
+                        )
+                        if n_subs1 or n_subs2:
+                            # fix the length of the object stream
+                            block = fix_stream_length(block)
+                            replaced_arXiv = True
+                    new_data.append(block)
+                    char_count += len(block)
+                    current_obj = []
+                elif current_obj:
+                    # If we're recording an object, simply add the line to it
+                    current_obj.append(line)
+                elif line == b"xref\n":
+                    # We found the xref table, record its position and write it
+                    # out using our updated indices.
+                    startxref = sum(map(len, new_data))
+                    new_data.append(line)
+                    new_data.append(b"0 %i\n" % (len(xref) + 1))
+                    new_data.append(b"0000000000 65535 f \n")
+                    for objid in sorted(xref):
+                        new_data.append(b"%010d 00000 n \n" % xref[objid])
+
+                    # skip the appropriate number of lines
+                    skip_n = len(xref) + 2
+                elif line == b"startxref\n":
+                    # Write out our recorded startxref position, skip the old
+                    # position.
+                    new_data.append(b"startxref\n%i\n" % startxref)
+                    skip_n = 1
+                else:
+                    # Anything else passes through
+                    new_data.append(line)
+                    char_count += len(line)
 
         removed_file = basename + "_removed.pdf"
-        with open(removed_file, "wb") as oid:
-            oid.write(data)
+        with open(removed_file, "wb") as fp:
+            fp.write(b"".join(new_data))
 
         output_file = basename + "_dearxiv.pdf"
         self.compress_pdf(removed_file, output_file)
 
         return output_file
+
+
+def fix_stream_length(block):
+    # This fixes the stream length of a block, which is needed after we have
+    # removed the arXiv stamp.
+    count = 0
+    block = block.split(b"\n")
+    do_count = False
+
+    for line in block:
+        if line in [b"stream", b"endstream"]:
+            do_count = not do_count
+            continue
+
+        if do_count:
+            # +1 for the newline character
+            count += len(line) + 1
+
+    new_block = []
+    for line in block:
+        if b" /Length " in line:
+            new_block.append(b"<< /Length %i >>" % count)
+        else:
+            new_block.append(line)
+
+    return b"\n".join(new_block)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 19:54:05 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 19:54:05 +0100
commit	fdbc77f3832b9071359b8e2e73ea1570ef691718 (patch)
tree	cb5ecd748b3d6b124e8c45a54c07122ac789d31e
parent	Code formatting with Black (diff)
download	paper2remarkable-fdbc77f3832b9071359b8e2e73ea1570ef691718.tar.gz paper2remarkable-fdbc77f3832b9071359b8e2e73ea1570ef691718.zip