aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 20:03:00 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 20:03:00 +0100
commit62e2b5e8fd42ff7857c555cf45f6bd688c1e527f (patch)
treecb5ecd748b3d6b124e8c45a54c07122ac789d31e
parentCode formatting with Black (diff)
parentFix the pdf output of dearxiv (diff)
downloadpaper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.tar.gz
paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.zip
Merge branch 'bugfix/arXiv_stamp'
-rw-r--r--paper2remarkable/providers/_base.py7
-rw-r--r--paper2remarkable/providers/arxiv.py109
2 files changed, 98 insertions, 18 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 53ad78e..20349c2 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -122,16 +122,11 @@ class Provider(metaclass=abc.ABCMeta):
[self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
)
elif self.pdftool == "qpdf":
- # TODO: the status == 3 is only needed because when we remove
- # the arXiv stamp we don't fix the length of the pdf object. This
- # causes qpdf to raise a warning and give a nonzero exit status.
- # Fixing the pdf object is the right approach, but this does
- # work as it is since qpdf fixes the file for us.
status = subprocess.call(
[self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,],
stderr=subprocess.DEVNULL,
)
- if not (status == 0 or status == 3):
+ if not status == 0:
raise _CalledProcessError(
"%s failed to compress the PDF file." % self.pdftool
)
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 74043ed..7f3d554 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -71,22 +71,107 @@ class Arxiv(Provider):
uncompress_file = basename + "_uncompress.pdf"
self.uncompress_pdf(input_file, uncompress_file)
- with open(uncompress_file, "rb") as fid:
- data = fid.read()
- # Remove the text element
- data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
- # Remove the URL element
- data = re.sub(
- b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
- b"",
- data,
- )
+ new_data = []
+ current_obj = []
+ replaced_arXiv = False
+ char_count = skip_n = startxref = 0
+ xref = {}
+
+ with open(uncompress_file, "rb") as fp:
+ for line in fp:
+ if skip_n:
+ # Skip a line
+ skip_n -= 1
+ continue
+
+ if line.endswith(b" obj\n"):
+ # Start a new object. Add it to the current object and
+ # record its position for the xref table.
+ current_obj.append(line)
+ objid = int(line.split(b" ")[0])
+ xref[objid] = char_count
+ elif current_obj and line == b"endobj\n":
+ # End the current object. If needed, replace the arXiv
+ # stamp in the block (done only once). Reset current
+ # object.
+ current_obj.append(line)
+ block = b"".join(current_obj)
+ if not replaced_arXiv:
+ # remove the text
+ block, n_subs1 = re.subn(
+ b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
+ b"()Tj",
+ block,
+ )
+ # remove the url
+ block, n_subs2 = re.subn(
+ b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+ b"",
+ block,
+ )
+ if n_subs1 or n_subs2:
+ # fix the length of the object stream
+ block = fix_stream_length(block)
+ replaced_arXiv = True
+ new_data.append(block)
+ char_count += len(block)
+ current_obj = []
+ elif current_obj:
+ # If we're recording an object, simply add the line to it
+ current_obj.append(line)
+ elif line == b"xref\n":
+ # We found the xref table, record its position and write it
+ # out using our updated indices.
+ startxref = sum(map(len, new_data))
+ new_data.append(line)
+ new_data.append(b"0 %i\n" % (len(xref) + 1))
+ new_data.append(b"0000000000 65535 f \n")
+ for objid in sorted(xref):
+ new_data.append(b"%010d 00000 n \n" % xref[objid])
+
+ # skip the appropriate number of lines
+ skip_n = len(xref) + 2
+ elif line == b"startxref\n":
+ # Write out our recorded startxref position, skip the old
+ # position.
+ new_data.append(b"startxref\n%i\n" % startxref)
+ skip_n = 1
+ else:
+ # Anything else passes through
+ new_data.append(line)
+ char_count += len(line)
removed_file = basename + "_removed.pdf"
- with open(removed_file, "wb") as oid:
- oid.write(data)
+ with open(removed_file, "wb") as fp:
+ fp.write(b"".join(new_data))
output_file = basename + "_dearxiv.pdf"
self.compress_pdf(removed_file, output_file)
return output_file
+
+
+def fix_stream_length(block):
+ # This fixes the stream length of a block, which is needed after we have
+ # removed the arXiv stamp.
+ count = 0
+ block = block.split(b"\n")
+ do_count = False
+
+ for line in block:
+ if line in [b"stream", b"endstream"]:
+ do_count = not do_count
+ continue
+
+ if do_count:
+ # +1 for the newline character
+ count += len(line) + 1
+
+ new_block = []
+ for line in block:
+ if b" /Length " in line:
+ new_block.append(b"<< /Length %i >>" % count)
+ else:
+ new_block.append(line)
+
+ return b"\n".join(new_block)