diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 20:03:00 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 20:03:00 +0100 |
| commit | 62e2b5e8fd42ff7857c555cf45f6bd688c1e527f (patch) | |
| tree | cb5ecd748b3d6b124e8c45a54c07122ac789d31e | |
| parent | Code formatting with Black (diff) | |
| parent | Fix the pdf output of dearxiv (diff) | |
| download | paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.tar.gz paper2remarkable-62e2b5e8fd42ff7857c555cf45f6bd688c1e527f.zip | |
Merge branch 'bugfix/arXiv_stamp'
| -rw-r--r-- | paper2remarkable/providers/_base.py | 7 | ||||
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 109 |
2 files changed, 98 insertions, 18 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 53ad78e..20349c2 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -122,16 +122,11 @@ class Provider(metaclass=abc.ABCMeta): [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] ) elif self.pdftool == "qpdf": - # TODO: the status == 3 is only needed because when we remove - # the arXiv stamp we don't fix the length of the pdf object. This - # causes qpdf to raise a warning and give a nonzero exit status. - # Fixing the pdf object is the right approach, but this does - # work as it is since qpdf fixes the file for us. status = subprocess.call( [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,], stderr=subprocess.DEVNULL, ) - if not (status == 0 or status == 3): + if not status == 0: raise _CalledProcessError( "%s failed to compress the PDF file." % self.pdftool ) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 74043ed..7f3d554 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -71,22 +71,107 @@ class Arxiv(Provider): uncompress_file = basename + "_uncompress.pdf" self.uncompress_pdf(input_file, uncompress_file) - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) + new_data = [] + current_obj = [] + replaced_arXiv = False + char_count = skip_n = startxref = 0 + xref = {} + + with open(uncompress_file, "rb") as fp: + for line in fp: + if skip_n: + # Skip a line + skip_n -= 1 + continue + + if line.endswith(b" obj\n"): + # Start a new object. Add it to the current object and + # record its position for the xref table. + current_obj.append(line) + objid = int(line.split(b" ")[0]) + xref[objid] = char_count + elif current_obj and line == b"endobj\n": + # End the current object. If needed, replace the arXiv + # stamp in the block (done only once). Reset current + # object. + current_obj.append(line) + block = b"".join(current_obj) + if not replaced_arXiv: + # remove the text + block, n_subs1 = re.subn( + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", + b"()Tj", + block, + ) + # remove the url + block, n_subs2 = re.subn( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + block, + ) + if n_subs1 or n_subs2: + # fix the length of the object stream + block = fix_stream_length(block) + replaced_arXiv = True + new_data.append(block) + char_count += len(block) + current_obj = [] + elif current_obj: + # If we're recording an object, simply add the line to it + current_obj.append(line) + elif line == b"xref\n": + # We found the xref table, record its position and write it + # out using our updated indices. + startxref = sum(map(len, new_data)) + new_data.append(line) + new_data.append(b"0 %i\n" % (len(xref) + 1)) + new_data.append(b"0000000000 65535 f \n") + for objid in sorted(xref): + new_data.append(b"%010d 00000 n \n" % xref[objid]) + + # skip the appropriate number of lines + skip_n = len(xref) + 2 + elif line == b"startxref\n": + # Write out our recorded startxref position, skip the old + # position. + new_data.append(b"startxref\n%i\n" % startxref) + skip_n = 1 + else: + # Anything else passes through + new_data.append(line) + char_count += len(line) removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) + with open(removed_file, "wb") as fp: + fp.write(b"".join(new_data)) output_file = basename + "_dearxiv.pdf" self.compress_pdf(removed_file, output_file) return output_file + + +def fix_stream_length(block): + # This fixes the stream length of a block, which is needed after we have + # removed the arXiv stamp. + count = 0 + block = block.split(b"\n") + do_count = False + + for line in block: + if line in [b"stream", b"endstream"]: + do_count = not do_count + continue + + if do_count: + # +1 for the newline character + count += len(line) + 1 + + new_block = [] + for line in block: + if b" /Length " in line: + new_block.append(b"<< /Length %i >>" % count) + else: + new_block.append(line) + + return b"\n".join(new_block) |
