diff options
| author | Jocelyn Boullier <jocelyn@boullier.bzh> | 2021-03-01 21:08:17 +0100 |
|---|---|---|
| committer | Jocelyn Boullier <jocelyn@boullier.bzh> | 2021-03-02 21:35:03 +0100 |
| commit | 47cde8628b024902d8a1ad9e1cf57b56c8c92442 (patch) | |
| tree | 3e9cf2db3c1c55821c34e91cb0fd4425724ab128 | |
| parent | Remove unnecessary check in arXiv unit test (diff) | |
| download | paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.tar.gz paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.zip | |
feat: copy ToC over from original file
| -rw-r--r-- | paper2remarkable/pdf_ops.py | 43 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 11 |
2 files changed, 53 insertions, 1 deletions
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c365920..aca055d 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -82,3 +82,46 @@ def shrink_pdf(filepath, gs_path="gs"): logger.info("Shrinking has no effect for this file, using original.") return filepath return output_file + + +def copy_toc(toc, filepath): + logger.info("Copying table of content ...") + reader = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + output_pdf.cloneDocumentFromReader(reader) + + # It holds the corresponding bookmark for the last level seen, which will be retrieved to + # specify the parent when we add the bookmark, to generate nested bookmarks. + # It assumes the table of content is well constructed and doesn't jump from a level 1 to a + # level 3 title without going through a level 2 at first. If it does, the parent bookmark + # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right + # now obviously). + level_last_bookmarks = {} + + for level, page, title in toc: + parent = None + if level > 0: + parent = level_last_bookmarks.get(level - 1) + + bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit") + level_last_bookmarks[level] = bookmark + + output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf" + with open(output_file, "wb") as f: + output_pdf.write(f) + + return output_file + + +def get_toc(filepath): + input_pdf = PyPDF2.PdfFileReader(filepath) + return list(yield_outlines(input_pdf, input_pdf.getOutlines())) + + +def yield_outlines(reader, outlines, level=0): + if isinstance(outlines, list): + for item in outlines: + yield from yield_outlines(reader, item, level=level + 1) + else: + page_number = reader.getDestinationPageNumber(outlines) + yield level, page_number, outlines["/Title"] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 0453c7a..cbdae25 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -17,7 +17,7 @@ import time from ..exceptions import _CalledProcessError from ..log import Logger -from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf +from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc from ..utils import ( assert_file_is_pdf, check_pdftool, @@ -84,8 +84,10 @@ class Provider(metaclass=abc.ABCMeta): elif crop == "left": self.operations.append(("crop", self.crop_pdf)) + self.blank = blank if blank: self.operations.append(("blank", blank_pdf)) + self.operations.append(("shrink", self.shrink_pdf)) logger.info("Starting %s provider" % type(self).__name__) @@ -215,9 +217,16 @@ class Provider(metaclass=abc.ABCMeta): assert_file_is_pdf(tmp_filename) + toc = get_toc(tmp_filename) + intermediate_fname = tmp_filename for opname, op in self.operations: intermediate_fname = op(intermediate_fname) + + # TODO: handle ToC with blank pages. + if not self.blank: + copy_toc(toc, intermediate_fname) + shutil.copy(intermediate_fname, clean_filename) if self.debug: |
