feat: copy ToC over from original file

author: Jocelyn Boullier <jocelyn@boullier.bzh> 2021-03-01 21:08:17 +0100
committer: Jocelyn Boullier <jocelyn@boullier.bzh> 2021-03-02 21:35:03 +0100
commit: 47cde8628b024902d8a1ad9e1cf57b56c8c92442 (patch)
tree: 3e9cf2db3c1c55821c34e91cb0fd4425724ab128
parent: Remove unnecessary check in arXiv unit test (diff)
download: paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.tar.gz
paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.zip
2 files changed, 53 insertions, 1 deletions
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index c365920..aca055d 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -82,3 +82,46 @@ def shrink_pdf(filepath, gs_path="gs"):
         logger.info("Shrinking has no effect for this file, using original.")
         return filepath
     return output_file
+
+
+def copy_toc(toc, filepath):
+    logger.info("Copying table of content ...")
+    reader = PyPDF2.PdfFileReader(filepath)
+    output_pdf = PyPDF2.PdfFileWriter()
+    output_pdf.cloneDocumentFromReader(reader)
+
+    # It holds the corresponding bookmark for the last level seen, which will be retrieved to
+    # specify the parent when we add the bookmark, to generate nested bookmarks.
+    # It assumes the table of content is well constructed and doesn't jump from a level 1 to a
+    # level 3 title without going through a level 2 at first. If it does, the parent bookmark
+    # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right
+    # now obviously).
+    level_last_bookmarks = {}
+
+    for level, page, title in toc:
+        parent = None
+        if level > 0:
+            parent = level_last_bookmarks.get(level - 1)
+
+        bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit")
+        level_last_bookmarks[level] = bookmark
+
+    output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf"
+    with open(output_file, "wb") as f:
+        output_pdf.write(f)
+
+    return output_file
+
+
+def get_toc(filepath):
+    input_pdf = PyPDF2.PdfFileReader(filepath)
+    return list(yield_outlines(input_pdf, input_pdf.getOutlines()))
+
+
+def yield_outlines(reader, outlines, level=0):
+    if isinstance(outlines, list):
+        for item in outlines:
+            yield from yield_outlines(reader, item, level=level + 1)
+    else:
+        page_number = reader.getDestinationPageNumber(outlines)
+        yield level, page_number, outlines["/Title"]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 0453c7a..cbdae25 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -17,7 +17,7 @@ import time
 
 from ..exceptions import _CalledProcessError
 from ..log import Logger
-from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
+from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc
 from ..utils import (
     assert_file_is_pdf,
     check_pdftool,
@@ -84,8 +84,10 @@ class Provider(metaclass=abc.ABCMeta):
         elif crop == "left":
             self.operations.append(("crop", self.crop_pdf))
 
+        self.blank = blank
         if blank:
             self.operations.append(("blank", blank_pdf))
+
         self.operations.append(("shrink", self.shrink_pdf))
 
         logger.info("Starting %s provider" % type(self).__name__)
@@ -215,9 +217,16 @@ class Provider(metaclass=abc.ABCMeta):
 
             assert_file_is_pdf(tmp_filename)
 
+            toc = get_toc(tmp_filename)
+
             intermediate_fname = tmp_filename
             for opname, op in self.operations:
                 intermediate_fname = op(intermediate_fname)
+
+            # TODO: handle ToC with blank pages.
+            if not self.blank:
+                copy_toc(toc, intermediate_fname)
+
             shutil.copy(intermediate_fname, clean_filename)
 
             if self.debug:
author	Jocelyn Boullier <jocelyn@boullier.bzh>	2021-03-01 21:08:17 +0100
committer	Jocelyn Boullier <jocelyn@boullier.bzh>	2021-03-02 21:35:03 +0100
commit	47cde8628b024902d8a1ad9e1cf57b56c8c92442 (patch)
tree	3e9cf2db3c1c55821c34e91cb0fd4425724ab128
parent	Remove unnecessary check in arXiv unit test (diff)
download	paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.tar.gz paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.zip