aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJocelyn Boullier <jocelyn@boullier.bzh>2021-03-01 21:08:17 +0100
committerJocelyn Boullier <jocelyn@boullier.bzh>2021-03-02 21:35:03 +0100
commit47cde8628b024902d8a1ad9e1cf57b56c8c92442 (patch)
tree3e9cf2db3c1c55821c34e91cb0fd4425724ab128
parentRemove unnecessary check in arXiv unit test (diff)
downloadpaper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.tar.gz
paper2remarkable-47cde8628b024902d8a1ad9e1cf57b56c8c92442.zip
feat: copy ToC over from original file
-rw-r--r--paper2remarkable/pdf_ops.py43
-rw-r--r--paper2remarkable/providers/_base.py11
2 files changed, 53 insertions, 1 deletions
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index c365920..aca055d 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -82,3 +82,46 @@ def shrink_pdf(filepath, gs_path="gs"):
logger.info("Shrinking has no effect for this file, using original.")
return filepath
return output_file
+
+
+def copy_toc(toc, filepath):
+ logger.info("Copying table of content ...")
+ reader = PyPDF2.PdfFileReader(filepath)
+ output_pdf = PyPDF2.PdfFileWriter()
+ output_pdf.cloneDocumentFromReader(reader)
+
+ # It holds the corresponding bookmark for the last level seen, which will be retrieved to
+ # specify the parent when we add the bookmark, to generate nested bookmarks.
+ # It assumes the table of content is well constructed and doesn't jump from a level 1 to a
+ # level 3 title without going through a level 2 at first. If it does, the parent bookmark
+ # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right
+ # now obviously).
+ level_last_bookmarks = {}
+
+ for level, page, title in toc:
+ parent = None
+ if level > 0:
+ parent = level_last_bookmarks.get(level - 1)
+
+ bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit")
+ level_last_bookmarks[level] = bookmark
+
+ output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf"
+ with open(output_file, "wb") as f:
+ output_pdf.write(f)
+
+ return output_file
+
+
+def get_toc(filepath):
+ input_pdf = PyPDF2.PdfFileReader(filepath)
+ return list(yield_outlines(input_pdf, input_pdf.getOutlines()))
+
+
+def yield_outlines(reader, outlines, level=0):
+ if isinstance(outlines, list):
+ for item in outlines:
+ yield from yield_outlines(reader, item, level=level + 1)
+ else:
+ page_number = reader.getDestinationPageNumber(outlines)
+ yield level, page_number, outlines["/Title"]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 0453c7a..cbdae25 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -17,7 +17,7 @@ import time
from ..exceptions import _CalledProcessError
from ..log import Logger
-from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
+from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc
from ..utils import (
assert_file_is_pdf,
check_pdftool,
@@ -84,8 +84,10 @@ class Provider(metaclass=abc.ABCMeta):
elif crop == "left":
self.operations.append(("crop", self.crop_pdf))
+ self.blank = blank
if blank:
self.operations.append(("blank", blank_pdf))
+
self.operations.append(("shrink", self.shrink_pdf))
logger.info("Starting %s provider" % type(self).__name__)
@@ -215,9 +217,16 @@ class Provider(metaclass=abc.ABCMeta):
assert_file_is_pdf(tmp_filename)
+ toc = get_toc(tmp_filename)
+
intermediate_fname = tmp_filename
for opname, op in self.operations:
intermediate_fname = op(intermediate_fname)
+
+ # TODO: handle ToC with blank pages.
+ if not self.blank:
+ copy_toc(toc, intermediate_fname)
+
shutil.copy(intermediate_fname, clean_filename)
if self.debug: