From 47cde8628b024902d8a1ad9e1cf57b56c8c92442 Mon Sep 17 00:00:00 2001 From: Jocelyn Boullier Date: Mon, 1 Mar 2021 21:08:17 +0100 Subject: feat: copy ToC over from original file --- paper2remarkable/pdf_ops.py | 43 +++++++++++++++++++++++++++++++++++++ paper2remarkable/providers/_base.py | 11 +++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c365920..aca055d 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -82,3 +82,46 @@ def shrink_pdf(filepath, gs_path="gs"): logger.info("Shrinking has no effect for this file, using original.") return filepath return output_file + + +def copy_toc(toc, filepath): + logger.info("Copying table of content ...") + reader = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + output_pdf.cloneDocumentFromReader(reader) + + # It holds the corresponding bookmark for the last level seen, which will be retrieved to + # specify the parent when we add the bookmark, to generate nested bookmarks. + # It assumes the table of content is well constructed and doesn't jump from a level 1 to a + # level 3 title without going through a level 2 at first. If it does, the parent bookmark + # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right + # now obviously). + level_last_bookmarks = {} + + for level, page, title in toc: + parent = None + if level > 0: + parent = level_last_bookmarks.get(level - 1) + + bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit") + level_last_bookmarks[level] = bookmark + + output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf" + with open(output_file, "wb") as f: + output_pdf.write(f) + + return output_file + + +def get_toc(filepath): + input_pdf = PyPDF2.PdfFileReader(filepath) + return list(yield_outlines(input_pdf, input_pdf.getOutlines())) + + +def yield_outlines(reader, outlines, level=0): + if isinstance(outlines, list): + for item in outlines: + yield from yield_outlines(reader, item, level=level + 1) + else: + page_number = reader.getDestinationPageNumber(outlines) + yield level, page_number, outlines["/Title"] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 0453c7a..cbdae25 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -17,7 +17,7 @@ import time from ..exceptions import _CalledProcessError from ..log import Logger -from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf +from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc from ..utils import ( assert_file_is_pdf, check_pdftool, @@ -84,8 +84,10 @@ class Provider(metaclass=abc.ABCMeta): elif crop == "left": self.operations.append(("crop", self.crop_pdf)) + self.blank = blank if blank: self.operations.append(("blank", blank_pdf)) + self.operations.append(("shrink", self.shrink_pdf)) logger.info("Starting %s provider" % type(self).__name__) @@ -215,9 +217,16 @@ class Provider(metaclass=abc.ABCMeta): assert_file_is_pdf(tmp_filename) + toc = get_toc(tmp_filename) + intermediate_fname = tmp_filename for opname, op in self.operations: intermediate_fname = op(intermediate_fname) + + # TODO: handle ToC with blank pages. + if not self.blank: + copy_toc(toc, intermediate_fname) + shutil.copy(intermediate_fname, clean_filename) if self.debug: -- cgit v1.2.3 From 9c268d299f9dc605f9d768e557f2887a8f7e80c8 Mon Sep 17 00:00:00 2001 From: Jocelyn Boullier Date: Mon, 1 Mar 2021 22:40:09 +0100 Subject: feat: use pikepdf instead of PyPDF2, bumps Python min. version to 3.6 Several reasons: 1. PyPDF2 isn't maintained anymore. 2. On PDF files with lots of pages, you hit a recursion limit because of the way PyPDF2 is written. The `_sweepIndirectReferences` function is recursive instead of being iterative. 3. Performances. PyPDF2 is a pure Python library, while pikepdf uses QPDF, a C++ library under the hood. It is much faster. This is quite noticable when processing PDFs such as books. 4. pikepdf fixes PDF. While implementing the ToC feature, I noticed that for some PDFs it didn't show up on the Remarkable, even before processing it. For some reason, simply opening a PDF with pikepdf and saving it again fixes the issue. So we get it fixed for free. --- paper2remarkable/crop.py | 33 +++++++++++--------- paper2remarkable/pdf_ops.py | 62 +++++++------------------------------ paper2remarkable/providers/_base.py | 9 +----- paper2remarkable/utils.py | 12 +++---- setup.py | 4 +-- 5 files changed, 38 insertions(+), 82 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 623d29f..16d050e 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -8,13 +8,12 @@ Copyright: 2019, G.J.J. van den Burg """ -import PyPDF2 import io import os import pdfplumber import subprocess -from PyPDF2.generic import RectangleObject +from pikepdf import Pdf from .log import Logger @@ -64,7 +63,7 @@ class Cropper(object): ): if not input_file is None: self.input_file = os.path.abspath(input_file) - self.reader = PyPDF2.PdfFileReader(self.input_file) + self.reader = Pdf.open(self.input_file) if not output_file is None: self.output_file = os.path.abspath(output_file) @@ -72,7 +71,6 @@ class Cropper(object): pdftoppm_path = None self.pdftoppm_path = pdftoppm_path - self.writer = PyPDF2.PdfFileWriter() def crop(self, margins=1): return self.process_file(self.crop_page, margins=margins) @@ -84,15 +82,16 @@ class Cropper(object): return self.process_file(self.right_page, padding=padding) def process_file(self, page_func, *args, **kwargs): - n = self.reader.getNumPages() + n = len(self.reader.pages) for page_idx in range(n): status = page_func(page_idx, *args, **kwargs) if not status == 0: return status if (page_idx + 1) % 10 == 0: logger.info("Processing pages ... (%i/%i)" % (page_idx + 1, n)) - with open(self.output_file, "wb") as fp: - self.writer.write(fp) + + self.reader.save(self.output_file) + self.reader.close() if n % 10 > 0: logger.info("Processing pages ... (%i/%i)" % (n, n)) return 0 @@ -112,21 +111,25 @@ class Cropper(object): def export_page(self, page_idx): """Helper function that exports a single page given by index """ - page = self.reader.getPage(page_idx) - writer = PyPDF2.PdfFileWriter() - writer.addPage(page) + page = self.reader.pages[page_idx] + + writer = Pdf.new() + writer.pages.append(page) + # Remove the annotations to avoid warning about `Bad annotation destination` when + # processing the page with pdftoppm. Since we've appended the page to the writer, pikepdf + # does a copy of the copy and we're not modifying the original page's annotations. + writer.pages[0].Annots = [] + tmpfname = "./page.pdf" - with open(tmpfname, "wb") as fp: - writer.write(fp) + writer.save(tmpfname) + writer.close() return tmpfname def process_page(self, page_idx, bbox_func, *args, **kwargs): """Process a single page and add it to the writer """ tmpfname = self.export_page(page_idx) bbox = bbox_func(tmpfname, *args, **kwargs) - thepage = self.reader.getPage(page_idx) - thepage.cropBox = RectangleObject(bbox) - self.writer.addPage(thepage) + self.reader.pages[page_idx].CropBox = bbox os.unlink(tmpfname) return 0 diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index aca055d..93f1200 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -9,10 +9,11 @@ Copyright: 2019, G.J.J. van den Burg """ -import PyPDF2 import os import subprocess +from pikepdf import Pdf + from .crop import Cropper from .log import Logger @@ -42,15 +43,17 @@ def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"): def blank_pdf(filepath): """Add blank pages to PDF""" logger.info("Adding blank pages") - input_pdf = PyPDF2.PdfFileReader(filepath) - output_pdf = PyPDF2.PdfFileWriter() - for page in input_pdf.pages: - output_pdf.addPage(page) - output_pdf.addBlankPage() + pdf = Pdf.open(filepath) + + previous_pages = pdf.pages + pdf.pages = [] + + for page in previous_pages: + pdf.pages.append(page) + pdf.add_blank_page() output_file = os.path.splitext(filepath)[0] + "-blank.pdf" - with open(output_file, "wb") as fp: - output_pdf.write(fp) + pdf.save(output_file) return output_file @@ -82,46 +85,3 @@ def shrink_pdf(filepath, gs_path="gs"): logger.info("Shrinking has no effect for this file, using original.") return filepath return output_file - - -def copy_toc(toc, filepath): - logger.info("Copying table of content ...") - reader = PyPDF2.PdfFileReader(filepath) - output_pdf = PyPDF2.PdfFileWriter() - output_pdf.cloneDocumentFromReader(reader) - - # It holds the corresponding bookmark for the last level seen, which will be retrieved to - # specify the parent when we add the bookmark, to generate nested bookmarks. - # It assumes the table of content is well constructed and doesn't jump from a level 1 to a - # level 3 title without going through a level 2 at first. If it does, the parent bookmark - # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right - # now obviously). - level_last_bookmarks = {} - - for level, page, title in toc: - parent = None - if level > 0: - parent = level_last_bookmarks.get(level - 1) - - bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit") - level_last_bookmarks[level] = bookmark - - output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf" - with open(output_file, "wb") as f: - output_pdf.write(f) - - return output_file - - -def get_toc(filepath): - input_pdf = PyPDF2.PdfFileReader(filepath) - return list(yield_outlines(input_pdf, input_pdf.getOutlines())) - - -def yield_outlines(reader, outlines, level=0): - if isinstance(outlines, list): - for item in outlines: - yield from yield_outlines(reader, item, level=level + 1) - else: - page_number = reader.getDestinationPageNumber(outlines) - yield level, page_number, outlines["/Title"] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index cbdae25..369d566 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -17,7 +17,7 @@ import time from ..exceptions import _CalledProcessError from ..log import Logger -from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc +from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, check_pdftool, @@ -84,7 +84,6 @@ class Provider(metaclass=abc.ABCMeta): elif crop == "left": self.operations.append(("crop", self.crop_pdf)) - self.blank = blank if blank: self.operations.append(("blank", blank_pdf)) @@ -217,16 +216,10 @@ class Provider(metaclass=abc.ABCMeta): assert_file_is_pdf(tmp_filename) - toc = get_toc(tmp_filename) - intermediate_fname = tmp_filename for opname, op in self.operations: intermediate_fname = op(intermediate_fname) - # TODO: handle ToC with blank pages. - if not self.blank: - copy_toc(toc, intermediate_fname) - shutil.copy(intermediate_fname, clean_filename) if self.debug: diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 2432916..0003103 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -8,7 +8,6 @@ Copyright: 2019, G.J.J. van den Burg """ -import PyPDF2 import regex import requests import string @@ -16,6 +15,8 @@ import subprocess import time import unidecode +from pikepdf import Pdf, PdfError + from .log import Logger from .exceptions import FileTypeError, RemarkableError, NoPDFToolError @@ -45,15 +46,14 @@ def clean_string(s): def assert_file_is_pdf(filename): """Assert that a given file is a PDF file. - This is done by trying to open it using PyPDF2. + This is done by trying to open it using pikepdf. """ try: - fp = open(filename, "rb") - pdf = PyPDF2.PdfFileReader(fp, strict=False) - fp.close() + pdf = Pdf.open(filename) + pdf.close() del pdf return True - except PyPDF2.utils.PdfReadError: + except PdfError: raise FileTypeError(filename, "pdf") diff --git a/setup.py b/setup.py index e529cc2..fb7d21c 100644 --- a/setup.py +++ b/setup.py @@ -13,13 +13,13 @@ EMAIL = "gertjanvandenburg@gmail.com" LICENSE = "MIT" LICENSE_TROVE = "License :: OSI Approved :: MIT License" NAME = "paper2remarkable" -REQUIRES_PYTHON = ">=3.5.0" +REQUIRES_PYTHON = ">=3.6.0" URL = "https://github.com/GjjvdBurg/paper2remarkable" VERSION = None # What packages are required for this module to be executed? REQUIRED = [ - "PyPDF2>=1.26", + "pikepdf>=2.8.0", "beautifulsoup4>=4.8", "html2text>=2020.1.16", "markdown>=3.1.1", -- cgit v1.2.3 From ee2d05d88d1972d56e93f8c179690fe3a6e308a3 Mon Sep 17 00:00:00 2001 From: Jocelyn Boullier Date: Tue, 2 Mar 2021 10:14:52 +0100 Subject: tests: add test for keeping the ToC after processing --- tests/test_providers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index eaeb8aa..9eec83b 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -11,6 +11,7 @@ import pdfplumber import shutil import tempfile import unittest +from pikepdf import Pdf from paper2remarkable.providers import ( ACL, @@ -34,6 +35,7 @@ from paper2remarkable.providers import ( Springer, TandFOnline, ) +from paper2remarkable.utils import download_url VERBOSE = False @@ -438,6 +440,16 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_local_file_copy_toc(self): + """Make sure the table of content is kept after processing.""" + local_filename = "test.pdf" + download_url("https://arxiv.org/pdf/1711.03512.pdf", local_filename) + prov = LocalFile(upload=False, verbose=VERBOSE) + filename = prov.run(local_filename) + with Pdf.open(filename) as pdf: + with pdf.open_outline() as outline: + assert len(outline.root) > 0 + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From db8ca7a5eb461a3b1c165407a731fb4a71a199fe Mon Sep 17 00:00:00 2001 From: Jocelyn Boullier Date: Tue, 2 Mar 2021 21:37:57 +0100 Subject: tests: add additional test about ToC, this time with arXiv provider --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index 9eec83b..af69c64 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -450,6 +450,14 @@ class TestProviders(unittest.TestCase): with pdf.open_outline() as outline: assert len(outline.root) > 0 + def test_arxiv_copy_toc(self): + """Make sure the table of content is kept after processing when using the arXiv provider.""" + prov = Arxiv(upload=False, verbose=VERBOSE) + filename = prov.run("https://arxiv.org/abs/1711.03512") + with Pdf.open(filename) as pdf: + with pdf.open_outline() as outline: + assert len(outline.root) > 0 + if __name__ == "__main__": unittest.main() -- cgit v1.2.3