aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJocelyn Boullier <jocelyn@boullier.bzh>2021-03-01 22:40:09 +0100
committerJocelyn Boullier <jocelyn@boullier.bzh>2021-03-02 21:35:03 +0100
commit9c268d299f9dc605f9d768e557f2887a8f7e80c8 (patch)
treef42e8fc28b7d165ed56863c24f108f07f365c375
parentfeat: copy ToC over from original file (diff)
downloadpaper2remarkable-9c268d299f9dc605f9d768e557f2887a8f7e80c8.tar.gz
paper2remarkable-9c268d299f9dc605f9d768e557f2887a8f7e80c8.zip
feat: use pikepdf instead of PyPDF2, bumps Python min. version to 3.6
Several reasons: 1. PyPDF2 isn't maintained anymore. 2. On PDF files with lots of pages, you hit a recursion limit because of the way PyPDF2 is written. The `_sweepIndirectReferences` function is recursive instead of being iterative. 3. Performances. PyPDF2 is a pure Python library, while pikepdf uses QPDF, a C++ library under the hood. It is much faster. This is quite noticable when processing PDFs such as books. 4. pikepdf fixes PDF. While implementing the ToC feature, I noticed that for some PDFs it didn't show up on the Remarkable, even before processing it. For some reason, simply opening a PDF with pikepdf and saving it again fixes the issue. So we get it fixed for free.
-rw-r--r--paper2remarkable/crop.py33
-rw-r--r--paper2remarkable/pdf_ops.py62
-rw-r--r--paper2remarkable/providers/_base.py9
-rw-r--r--paper2remarkable/utils.py12
-rw-r--r--setup.py4
5 files changed, 38 insertions, 82 deletions
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
index 623d29f..16d050e 100644
--- a/paper2remarkable/crop.py
+++ b/paper2remarkable/crop.py
@@ -8,13 +8,12 @@ Copyright: 2019, G.J.J. van den Burg
"""
-import PyPDF2
import io
import os
import pdfplumber
import subprocess
-from PyPDF2.generic import RectangleObject
+from pikepdf import Pdf
from .log import Logger
@@ -64,7 +63,7 @@ class Cropper(object):
):
if not input_file is None:
self.input_file = os.path.abspath(input_file)
- self.reader = PyPDF2.PdfFileReader(self.input_file)
+ self.reader = Pdf.open(self.input_file)
if not output_file is None:
self.output_file = os.path.abspath(output_file)
@@ -72,7 +71,6 @@ class Cropper(object):
pdftoppm_path = None
self.pdftoppm_path = pdftoppm_path
- self.writer = PyPDF2.PdfFileWriter()
def crop(self, margins=1):
return self.process_file(self.crop_page, margins=margins)
@@ -84,15 +82,16 @@ class Cropper(object):
return self.process_file(self.right_page, padding=padding)
def process_file(self, page_func, *args, **kwargs):
- n = self.reader.getNumPages()
+ n = len(self.reader.pages)
for page_idx in range(n):
status = page_func(page_idx, *args, **kwargs)
if not status == 0:
return status
if (page_idx + 1) % 10 == 0:
logger.info("Processing pages ... (%i/%i)" % (page_idx + 1, n))
- with open(self.output_file, "wb") as fp:
- self.writer.write(fp)
+
+ self.reader.save(self.output_file)
+ self.reader.close()
if n % 10 > 0:
logger.info("Processing pages ... (%i/%i)" % (n, n))
return 0
@@ -112,21 +111,25 @@ class Cropper(object):
def export_page(self, page_idx):
"""Helper function that exports a single page given by index """
- page = self.reader.getPage(page_idx)
- writer = PyPDF2.PdfFileWriter()
- writer.addPage(page)
+ page = self.reader.pages[page_idx]
+
+ writer = Pdf.new()
+ writer.pages.append(page)
+ # Remove the annotations to avoid warning about `Bad annotation destination` when
+ # processing the page with pdftoppm. Since we've appended the page to the writer, pikepdf
+ # does a copy of the copy and we're not modifying the original page's annotations.
+ writer.pages[0].Annots = []
+
tmpfname = "./page.pdf"
- with open(tmpfname, "wb") as fp:
- writer.write(fp)
+ writer.save(tmpfname)
+ writer.close()
return tmpfname
def process_page(self, page_idx, bbox_func, *args, **kwargs):
"""Process a single page and add it to the writer """
tmpfname = self.export_page(page_idx)
bbox = bbox_func(tmpfname, *args, **kwargs)
- thepage = self.reader.getPage(page_idx)
- thepage.cropBox = RectangleObject(bbox)
- self.writer.addPage(thepage)
+ self.reader.pages[page_idx].CropBox = bbox
os.unlink(tmpfname)
return 0
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index aca055d..93f1200 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -9,10 +9,11 @@ Copyright: 2019, G.J.J. van den Burg
"""
-import PyPDF2
import os
import subprocess
+from pikepdf import Pdf
+
from .crop import Cropper
from .log import Logger
@@ -42,15 +43,17 @@ def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"):
def blank_pdf(filepath):
"""Add blank pages to PDF"""
logger.info("Adding blank pages")
- input_pdf = PyPDF2.PdfFileReader(filepath)
- output_pdf = PyPDF2.PdfFileWriter()
- for page in input_pdf.pages:
- output_pdf.addPage(page)
- output_pdf.addBlankPage()
+ pdf = Pdf.open(filepath)
+
+ previous_pages = pdf.pages
+ pdf.pages = []
+
+ for page in previous_pages:
+ pdf.pages.append(page)
+ pdf.add_blank_page()
output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
- with open(output_file, "wb") as fp:
- output_pdf.write(fp)
+ pdf.save(output_file)
return output_file
@@ -82,46 +85,3 @@ def shrink_pdf(filepath, gs_path="gs"):
logger.info("Shrinking has no effect for this file, using original.")
return filepath
return output_file
-
-
-def copy_toc(toc, filepath):
- logger.info("Copying table of content ...")
- reader = PyPDF2.PdfFileReader(filepath)
- output_pdf = PyPDF2.PdfFileWriter()
- output_pdf.cloneDocumentFromReader(reader)
-
- # It holds the corresponding bookmark for the last level seen, which will be retrieved to
- # specify the parent when we add the bookmark, to generate nested bookmarks.
- # It assumes the table of content is well constructed and doesn't jump from a level 1 to a
- # level 3 title without going through a level 2 at first. If it does, the parent bookmark
- # associated to the level 3 could be wrong if we saw a level 2 previously (but not the right
- # now obviously).
- level_last_bookmarks = {}
-
- for level, page, title in toc:
- parent = None
- if level > 0:
- parent = level_last_bookmarks.get(level - 1)
-
- bookmark = output_pdf.addBookmark(title, page, parent=parent, fit="/Fit")
- level_last_bookmarks[level] = bookmark
-
- output_file = os.path.splitext(filepath)[0] + "-with-toc.pdf"
- with open(output_file, "wb") as f:
- output_pdf.write(f)
-
- return output_file
-
-
-def get_toc(filepath):
- input_pdf = PyPDF2.PdfFileReader(filepath)
- return list(yield_outlines(input_pdf, input_pdf.getOutlines()))
-
-
-def yield_outlines(reader, outlines, level=0):
- if isinstance(outlines, list):
- for item in outlines:
- yield from yield_outlines(reader, item, level=level + 1)
- else:
- page_number = reader.getDestinationPageNumber(outlines)
- yield level, page_number, outlines["/Title"]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index cbdae25..369d566 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -17,7 +17,7 @@ import time
from ..exceptions import _CalledProcessError
from ..log import Logger
-from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf, get_toc, copy_toc
+from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
from ..utils import (
assert_file_is_pdf,
check_pdftool,
@@ -84,7 +84,6 @@ class Provider(metaclass=abc.ABCMeta):
elif crop == "left":
self.operations.append(("crop", self.crop_pdf))
- self.blank = blank
if blank:
self.operations.append(("blank", blank_pdf))
@@ -217,16 +216,10 @@ class Provider(metaclass=abc.ABCMeta):
assert_file_is_pdf(tmp_filename)
- toc = get_toc(tmp_filename)
-
intermediate_fname = tmp_filename
for opname, op in self.operations:
intermediate_fname = op(intermediate_fname)
- # TODO: handle ToC with blank pages.
- if not self.blank:
- copy_toc(toc, intermediate_fname)
-
shutil.copy(intermediate_fname, clean_filename)
if self.debug:
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 2432916..0003103 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -8,7 +8,6 @@ Copyright: 2019, G.J.J. van den Burg
"""
-import PyPDF2
import regex
import requests
import string
@@ -16,6 +15,8 @@ import subprocess
import time
import unidecode
+from pikepdf import Pdf, PdfError
+
from .log import Logger
from .exceptions import FileTypeError, RemarkableError, NoPDFToolError
@@ -45,15 +46,14 @@ def clean_string(s):
def assert_file_is_pdf(filename):
"""Assert that a given file is a PDF file.
- This is done by trying to open it using PyPDF2.
+ This is done by trying to open it using pikepdf.
"""
try:
- fp = open(filename, "rb")
- pdf = PyPDF2.PdfFileReader(fp, strict=False)
- fp.close()
+ pdf = Pdf.open(filename)
+ pdf.close()
del pdf
return True
- except PyPDF2.utils.PdfReadError:
+ except PdfError:
raise FileTypeError(filename, "pdf")
diff --git a/setup.py b/setup.py
index e529cc2..fb7d21c 100644
--- a/setup.py
+++ b/setup.py
@@ -13,13 +13,13 @@ EMAIL = "gertjanvandenburg@gmail.com"
LICENSE = "MIT"
LICENSE_TROVE = "License :: OSI Approved :: MIT License"
NAME = "paper2remarkable"
-REQUIRES_PYTHON = ">=3.5.0"
+REQUIRES_PYTHON = ">=3.6.0"
URL = "https://github.com/GjjvdBurg/paper2remarkable"
VERSION = None
# What packages are required for this module to be executed?
REQUIRED = [
- "PyPDF2>=1.26",
+ "pikepdf>=2.8.0",
"beautifulsoup4>=4.8",
"html2text>=2020.1.16",
"markdown>=3.1.1",