Merge branch 'feature/speedup'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-02-22 14:15:21 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-02-22 14:15:21 +0000
commit: d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9 (patch)
tree: e510ca170bfd7fdd5ac6d9f1f8bf1d0d17451e03
parent: Merge branch 'feature/provider_jmlr' (diff)
parent: Merge branch 'master' into feature/speedup (diff)
download: paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.tar.gz
paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.zip
7 files changed, 141 insertions, 49 deletions
diff --git a/.travis.yml b/.travis.yml
index 5551597..f412f9b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,10 +6,9 @@ python:
 
 before_install:
   - sudo apt-get update
-  - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils
+  - sudo apt-get install ghostscript pdftk poppler-utils
 
 install:
-  - pip install six
   - pip install -e .[dev]
 
 script:
diff --git a/Dockerfile b/Dockerfile
index 38db46b..86743a2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update \
         libmagickwand-dev \
         pdftk \
         ghostscript \
-        texlive-extra-utils  # contains pdfcrop
+	poppler-utils
 
 RUN pip install --no-cache-dir paper2remarkable
 
diff --git a/README.md b/README.md
index 446682e..dfaae7d 100644
--- a/README.md
+++ b/README.md
@@ -116,10 +116,11 @@ $ p2r -v https://arxiv.org/abs/1811.11242
 The script requires the following external programs to be available:
 
 - [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
-- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a 
-  LaTeX installation.
 - [GhostScript](https://www.ghostscript.com/)
 - [rMAPI](https://github.com/juruen/rmapi)
+- [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended 
+  for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/) 
+  installation.
 
 If these scripts are not available on the ``PATH`` variable, you can supply 
 them with the relevant options to the script. Then, you can install 
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
index 5f3b4e3..2b6e086 100644
--- a/paper2remarkable/crop.py
+++ b/paper2remarkable/crop.py
@@ -9,9 +9,12 @@ Copyright: 2019, G.J.J. van den Burg
 """
 
 import PyPDF2
+import io
 import os
-import subprocess
 import pdfplumber
+import subprocess
+
+from PyPDF2.generic import RectangleObject
 
 from .log import Logger
 
@@ -21,17 +24,41 @@ RM_HEIGHT = 1872
 logger = Logger()
 
 
+def find_offset_byte_line(line):
+    """Find index of first nonzero bit in a line of bytes
+
+    The given line is a string of bytes, each representing 8 pixels. This code 
+    finds the index of the first bit that is not zero. Used when find the 
+    cropbox with pdftoppm.
+    """
+    off = 0
+    for c in line:
+        if c == 0:
+            off += 8
+        else:
+            k = 0
+            while c > 0:
+                k += 1
+                c >>= 1
+            off += k
+            break
+    return off
+
+
 class Cropper(object):
     def __init__(
-        self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+        self,
+        input_file=None,
+        output_file=None,
+        pdftoppm_path="pdftoppm",
     ):
         if not input_file is None:
             self.input_file = os.path.abspath(input_file)
             self.reader = PyPDF2.PdfFileReader(self.input_file)
         if not output_file is None:
             self.output_file = os.path.abspath(output_file)
-        self.pdfcrop_path = pdfcrop_path
 
+        self.pdftoppm_path = pdftoppm_path
         self.writer = PyPDF2.PdfFileWriter()
 
     def crop(self, margins=1):
@@ -75,38 +102,27 @@ class Cropper(object):
     def process_page(self, page_idx, bbox_func, *args, **kwargs):
         """Process a single page and add it to the writer """
         tmpfname = self.export_page(page_idx)
-        tmpfout = "./output.pdf"
         bbox = bbox_func(tmpfname, *args, **kwargs)
-        status = subprocess.call(
-            [
-                self.pdfcrop_path,
-                "--bbox",
-                " ".join(map(str, bbox)),
-                tmpfname,
-                tmpfout,
-            ],
-            stdout=subprocess.DEVNULL,
-        )
-        if not status == 0:
-            return status
-        reader = PyPDF2.PdfFileReader(tmpfout)
-        page = reader.getPage(0)
-        self.writer.addPage(page)
+        thepage = self.reader.getPage(page_idx)
+        thepage.cropBox = RectangleObject(bbox)
+        self.writer.addPage(thepage)
         os.unlink(tmpfname)
-        os.unlink(tmpfout)
         return 0
 
-    def get_bbox(self, filename, margins=1, resolution=72):
-        """Get the bounding box, with optional margins
-
-        if margins is integer, used for all margins, else
-        margins = [left, top, right, bottom]
-
-        We get the bounding box by finding the smallest rectangle that is 
-        completely surrounded by white pixels.
-        """
-        if isinstance(margins, int):
-            margins = [margins for _ in range(4)]
+    def get_raw_bbox(self, filename, resolution=72):
+        """Get the basic bounding box of a pdf file"""
+        # We try to use pdftoppm, but if it's not available or fails, we
+        # default to pdfplumber.
+        try:
+            bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution)
+        except subprocess.CalledProcessError:
+            bbox = self.get_raw_bbox_pdfplumber(
+                filename, resolution=resolution
+            )
+        return bbox
+
+    def get_raw_bbox_pdfplumber(self, filename, resolution=72):
+        """Get the basic bounding box with pdfplumber"""
         pdf = pdfplumber.open(filename)
         im = pdf.pages[0].to_image(resolution=resolution)
         pdf.close()
@@ -131,6 +147,74 @@ class Cropper(object):
         while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
             right += 1
 
+        return left, right, top, bottom, W, H
+
+    def get_raw_bbox_pdftoppm(self, filename, resolution=72):
+        """Get the basic bounding box using pdftoppm """
+        cmd = [
+            self.pdftoppm_path,
+            "-r",
+            str(resolution),
+            "-singlefile",
+            "-mono",
+            filename,
+        ]
+
+        im = subprocess.check_output(cmd)
+        im = io.BytesIO(im)
+
+        id_ = im.readline().rstrip(b"\n")
+        if not id_ == b"P4":
+            raise ValueError("Not in P4 format")
+        wh = im.readline().rstrip(b"\n").split(b" ")
+        width, height = int(wh[0]), int(wh[1])
+        imdata = im.read()
+
+        pad = width % 8
+        padwidth = width + pad
+        stepsize = padwidth // 8
+
+        for top in range(height):
+            if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0:
+                break
+
+        for bottom in reversed(range(height)):
+            if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0:
+                break
+
+        left = width
+        right = 0
+        for i in range(top, bottom):
+            lline = imdata[i * stepsize : (i + 1) * stepsize]
+            rline = reversed(imdata[i * stepsize : (i + 1) * stepsize])
+            l = find_offset_byte_line(lline)
+            left = min(left, l)
+            r = padwidth + pad - find_offset_byte_line(rline)
+            right = max(right, r)
+
+        top += 1
+        left += 1
+        right = width - right + 2
+        bottom = height - bottom - 2
+
+        return left, right, top, bottom, width, height
+
+    def get_bbox(self, filename, margins=1, resolution=72):
+        """Get the bounding box, with optional margins
+
+        if margins is integer, used for all margins, else
+        margins = [left, top, right, bottom]
+
+        We get the bounding box by finding the smallest rectangle that is 
+        completely surrounded by white pixels.
+        """
+        if isinstance(margins, int):
+            margins = [margins for _ in range(4)]
+
+        left, right, top, bottom, W, H = self.get_raw_bbox(
+            filename, resolution=resolution
+        )
+
         left -= margins[0]
         top -= margins[1]
         right -= margins[2]
@@ -141,7 +225,7 @@ class Cropper(object):
 
         # The remarkable changes the orientation of a portrait page if the
         # width is greater than the height. To prevent this, we pad the height
-        # with extra whitespace. This should only occur if the original 
+        # with extra whitespace. This should only occur if the original
         # orientation of the page would be changed by cropping.
         w, h = x1 - x0, y1 - y0
         if H > W and w > h:
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index c660452..4c695c6 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -19,13 +19,17 @@ from .log import Logger
 logger = Logger()
 
 
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
+def crop_pdf(filepath, pdftoppm_path="pdftoppm"):
     """Crop the pdf file using Cropper
     """
     logger.info("Cropping pdf file")
     cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
 
-    cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
+    cropper = Cropper(
+        filepath,
+        cropped_file,
+        pdftoppm_path=pdftoppm_path,
+    )
     status = cropper.crop(margins=15)
 
     if not status == 0:
@@ -39,13 +43,17 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
     return cropped_file
 
 
-def center_pdf(filepath, pdfcrop_path="pdfcrop"):
+def center_pdf(filepath, pdftoppm_path="pdftoppm"):
     """Center the pdf file on the reMarkable
     """
     logger.info("Centering pdf file")
     centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
 
-    cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
+    cropper = Cropper(
+        filepath,
+        centered_file,
+        pdftoppm_path=pdftoppm_path,
+    )
     status = cropper.center()
 
     if not status == 0:
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 221d0ba..bf8cdf5 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -39,7 +39,7 @@ class Provider(metaclass=abc.ABCMeta):
         blank=False,
         remarkable_dir="/",
         rmapi_path="rmapi",
-        pdfcrop_path="pdfcrop",
+        pdftoppm_path="pdftoppm",
         pdftk_path="pdftk",
         gs_path="gs",
         cookiejar=None,
@@ -48,7 +48,7 @@ class Provider(metaclass=abc.ABCMeta):
         self.debug = debug
         self.remarkable_dir = remarkable_dir
         self.rmapi_path = rmapi_path
-        self.pdfcrop_path = pdfcrop_path
+        self.pdftoppm_path = pdftoppm_path
         self.pdftk_path = pdftk_path
         self.gs_path = gs_path
         self.informer = Informer()
@@ -83,10 +83,10 @@ class Provider(metaclass=abc.ABCMeta):
 
     # Wrappers for pdf operations that have additional arguments
     def crop_pdf(self, filepath):
-        return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+        return crop_pdf(filepath, pdftoppm_path=self.pdftoppm_path)
 
     def center_pdf(self, filepath):
-        return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+        return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path)
 
     def shrink_pdf(self, filepath):
         return shrink_pdf(filepath, gs_path=self.gs_path)
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 69af4e6..2303603 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -71,9 +71,9 @@ def parse_args():
         "--gs", help="path to gs executable (default: gs)", default="gs"
     )
     parser.add_argument(
-        "--pdfcrop",
-        help="path to pdfcrop executable (default: pdfcrop)",
-        default="pdfcrop",
+        "--pdftoppm",
+        help="path to pdftoppm executable (default: pdftoppm)",
+        default="pdftoppm",
     )
     parser.add_argument(
         "--pdftk",
@@ -133,7 +133,7 @@ def main():
         blank=args.blank,
         remarkable_dir=args.remarkable_dir,
         rmapi_path=args.rmapi,
-        pdfcrop_path=args.pdfcrop,
+        pdftoppm_path=args.pdftoppm,
         pdftk_path=args.pdftk,
         gs_path=args.gs,
         cookiejar=cookiejar,
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-02-22 14:15:21 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-02-22 14:15:21 +0000
commit	d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9 (patch)
tree	e510ca170bfd7fdd5ac6d9f1f8bf1d0d17451e03
parent	Merge branch 'feature/provider_jmlr' (diff)
parent	Merge branch 'master' into feature/speedup (diff)
download	paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.tar.gz paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.zip