aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-22 14:15:21 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-22 14:15:21 +0000
commitd0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9 (patch)
treee510ca170bfd7fdd5ac6d9f1f8bf1d0d17451e03
parentMerge branch 'feature/provider_jmlr' (diff)
parentMerge branch 'master' into feature/speedup (diff)
downloadpaper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.tar.gz
paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.zip
Merge branch 'feature/speedup'
-rw-r--r--.travis.yml3
-rw-r--r--Dockerfile2
-rw-r--r--README.md5
-rw-r--r--paper2remarkable/crop.py148
-rw-r--r--paper2remarkable/pdf_ops.py16
-rw-r--r--paper2remarkable/providers/_base.py8
-rw-r--r--paper2remarkable/ui.py8
7 files changed, 141 insertions, 49 deletions
diff --git a/.travis.yml b/.travis.yml
index 5551597..f412f9b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,10 +6,9 @@ python:
before_install:
- sudo apt-get update
- - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils
+ - sudo apt-get install ghostscript pdftk poppler-utils
install:
- - pip install six
- pip install -e .[dev]
script:
diff --git a/Dockerfile b/Dockerfile
index 38db46b..86743a2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update \
libmagickwand-dev \
pdftk \
ghostscript \
- texlive-extra-utils # contains pdfcrop
+ poppler-utils
RUN pip install --no-cache-dir paper2remarkable
diff --git a/README.md b/README.md
index 446682e..dfaae7d 100644
--- a/README.md
+++ b/README.md
@@ -116,10 +116,11 @@ $ p2r -v https://arxiv.org/abs/1811.11242
The script requires the following external programs to be available:
- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
-- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a
- LaTeX installation.
- [GhostScript](https://www.ghostscript.com/)
- [rMAPI](https://github.com/juruen/rmapi)
+- [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended
+ for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/)
+ installation.
If these scripts are not available on the ``PATH`` variable, you can supply
them with the relevant options to the script. Then, you can install
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
index 5f3b4e3..2b6e086 100644
--- a/paper2remarkable/crop.py
+++ b/paper2remarkable/crop.py
@@ -9,9 +9,12 @@ Copyright: 2019, G.J.J. van den Burg
"""
import PyPDF2
+import io
import os
-import subprocess
import pdfplumber
+import subprocess
+
+from PyPDF2.generic import RectangleObject
from .log import Logger
@@ -21,17 +24,41 @@ RM_HEIGHT = 1872
logger = Logger()
+def find_offset_byte_line(line):
+ """Find index of first nonzero bit in a line of bytes
+
+ The given line is a string of bytes, each representing 8 pixels. This code
+ finds the index of the first bit that is not zero. Used when find the
+ cropbox with pdftoppm.
+ """
+ off = 0
+ for c in line:
+ if c == 0:
+ off += 8
+ else:
+ k = 0
+ while c > 0:
+ k += 1
+ c >>= 1
+ off += k
+ break
+ return off
+
+
class Cropper(object):
def __init__(
- self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ self,
+ input_file=None,
+ output_file=None,
+ pdftoppm_path="pdftoppm",
):
if not input_file is None:
self.input_file = os.path.abspath(input_file)
self.reader = PyPDF2.PdfFileReader(self.input_file)
if not output_file is None:
self.output_file = os.path.abspath(output_file)
- self.pdfcrop_path = pdfcrop_path
+ self.pdftoppm_path = pdftoppm_path
self.writer = PyPDF2.PdfFileWriter()
def crop(self, margins=1):
@@ -75,38 +102,27 @@ class Cropper(object):
def process_page(self, page_idx, bbox_func, *args, **kwargs):
"""Process a single page and add it to the writer """
tmpfname = self.export_page(page_idx)
- tmpfout = "./output.pdf"
bbox = bbox_func(tmpfname, *args, **kwargs)
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--bbox",
- " ".join(map(str, bbox)),
- tmpfname,
- tmpfout,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- return status
- reader = PyPDF2.PdfFileReader(tmpfout)
- page = reader.getPage(0)
- self.writer.addPage(page)
+ thepage = self.reader.getPage(page_idx)
+ thepage.cropBox = RectangleObject(bbox)
+ self.writer.addPage(thepage)
os.unlink(tmpfname)
- os.unlink(tmpfout)
return 0
- def get_bbox(self, filename, margins=1, resolution=72):
- """Get the bounding box, with optional margins
-
- if margins is integer, used for all margins, else
- margins = [left, top, right, bottom]
-
- We get the bounding box by finding the smallest rectangle that is
- completely surrounded by white pixels.
- """
- if isinstance(margins, int):
- margins = [margins for _ in range(4)]
+ def get_raw_bbox(self, filename, resolution=72):
+ """Get the basic bounding box of a pdf file"""
+ # We try to use pdftoppm, but if it's not available or fails, we
+ # default to pdfplumber.
+ try:
+ bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution)
+ except subprocess.CalledProcessError:
+ bbox = self.get_raw_bbox_pdfplumber(
+ filename, resolution=resolution
+ )
+ return bbox
+
+ def get_raw_bbox_pdfplumber(self, filename, resolution=72):
+ """Get the basic bounding box with pdfplumber"""
pdf = pdfplumber.open(filename)
im = pdf.pages[0].to_image(resolution=resolution)
pdf.close()
@@ -131,6 +147,74 @@ class Cropper(object):
while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
right += 1
+ return left, right, top, bottom, W, H
+
+ def get_raw_bbox_pdftoppm(self, filename, resolution=72):
+ """Get the basic bounding box using pdftoppm """
+ cmd = [
+ self.pdftoppm_path,
+ "-r",
+ str(resolution),
+ "-singlefile",
+ "-mono",
+ filename,
+ ]
+
+ im = subprocess.check_output(cmd)
+ im = io.BytesIO(im)
+
+ id_ = im.readline().rstrip(b"\n")
+ if not id_ == b"P4":
+ raise ValueError("Not in P4 format")
+ wh = im.readline().rstrip(b"\n").split(b" ")
+ width, height = int(wh[0]), int(wh[1])
+ imdata = im.read()
+
+ pad = width % 8
+ padwidth = width + pad
+ stepsize = padwidth // 8
+
+ for top in range(height):
+ if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0:
+ break
+
+ for bottom in reversed(range(height)):
+ if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0:
+ break
+
+ left = width
+ right = 0
+ for i in range(top, bottom):
+ lline = imdata[i * stepsize : (i + 1) * stepsize]
+ rline = reversed(imdata[i * stepsize : (i + 1) * stepsize])
+ l = find_offset_byte_line(lline)
+ left = min(left, l)
+ r = padwidth + pad - find_offset_byte_line(rline)
+ right = max(right, r)
+
+ top += 1
+ left += 1
+ right = width - right + 2
+ bottom = height - bottom - 2
+
+ return left, right, top, bottom, width, height
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+
+ left, right, top, bottom, W, H = self.get_raw_bbox(
+ filename, resolution=resolution
+ )
+
left -= margins[0]
top -= margins[1]
right -= margins[2]
@@ -141,7 +225,7 @@ class Cropper(object):
# The remarkable changes the orientation of a portrait page if the
# width is greater than the height. To prevent this, we pad the height
- # with extra whitespace. This should only occur if the original
+ # with extra whitespace. This should only occur if the original
# orientation of the page would be changed by cropping.
w, h = x1 - x0, y1 - y0
if H > W and w > h:
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index c660452..4c695c6 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -19,13 +19,17 @@ from .log import Logger
logger = Logger()
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
+def crop_pdf(filepath, pdftoppm_path="pdftoppm"):
"""Crop the pdf file using Cropper
"""
logger.info("Cropping pdf file")
cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
+ cropper = Cropper(
+ filepath,
+ cropped_file,
+ pdftoppm_path=pdftoppm_path,
+ )
status = cropper.crop(margins=15)
if not status == 0:
@@ -39,13 +43,17 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
return cropped_file
-def center_pdf(filepath, pdfcrop_path="pdfcrop"):
+def center_pdf(filepath, pdftoppm_path="pdftoppm"):
"""Center the pdf file on the reMarkable
"""
logger.info("Centering pdf file")
centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
- cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
+ cropper = Cropper(
+ filepath,
+ centered_file,
+ pdftoppm_path=pdftoppm_path,
+ )
status = cropper.center()
if not status == 0:
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 221d0ba..bf8cdf5 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -39,7 +39,7 @@ class Provider(metaclass=abc.ABCMeta):
blank=False,
remarkable_dir="/",
rmapi_path="rmapi",
- pdfcrop_path="pdfcrop",
+ pdftoppm_path="pdftoppm",
pdftk_path="pdftk",
gs_path="gs",
cookiejar=None,
@@ -48,7 +48,7 @@ class Provider(metaclass=abc.ABCMeta):
self.debug = debug
self.remarkable_dir = remarkable_dir
self.rmapi_path = rmapi_path
- self.pdfcrop_path = pdfcrop_path
+ self.pdftoppm_path = pdftoppm_path
self.pdftk_path = pdftk_path
self.gs_path = gs_path
self.informer = Informer()
@@ -83,10 +83,10 @@ class Provider(metaclass=abc.ABCMeta):
# Wrappers for pdf operations that have additional arguments
def crop_pdf(self, filepath):
- return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+ return crop_pdf(filepath, pdftoppm_path=self.pdftoppm_path)
def center_pdf(self, filepath):
- return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+ return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path)
def shrink_pdf(self, filepath):
return shrink_pdf(filepath, gs_path=self.gs_path)
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 69af4e6..2303603 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -71,9 +71,9 @@ def parse_args():
"--gs", help="path to gs executable (default: gs)", default="gs"
)
parser.add_argument(
- "--pdfcrop",
- help="path to pdfcrop executable (default: pdfcrop)",
- default="pdfcrop",
+ "--pdftoppm",
+ help="path to pdftoppm executable (default: pdftoppm)",
+ default="pdftoppm",
)
parser.add_argument(
"--pdftk",
@@ -133,7 +133,7 @@ def main():
blank=args.blank,
remarkable_dir=args.remarkable_dir,
rmapi_path=args.rmapi,
- pdfcrop_path=args.pdfcrop,
+ pdftoppm_path=args.pdftoppm,
pdftk_path=args.pdftk,
gs_path=args.gs,
cookiejar=cookiejar,