diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-02-22 14:15:21 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-02-22 14:15:21 +0000 |
| commit | d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9 (patch) | |
| tree | e510ca170bfd7fdd5ac6d9f1f8bf1d0d17451e03 | |
| parent | Merge branch 'feature/provider_jmlr' (diff) | |
| parent | Merge branch 'master' into feature/speedup (diff) | |
| download | paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.tar.gz paper2remarkable-d0a74ccd986fa7a595c1de279f4a11cbeaaa2eb9.zip | |
Merge branch 'feature/speedup'
| -rw-r--r-- | .travis.yml | 3 | ||||
| -rw-r--r-- | Dockerfile | 2 | ||||
| -rw-r--r-- | README.md | 5 | ||||
| -rw-r--r-- | paper2remarkable/crop.py | 148 | ||||
| -rw-r--r-- | paper2remarkable/pdf_ops.py | 16 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 8 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 8 |
7 files changed, 141 insertions, 49 deletions
diff --git a/.travis.yml b/.travis.yml index 5551597..f412f9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,10 +6,9 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils install: - - pip install six - pip install -e .[dev] script: @@ -18,7 +18,7 @@ RUN apt-get update \ libmagickwand-dev \ pdftk \ ghostscript \ - texlive-extra-utils # contains pdfcrop + poppler-utils RUN pip install --no-cache-dir paper2remarkable @@ -116,10 +116,11 @@ $ p2r -v https://arxiv.org/abs/1811.11242 The script requires the following external programs to be available: - [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) -- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a - LaTeX installation. - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) +- [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended + for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/) + installation. If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. Then, you can install diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 5f3b4e3..2b6e086 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -9,9 +9,12 @@ Copyright: 2019, G.J.J. van den Burg """ import PyPDF2 +import io import os -import subprocess import pdfplumber +import subprocess + +from PyPDF2.generic import RectangleObject from .log import Logger @@ -21,17 +24,41 @@ RM_HEIGHT = 1872 logger = Logger() +def find_offset_byte_line(line): + """Find index of first nonzero bit in a line of bytes + + The given line is a string of bytes, each representing 8 pixels. This code + finds the index of the first bit that is not zero. Used when find the + cropbox with pdftoppm. + """ + off = 0 + for c in line: + if c == 0: + off += 8 + else: + k = 0 + while c > 0: + k += 1 + c >>= 1 + off += k + break + return off + + class Cropper(object): def __init__( - self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + self, + input_file=None, + output_file=None, + pdftoppm_path="pdftoppm", ): if not input_file is None: self.input_file = os.path.abspath(input_file) self.reader = PyPDF2.PdfFileReader(self.input_file) if not output_file is None: self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path + self.pdftoppm_path = pdftoppm_path self.writer = PyPDF2.PdfFileWriter() def crop(self, margins=1): @@ -75,38 +102,27 @@ class Cropper(object): def process_page(self, page_idx, bbox_func, *args, **kwargs): """Process a single page and add it to the writer """ tmpfname = self.export_page(page_idx) - tmpfout = "./output.pdf" bbox = bbox_func(tmpfname, *args, **kwargs) - status = subprocess.call( - [ - self.pdfcrop_path, - "--bbox", - " ".join(map(str, bbox)), - tmpfname, - tmpfout, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - return status - reader = PyPDF2.PdfFileReader(tmpfout) - page = reader.getPage(0) - self.writer.addPage(page) + thepage = self.reader.getPage(page_idx) + thepage.cropBox = RectangleObject(bbox) + self.writer.addPage(thepage) os.unlink(tmpfname) - os.unlink(tmpfout) return 0 - def get_bbox(self, filename, margins=1, resolution=72): - """Get the bounding box, with optional margins - - if margins is integer, used for all margins, else - margins = [left, top, right, bottom] - - We get the bounding box by finding the smallest rectangle that is - completely surrounded by white pixels. - """ - if isinstance(margins, int): - margins = [margins for _ in range(4)] + def get_raw_bbox(self, filename, resolution=72): + """Get the basic bounding box of a pdf file""" + # We try to use pdftoppm, but if it's not available or fails, we + # default to pdfplumber. + try: + bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) + except subprocess.CalledProcessError: + bbox = self.get_raw_bbox_pdfplumber( + filename, resolution=resolution + ) + return bbox + + def get_raw_bbox_pdfplumber(self, filename, resolution=72): + """Get the basic bounding box with pdfplumber""" pdf = pdfplumber.open(filename) im = pdf.pages[0].to_image(resolution=resolution) pdf.close() @@ -131,6 +147,74 @@ class Cropper(object): while right < W and sum(M[W - 1 - right]) == H * 255 * 3: right += 1 + return left, right, top, bottom, W, H + + def get_raw_bbox_pdftoppm(self, filename, resolution=72): + """Get the basic bounding box using pdftoppm """ + cmd = [ + self.pdftoppm_path, + "-r", + str(resolution), + "-singlefile", + "-mono", + filename, + ] + + im = subprocess.check_output(cmd) + im = io.BytesIO(im) + + id_ = im.readline().rstrip(b"\n") + if not id_ == b"P4": + raise ValueError("Not in P4 format") + wh = im.readline().rstrip(b"\n").split(b" ") + width, height = int(wh[0]), int(wh[1]) + imdata = im.read() + + pad = width % 8 + padwidth = width + pad + stepsize = padwidth // 8 + + for top in range(height): + if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0: + break + + for bottom in reversed(range(height)): + if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0: + break + + left = width + right = 0 + for i in range(top, bottom): + lline = imdata[i * stepsize : (i + 1) * stepsize] + rline = reversed(imdata[i * stepsize : (i + 1) * stepsize]) + l = find_offset_byte_line(lline) + left = min(left, l) + r = padwidth + pad - find_offset_byte_line(rline) + right = max(right, r) + + top += 1 + left += 1 + right = width - right + 2 + bottom = height - bottom - 2 + + return left, right, top, bottom, width, height + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + + left, right, top, bottom, W, H = self.get_raw_bbox( + filename, resolution=resolution + ) + left -= margins[0] top -= margins[1] right -= margins[2] @@ -141,7 +225,7 @@ class Cropper(object): # The remarkable changes the orientation of a portrait page if the # width is greater than the height. To prevent this, we pad the height - # with extra whitespace. This should only occur if the original + # with extra whitespace. This should only occur if the original # orientation of the page would be changed by cropping. w, h = x1 - x0, y1 - y0 if H > W and w > h: diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c660452..4c695c6 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -19,13 +19,17 @@ from .log import Logger logger = Logger() -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): +def crop_pdf(filepath, pdftoppm_path="pdftoppm"): """Crop the pdf file using Cropper """ logger.info("Cropping pdf file") cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) + cropper = Cropper( + filepath, + cropped_file, + pdftoppm_path=pdftoppm_path, + ) status = cropper.crop(margins=15) if not status == 0: @@ -39,13 +43,17 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"): return cropped_file -def center_pdf(filepath, pdfcrop_path="pdfcrop"): +def center_pdf(filepath, pdftoppm_path="pdftoppm"): """Center the pdf file on the reMarkable """ logger.info("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) + cropper = Cropper( + filepath, + centered_file, + pdftoppm_path=pdftoppm_path, + ) status = cropper.center() if not status == 0: diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 221d0ba..bf8cdf5 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -39,7 +39,7 @@ class Provider(metaclass=abc.ABCMeta): blank=False, remarkable_dir="/", rmapi_path="rmapi", - pdfcrop_path="pdfcrop", + pdftoppm_path="pdftoppm", pdftk_path="pdftk", gs_path="gs", cookiejar=None, @@ -48,7 +48,7 @@ class Provider(metaclass=abc.ABCMeta): self.debug = debug self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path - self.pdfcrop_path = pdfcrop_path + self.pdftoppm_path = pdftoppm_path self.pdftk_path = pdftk_path self.gs_path = gs_path self.informer = Informer() @@ -83,10 +83,10 @@ class Provider(metaclass=abc.ABCMeta): # Wrappers for pdf operations that have additional arguments def crop_pdf(self, filepath): - return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return crop_pdf(filepath, pdftoppm_path=self.pdftoppm_path) def center_pdf(self, filepath): - return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path) def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 69af4e6..2303603 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -71,9 +71,9 @@ def parse_args(): "--gs", help="path to gs executable (default: gs)", default="gs" ) parser.add_argument( - "--pdfcrop", - help="path to pdfcrop executable (default: pdfcrop)", - default="pdfcrop", + "--pdftoppm", + help="path to pdftoppm executable (default: pdftoppm)", + default="pdftoppm", ) parser.add_argument( "--pdftk", @@ -133,7 +133,7 @@ def main(): blank=args.blank, remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, - pdfcrop_path=args.pdfcrop, + pdftoppm_path=args.pdftoppm, pdftk_path=args.pdftk, gs_path=args.gs, cookiejar=cookiejar, |
