diff options
Diffstat (limited to 'arxiv2remarkable.py')
| -rwxr-xr-x | arxiv2remarkable.py | 204 |
1 files changed, 169 insertions, 35 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index f237d5a..e009298 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -19,6 +19,7 @@ import argparse import bs4 import datetime import os +import pdfplumber import re import requests import shutil @@ -118,37 +119,6 @@ class Provider(metaclass=abc.ABCMeta): self.log("Created filename: %s" % name) return name - def center_pdf(self, filepath): - if not self.center: - return filepath - pdf_file = PyPDF2.PdfFileReader(filepath) - mediaBox = pdf_file.getPage(0).mediaBox - width = mediaBox[2] - mediaBox[0] - height = mediaBox[3] - mediaBox[1] - padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT - left_margin = padding / 2 + 15 - - self.log("Centering PDF file") - status = subprocess.call( - [ - self.pdfcrop_path, - "--margins", - "%i 40 15 15" % left_margin, - filepath, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - self.warn("Failed to crop the pdf file at: %s" % filepath) - return filepath - centered_file = os.path.splitext(filepath)[0] + "-crop.pdf" - if not os.path.exists(centered_file): - self.warn( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - def blank_pdf(self, filepath): if not self.blank: return filepath @@ -167,14 +137,15 @@ class Provider(metaclass=abc.ABCMeta): def crop_pdf(self, filepath): self.log("Cropping pdf file") - status = subprocess.call( - [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], - stdout=subprocess.DEVNULL, + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + cropper = Cropper( + filepath, cropped_file, pdfcrop_path=self.pdfcrop_path ) + status = cropper.crop(margins=15) + if not status == 0: self.warn("Failed to crop the pdf file at: %s" % filepath) return filepath - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" if not os.path.exists(cropped_file): self.warn( "Can't find cropped file '%s' where expected." % cropped_file @@ -182,6 +153,26 @@ class Provider(metaclass=abc.ABCMeta): return filepath return cropped_file + def center_pdf(self, filepath): + if not self.center: + return filepath + + self.log("Centering PDF file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + cropper = Cropper( + filepath, centered_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.center() + if not status == 0: + self.warn("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + self.warn( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + def shrink_pdf(self, filepath): self.log("Shrinking pdf file") output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" @@ -602,6 +593,149 @@ class PdfUrlProvider(Provider): return filename +class Cropper(object): + def __init__( + self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + ): + if not input_file is None: + self.input_file = os.path.abspath(input_file) + self.reader = PyPDF2.PdfFileReader(self.input_file) + if not output_file is None: + self.output_file = os.path.abspath(output_file) + self.pdfcrop_path = pdfcrop_path + + self.writer = PyPDF2.PdfFileWriter() + + def crop(self, margins=1): + return self.process_file(self.crop_page, margins=margins) + + def center(self, padding=15): + return self.process_file(self.center_page, padding=padding) + + def process_file(self, page_func, *args, **kwargs): + for page_idx in range(self.reader.getNumPages()): + status = page_func(page_idx, *args, **kwargs) + if not status == 0: + return status + with open(self.output_file, "wb") as fp: + self.writer.write(fp) + return 0 + + def center_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_center_bbox, padding=padding + ) + + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + + def export_page(self, page_idx): + """Helper function that exports a single page given by index """ + page = self.reader.getPage(page_idx) + writer = PyPDF2.PdfFileWriter() + writer.addPage(page) + tmpfname = "./page.pdf" + with open(tmpfname, "wb") as fp: + writer.write(fp) + return tmpfname + + def process_page(self, page_idx, bbox_func, *args, **kwargs): + """Process a single page and add it to the writer """ + tmpfname = self.export_page(page_idx) + tmpfout = "./output.pdf" + bbox = bbox_func(tmpfname, *args, **kwargs) + status = subprocess.call( + [ + self.pdfcrop_path, + "--bbox", + " ".join(map(str, bbox)), + tmpfname, + tmpfout, + ], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + return status + reader = PyPDF2.PdfFileReader(tmpfout) + page = reader.getPage(0) + self.writer.addPage(page) + os.unlink(tmpfname) + os.unlink(tmpfout) + return 0 + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + pdf = pdfplumber.open(filename) + im = pdf.pages[0].to_image(resolution=resolution) + pdf.close() + + pixels = list(im.original.getdata()) + W, H = im.original.size + + # M is a list of H lists with each W integers that equal the sum of the + # pixel values + M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] + + left, top, bottom, right = 0, 0, 0, 0 + while top < H and sum(M[top]) == W * 255 * 3: + top += 1 + while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: + bottom += 1 + + # Transpose M + M = list(zip(*M)) + while left < W and sum(M[left]) == H * 255 * 3: + left += 1 + while right < W and sum(M[W - 1 - right]) == H * 255 * 3: + right += 1 + + left -= margins[0] + top -= margins[1] + right -= margins[2] + bottom -= margins[3] + + # This is the bounding box in PIL format: (0, 0) top left + x0, y0, x1, y1 = left, top, W - right, H - bottom + + # Get the bbox in Ghostscript format: (0, 0) bottom left + a0, b0, a1, b1 = x0, H - y1, x1, H - y0 + return [a0, b0, a1, b1] + + def get_center_bbox(self, filename, padding=15): + """Compute a bounding box that will center the page file on the + reMarkable + """ + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # we want some minimal padding all around, because it is visually more + # pleasing. + h_prime = h + 2 * padding + w_prime = w + 2 * padding + + # if the document is wider than the remarkable, we add top-padding to + # center it, otherwise we add left-padding + x, y = 0, 0 + if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: + y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 + else: + x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 + + margins = [padding + x, padding + y, padding, padding] + return self.get_bbox(filename, margins=margins) + + def exception(msg): print("ERROR: " + msg, file=sys.stderr) print("Error occurred. Exiting.", file=sys.stderr) |
