aboutsummaryrefslogtreecommitdiff
path: root/arxiv2remarkable.py
diff options
context:
space:
mode:
Diffstat (limited to 'arxiv2remarkable.py')
-rwxr-xr-xarxiv2remarkable.py204
1 files changed, 169 insertions, 35 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index f237d5a..e009298 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -19,6 +19,7 @@ import argparse
import bs4
import datetime
import os
+import pdfplumber
import re
import requests
import shutil
@@ -118,37 +119,6 @@ class Provider(metaclass=abc.ABCMeta):
self.log("Created filename: %s" % name)
return name
- def center_pdf(self, filepath):
- if not self.center:
- return filepath
- pdf_file = PyPDF2.PdfFileReader(filepath)
- mediaBox = pdf_file.getPage(0).mediaBox
- width = mediaBox[2] - mediaBox[0]
- height = mediaBox[3] - mediaBox[1]
- padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT
- left_margin = padding / 2 + 15
-
- self.log("Centering PDF file")
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--margins",
- "%i 40 15 15" % left_margin,
- filepath,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- self.warn("Failed to crop the pdf file at: %s" % filepath)
- return filepath
- centered_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- if not os.path.exists(centered_file):
- self.warn(
- "Can't find centered file '%s' where expected." % centered_file
- )
- return filepath
- return centered_file
-
def blank_pdf(self, filepath):
if not self.blank:
return filepath
@@ -167,14 +137,15 @@ class Provider(metaclass=abc.ABCMeta):
def crop_pdf(self, filepath):
self.log("Cropping pdf file")
- status = subprocess.call(
- [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
- stdout=subprocess.DEVNULL,
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+ cropper = Cropper(
+ filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
)
+ status = cropper.crop(margins=15)
+
if not status == 0:
self.warn("Failed to crop the pdf file at: %s" % filepath)
return filepath
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
if not os.path.exists(cropped_file):
self.warn(
"Can't find cropped file '%s' where expected." % cropped_file
@@ -182,6 +153,26 @@ class Provider(metaclass=abc.ABCMeta):
return filepath
return cropped_file
+ def center_pdf(self, filepath):
+ if not self.center:
+ return filepath
+
+ self.log("Centering PDF file")
+ centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+ cropper = Cropper(
+ filepath, centered_file, pdfcrop_path=self.pdfcrop_path
+ )
+ status = cropper.center()
+ if not status == 0:
+ self.warn("Failed to center the pdf file at: %s" % filepath)
+ return filepath
+ if not os.path.exists(centered_file):
+ self.warn(
+ "Can't find centered file '%s' where expected." % centered_file
+ )
+ return filepath
+ return centered_file
+
def shrink_pdf(self, filepath):
self.log("Shrinking pdf file")
output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
@@ -602,6 +593,149 @@ class PdfUrlProvider(Provider):
return filename
+class Cropper(object):
+ def __init__(
+ self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ ):
+ if not input_file is None:
+ self.input_file = os.path.abspath(input_file)
+ self.reader = PyPDF2.PdfFileReader(self.input_file)
+ if not output_file is None:
+ self.output_file = os.path.abspath(output_file)
+ self.pdfcrop_path = pdfcrop_path
+
+ self.writer = PyPDF2.PdfFileWriter()
+
+ def crop(self, margins=1):
+ return self.process_file(self.crop_page, margins=margins)
+
+ def center(self, padding=15):
+ return self.process_file(self.center_page, padding=padding)
+
+ def process_file(self, page_func, *args, **kwargs):
+ for page_idx in range(self.reader.getNumPages()):
+ status = page_func(page_idx, *args, **kwargs)
+ if not status == 0:
+ return status
+ with open(self.output_file, "wb") as fp:
+ self.writer.write(fp)
+ return 0
+
+ def center_page(self, page_idx, padding):
+ return self.process_page(
+ page_idx, self.get_center_bbox, padding=padding
+ )
+
+ def crop_page(self, page_idx, margins):
+ return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+ def export_page(self, page_idx):
+ """Helper function that exports a single page given by index """
+ page = self.reader.getPage(page_idx)
+ writer = PyPDF2.PdfFileWriter()
+ writer.addPage(page)
+ tmpfname = "./page.pdf"
+ with open(tmpfname, "wb") as fp:
+ writer.write(fp)
+ return tmpfname
+
+ def process_page(self, page_idx, bbox_func, *args, **kwargs):
+ """Process a single page and add it to the writer """
+ tmpfname = self.export_page(page_idx)
+ tmpfout = "./output.pdf"
+ bbox = bbox_func(tmpfname, *args, **kwargs)
+ status = subprocess.call(
+ [
+ self.pdfcrop_path,
+ "--bbox",
+ " ".join(map(str, bbox)),
+ tmpfname,
+ tmpfout,
+ ],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ return status
+ reader = PyPDF2.PdfFileReader(tmpfout)
+ page = reader.getPage(0)
+ self.writer.addPage(page)
+ os.unlink(tmpfname)
+ os.unlink(tmpfout)
+ return 0
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+ pdf = pdfplumber.open(filename)
+ im = pdf.pages[0].to_image(resolution=resolution)
+ pdf.close()
+
+ pixels = list(im.original.getdata())
+ W, H = im.original.size
+
+ # M is a list of H lists with each W integers that equal the sum of the
+ # pixel values
+ M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+ left, top, bottom, right = 0, 0, 0, 0
+ while top < H and sum(M[top]) == W * 255 * 3:
+ top += 1
+ while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+ bottom += 1
+
+ # Transpose M
+ M = list(zip(*M))
+ while left < W and sum(M[left]) == H * 255 * 3:
+ left += 1
+ while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+ right += 1
+
+ left -= margins[0]
+ top -= margins[1]
+ right -= margins[2]
+ bottom -= margins[3]
+
+ # This is the bounding box in PIL format: (0, 0) top left
+ x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+ # Get the bbox in Ghostscript format: (0, 0) bottom left
+ a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+ return [a0, b0, a1, b1]
+
+ def get_center_bbox(self, filename, padding=15):
+ """Compute a bounding box that will center the page file on the
+ reMarkable
+ """
+ bbox = self.get_bbox(filename, margins=0)
+
+ h = bbox[3] - bbox[1]
+ w = bbox[2] - bbox[0]
+
+ # we want some minimal padding all around, because it is visually more
+ # pleasing.
+ h_prime = h + 2 * padding
+ w_prime = w + 2 * padding
+
+ # if the document is wider than the remarkable, we add top-padding to
+ # center it, otherwise we add left-padding
+ x, y = 0, 0
+ if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+ y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+ else:
+ x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+ margins = [padding + x, padding + y, padding, padding]
+ return self.get_bbox(filename, margins=margins)
+
+
def exception(msg):
print("ERROR: " + msg, file=sys.stderr)
print("Error occurred. Exiting.", file=sys.stderr)