aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-07-05 17:20:12 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-07-05 17:20:12 +0100
commit4cb3af830a9c9e0612cd48feaec44490c345d5a3 (patch)
tree88543f97d2af6f94ba003000881cc0b6fcdf3395
parentGive the temporary directory a prefix (diff)
downloadpaper2remarkable-4cb3af830a9c9e0612cd48feaec44490c345d5a3.tar.gz
paper2remarkable-4cb3af830a9c9e0612cd48feaec44490c345d5a3.zip
Add our own method for determining the bounding box
I found some pdfs where pdfcrop/ghostscript determined the bounding box incorrectly. With this commit we introduce a simple way to determine the bounding box by finding the smallest rectangle that is completely surrounded by white pixels. This will work well for most PDFs. The centering functionality is also improved, and now correctly centers the PDF on the reMarkable both vertically and horizontally.
-rwxr-xr-xarxiv2remarkable.py204
-rw-r--r--poetry.lock87
-rw-r--r--pyproject.toml1
3 files changed, 256 insertions, 36 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index f237d5a..e009298 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -19,6 +19,7 @@ import argparse
import bs4
import datetime
import os
+import pdfplumber
import re
import requests
import shutil
@@ -118,37 +119,6 @@ class Provider(metaclass=abc.ABCMeta):
self.log("Created filename: %s" % name)
return name
- def center_pdf(self, filepath):
- if not self.center:
- return filepath
- pdf_file = PyPDF2.PdfFileReader(filepath)
- mediaBox = pdf_file.getPage(0).mediaBox
- width = mediaBox[2] - mediaBox[0]
- height = mediaBox[3] - mediaBox[1]
- padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT
- left_margin = padding / 2 + 15
-
- self.log("Centering PDF file")
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--margins",
- "%i 40 15 15" % left_margin,
- filepath,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- self.warn("Failed to crop the pdf file at: %s" % filepath)
- return filepath
- centered_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- if not os.path.exists(centered_file):
- self.warn(
- "Can't find centered file '%s' where expected." % centered_file
- )
- return filepath
- return centered_file
-
def blank_pdf(self, filepath):
if not self.blank:
return filepath
@@ -167,14 +137,15 @@ class Provider(metaclass=abc.ABCMeta):
def crop_pdf(self, filepath):
self.log("Cropping pdf file")
- status = subprocess.call(
- [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
- stdout=subprocess.DEVNULL,
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+ cropper = Cropper(
+ filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
)
+ status = cropper.crop(margins=15)
+
if not status == 0:
self.warn("Failed to crop the pdf file at: %s" % filepath)
return filepath
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
if not os.path.exists(cropped_file):
self.warn(
"Can't find cropped file '%s' where expected." % cropped_file
@@ -182,6 +153,26 @@ class Provider(metaclass=abc.ABCMeta):
return filepath
return cropped_file
+ def center_pdf(self, filepath):
+ if not self.center:
+ return filepath
+
+ self.log("Centering PDF file")
+ centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+ cropper = Cropper(
+ filepath, centered_file, pdfcrop_path=self.pdfcrop_path
+ )
+ status = cropper.center()
+ if not status == 0:
+ self.warn("Failed to center the pdf file at: %s" % filepath)
+ return filepath
+ if not os.path.exists(centered_file):
+ self.warn(
+ "Can't find centered file '%s' where expected." % centered_file
+ )
+ return filepath
+ return centered_file
+
def shrink_pdf(self, filepath):
self.log("Shrinking pdf file")
output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
@@ -602,6 +593,149 @@ class PdfUrlProvider(Provider):
return filename
+class Cropper(object):
+ def __init__(
+ self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ ):
+ if not input_file is None:
+ self.input_file = os.path.abspath(input_file)
+ self.reader = PyPDF2.PdfFileReader(self.input_file)
+ if not output_file is None:
+ self.output_file = os.path.abspath(output_file)
+ self.pdfcrop_path = pdfcrop_path
+
+ self.writer = PyPDF2.PdfFileWriter()
+
+ def crop(self, margins=1):
+ return self.process_file(self.crop_page, margins=margins)
+
+ def center(self, padding=15):
+ return self.process_file(self.center_page, padding=padding)
+
+ def process_file(self, page_func, *args, **kwargs):
+ for page_idx in range(self.reader.getNumPages()):
+ status = page_func(page_idx, *args, **kwargs)
+ if not status == 0:
+ return status
+ with open(self.output_file, "wb") as fp:
+ self.writer.write(fp)
+ return 0
+
+ def center_page(self, page_idx, padding):
+ return self.process_page(
+ page_idx, self.get_center_bbox, padding=padding
+ )
+
+ def crop_page(self, page_idx, margins):
+ return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+ def export_page(self, page_idx):
+ """Helper function that exports a single page given by index """
+ page = self.reader.getPage(page_idx)
+ writer = PyPDF2.PdfFileWriter()
+ writer.addPage(page)
+ tmpfname = "./page.pdf"
+ with open(tmpfname, "wb") as fp:
+ writer.write(fp)
+ return tmpfname
+
+ def process_page(self, page_idx, bbox_func, *args, **kwargs):
+ """Process a single page and add it to the writer """
+ tmpfname = self.export_page(page_idx)
+ tmpfout = "./output.pdf"
+ bbox = bbox_func(tmpfname, *args, **kwargs)
+ status = subprocess.call(
+ [
+ self.pdfcrop_path,
+ "--bbox",
+ " ".join(map(str, bbox)),
+ tmpfname,
+ tmpfout,
+ ],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ return status
+ reader = PyPDF2.PdfFileReader(tmpfout)
+ page = reader.getPage(0)
+ self.writer.addPage(page)
+ os.unlink(tmpfname)
+ os.unlink(tmpfout)
+ return 0
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+ pdf = pdfplumber.open(filename)
+ im = pdf.pages[0].to_image(resolution=resolution)
+ pdf.close()
+
+ pixels = list(im.original.getdata())
+ W, H = im.original.size
+
+ # M is a list of H lists with each W integers that equal the sum of the
+ # pixel values
+ M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+ left, top, bottom, right = 0, 0, 0, 0
+ while top < H and sum(M[top]) == W * 255 * 3:
+ top += 1
+ while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+ bottom += 1
+
+ # Transpose M
+ M = list(zip(*M))
+ while left < W and sum(M[left]) == H * 255 * 3:
+ left += 1
+ while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+ right += 1
+
+ left -= margins[0]
+ top -= margins[1]
+ right -= margins[2]
+ bottom -= margins[3]
+
+ # This is the bounding box in PIL format: (0, 0) top left
+ x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+ # Get the bbox in Ghostscript format: (0, 0) bottom left
+ a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+ return [a0, b0, a1, b1]
+
+ def get_center_bbox(self, filename, padding=15):
+ """Compute a bounding box that will center the page file on the
+ reMarkable
+ """
+ bbox = self.get_bbox(filename, margins=0)
+
+ h = bbox[3] - bbox[1]
+ w = bbox[2] - bbox[0]
+
+ # we want some minimal padding all around, because it is visually more
+ # pleasing.
+ h_prime = h + 2 * padding
+ w_prime = w + 2 * padding
+
+ # if the document is wider than the remarkable, we add top-padding to
+ # center it, otherwise we add left-padding
+ x, y = 0, 0
+ if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+ y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+ else:
+ x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+ margins = [padding + x, padding + y, padding, padding]
+ return self.get_bbox(filename, margins=margins)
+
+
def exception(msg):
print("ERROR: " + msg, file=sys.stderr)
print("Error occurred. Exiting.", file=sys.stderr)
diff --git a/poetry.lock b/poetry.lock
index 893007f..322114d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -46,6 +46,51 @@ version = "2.8"
[[package]]
category = "main"
+description = "PDF parser and analyzer"
+name = "pdfminer.six"
+optional = false
+python-versions = "*"
+version = "20181108"
+
+[package.dependencies]
+pycryptodome = "*"
+six = "*"
+sortedcontainers = "*"
+
+[[package]]
+category = "main"
+description = "Plumb a PDF for detailed information about each char, rectangle, and line."
+name = "pdfplumber"
+optional = false
+python-versions = "*"
+version = "0.5.12"
+
+[package.dependencies]
+chardet = "*"
+"pdfminer.six" = "20181108"
+pillow = ">=3.0.0"
+pycryptodome = "*"
+unicodecsv = ">=0.14.1"
+wand = "*"
+
+[[package]]
+category = "main"
+description = "Python Imaging Library (Fork)"
+name = "pillow"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+version = "6.0.0"
+
+[[package]]
+category = "main"
+description = "Cryptographic library for Python"
+name = "pycryptodome"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+version = "3.8.2"
+
+[[package]]
+category = "main"
description = "Python HTTP for Humans."
name = "requests"
optional = false
@@ -60,6 +105,22 @@ urllib3 = ">=1.21.1,<1.25"
[[package]]
category = "main"
+description = "Python 2 and 3 compatibility utilities"
+name = "six"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+version = "1.12.0"
+
+[[package]]
+category = "main"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+name = "sortedcontainers"
+optional = false
+python-versions = "*"
+version = "2.1.0"
+
+[[package]]
+category = "main"
description = "A CSS4 selector implementation for Beautiful Soup."
name = "soupsieve"
optional = false
@@ -68,14 +129,30 @@ version = "1.7.3"
[[package]]
category = "main"
+description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*."
+name = "unicodecsv"
+optional = false
+python-versions = "*"
+version = "0.14.1"
+
+[[package]]
+category = "main"
description = "HTTP library with thread-safe connection pooling, file post, and more."
name = "urllib3"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
version = "1.24.1"
+[[package]]
+category = "main"
+description = "Ctypes-based simple MagickWand API binding for Python"
+name = "wand"
+optional = false
+python-versions = "*"
+version = "0.5.4"
+
[metadata]
-content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82"
+content-hash = "21a857f686e73e377feae7bf7c09ed5933d51a3f90ca77315408a3d7fc362c42"
python-versions = "^3.5"
[metadata.hashes]
@@ -84,6 +161,14 @@ bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
+"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"]
+pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"]
+pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"]
+pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"]
requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
+six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"]
+sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"]
soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
+unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"]
urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
+wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"]
diff --git a/pyproject.toml b/pyproject.toml
index 2c28224..a211300 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ license = "MIT"
python = "^3.5"
bs4 = "^0.0.1"
requests = "^2.21"
+pdfplumber = "^0.5.12"
[tool.poetry.dev-dependencies]