aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xarxiv2remarkable.py204
-rw-r--r--poetry.lock87
-rw-r--r--pyproject.toml1
3 files changed, 256 insertions, 36 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
index f237d5a..e009298 100755
--- a/arxiv2remarkable.py
+++ b/arxiv2remarkable.py
@@ -19,6 +19,7 @@ import argparse
import bs4
import datetime
import os
+import pdfplumber
import re
import requests
import shutil
@@ -118,37 +119,6 @@ class Provider(metaclass=abc.ABCMeta):
self.log("Created filename: %s" % name)
return name
- def center_pdf(self, filepath):
- if not self.center:
- return filepath
- pdf_file = PyPDF2.PdfFileReader(filepath)
- mediaBox = pdf_file.getPage(0).mediaBox
- width = mediaBox[2] - mediaBox[0]
- height = mediaBox[3] - mediaBox[1]
- padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT
- left_margin = padding / 2 + 15
-
- self.log("Centering PDF file")
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--margins",
- "%i 40 15 15" % left_margin,
- filepath,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- self.warn("Failed to crop the pdf file at: %s" % filepath)
- return filepath
- centered_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- if not os.path.exists(centered_file):
- self.warn(
- "Can't find centered file '%s' where expected." % centered_file
- )
- return filepath
- return centered_file
-
def blank_pdf(self, filepath):
if not self.blank:
return filepath
@@ -167,14 +137,15 @@ class Provider(metaclass=abc.ABCMeta):
def crop_pdf(self, filepath):
self.log("Cropping pdf file")
- status = subprocess.call(
- [self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
- stdout=subprocess.DEVNULL,
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+ cropper = Cropper(
+ filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
)
+ status = cropper.crop(margins=15)
+
if not status == 0:
self.warn("Failed to crop the pdf file at: %s" % filepath)
return filepath
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
if not os.path.exists(cropped_file):
self.warn(
"Can't find cropped file '%s' where expected." % cropped_file
@@ -182,6 +153,26 @@ class Provider(metaclass=abc.ABCMeta):
return filepath
return cropped_file
+ def center_pdf(self, filepath):
+ if not self.center:
+ return filepath
+
+ self.log("Centering PDF file")
+ centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+ cropper = Cropper(
+ filepath, centered_file, pdfcrop_path=self.pdfcrop_path
+ )
+ status = cropper.center()
+ if not status == 0:
+ self.warn("Failed to center the pdf file at: %s" % filepath)
+ return filepath
+ if not os.path.exists(centered_file):
+ self.warn(
+ "Can't find centered file '%s' where expected." % centered_file
+ )
+ return filepath
+ return centered_file
+
def shrink_pdf(self, filepath):
self.log("Shrinking pdf file")
output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
@@ -602,6 +593,149 @@ class PdfUrlProvider(Provider):
return filename
+class Cropper(object):
+ def __init__(
+ self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ ):
+ if not input_file is None:
+ self.input_file = os.path.abspath(input_file)
+ self.reader = PyPDF2.PdfFileReader(self.input_file)
+ if not output_file is None:
+ self.output_file = os.path.abspath(output_file)
+ self.pdfcrop_path = pdfcrop_path
+
+ self.writer = PyPDF2.PdfFileWriter()
+
+ def crop(self, margins=1):
+ return self.process_file(self.crop_page, margins=margins)
+
+ def center(self, padding=15):
+ return self.process_file(self.center_page, padding=padding)
+
+ def process_file(self, page_func, *args, **kwargs):
+ for page_idx in range(self.reader.getNumPages()):
+ status = page_func(page_idx, *args, **kwargs)
+ if not status == 0:
+ return status
+ with open(self.output_file, "wb") as fp:
+ self.writer.write(fp)
+ return 0
+
+ def center_page(self, page_idx, padding):
+ return self.process_page(
+ page_idx, self.get_center_bbox, padding=padding
+ )
+
+ def crop_page(self, page_idx, margins):
+ return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+ def export_page(self, page_idx):
+ """Helper function that exports a single page given by index """
+ page = self.reader.getPage(page_idx)
+ writer = PyPDF2.PdfFileWriter()
+ writer.addPage(page)
+ tmpfname = "./page.pdf"
+ with open(tmpfname, "wb") as fp:
+ writer.write(fp)
+ return tmpfname
+
+ def process_page(self, page_idx, bbox_func, *args, **kwargs):
+ """Process a single page and add it to the writer """
+ tmpfname = self.export_page(page_idx)
+ tmpfout = "./output.pdf"
+ bbox = bbox_func(tmpfname, *args, **kwargs)
+ status = subprocess.call(
+ [
+ self.pdfcrop_path,
+ "--bbox",
+ " ".join(map(str, bbox)),
+ tmpfname,
+ tmpfout,
+ ],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ return status
+ reader = PyPDF2.PdfFileReader(tmpfout)
+ page = reader.getPage(0)
+ self.writer.addPage(page)
+ os.unlink(tmpfname)
+ os.unlink(tmpfout)
+ return 0
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+ pdf = pdfplumber.open(filename)
+ im = pdf.pages[0].to_image(resolution=resolution)
+ pdf.close()
+
+ pixels = list(im.original.getdata())
+ W, H = im.original.size
+
+ # M is a list of H lists with each W integers that equal the sum of the
+ # pixel values
+ M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+ left, top, bottom, right = 0, 0, 0, 0
+ while top < H and sum(M[top]) == W * 255 * 3:
+ top += 1
+ while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+ bottom += 1
+
+ # Transpose M
+ M = list(zip(*M))
+ while left < W and sum(M[left]) == H * 255 * 3:
+ left += 1
+ while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+ right += 1
+
+ left -= margins[0]
+ top -= margins[1]
+ right -= margins[2]
+ bottom -= margins[3]
+
+ # This is the bounding box in PIL format: (0, 0) top left
+ x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+ # Get the bbox in Ghostscript format: (0, 0) bottom left
+ a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+ return [a0, b0, a1, b1]
+
+ def get_center_bbox(self, filename, padding=15):
+ """Compute a bounding box that will center the page file on the
+ reMarkable
+ """
+ bbox = self.get_bbox(filename, margins=0)
+
+ h = bbox[3] - bbox[1]
+ w = bbox[2] - bbox[0]
+
+ # we want some minimal padding all around, because it is visually more
+ # pleasing.
+ h_prime = h + 2 * padding
+ w_prime = w + 2 * padding
+
+ # if the document is wider than the remarkable, we add top-padding to
+ # center it, otherwise we add left-padding
+ x, y = 0, 0
+ if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+ y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+ else:
+ x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+ margins = [padding + x, padding + y, padding, padding]
+ return self.get_bbox(filename, margins=margins)
+
+
def exception(msg):
print("ERROR: " + msg, file=sys.stderr)
print("Error occurred. Exiting.", file=sys.stderr)
diff --git a/poetry.lock b/poetry.lock
index 893007f..322114d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -46,6 +46,51 @@ version = "2.8"
[[package]]
category = "main"
+description = "PDF parser and analyzer"
+name = "pdfminer.six"
+optional = false
+python-versions = "*"
+version = "20181108"
+
+[package.dependencies]
+pycryptodome = "*"
+six = "*"
+sortedcontainers = "*"
+
+[[package]]
+category = "main"
+description = "Plumb a PDF for detailed information about each char, rectangle, and line."
+name = "pdfplumber"
+optional = false
+python-versions = "*"
+version = "0.5.12"
+
+[package.dependencies]
+chardet = "*"
+"pdfminer.six" = "20181108"
+pillow = ">=3.0.0"
+pycryptodome = "*"
+unicodecsv = ">=0.14.1"
+wand = "*"
+
+[[package]]
+category = "main"
+description = "Python Imaging Library (Fork)"
+name = "pillow"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+version = "6.0.0"
+
+[[package]]
+category = "main"
+description = "Cryptographic library for Python"
+name = "pycryptodome"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+version = "3.8.2"
+
+[[package]]
+category = "main"
description = "Python HTTP for Humans."
name = "requests"
optional = false
@@ -60,6 +105,22 @@ urllib3 = ">=1.21.1,<1.25"
[[package]]
category = "main"
+description = "Python 2 and 3 compatibility utilities"
+name = "six"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+version = "1.12.0"
+
+[[package]]
+category = "main"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+name = "sortedcontainers"
+optional = false
+python-versions = "*"
+version = "2.1.0"
+
+[[package]]
+category = "main"
description = "A CSS4 selector implementation for Beautiful Soup."
name = "soupsieve"
optional = false
@@ -68,14 +129,30 @@ version = "1.7.3"
[[package]]
category = "main"
+description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*."
+name = "unicodecsv"
+optional = false
+python-versions = "*"
+version = "0.14.1"
+
+[[package]]
+category = "main"
description = "HTTP library with thread-safe connection pooling, file post, and more."
name = "urllib3"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
version = "1.24.1"
+[[package]]
+category = "main"
+description = "Ctypes-based simple MagickWand API binding for Python"
+name = "wand"
+optional = false
+python-versions = "*"
+version = "0.5.4"
+
[metadata]
-content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82"
+content-hash = "21a857f686e73e377feae7bf7c09ed5933d51a3f90ca77315408a3d7fc362c42"
python-versions = "^3.5"
[metadata.hashes]
@@ -84,6 +161,14 @@ bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
+"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"]
+pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"]
+pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"]
+pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"]
requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
+six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"]
+sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"]
soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
+unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"]
urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
+wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"]
diff --git a/pyproject.toml b/pyproject.toml
index 2c28224..a211300 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ license = "MIT"
python = "^3.5"
bs4 = "^0.0.1"
requests = "^2.21"
+pdfplumber = "^0.5.12"
[tool.poetry.dev-dependencies]