diff options
| -rwxr-xr-x | arxiv2remarkable.py | 204 | ||||
| -rw-r--r-- | poetry.lock | 87 | ||||
| -rw-r--r-- | pyproject.toml | 1 |
3 files changed, 256 insertions, 36 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index f237d5a..e009298 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -19,6 +19,7 @@ import argparse import bs4 import datetime import os +import pdfplumber import re import requests import shutil @@ -118,37 +119,6 @@ class Provider(metaclass=abc.ABCMeta): self.log("Created filename: %s" % name) return name - def center_pdf(self, filepath): - if not self.center: - return filepath - pdf_file = PyPDF2.PdfFileReader(filepath) - mediaBox = pdf_file.getPage(0).mediaBox - width = mediaBox[2] - mediaBox[0] - height = mediaBox[3] - mediaBox[1] - padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT - left_margin = padding / 2 + 15 - - self.log("Centering PDF file") - status = subprocess.call( - [ - self.pdfcrop_path, - "--margins", - "%i 40 15 15" % left_margin, - filepath, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - self.warn("Failed to crop the pdf file at: %s" % filepath) - return filepath - centered_file = os.path.splitext(filepath)[0] + "-crop.pdf" - if not os.path.exists(centered_file): - self.warn( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - def blank_pdf(self, filepath): if not self.blank: return filepath @@ -167,14 +137,15 @@ class Provider(metaclass=abc.ABCMeta): def crop_pdf(self, filepath): self.log("Cropping pdf file") - status = subprocess.call( - [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], - stdout=subprocess.DEVNULL, + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + cropper = Cropper( + filepath, cropped_file, pdfcrop_path=self.pdfcrop_path ) + status = cropper.crop(margins=15) + if not status == 0: self.warn("Failed to crop the pdf file at: %s" % filepath) return filepath - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" if not os.path.exists(cropped_file): self.warn( "Can't find cropped file '%s' where expected." % cropped_file @@ -182,6 +153,26 @@ class Provider(metaclass=abc.ABCMeta): return filepath return cropped_file + def center_pdf(self, filepath): + if not self.center: + return filepath + + self.log("Centering PDF file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + cropper = Cropper( + filepath, centered_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.center() + if not status == 0: + self.warn("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + self.warn( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + def shrink_pdf(self, filepath): self.log("Shrinking pdf file") output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" @@ -602,6 +593,149 @@ class PdfUrlProvider(Provider): return filename +class Cropper(object): + def __init__( + self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + ): + if not input_file is None: + self.input_file = os.path.abspath(input_file) + self.reader = PyPDF2.PdfFileReader(self.input_file) + if not output_file is None: + self.output_file = os.path.abspath(output_file) + self.pdfcrop_path = pdfcrop_path + + self.writer = PyPDF2.PdfFileWriter() + + def crop(self, margins=1): + return self.process_file(self.crop_page, margins=margins) + + def center(self, padding=15): + return self.process_file(self.center_page, padding=padding) + + def process_file(self, page_func, *args, **kwargs): + for page_idx in range(self.reader.getNumPages()): + status = page_func(page_idx, *args, **kwargs) + if not status == 0: + return status + with open(self.output_file, "wb") as fp: + self.writer.write(fp) + return 0 + + def center_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_center_bbox, padding=padding + ) + + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + + def export_page(self, page_idx): + """Helper function that exports a single page given by index """ + page = self.reader.getPage(page_idx) + writer = PyPDF2.PdfFileWriter() + writer.addPage(page) + tmpfname = "./page.pdf" + with open(tmpfname, "wb") as fp: + writer.write(fp) + return tmpfname + + def process_page(self, page_idx, bbox_func, *args, **kwargs): + """Process a single page and add it to the writer """ + tmpfname = self.export_page(page_idx) + tmpfout = "./output.pdf" + bbox = bbox_func(tmpfname, *args, **kwargs) + status = subprocess.call( + [ + self.pdfcrop_path, + "--bbox", + " ".join(map(str, bbox)), + tmpfname, + tmpfout, + ], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + return status + reader = PyPDF2.PdfFileReader(tmpfout) + page = reader.getPage(0) + self.writer.addPage(page) + os.unlink(tmpfname) + os.unlink(tmpfout) + return 0 + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + pdf = pdfplumber.open(filename) + im = pdf.pages[0].to_image(resolution=resolution) + pdf.close() + + pixels = list(im.original.getdata()) + W, H = im.original.size + + # M is a list of H lists with each W integers that equal the sum of the + # pixel values + M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] + + left, top, bottom, right = 0, 0, 0, 0 + while top < H and sum(M[top]) == W * 255 * 3: + top += 1 + while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: + bottom += 1 + + # Transpose M + M = list(zip(*M)) + while left < W and sum(M[left]) == H * 255 * 3: + left += 1 + while right < W and sum(M[W - 1 - right]) == H * 255 * 3: + right += 1 + + left -= margins[0] + top -= margins[1] + right -= margins[2] + bottom -= margins[3] + + # This is the bounding box in PIL format: (0, 0) top left + x0, y0, x1, y1 = left, top, W - right, H - bottom + + # Get the bbox in Ghostscript format: (0, 0) bottom left + a0, b0, a1, b1 = x0, H - y1, x1, H - y0 + return [a0, b0, a1, b1] + + def get_center_bbox(self, filename, padding=15): + """Compute a bounding box that will center the page file on the + reMarkable + """ + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # we want some minimal padding all around, because it is visually more + # pleasing. + h_prime = h + 2 * padding + w_prime = w + 2 * padding + + # if the document is wider than the remarkable, we add top-padding to + # center it, otherwise we add left-padding + x, y = 0, 0 + if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: + y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 + else: + x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 + + margins = [padding + x, padding + y, padding, padding] + return self.get_bbox(filename, margins=margins) + + def exception(msg): print("ERROR: " + msg, file=sys.stderr) print("Error occurred. Exiting.", file=sys.stderr) diff --git a/poetry.lock b/poetry.lock index 893007f..322114d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -46,6 +46,51 @@ version = "2.8" [[package]] category = "main" +description = "PDF parser and analyzer" +name = "pdfminer.six" +optional = false +python-versions = "*" +version = "20181108" + +[package.dependencies] +pycryptodome = "*" +six = "*" +sortedcontainers = "*" + +[[package]] +category = "main" +description = "Plumb a PDF for detailed information about each char, rectangle, and line." +name = "pdfplumber" +optional = false +python-versions = "*" +version = "0.5.12" + +[package.dependencies] +chardet = "*" +"pdfminer.six" = "20181108" +pillow = ">=3.0.0" +pycryptodome = "*" +unicodecsv = ">=0.14.1" +wand = "*" + +[[package]] +category = "main" +description = "Python Imaging Library (Fork)" +name = "pillow" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "6.0.0" + +[[package]] +category = "main" +description = "Cryptographic library for Python" +name = "pycryptodome" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +version = "3.8.2" + +[[package]] +category = "main" description = "Python HTTP for Humans." name = "requests" optional = false @@ -60,6 +105,22 @@ urllib3 = ">=1.21.1,<1.25" [[package]] category = "main" +description = "Python 2 and 3 compatibility utilities" +name = "six" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*" +version = "1.12.0" + +[[package]] +category = "main" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +name = "sortedcontainers" +optional = false +python-versions = "*" +version = "2.1.0" + +[[package]] +category = "main" description = "A CSS4 selector implementation for Beautiful Soup." name = "soupsieve" optional = false @@ -68,14 +129,30 @@ version = "1.7.3" [[package]] category = "main" +description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*." +name = "unicodecsv" +optional = false +python-versions = "*" +version = "0.14.1" + +[[package]] +category = "main" description = "HTTP library with thread-safe connection pooling, file post, and more." name = "urllib3" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" version = "1.24.1" +[[package]] +category = "main" +description = "Ctypes-based simple MagickWand API binding for Python" +name = "wand" +optional = false +python-versions = "*" +version = "0.5.4" + [metadata] -content-hash = "bf39364b4d9cc98c89d858338ce8e7609e35fdd4a7e5cfe256f768c12ed7cb82" +content-hash = "21a857f686e73e377feae7bf7c09ed5933d51a3f90ca77315408a3d7fc362c42" python-versions = "^3.5" [metadata.hashes] @@ -84,6 +161,14 @@ bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] +"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"] +pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"] +pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"] +pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"] requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] +six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"] +sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"] soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] +unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"] urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] +wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"] diff --git a/pyproject.toml b/pyproject.toml index 2c28224..a211300 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ license = "MIT" python = "^3.5" bs4 = "^0.0.1" requests = "^2.21" +pdfplumber = "^0.5.12" [tool.poetry.dev-dependencies] |
