From 721cf5ffb2ebac345f405dd2b2e0d38ba8a3e1ae Mon Sep 17 00:00:00 2001 From: Christophe Delaere Date: Wed, 1 Apr 2020 09:13:48 +0200 Subject: replaced pdftk by qpdf --- paper2remarkable/providers/_base.py | 4 ++-- paper2remarkable/providers/arxiv.py | 11 +++++------ paper2remarkable/ui.py | 8 ++++---- paper2remarkable/utils.py | 1 + 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 96fb151..fbe5308 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -42,7 +42,7 @@ class Provider(metaclass=abc.ABCMeta): remarkable_dir="/", rmapi_path="rmapi", pdftoppm_path="pdftoppm", - pdftk_path="pdftk", + qpdf_path="qpdf", gs_path="gs", cookiejar=None, ): @@ -51,7 +51,7 @@ class Provider(metaclass=abc.ABCMeta): self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path - self.pdftk_path = pdftk_path + self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() self.cookiejar = cookiejar diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 913e015..06bfdec 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -75,16 +75,15 @@ class Arxiv(Provider): status = subprocess.call( [ - self.pdftk_path, + self.qpdf_path, + "--stream-data=uncompress", input_file, - "output", uncompress_file, - "uncompress", ] ) if not status == 0: raise CalledProcessError( - "pdftk failed to uncompress the PDF file." + "qpdf failed to uncompress the PDF file." ) with open(uncompress_file, "rb") as fid: @@ -104,9 +103,9 @@ class Arxiv(Provider): output_file = basename + "_dearxiv.pdf" status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] + [self.qpdf_path, "--stream-data=compress", removed_file, output_file] ) if not status == 0: - raise CalledProcessError("pdftk failed to compress the PDF file.") + raise CalledProcessError("qpdf failed to compress the PDF file.") return output_file diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 12443d4..e17bffb 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -89,9 +89,9 @@ def parse_args(): default="pdftoppm", ) parser.add_argument( - "--pdftk", - help="path to pdftk executable (default: pdftk)", - default="pdftk", + "--qpdf", + help="path to qpdf executable (default: qpdf)", + default="qpdf", ) parser.add_argument( "--rmapi", @@ -158,7 +158,7 @@ def main(): remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, - pdftk_path=args.pdftk, + qpdf_path=args.qpdf, gs_path=args.gs, cookiejar=cookiejar, ) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 592dcd3..f1447d9 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -148,6 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file + logger.info("%s put %s %s/"%(rmapi_path,filepath,remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, -- cgit v1.2.3 From 726d4c42dde92c67131ee0311e7f965dd2ea13ad Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:48:48 +0100 Subject: Fix the calledprocesserror by not inheriting Turns out this never actually worked as intended. --- paper2remarkable/exceptions.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index a608bcc..66a329f 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -90,16 +90,13 @@ class RemarkableError(Error): return msg -class _CalledProcessError(CalledProcessError): - """Exception raised when subprocesses fail. +class _CalledProcessError(Error): + """Exception raised when subprocesses fail. """ - We subclass the CalledProcessError so we can add our custom error message. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, message): + self.message = message def __str__(self): - parent = super().__str__() - msg = parent + GH_MSG + msg = "ERROR: {message}".format(message=self.message) + msg += GH_MSG return msg -- cgit v1.2.3 From 2d5060549eccf173498b1db85788032bb0730e10 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:49:58 +0100 Subject: Add the pdftk path back to the ui --- paper2remarkable/providers/_base.py | 2 ++ paper2remarkable/ui.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index fbe5308..1337201 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -42,6 +42,7 @@ class Provider(metaclass=abc.ABCMeta): remarkable_dir="/", rmapi_path="rmapi", pdftoppm_path="pdftoppm", + pdftk_path="pdftk", qpdf_path="qpdf", gs_path="gs", cookiejar=None, @@ -51,6 +52,7 @@ class Provider(metaclass=abc.ABCMeta): self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path + self.pdftk_path = pdftk_path self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index e17bffb..a3bf3c2 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -88,6 +88,11 @@ def parse_args(): help="path to pdftoppm executable (default: pdftoppm)", default="pdftoppm", ) + parser.add_argument( + "--pdftk", + help="path to pdftk executable (default: pdftk)", + default="pdftk", + ) parser.add_argument( "--qpdf", help="path to qpdf executable (default: qpdf)", @@ -158,6 +163,7 @@ def main(): remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, + pdftk_path=args.pdftk, qpdf_path=args.qpdf, gs_path=args.gs, cookiejar=cookiejar, -- cgit v1.2.3 From 8d5ce28ed6a4cf52ae10bf4bed197cd00c529218 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:53:15 +0100 Subject: Enable both pdftk and qpdf This adds a function that checks which pdf tool is available and moves the compress/uncompress code to the base class of the providers for cleaner code. A new exception is added in case neither pdf tool can be found. --- paper2remarkable/exceptions.py | 16 +++++++++++++ paper2remarkable/providers/_base.py | 46 +++++++++++++++++++++++++++++++++++++ paper2remarkable/providers/arxiv.py | 27 ++++------------------ paper2remarkable/utils.py | 22 +++++++++++++++++- 4 files changed, 87 insertions(+), 24 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 66a329f..5ea9a78 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -100,3 +100,19 @@ class _CalledProcessError(Error): msg = "ERROR: {message}".format(message=self.message) msg += GH_MSG return msg + + +class NoPDFToolError(Error): + """Exception raised when neither pdftk or qpdf is found.""" + + def __init__(self): + pass + + def __str__(self): + msg = ( + "ERROR: Neither pdftk or qpdf could be found. Install " + "either of these or ensure that they can be found using " + "the --pdftk or --qpdf options." + ) + msg += GH_MSG + return msg diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 1337201..0cab6b7 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -11,13 +11,16 @@ Copyright: 2019, G.J.J. van den Burg import abc import os import shutil +import subprocess import tempfile import time +from ..exceptions import _CalledProcessError from ._info import Informer from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, + check_pdftool, download_url, upload_to_remarkable, follow_redirects, @@ -58,6 +61,8 @@ class Provider(metaclass=abc.ABCMeta): self.informer = Informer() self.cookiejar = cookiejar + self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path) + # wait time to not hit the server too frequently self.server_delay = 0 @@ -110,6 +115,47 @@ class Provider(metaclass=abc.ABCMeta): # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename, cookiejar=self.cookiejar) + def compress_pdf(self, in_pdf, out_pdf): + """ Compress a pdf file, returns subprocess status """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] + ) + elif self.pdftool == "qpdf": + # TODO: the --no-warn option is only needed because when we remove + # the arXiv stamp we don't fix the length of the pdf object. This + # causes qpdf to raise a warning and give a nonzero exit status + # (3). Fixing the pdf object is the right approach, but this does + # work as qpdf fixes the file. + status = subprocess.call( + [ + self.qpdf_path, + "--no-warn", + "--stream-data=compress", + in_pdf, + out_pdf, + ] + ) + if not (status == 0 or status == 3): + raise _CalledProcessError( + "%s failed to compress the PDF file." % self.pdftool + ) + + def uncompress_pdf(self, in_pdf, out_pdf): + """ Uncompress a pdf file """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] + ) + elif self.pdftool == "qpdf": + status = subprocess.call( + [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,] + ) + if not status == 0: + raise _CalledProcessError( + "%s failed to uncompress the PDF file." % self.pdftool + ) + def run(self, src, filename=None): # follow_redirects here is needed with library use if os.path.exists(src): diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 06bfdec..74043ed 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -10,14 +10,10 @@ Copyright: 2019, G.J.J. van den Burg import os import re -import subprocess from ._info import Informer from ._base import Provider -from ..exceptions import ( - URLResolutionError, - _CalledProcessError as CalledProcessError, -) +from ..exceptions import URLResolutionError from ..log import Logger logger = Logger() @@ -71,20 +67,9 @@ class Arxiv(Provider): """Remove the arXiv timestamp from a pdf""" logger.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - status = subprocess.call( - [ - self.qpdf_path, - "--stream-data=uncompress", - input_file, - uncompress_file, - ] - ) - if not status == 0: - raise CalledProcessError( - "qpdf failed to uncompress the PDF file." - ) + uncompress_file = basename + "_uncompress.pdf" + self.uncompress_pdf(input_file, uncompress_file) with open(uncompress_file, "rb") as fid: data = fid.read() @@ -102,10 +87,6 @@ class Arxiv(Provider): oid.write(data) output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.qpdf_path, "--stream-data=compress", removed_file, output_file] - ) - if not status == 0: - raise CalledProcessError("qpdf failed to compress the PDF file.") + self.compress_pdf(removed_file, output_file) return output_file diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index f1447d9..9bfeec6 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import time import unidecode from .log import Logger -from .exceptions import FileTypeError, RemarkableError +from .exceptions import FileTypeError, RemarkableError, NoPDFToolError HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -166,3 +166,23 @@ def is_url(string): string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None + + +def check_pdftool(pdftk_path, qpdf_path): + """Check whether we have pdftk or qpdf available""" + # set defaults in case either is set to None or something + pdftk_path = pdftk_path or 'false' + qpdf_path = qpdf_path or 'false' + + status = subprocess.call( + [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + if status == 0: + return "pdftk" + status = subprocess.call( + [qpdf_path, '--help'], stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + if status == 0: + return "qpdf" + raise NoPDFToolError -- cgit v1.2.3 From 01c294bccd10f8c430e1c959fbb5ebacea8f3c3a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:53:26 +0100 Subject: Add unit test --- .travis.yml | 2 +- tests/test_providers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f412f9b..e2edaaa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils qpdf install: - pip install -e .[dev] diff --git a/tests/test_providers.py b/tests/test_providers.py index 2bf7507..e539949 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) -- cgit v1.2.3 From d9a41be6b304b39730839096e1e2ddaff1f379b6 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:54:17 +0100 Subject: Code formatting --- paper2remarkable/providers/_base.py | 6 +++--- paper2remarkable/ui.py | 8 ++------ paper2remarkable/utils.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 0cab6b7..0374213 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -16,16 +16,16 @@ import tempfile import time from ..exceptions import _CalledProcessError -from ._info import Informer +from ..log import Logger from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, check_pdftool, download_url, - upload_to_remarkable, follow_redirects, + upload_to_remarkable, ) -from ..log import Logger +from ._info import Informer logger = Logger() diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index a3bf3c2..bfb3647 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -59,12 +59,8 @@ def parse_args(): action="store_true", ) parser.add_argument( - '-k', - '--no-crop', - help="Don't crop the pdf file", - action="store_true" - ) - + "-k", "--no-crop", help="Don't crop the pdf file", action="store_true" + ) parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 9bfeec6..39cf547 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -148,7 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file - logger.info("%s put %s %s/"%(rmapi_path,filepath,remarkable_dir)) + logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, -- cgit v1.2.3