diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 14:58:45 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-04 14:58:45 +0100 |
| commit | c0a96f196b5a5c0931cd8d9aed1edecdc16c16af (patch) | |
| tree | b9b139609e1bddef0eea42550b5ca9afbd50f2fb | |
| parent | Merge branch 'ClaytonJY-upgrade-cairo' (diff) | |
| parent | Code formatting (diff) | |
| download | paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.tar.gz paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.zip | |
Merge branch 'qpdfSwitch' of https://github.com/delaere/paper2remarkable into delaere-qpdfSwitch
| -rw-r--r-- | .travis.yml | 2 | ||||
| -rw-r--r-- | paper2remarkable/exceptions.py | 29 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 54 | ||||
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 28 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 14 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 23 | ||||
| -rw-r--r-- | tests/test_providers.py | 3 |
7 files changed, 109 insertions, 44 deletions
diff --git a/.travis.yml b/.travis.yml index f412f9b..e2edaaa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils qpdf install: - pip install -e .[dev] diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index a608bcc..5ea9a78 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -90,16 +90,29 @@ class RemarkableError(Error): return msg -class _CalledProcessError(CalledProcessError): - """Exception raised when subprocesses fail. +class _CalledProcessError(Error): + """Exception raised when subprocesses fail. """ + + def __init__(self, message): + self.message = message + + def __str__(self): + msg = "ERROR: {message}".format(message=self.message) + msg += GH_MSG + return msg - We subclass the CalledProcessError so we can add our custom error message. - """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) +class NoPDFToolError(Error): + """Exception raised when neither pdftk or qpdf is found.""" + + def __init__(self): + pass def __str__(self): - parent = super().__str__() - msg = parent + GH_MSG + msg = ( + "ERROR: Neither pdftk or qpdf could be found. Install " + "either of these or ensure that they can be found using " + "the --pdftk or --qpdf options." + ) + msg += GH_MSG return msg diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 96fb151..0374213 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -11,18 +11,21 @@ Copyright: 2019, G.J.J. van den Burg import abc import os import shutil +import subprocess import tempfile import time -from ._info import Informer +from ..exceptions import _CalledProcessError +from ..log import Logger from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, + check_pdftool, download_url, - upload_to_remarkable, follow_redirects, + upload_to_remarkable, ) -from ..log import Logger +from ._info import Informer logger = Logger() @@ -43,6 +46,7 @@ class Provider(metaclass=abc.ABCMeta): rmapi_path="rmapi", pdftoppm_path="pdftoppm", pdftk_path="pdftk", + qpdf_path="qpdf", gs_path="gs", cookiejar=None, ): @@ -52,10 +56,13 @@ class Provider(metaclass=abc.ABCMeta): self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path self.pdftk_path = pdftk_path + self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() self.cookiejar = cookiejar + self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path) + # wait time to not hit the server too frequently self.server_delay = 0 @@ -108,6 +115,47 @@ class Provider(metaclass=abc.ABCMeta): # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename, cookiejar=self.cookiejar) + def compress_pdf(self, in_pdf, out_pdf): + """ Compress a pdf file, returns subprocess status """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] + ) + elif self.pdftool == "qpdf": + # TODO: the --no-warn option is only needed because when we remove + # the arXiv stamp we don't fix the length of the pdf object. This + # causes qpdf to raise a warning and give a nonzero exit status + # (3). Fixing the pdf object is the right approach, but this does + # work as qpdf fixes the file. + status = subprocess.call( + [ + self.qpdf_path, + "--no-warn", + "--stream-data=compress", + in_pdf, + out_pdf, + ] + ) + if not (status == 0 or status == 3): + raise _CalledProcessError( + "%s failed to compress the PDF file." % self.pdftool + ) + + def uncompress_pdf(self, in_pdf, out_pdf): + """ Uncompress a pdf file """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] + ) + elif self.pdftool == "qpdf": + status = subprocess.call( + [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,] + ) + if not status == 0: + raise _CalledProcessError( + "%s failed to uncompress the PDF file." % self.pdftool + ) + def run(self, src, filename=None): # follow_redirects here is needed with library use if os.path.exists(src): diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 913e015..74043ed 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -10,14 +10,10 @@ Copyright: 2019, G.J.J. van den Burg import os import re -import subprocess from ._info import Informer from ._base import Provider -from ..exceptions import ( - URLResolutionError, - _CalledProcessError as CalledProcessError, -) +from ..exceptions import URLResolutionError from ..log import Logger logger = Logger() @@ -71,21 +67,9 @@ class Arxiv(Provider): """Remove the arXiv timestamp from a pdf""" logger.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - raise CalledProcessError( - "pdftk failed to uncompress the PDF file." - ) + uncompress_file = basename + "_uncompress.pdf" + self.uncompress_pdf(input_file, uncompress_file) with open(uncompress_file, "rb") as fid: data = fid.read() @@ -103,10 +87,6 @@ class Arxiv(Provider): oid.write(data) output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - raise CalledProcessError("pdftk failed to compress the PDF file.") + self.compress_pdf(removed_file, output_file) return output_file diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 12443d4..bfb3647 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -59,12 +59,8 @@ def parse_args(): action="store_true", ) parser.add_argument( - '-k', - '--no-crop', - help="Don't crop the pdf file", - action="store_true" - ) - + "-k", "--no-crop", help="Don't crop the pdf file", action="store_true" + ) parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) @@ -94,6 +90,11 @@ def parse_args(): default="pdftk", ) parser.add_argument( + "--qpdf", + help="path to qpdf executable (default: qpdf)", + default="qpdf", + ) + parser.add_argument( "--rmapi", help="path to rmapi executable (default: rmapi)", default="rmapi", @@ -159,6 +160,7 @@ def main(): rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, pdftk_path=args.pdftk, + qpdf_path=args.qpdf, gs_path=args.gs, cookiejar=cookiejar, ) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 592dcd3..39cf547 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import time import unidecode from .log import Logger -from .exceptions import FileTypeError, RemarkableError +from .exceptions import FileTypeError, RemarkableError, NoPDFToolError HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -148,6 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file + logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, @@ -165,3 +166,23 @@ def is_url(string): string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None + + +def check_pdftool(pdftk_path, qpdf_path): + """Check whether we have pdftk or qpdf available""" + # set defaults in case either is set to None or something + pdftk_path = pdftk_path or 'false' + qpdf_path = qpdf_path or 'false' + + status = subprocess.call( + [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + if status == 0: + return "pdftk" + status = subprocess.call( + [qpdf_path, '--help'], stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + if status == 0: + return "qpdf" + raise NoPDFToolError diff --git a/tests/test_providers.py b/tests/test_providers.py index 2bf7507..e539949 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) |
