aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 14:58:45 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-04 14:58:45 +0100
commitc0a96f196b5a5c0931cd8d9aed1edecdc16c16af (patch)
treeb9b139609e1bddef0eea42550b5ca9afbd50f2fb
parentMerge branch 'ClaytonJY-upgrade-cairo' (diff)
parentCode formatting (diff)
downloadpaper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.tar.gz
paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.zip
Merge branch 'qpdfSwitch' of https://github.com/delaere/paper2remarkable into delaere-qpdfSwitch
-rw-r--r--.travis.yml2
-rw-r--r--paper2remarkable/exceptions.py29
-rw-r--r--paper2remarkable/providers/_base.py54
-rw-r--r--paper2remarkable/providers/arxiv.py28
-rw-r--r--paper2remarkable/ui.py14
-rw-r--r--paper2remarkable/utils.py23
-rw-r--r--tests/test_providers.py3
7 files changed, 109 insertions, 44 deletions
diff --git a/.travis.yml b/.travis.yml
index f412f9b..e2edaaa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ python:
before_install:
- sudo apt-get update
- - sudo apt-get install ghostscript pdftk poppler-utils
+ - sudo apt-get install ghostscript pdftk poppler-utils qpdf
install:
- pip install -e .[dev]
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index a608bcc..5ea9a78 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -90,16 +90,29 @@ class RemarkableError(Error):
return msg
-class _CalledProcessError(CalledProcessError):
- """Exception raised when subprocesses fail.
+class _CalledProcessError(Error):
+ """Exception raised when subprocesses fail. """
+
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ msg = "ERROR: {message}".format(message=self.message)
+ msg += GH_MSG
+ return msg
- We subclass the CalledProcessError so we can add our custom error message.
- """
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
+class NoPDFToolError(Error):
+ """Exception raised when neither pdftk or qpdf is found."""
+
+ def __init__(self):
+ pass
def __str__(self):
- parent = super().__str__()
- msg = parent + GH_MSG
+ msg = (
+ "ERROR: Neither pdftk or qpdf could be found. Install "
+ "either of these or ensure that they can be found using "
+ "the --pdftk or --qpdf options."
+ )
+ msg += GH_MSG
return msg
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 96fb151..0374213 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -11,18 +11,21 @@ Copyright: 2019, G.J.J. van den Burg
import abc
import os
import shutil
+import subprocess
import tempfile
import time
-from ._info import Informer
+from ..exceptions import _CalledProcessError
+from ..log import Logger
from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
from ..utils import (
assert_file_is_pdf,
+ check_pdftool,
download_url,
- upload_to_remarkable,
follow_redirects,
+ upload_to_remarkable,
)
-from ..log import Logger
+from ._info import Informer
logger = Logger()
@@ -43,6 +46,7 @@ class Provider(metaclass=abc.ABCMeta):
rmapi_path="rmapi",
pdftoppm_path="pdftoppm",
pdftk_path="pdftk",
+ qpdf_path="qpdf",
gs_path="gs",
cookiejar=None,
):
@@ -52,10 +56,13 @@ class Provider(metaclass=abc.ABCMeta):
self.rmapi_path = rmapi_path
self.pdftoppm_path = pdftoppm_path
self.pdftk_path = pdftk_path
+ self.qpdf_path = qpdf_path
self.gs_path = gs_path
self.informer = Informer()
self.cookiejar = cookiejar
+ self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path)
+
# wait time to not hit the server too frequently
self.server_delay = 0
@@ -108,6 +115,47 @@ class Provider(metaclass=abc.ABCMeta):
# This must exist so that the LocalFile provider can overwrite it
download_url(pdf_url, filename, cookiejar=self.cookiejar)
+ def compress_pdf(self, in_pdf, out_pdf):
+ """ Compress a pdf file, returns subprocess status """
+ if self.pdftool == "pdftk":
+ status = subprocess.call(
+ [self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
+ )
+ elif self.pdftool == "qpdf":
+ # TODO: the --no-warn option is only needed because when we remove
+ # the arXiv stamp we don't fix the length of the pdf object. This
+ # causes qpdf to raise a warning and give a nonzero exit status
+ # (3). Fixing the pdf object is the right approach, but this does
+ # work as qpdf fixes the file.
+ status = subprocess.call(
+ [
+ self.qpdf_path,
+ "--no-warn",
+ "--stream-data=compress",
+ in_pdf,
+ out_pdf,
+ ]
+ )
+ if not (status == 0 or status == 3):
+ raise _CalledProcessError(
+ "%s failed to compress the PDF file." % self.pdftool
+ )
+
+ def uncompress_pdf(self, in_pdf, out_pdf):
+ """ Uncompress a pdf file """
+ if self.pdftool == "pdftk":
+ status = subprocess.call(
+ [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",]
+ )
+ elif self.pdftool == "qpdf":
+ status = subprocess.call(
+ [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,]
+ )
+ if not status == 0:
+ raise _CalledProcessError(
+ "%s failed to uncompress the PDF file." % self.pdftool
+ )
+
def run(self, src, filename=None):
# follow_redirects here is needed with library use
if os.path.exists(src):
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 913e015..74043ed 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -10,14 +10,10 @@ Copyright: 2019, G.J.J. van den Burg
import os
import re
-import subprocess
from ._info import Informer
from ._base import Provider
-from ..exceptions import (
- URLResolutionError,
- _CalledProcessError as CalledProcessError,
-)
+from ..exceptions import URLResolutionError
from ..log import Logger
logger = Logger()
@@ -71,21 +67,9 @@ class Arxiv(Provider):
"""Remove the arXiv timestamp from a pdf"""
logger.info("Removing arXiv timestamp")
basename = os.path.splitext(input_file)[0]
- uncompress_file = basename + "_uncompress.pdf"
- status = subprocess.call(
- [
- self.pdftk_path,
- input_file,
- "output",
- uncompress_file,
- "uncompress",
- ]
- )
- if not status == 0:
- raise CalledProcessError(
- "pdftk failed to uncompress the PDF file."
- )
+ uncompress_file = basename + "_uncompress.pdf"
+ self.uncompress_pdf(input_file, uncompress_file)
with open(uncompress_file, "rb") as fid:
data = fid.read()
@@ -103,10 +87,6 @@ class Arxiv(Provider):
oid.write(data)
output_file = basename + "_dearxiv.pdf"
- status = subprocess.call(
- [self.pdftk_path, removed_file, "output", output_file, "compress"]
- )
- if not status == 0:
- raise CalledProcessError("pdftk failed to compress the PDF file.")
+ self.compress_pdf(removed_file, output_file)
return output_file
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 12443d4..bfb3647 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -59,12 +59,8 @@ def parse_args():
action="store_true",
)
parser.add_argument(
- '-k',
- '--no-crop',
- help="Don't crop the pdf file",
- action="store_true"
- )
-
+ "-k", "--no-crop", help="Don't crop the pdf file", action="store_true"
+ )
parser.add_argument(
"-v", "--verbose", help="be verbose", action="store_true"
)
@@ -94,6 +90,11 @@ def parse_args():
default="pdftk",
)
parser.add_argument(
+ "--qpdf",
+ help="path to qpdf executable (default: qpdf)",
+ default="qpdf",
+ )
+ parser.add_argument(
"--rmapi",
help="path to rmapi executable (default: rmapi)",
default="rmapi",
@@ -159,6 +160,7 @@ def main():
rmapi_path=args.rmapi,
pdftoppm_path=args.pdftoppm,
pdftk_path=args.pdftk,
+ qpdf_path=args.qpdf,
gs_path=args.gs,
cookiejar=cookiejar,
)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 592dcd3..39cf547 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -17,7 +17,7 @@ import time
import unidecode
from .log import Logger
-from .exceptions import FileTypeError, RemarkableError
+from .exceptions import FileTypeError, RemarkableError, NoPDFToolError
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -148,6 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
)
# Upload the file
+ logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir))
status = subprocess.call(
[rmapi_path, "put", filepath, remarkable_dir + "/"],
stdout=subprocess.DEVNULL,
@@ -165,3 +166,23 @@ def is_url(string):
string = string.strip(" ")
match = regex.fullmatch(pattern, string)
return match is not None
+
+
+def check_pdftool(pdftk_path, qpdf_path):
+ """Check whether we have pdftk or qpdf available"""
+ # set defaults in case either is set to None or something
+ pdftk_path = pdftk_path or 'false'
+ qpdf_path = qpdf_path or 'false'
+
+ status = subprocess.call(
+ [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+ )
+ if status == 0:
+ return "pdftk"
+ status = subprocess.call(
+ [qpdf_path, '--help'], stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL
+ )
+ if status == 0:
+ return "qpdf"
+ raise NoPDFToolError
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 2bf7507..e539949 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase):
shutil.rmtree(self.test_dir)
def test_arxiv_1(self):
- prov = Arxiv(upload=False, verbose=VERBOSE)
+ # check with qpdf
+ prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None)
url = "https://arxiv.org/abs/1811.11242v1"
exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
filename = prov.run(url)