Merge branch 'qpdfSwitch' of https://github.com/delaere/paper2remarkable into delaere-qpdfSwitch

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 14:58:45 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-04 14:58:45 +0100
commit: c0a96f196b5a5c0931cd8d9aed1edecdc16c16af (patch)
tree: b9b139609e1bddef0eea42550b5ca9afbd50f2fb
parent: Merge branch 'ClaytonJY-upgrade-cairo' (diff)
parent: Code formatting (diff)
download: paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.tar.gz
paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.zip
7 files changed, 109 insertions, 44 deletions
diff --git a/.travis.yml b/.travis.yml
index f412f9b..e2edaaa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ python:
 
 before_install:
   - sudo apt-get update
-  - sudo apt-get install ghostscript pdftk poppler-utils
+  - sudo apt-get install ghostscript pdftk poppler-utils qpdf
 
 install:
   - pip install -e .[dev]
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index a608bcc..5ea9a78 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -90,16 +90,29 @@ class RemarkableError(Error):
         return msg
 
 
-class _CalledProcessError(CalledProcessError):
-    """Exception raised when subprocesses fail.
+class _CalledProcessError(Error):
+    """Exception raised when subprocesses fail.  """
+
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        msg = "ERROR: {message}".format(message=self.message)
+        msg += GH_MSG
+        return msg
 
-    We subclass the CalledProcessError so we can add our custom error message.
-    """
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+class NoPDFToolError(Error):
+    """Exception raised when neither pdftk or qpdf is found."""
+
+    def __init__(self):
+        pass
 
     def __str__(self):
-        parent = super().__str__()
-        msg = parent + GH_MSG
+        msg = (
+            "ERROR: Neither pdftk or qpdf could be found. Install "
+            "either of these or ensure that they can be found using "
+            "the --pdftk or --qpdf options."
+        )
+        msg += GH_MSG
         return msg
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 96fb151..0374213 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -11,18 +11,21 @@ Copyright: 2019, G.J.J. van den Burg
 import abc
 import os
 import shutil
+import subprocess
 import tempfile
 import time
 
-from ._info import Informer
+from ..exceptions import _CalledProcessError
+from ..log import Logger
 from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
 from ..utils import (
     assert_file_is_pdf,
+    check_pdftool,
     download_url,
-    upload_to_remarkable,
     follow_redirects,
+    upload_to_remarkable,
 )
-from ..log import Logger
+from ._info import Informer
 
 logger = Logger()
 
@@ -43,6 +46,7 @@ class Provider(metaclass=abc.ABCMeta):
         rmapi_path="rmapi",
         pdftoppm_path="pdftoppm",
         pdftk_path="pdftk",
+        qpdf_path="qpdf",
         gs_path="gs",
         cookiejar=None,
     ):
@@ -52,10 +56,13 @@ class Provider(metaclass=abc.ABCMeta):
         self.rmapi_path = rmapi_path
         self.pdftoppm_path = pdftoppm_path
         self.pdftk_path = pdftk_path
+        self.qpdf_path = qpdf_path
         self.gs_path = gs_path
         self.informer = Informer()
         self.cookiejar = cookiejar
 
+        self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path)
+
         # wait time to not hit the server too frequently
         self.server_delay = 0
 
@@ -108,6 +115,47 @@ class Provider(metaclass=abc.ABCMeta):
         # This must exist so that the LocalFile provider can overwrite it
         download_url(pdf_url, filename, cookiejar=self.cookiejar)
 
+    def compress_pdf(self, in_pdf, out_pdf):
+        """ Compress a pdf file, returns subprocess status """
+        if self.pdftool == "pdftk":
+            status = subprocess.call(
+                [self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
+            )
+        elif self.pdftool == "qpdf":
+            # TODO: the --no-warn option is only needed because when we remove
+            # the arXiv stamp we don't fix the length of the pdf object. This
+            # causes qpdf to raise a warning and give a nonzero exit status
+            # (3). Fixing the pdf object is the right approach, but this does 
+            # work as qpdf fixes the file.
+            status = subprocess.call(
+                [
+                    self.qpdf_path,
+                    "--no-warn",
+                    "--stream-data=compress",
+                    in_pdf,
+                    out_pdf,
+                ]
+            )
+        if not (status == 0 or status == 3):
+            raise _CalledProcessError(
+                "%s failed to compress the PDF file." % self.pdftool
+            )
+
+    def uncompress_pdf(self, in_pdf, out_pdf):
+        """ Uncompress a pdf file """
+        if self.pdftool == "pdftk":
+            status = subprocess.call(
+                [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",]
+            )
+        elif self.pdftool == "qpdf":
+            status = subprocess.call(
+                [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,]
+            )
+        if not status == 0:
+            raise _CalledProcessError(
+                "%s failed to uncompress the PDF file." % self.pdftool
+            )
+
     def run(self, src, filename=None):
         # follow_redirects here is needed with library use
         if os.path.exists(src):
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 913e015..74043ed 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -10,14 +10,10 @@ Copyright: 2019, G.J.J. van den Burg
 
 import os
 import re
-import subprocess
 
 from ._info import Informer
 from ._base import Provider
-from ..exceptions import (
-    URLResolutionError,
-    _CalledProcessError as CalledProcessError,
-)
+from ..exceptions import URLResolutionError
 from ..log import Logger
 
 logger = Logger()
@@ -71,21 +67,9 @@ class Arxiv(Provider):
         """Remove the arXiv timestamp from a pdf"""
         logger.info("Removing arXiv timestamp")
         basename = os.path.splitext(input_file)[0]
-        uncompress_file = basename + "_uncompress.pdf"
 
-        status = subprocess.call(
-            [
-                self.pdftk_path,
-                input_file,
-                "output",
-                uncompress_file,
-                "uncompress",
-            ]
-        )
-        if not status == 0:
-            raise CalledProcessError(
-                "pdftk failed to uncompress the PDF file."
-            )
+        uncompress_file = basename + "_uncompress.pdf"
+        self.uncompress_pdf(input_file, uncompress_file)
 
         with open(uncompress_file, "rb") as fid:
             data = fid.read()
@@ -103,10 +87,6 @@ class Arxiv(Provider):
             oid.write(data)
 
         output_file = basename + "_dearxiv.pdf"
-        status = subprocess.call(
-            [self.pdftk_path, removed_file, "output", output_file, "compress"]
-        )
-        if not status == 0:
-            raise CalledProcessError("pdftk failed to compress the PDF file.")
+        self.compress_pdf(removed_file, output_file)
 
         return output_file
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 12443d4..bfb3647 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -59,12 +59,8 @@ def parse_args():
         action="store_true",
     )
     parser.add_argument(
-            '-k',
-            '--no-crop',
-            help="Don't crop the pdf file",
-            action="store_true"
-            )
-
+        "-k", "--no-crop", help="Don't crop the pdf file", action="store_true"
+    )
     parser.add_argument(
         "-v", "--verbose", help="be verbose", action="store_true"
     )
@@ -94,6 +90,11 @@ def parse_args():
         default="pdftk",
     )
     parser.add_argument(
+        "--qpdf",
+        help="path to qpdf executable (default: qpdf)",
+        default="qpdf",
+    )
+    parser.add_argument(
         "--rmapi",
         help="path to rmapi executable (default: rmapi)",
         default="rmapi",
@@ -159,6 +160,7 @@ def main():
         rmapi_path=args.rmapi,
         pdftoppm_path=args.pdftoppm,
         pdftk_path=args.pdftk,
+        qpdf_path=args.qpdf,
         gs_path=args.gs,
         cookiejar=cookiejar,
     )
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 592dcd3..39cf547 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -17,7 +17,7 @@ import time
 import unidecode
 
 from .log import Logger
-from .exceptions import FileTypeError, RemarkableError
+from .exceptions import FileTypeError, RemarkableError, NoPDFToolError
 
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -148,6 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
                 )
 
     # Upload the file
+    logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir))
     status = subprocess.call(
         [rmapi_path, "put", filepath, remarkable_dir + "/"],
         stdout=subprocess.DEVNULL,
@@ -165,3 +166,23 @@ def is_url(string):
     string = string.strip(" ")
     match = regex.fullmatch(pattern, string)
     return match is not None
+
+
+def check_pdftool(pdftk_path, qpdf_path):
+    """Check whether we have pdftk or qpdf available"""
+    # set defaults in case either is set to None or something
+    pdftk_path = pdftk_path or 'false'
+    qpdf_path = qpdf_path or 'false'
+
+    status = subprocess.call(
+        [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+    )
+    if status == 0:
+        return "pdftk"
+    status = subprocess.call(
+        [qpdf_path, '--help'], stdout=subprocess.DEVNULL, 
+        stderr=subprocess.DEVNULL
+    )
+    if status == 0:
+        return "qpdf"
+    raise NoPDFToolError
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 2bf7507..e539949 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase):
         shutil.rmtree(self.test_dir)
 
     def test_arxiv_1(self):
-        prov = Arxiv(upload=False, verbose=VERBOSE)
+        # check with qpdf
+        prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None)
         url = "https://arxiv.org/abs/1811.11242v1"
         exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
         filename = prov.run(url)
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 14:58:45 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-04 14:58:45 +0100
commit	c0a96f196b5a5c0931cd8d9aed1edecdc16c16af (patch)
tree	b9b139609e1bddef0eea42550b5ca9afbd50f2fb
parent	Merge branch 'ClaytonJY-upgrade-cairo' (diff)
parent	Code formatting (diff)
download	paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.tar.gz paper2remarkable-c0a96f196b5a5c0931cd8d9aed1edecdc16c16af.zip