Merge branch 'bugfix/dearxiv'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-05-03 21:16:22 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-05-03 21:16:22 +0100
commit: 64a352ff6654744b9738fe32c01679032d42d6c1 (patch)
tree: 5d9b7d17894e51021d905171f00f4b3e450b4dba
parent: Bump version and update changelog (diff)
parent: Merge branch 'master' into bugfix/dearxiv (diff)
download: paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.tar.gz
paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.zip
4 files changed, 129 insertions, 29 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index c3abe19..5ca3588 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -131,8 +131,29 @@ class Provider(metaclass=abc.ABCMeta):
                 "%s failed to compress the PDF file." % self.pdftool
             )
 
+    def rewrite_pdf(self, in_pdf, out_pdf):
+        """ Re-write the pdf using Ghostscript
+
+        This helps avoid issues in dearxiv due to nested pdfs.
+        """
+        status = subprocess.call(
+            [
+                self.gs_path,
+                "-sDEVICE=pdfwrite",
+                "-dQUIET",
+                "-o",
+                out_pdf,
+                in_pdf,
+            ]
+        )
+        if not status == 0:
+            raise _CalledProcessError(
+                "Failed to rewrite the pdf with GhostScript"
+            )
+
     def uncompress_pdf(self, in_pdf, out_pdf):
         """ Uncompress a pdf file """
+
         if self.pdftool == "pdftk":
             status = subprocess.call(
                 [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 47da448..4d0bc19 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -18,8 +18,9 @@ from ..log import Logger
 
 logger = Logger()
 
-DEARXIV_TEXT_REGEX = (
-    b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_URI_REGEX = (
+    b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+"
 )
 
 
@@ -32,8 +33,8 @@ class Arxiv(Provider):
     re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
     re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
 
-    re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?"
-    re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf"
+    re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
+    re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -68,8 +69,11 @@ class Arxiv(Provider):
         logger.info("Removing arXiv timestamp ... ", end="")
         basename = os.path.splitext(input_file)[0]
 
+        recoded_file = basename + "_rewrite.pdf"
+        self.rewrite_pdf(input_file, recoded_file)
+
         uncompress_file = basename + "_uncompress.pdf"
-        self.uncompress_pdf(input_file, uncompress_file)
+        self.uncompress_pdf(recoded_file, uncompress_file)
 
         new_data = []
         current_obj = []
@@ -90,36 +94,42 @@ class Arxiv(Provider):
                     current_obj.append(line)
                     objid = int(line.split(b" ")[0])
                     xref[objid] = char_count
-                elif current_obj and line.startswith(b"endobj"):
+                elif current_obj and (
+                    line.startswith(b"endobj")
+                    and not line.startswith(b"endobj xref")
+                ):
                     # End the current object. If needed, replace the arXiv
                     # stamp in the block (done only once). Reset current
                     # object.
                     current_obj.append(line)
                     block = b"".join(current_obj)
-                    if not replaced_arXiv and b"arXivStAmP" in block:
-                        # remove the text
-                        block, n_subs1 = re.subn(
-                            b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
-                            b"()Tj",
-                            block,
-                        )
-                        # remove the url
-                        block, n_subs2 = re.subn(
-                            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
-                            b"",
-                            block,
-                        )
-                        if n_subs1 or n_subs2:
-                            # fix the length of the object stream
-                            block = fix_stream_length(block)
-                            replaced_arXiv = True
+                    # remove the text
+                    block, n_subs1 = re.subn(
+                        b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block,
+                    )
+                    # remove the url
+                    block, n_subs2 = re.subn(
+                        b"<<\n\/URI \("
+                        + DEARXIV_URI_REGEX
+                        + b"\)\n\/S /URI\n>>\n",
+                        b"",
+                        block,
+                    )
+                    if n_subs1 or n_subs2:
+                        # fix the length of the object stream
+                        block = fix_stream_length(block)
+                        replaced_arXiv = True
                     new_data.append(block)
                     char_count += len(block)
                     current_obj = []
-                elif current_obj:
-                    # If we're recording an object, simply add the line to it
-                    current_obj.append(line)
                 elif line in [b"xref\n", b"endobj xref\n"]:
+                    if b"endobj" in line and current_obj:
+                        current_obj.append(b"endobj\n")
+                        block = b"".join(current_obj)
+                        new_data.append(block)
+                        char_count += len(block)
+                        current_obj = []
+                        line = b"xref\n"
                     # We found the xref table, record its position and write it
                     # out using our updated indices.
                     startxref = sum(map(len, new_data))
@@ -131,6 +141,9 @@ class Arxiv(Provider):
 
                     # skip the appropriate number of lines
                     skip_n = len(xref) + 2
+                elif current_obj:
+                    # If we're recording an object, simply add the line to it
+                    current_obj.append(line)
                 elif line == b"startxref\n":
                     # Write out our recorded startxref position, skip the old
                     # position.
@@ -148,7 +161,7 @@ class Arxiv(Provider):
         output_file = basename + "_dearxiv.pdf"
         self.compress_pdf(removed_file, output_file)
 
-        logger.append("success" if replaced_arXiv else "failed", "info")
+        logger.append("success" if replaced_arXiv else "none found", "info")
 
         return output_file
 
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index 08ea2c4..2cb84cf 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -13,7 +13,11 @@ import shutil
 import tempfile
 import unittest
 
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
+from paper2remarkable.providers.arxiv import (
+    DEARXIV_TEXT_REGEX,
+    DEARXIV_URI_REGEX,
+    Arxiv,
+)
 
 
 class TestArxiv(unittest.TestCase):
@@ -39,6 +43,26 @@ class TestArxiv(unittest.TestCase):
         m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
         self.assertIsNotNone(m)
 
+    def test_text_regex_3(self):
+        key = b"arXiv:physics/0605197v1  [physics.data-an]  23 May 2006"
+        m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+        self.assertIsNotNone(m)
+
+    def test_text_regex_4(self):
+        key = b"arXiv:math/0309285v2  [math.NA]  9 Apr 2004"
+        m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+        self.assertIsNotNone(m)
+
+    def test_uri_regex_1(self):
+        key = b"http://arxiv.org/abs/physics/0605197v1"
+        m = re.fullmatch(DEARXIV_URI_REGEX, key)
+        self.assertIsNotNone(m)
+
+    def test_uri_regex_2(self):
+        key = b"https://arxiv.org/abs/1101.0028v3"
+        m = re.fullmatch(DEARXIV_URI_REGEX, key)
+        self.assertIsNotNone(m)
+
     def test_stamp_removed_1(self):
         url = "https://arxiv.org/pdf/1703.06103.pdf"
         prov = Arxiv(upload=False)
@@ -57,6 +81,42 @@ class TestArxiv(unittest.TestCase):
             data = fp.read()
         self.assertNotIn(b"arXiv:2003.06222v1  [stat.ML]  13 Mar 2020", data)
 
+    def test_stamp_removed_3(self):
+        url = "https://arxiv.org/abs/physics/0605197v1"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(
+            b"arXiv:physics/0605197v1  [physics.data-an]  23 May 2006", data
+        )
+        self.assertNotIn(
+            b"/URI (http://arxiv.org/abs/physics/0605197v1)", data
+        )
+
+    def test_stamp_removed_4(self):
+        url = "https://arxiv.org/abs/math/0309285v2"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(b"arXiv:math/0309285v2  [math.NA]  9 Apr 2004", data)
+        self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data)
+
+    def test_stamp_removed_5(self):
+        url = "https://arxiv.org/abs/astro-ph/9207001v1"
+        prov = Arxiv(upload=False)
+        filename = prov.run(url, filename="./target.pdf")
+        prov.uncompress_pdf(filename, "unc.pdf")
+        with open("unc.pdf", "rb") as fp:
+            data = fp.read()
+        self.assertNotIn(
+            b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data
+        )
+        self.assertNotIn(b"arXiv:astro-ph/9207001v1  13 Jul 1992", data)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_providers.py b/tests/test_providers.py
index a7f17ff..d2fdb0d 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -86,6 +86,13 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp_filename, os.path.basename(filename))
 
+    def test_arxiv_5(self):
+        prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None)
+        url = "https://arxiv.org/abs/2002.11523"
+        exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
     def test_pmc(self):
         prov = PubMed(upload=False, verbose=VERBOSE)
         url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
@@ -256,6 +263,5 @@ class TestProviders(unittest.TestCase):
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
 
-
 if __name__ == "__main__":
     unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-05-03 21:16:22 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-05-03 21:16:22 +0100
commit	64a352ff6654744b9738fe32c01679032d42d6c1 (patch)
tree	5d9b7d17894e51021d905171f00f4b3e450b4dba
parent	Bump version and update changelog (diff)
parent	Merge branch 'master' into bugfix/dearxiv (diff)
download	paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.tar.gz paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.zip