aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-05-03 21:16:22 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-05-03 21:16:22 +0100
commit64a352ff6654744b9738fe32c01679032d42d6c1 (patch)
tree5d9b7d17894e51021d905171f00f4b3e450b4dba
parentBump version and update changelog (diff)
parentMerge branch 'master' into bugfix/dearxiv (diff)
downloadpaper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.tar.gz
paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.zip
Merge branch 'bugfix/dearxiv'
-rw-r--r--paper2remarkable/providers/_base.py21
-rw-r--r--paper2remarkable/providers/arxiv.py67
-rw-r--r--tests/test_arxiv.py62
-rw-r--r--tests/test_providers.py8
4 files changed, 129 insertions, 29 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index c3abe19..5ca3588 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -131,8 +131,29 @@ class Provider(metaclass=abc.ABCMeta):
"%s failed to compress the PDF file." % self.pdftool
)
+ def rewrite_pdf(self, in_pdf, out_pdf):
+ """ Re-write the pdf using Ghostscript
+
+ This helps avoid issues in dearxiv due to nested pdfs.
+ """
+ status = subprocess.call(
+ [
+ self.gs_path,
+ "-sDEVICE=pdfwrite",
+ "-dQUIET",
+ "-o",
+ out_pdf,
+ in_pdf,
+ ]
+ )
+ if not status == 0:
+ raise _CalledProcessError(
+ "Failed to rewrite the pdf with GhostScript"
+ )
+
def uncompress_pdf(self, in_pdf, out_pdf):
""" Uncompress a pdf file """
+
if self.pdftool == "pdftk":
status = subprocess.call(
[self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",]
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 47da448..4d0bc19 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -18,8 +18,9 @@ from ..log import Logger
logger = Logger()
-DEARXIV_TEXT_REGEX = (
- b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_URI_REGEX = (
+ b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+"
)
@@ -32,8 +33,8 @@ class Arxiv(Provider):
re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
- re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?"
- re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf"
+ re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
+ re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -68,8 +69,11 @@ class Arxiv(Provider):
logger.info("Removing arXiv timestamp ... ", end="")
basename = os.path.splitext(input_file)[0]
+ recoded_file = basename + "_rewrite.pdf"
+ self.rewrite_pdf(input_file, recoded_file)
+
uncompress_file = basename + "_uncompress.pdf"
- self.uncompress_pdf(input_file, uncompress_file)
+ self.uncompress_pdf(recoded_file, uncompress_file)
new_data = []
current_obj = []
@@ -90,36 +94,42 @@ class Arxiv(Provider):
current_obj.append(line)
objid = int(line.split(b" ")[0])
xref[objid] = char_count
- elif current_obj and line.startswith(b"endobj"):
+ elif current_obj and (
+ line.startswith(b"endobj")
+ and not line.startswith(b"endobj xref")
+ ):
# End the current object. If needed, replace the arXiv
# stamp in the block (done only once). Reset current
# object.
current_obj.append(line)
block = b"".join(current_obj)
- if not replaced_arXiv and b"arXivStAmP" in block:
- # remove the text
- block, n_subs1 = re.subn(
- b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
- b"()Tj",
- block,
- )
- # remove the url
- block, n_subs2 = re.subn(
- b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
- b"",
- block,
- )
- if n_subs1 or n_subs2:
- # fix the length of the object stream
- block = fix_stream_length(block)
- replaced_arXiv = True
+ # remove the text
+ block, n_subs1 = re.subn(
+ b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block,
+ )
+ # remove the url
+ block, n_subs2 = re.subn(
+ b"<<\n\/URI \("
+ + DEARXIV_URI_REGEX
+ + b"\)\n\/S /URI\n>>\n",
+ b"",
+ block,
+ )
+ if n_subs1 or n_subs2:
+ # fix the length of the object stream
+ block = fix_stream_length(block)
+ replaced_arXiv = True
new_data.append(block)
char_count += len(block)
current_obj = []
- elif current_obj:
- # If we're recording an object, simply add the line to it
- current_obj.append(line)
elif line in [b"xref\n", b"endobj xref\n"]:
+ if b"endobj" in line and current_obj:
+ current_obj.append(b"endobj\n")
+ block = b"".join(current_obj)
+ new_data.append(block)
+ char_count += len(block)
+ current_obj = []
+ line = b"xref\n"
# We found the xref table, record its position and write it
# out using our updated indices.
startxref = sum(map(len, new_data))
@@ -131,6 +141,9 @@ class Arxiv(Provider):
# skip the appropriate number of lines
skip_n = len(xref) + 2
+ elif current_obj:
+ # If we're recording an object, simply add the line to it
+ current_obj.append(line)
elif line == b"startxref\n":
# Write out our recorded startxref position, skip the old
# position.
@@ -148,7 +161,7 @@ class Arxiv(Provider):
output_file = basename + "_dearxiv.pdf"
self.compress_pdf(removed_file, output_file)
- logger.append("success" if replaced_arXiv else "failed", "info")
+ logger.append("success" if replaced_arXiv else "none found", "info")
return output_file
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
index 08ea2c4..2cb84cf 100644
--- a/tests/test_arxiv.py
+++ b/tests/test_arxiv.py
@@ -13,7 +13,11 @@ import shutil
import tempfile
import unittest
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv
+from paper2remarkable.providers.arxiv import (
+ DEARXIV_TEXT_REGEX,
+ DEARXIV_URI_REGEX,
+ Arxiv,
+)
class TestArxiv(unittest.TestCase):
@@ -39,6 +43,26 @@ class TestArxiv(unittest.TestCase):
m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
self.assertIsNotNone(m)
+ def test_text_regex_3(self):
+ key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_text_regex_4(self):
+ key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_uri_regex_1(self):
+ key = b"http://arxiv.org/abs/physics/0605197v1"
+ m = re.fullmatch(DEARXIV_URI_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_uri_regex_2(self):
+ key = b"https://arxiv.org/abs/1101.0028v3"
+ m = re.fullmatch(DEARXIV_URI_REGEX, key)
+ self.assertIsNotNone(m)
+
def test_stamp_removed_1(self):
url = "https://arxiv.org/pdf/1703.06103.pdf"
prov = Arxiv(upload=False)
@@ -57,6 +81,42 @@ class TestArxiv(unittest.TestCase):
data = fp.read()
self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data)
+ def test_stamp_removed_3(self):
+ url = "https://arxiv.org/abs/physics/0605197v1"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(
+ b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data
+ )
+ self.assertNotIn(
+ b"/URI (http://arxiv.org/abs/physics/0605197v1)", data
+ )
+
+ def test_stamp_removed_4(self):
+ url = "https://arxiv.org/abs/math/0309285v2"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data)
+ self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data)
+
+ def test_stamp_removed_5(self):
+ url = "https://arxiv.org/abs/astro-ph/9207001v1"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(
+ b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data
+ )
+ self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data)
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_providers.py b/tests/test_providers.py
index a7f17ff..d2fdb0d 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -86,6 +86,13 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_arxiv_5(self):
+ prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None)
+ url = "https://arxiv.org/abs/2002.11523"
+ exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_pmc(self):
prov = PubMed(upload=False, verbose=VERBOSE)
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
@@ -256,6 +263,5 @@ class TestProviders(unittest.TestCase):
self.assertEqual(4, len(pdfplumber.open(filename).pages))
-
if __name__ == "__main__":
unittest.main()