diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-05-03 21:16:22 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-05-03 21:16:22 +0100 |
| commit | 64a352ff6654744b9738fe32c01679032d42d6c1 (patch) | |
| tree | 5d9b7d17894e51021d905171f00f4b3e450b4dba | |
| parent | Bump version and update changelog (diff) | |
| parent | Merge branch 'master' into bugfix/dearxiv (diff) | |
| download | paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.tar.gz paper2remarkable-64a352ff6654744b9738fe32c01679032d42d6c1.zip | |
Merge branch 'bugfix/dearxiv'
| -rw-r--r-- | paper2remarkable/providers/_base.py | 21 | ||||
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 67 | ||||
| -rw-r--r-- | tests/test_arxiv.py | 62 | ||||
| -rw-r--r-- | tests/test_providers.py | 8 |
4 files changed, 129 insertions, 29 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index c3abe19..5ca3588 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -131,8 +131,29 @@ class Provider(metaclass=abc.ABCMeta): "%s failed to compress the PDF file." % self.pdftool ) + def rewrite_pdf(self, in_pdf, out_pdf): + """ Re-write the pdf using Ghostscript + + This helps avoid issues in dearxiv due to nested pdfs. + """ + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dQUIET", + "-o", + out_pdf, + in_pdf, + ] + ) + if not status == 0: + raise _CalledProcessError( + "Failed to rewrite the pdf with GhostScript" + ) + def uncompress_pdf(self, in_pdf, out_pdf): """ Uncompress a pdf file """ + if self.pdftool == "pdftk": status = subprocess.call( [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 47da448..4d0bc19 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -18,8 +18,9 @@ from ..log import Logger logger = Logger() -DEARXIV_TEXT_REGEX = ( - b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_URI_REGEX = ( + b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+" ) @@ -32,8 +33,8 @@ class Arxiv(Provider): re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" - re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?" - re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf" + re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?" + re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -68,8 +69,11 @@ class Arxiv(Provider): logger.info("Removing arXiv timestamp ... ", end="") basename = os.path.splitext(input_file)[0] + recoded_file = basename + "_rewrite.pdf" + self.rewrite_pdf(input_file, recoded_file) + uncompress_file = basename + "_uncompress.pdf" - self.uncompress_pdf(input_file, uncompress_file) + self.uncompress_pdf(recoded_file, uncompress_file) new_data = [] current_obj = [] @@ -90,36 +94,42 @@ class Arxiv(Provider): current_obj.append(line) objid = int(line.split(b" ")[0]) xref[objid] = char_count - elif current_obj and line.startswith(b"endobj"): + elif current_obj and ( + line.startswith(b"endobj") + and not line.startswith(b"endobj xref") + ): # End the current object. If needed, replace the arXiv # stamp in the block (done only once). Reset current # object. current_obj.append(line) block = b"".join(current_obj) - if not replaced_arXiv and b"arXivStAmP" in block: - # remove the text - block, n_subs1 = re.subn( - b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", - b"()Tj", - block, - ) - # remove the url - block, n_subs2 = re.subn( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - block, - ) - if n_subs1 or n_subs2: - # fix the length of the object stream - block = fix_stream_length(block) - replaced_arXiv = True + # remove the text + block, n_subs1 = re.subn( + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block, + ) + # remove the url + block, n_subs2 = re.subn( + b"<<\n\/URI \(" + + DEARXIV_URI_REGEX + + b"\)\n\/S /URI\n>>\n", + b"", + block, + ) + if n_subs1 or n_subs2: + # fix the length of the object stream + block = fix_stream_length(block) + replaced_arXiv = True new_data.append(block) char_count += len(block) current_obj = [] - elif current_obj: - # If we're recording an object, simply add the line to it - current_obj.append(line) elif line in [b"xref\n", b"endobj xref\n"]: + if b"endobj" in line and current_obj: + current_obj.append(b"endobj\n") + block = b"".join(current_obj) + new_data.append(block) + char_count += len(block) + current_obj = [] + line = b"xref\n" # We found the xref table, record its position and write it # out using our updated indices. startxref = sum(map(len, new_data)) @@ -131,6 +141,9 @@ class Arxiv(Provider): # skip the appropriate number of lines skip_n = len(xref) + 2 + elif current_obj: + # If we're recording an object, simply add the line to it + current_obj.append(line) elif line == b"startxref\n": # Write out our recorded startxref position, skip the old # position. @@ -148,7 +161,7 @@ class Arxiv(Provider): output_file = basename + "_dearxiv.pdf" self.compress_pdf(removed_file, output_file) - logger.append("success" if replaced_arXiv else "failed", "info") + logger.append("success" if replaced_arXiv else "none found", "info") return output_file diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index 08ea2c4..2cb84cf 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -13,7 +13,11 @@ import shutil import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv +from paper2remarkable.providers.arxiv import ( + DEARXIV_TEXT_REGEX, + DEARXIV_URI_REGEX, + Arxiv, +) class TestArxiv(unittest.TestCase): @@ -39,6 +43,26 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_text_regex_3(self): + key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_4(self): + key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_1(self): + key = b"http://arxiv.org/abs/physics/0605197v1" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_2(self): + key = b"https://arxiv.org/abs/1101.0028v3" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + def test_stamp_removed_1(self): url = "https://arxiv.org/pdf/1703.06103.pdf" prov = Arxiv(upload=False) @@ -57,6 +81,42 @@ class TestArxiv(unittest.TestCase): data = fp.read() self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + def test_stamp_removed_3(self): + url = "https://arxiv.org/abs/physics/0605197v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data + ) + self.assertNotIn( + b"/URI (http://arxiv.org/abs/physics/0605197v1)", data + ) + + def test_stamp_removed_4(self): + url = "https://arxiv.org/abs/math/0309285v2" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data) + self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data) + + def test_stamp_removed_5(self): + url = "https://arxiv.org/abs/astro-ph/9207001v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data + ) + self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index a7f17ff..d2fdb0d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -86,6 +86,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_5(self): + prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None) + url = "https://arxiv.org/abs/2002.11523" + exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" @@ -256,6 +263,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(4, len(pdfplumber.open(filename).pages)) - if __name__ == "__main__": unittest.main() |
