diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-05-30 16:44:17 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-05-30 16:44:17 +0100 |
| commit | 62914dc850b94fdcce90811dab568eb3a6d37729 (patch) | |
| tree | 27fbe2d8564151a9b545302904d2028ee6a0da11 | |
| parent | Add provider for ECCC (partially addresses #104) (diff) | |
| parent | Merge branch 'feature/provider_iacr' (diff) | |
| download | paper2remarkable-62914dc850b94fdcce90811dab568eb3a6d37729.tar.gz paper2remarkable-62914dc850b94fdcce90811dab568eb3a6d37729.zip | |
Merge branch 'master' into feature/provider_eccc
| -rw-r--r-- | paper2remarkable/crop.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 12 | ||||
| -rw-r--r-- | paper2remarkable/providers/iacr.py | 111 | ||||
| -rw-r--r-- | tests/test_providers.py | 22 | ||||
| -rw-r--r-- | tests/test_ui.py | 11 |
6 files changed, 154 insertions, 6 deletions
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 16d050e..6e4a177 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -180,7 +180,7 @@ class Cropper(object): filename, ] - im = subprocess.check_output(cmd) + im = subprocess.check_output(cmd, stderr=subprocess.DEVNULL) im = io.BytesIO(im) id_ = im.readline().rstrip(b"\n") diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index e574b80..4addacb 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -7,6 +7,7 @@ from .citeseerx import CiteSeerX from .cvf import CVF from .eccc import ECCC from .html import HTML +from .iacr import IACR from .jmlr import JMLR from .local import LocalFile from .nature import Nature @@ -30,6 +31,7 @@ providers = [ CiteSeerX, CVF, ECCC, + IACR, JMLR, Nature, NBER, diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 56d61e5..9357b91 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -142,13 +142,14 @@ class Provider(metaclass=abc.ABCMeta): "%s failed to compress the PDF file." % self.pdftool ) - def rewrite_pdf(self, in_pdf, out_pdf=None): - """Re-write the pdf using Ghostscript + def rewrite_pdf(self, in_file, out_pdf=None): + """Re-write the ps or pdf using Ghostscript - This helps avoid issues in dearxiv due to nested pdfs. + This helps avoid issues in dearxiv due to nested pdfs and enables + support for postscript files. """ if out_pdf is None: - out_pdf = os.path.splitext(in_pdf)[0] + "-rewrite.pdf" + out_pdf = os.path.splitext(in_file)[0] + "-rewrite.pdf" status = subprocess.call( [ @@ -157,7 +158,7 @@ class Provider(metaclass=abc.ABCMeta): "-dQUIET", "-o", out_pdf, - in_pdf, + in_file, ] ) if not status == 0: @@ -169,6 +170,7 @@ class Provider(metaclass=abc.ABCMeta): def uncompress_pdf(self, in_pdf, out_pdf): """ Uncompress a pdf file """ + logger.info("Uncompressing with {self.pdftool} ...") if self.pdftool == "pdftk": status = subprocess.call( [ diff --git a/paper2remarkable/providers/iacr.py b/paper2remarkable/providers/iacr.py new file mode 100644 index 0000000..f91d2e5 --- /dev/null +++ b/paper2remarkable/providers/iacr.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +"""Provider for IACR's eprints + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import bs4 +import os +import re +import urllib.parse + +from ._info import Informer +from ._base import Provider +from ..exceptions import URLResolutionError +from ..log import Logger +from ..utils import get_page_with_retry + +logger = Logger() + + +class IACRInformer(Informer): + def get_title(self, soup): + title = soup.find_all("title") + if not title: + logger.warning( + "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + ) + return "" + return title[0].get_text().split("-", maxsplit=1)[-1] + + def get_authors(self, soup): + i = soup.find_all("i") + if not i: + logger.warning( + "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + ) + return "" + authors = i[0].get_text() + authors = authors.replace(" ", " ") + authors = authors.split(" and ") + return self._format_authors(authors, sep=" ", idx=-1) + + def get_year(self, soup): + h2 = soup.find_all("h2") + if not h2: + logger.warning( + "Couldn't determine year information, maybe provide the desired filename using '--filename'?" + ) + return "" + text = h2[0].get_text() + report = text.split(":", maxsplit=1)[-1] + year_num = report.strip().split(" ")[1] + year = year_num.split("/")[0] + return year + + +class IACR(Provider): + + re_abs = "https?://eprint.iacr.org/\d{4}/\d+$" + re_pdf = "https?://eprint.iacr.org/\d{4}/\d+\.pdf$" + re_ps = "https?://eprint.iacr.org/\d{4}/\d+\.ps$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = IACRInformer() + + def _get_doc_url(self, abs_url): + page = get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + + bb = soup.find_all("b") + b = next((b for b in bb if "Available format" in b.get_text()), None) + if b is None: + # Fallback + return abs_url + ".pdf" + aa = b.find_next_siblings("a") + a = next((a for a in aa if "PDF" in a.get_text()), None) + if not a is None: + return urllib.parse.urljoin(abs_url, a.get("href")) + a = next((a for a in aa if "Postscript (PS)" in a.get_text()), None) + if not a is None: + return urllib.parse.urljoin(abs_url, a.get("href")) + # Fallback + return abs_url + ".pdf" + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self._get_doc_url(url) + elif re.match(self.re_pdf, url): + abs_url = url[: -len(".pdf")] + pdf_url = url + elif re.match(self.re_ps, url): + abs_url = url[: -len(".ps")] + pdf_url = url + else: + raise URLResolutionError("IACR", url) + return abs_url, pdf_url + + def retrieve_pdf(self, pdf_url, filename): + # Bit hacky, can consider adding first-class PS support + tmpfilename = os.path.splitext(filename)[0] + "-tmp.pdf" + super().retrieve_pdf(pdf_url, tmpfilename) + self.rewrite_pdf(tmpfilename, out_pdf=filename) + + def validate(src): + return re.match(IACR.re_abs, src) or re.match(IACR.re_pdf, src) diff --git a/tests/test_providers.py b/tests/test_providers.py index efc4be1..33e0286 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -23,6 +23,7 @@ from paper2remarkable.providers import ( CiteSeerX, ECCC, HTML, + IACR, JMLR, LocalFile, Nature, @@ -502,6 +503,27 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_iacr_1(self): + prov = IACR(upload=False, verbose=VERBOSE) + url = "https://eprint.iacr.org/2021/489" + exp = "Xu_et_al_-_ROSE_Robust_Searchable_Encryption_With_Forward_and_Backward_Security_and_Practical_Performance_2021.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_iacr_2(self): + prov = IACR(upload=False, verbose=VERBOSE) + url = "https://eprint.iacr.org/2007/474.pdf" + exp = "Cochran_-_Notes_on_the_Wang_Et_Al._2_63_SHA-1_Differential_Path_2007.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_iacr_3(self): + prov = IACR(upload=False, verbose=VERBOSE) + url = "http://eprint.iacr.org/1996/008" + exp = "Naor_Wool_-_Access_Control_and_Signatures_via_Quorum_Secret_Sharing_1996.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ui.py b/tests/test_ui.py index 87b66d8..5ecb3e1 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -24,6 +24,7 @@ from paper2remarkable.providers import ( CVF, ECCC, HTML, + IACR, JMLR, LocalFile, Nature, @@ -115,6 +116,16 @@ class TestUI(unittest.TestCase): "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", ), ( + IACR, + "https://eprint.iacr.org/1999/011", + "https://eprint.iacr.org/1999/011", + ), + ( + IACR, + "https://eprint.iacr.org/2021/685.pdf", + "https://eprint.iacr.org/2021/685.pdf", + ), + ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", |
