diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-05-30 16:44:31 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-05-30 16:44:31 +0100 |
| commit | bd8968b653a81dfdbd1468e24d2f02f9e70b7fdf (patch) | |
| tree | 27fbe2d8564151a9b545302904d2028ee6a0da11 | |
| parent | Merge branch 'feature/provider_iacr' (diff) | |
| parent | Merge branch 'master' into feature/provider_eccc (diff) | |
| download | paper2remarkable-bd8968b653a81dfdbd1468e24d2f02f9e70b7fdf.tar.gz paper2remarkable-bd8968b653a81dfdbd1468e24d2f02f9e70b7fdf.zip | |
Merge branch 'feature/provider_eccc'
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/eccc.py | 86 | ||||
| -rw-r--r-- | tests/test_providers.py | 24 | ||||
| -rw-r--r-- | tests/test_ui.py | 6 |
4 files changed, 118 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index fb12a21..4addacb 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -5,6 +5,7 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX from .cvf import CVF +from .eccc import ECCC from .html import HTML from .iacr import IACR from .jmlr import JMLR @@ -29,6 +30,7 @@ providers = [ Arxiv, CiteSeerX, CVF, + ECCC, IACR, JMLR, Nature, diff --git a/paper2remarkable/providers/eccc.py b/paper2remarkable/providers/eccc.py new file mode 100644 index 0000000..f6a6bae --- /dev/null +++ b/paper2remarkable/providers/eccc.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +"""Provider for Electronic Colloquium on Computational Complexity + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2021, G.J.J. van den Burg + +""" + +import bs4 +import re + +from ._info import Informer +from ._base import Provider +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class ECCCInformer(Informer): + def _get_paper_div(self, soup): + h3 = soup.find(lambda t: t.name == "h3" and t.get_text() == "Paper:") + div = h3.find_next_sibling("div") + return bs4.BeautifulSoup(div.prettify(), "html.parser") + + def get_title(self, soup): + divsoup = self._get_paper_div(soup) + h4 = divsoup.find("h4") + if not h4: + logger.warning( + "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + ) + return "" + return h4.get_text().strip() + + def get_authors(self, soup): + divsoup = self._get_paper_div(soup) + aa = divsoup.find_all( + lambda t: t.name == "a" and t.get("href").startswith("/author/") + ) + if not aa: + logger.warning( + "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + ) + return "" + authors = [a.get_text() for a in aa] + return self._format_authors(authors, sep=" ", idx=-1) + + def get_year(self, soup): + divsoup = self._get_paper_div(soup) + line = next( + (l for l in divsoup.text.split("\n") if "Publication: " in l), None + ) + if line is None: + logger.warning( + "Couldn't determine year information, maybe provide the desired filename using '--filename'?" + ) + return "" + year = line.strip().split(" ")[3] # bit lazy + return year + + +class ECCC(Provider): + + re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$" + re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = ECCCInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.rstrip("/") + "/download" + elif re.match(self.re_pdf, url): + abs_url = url.rstrip("/")[: -len("/download")] + pdf_url = url + else: + raise URLResolutionError("ECCC", url) + return abs_url, pdf_url + + def validate(src): + return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src) diff --git a/tests/test_providers.py b/tests/test_providers.py index 2e112e0..33e0286 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -21,6 +21,7 @@ from paper2remarkable.providers import ( Arxiv, CVF, CiteSeerX, + ECCC, HTML, IACR, JMLR, @@ -479,6 +480,29 @@ class TestProviders(unittest.TestCase): with pdf.open_outline() as outline: assert len(outline.root) > 0 + def test_eccc_1(self): + prov = ECCC(upload=False, verbose=VERBOSE) + url = "https://eccc.weizmann.ac.il/report/2021/063/" + exp = "Chou_et_al_-_Approximability_of_All_Finite_CSPs_in_the_Dynamic_Streaming_Setting_2021.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_eccc_2(self): + prov = ECCC(upload=False, verbose=VERBOSE) + url = "https://eccc.weizmann.ac.il/report/2007/003/" + exp = "Cai_Lu_-_Bases_Collapse_in_Holographic_Algorithms_2007.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_eccc_3(self): + prov = ECCC(upload=False, verbose=VERBOSE) + url = "https://eccc.weizmann.ac.il/report/1998/052/download" + exp = ( + "Hemkemeier_Vallentin_-_On_the_Decomposition_of_Lattices_1998.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_iacr_1(self): prov = IACR(upload=False, verbose=VERBOSE) url = "https://eprint.iacr.org/2021/489" diff --git a/tests/test_ui.py b/tests/test_ui.py index 6dfdaa6..5ecb3e1 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -22,6 +22,7 @@ from paper2remarkable.providers import ( Arxiv, CiteSeerX, CVF, + ECCC, HTML, IACR, JMLR, @@ -214,6 +215,11 @@ class TestUI(unittest.TestCase): "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf", "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf", ), + ( + ECCC, + "https://eccc.weizmann.ac.il/report/2019/006/", + "https://eccc.weizmann.ac.il/report/2019/006/", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) |
