aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 16:44:31 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 16:44:31 +0100
commitbd8968b653a81dfdbd1468e24d2f02f9e70b7fdf (patch)
tree27fbe2d8564151a9b545302904d2028ee6a0da11
parentMerge branch 'feature/provider_iacr' (diff)
parentMerge branch 'master' into feature/provider_eccc (diff)
downloadpaper2remarkable-bd8968b653a81dfdbd1468e24d2f02f9e70b7fdf.tar.gz
paper2remarkable-bd8968b653a81dfdbd1468e24d2f02f9e70b7fdf.zip
Merge branch 'feature/provider_eccc'
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/eccc.py86
-rw-r--r--tests/test_providers.py24
-rw-r--r--tests/test_ui.py6
4 files changed, 118 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index fb12a21..4addacb 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -5,6 +5,7 @@ from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
from .cvf import CVF
+from .eccc import ECCC
from .html import HTML
from .iacr import IACR
from .jmlr import JMLR
@@ -29,6 +30,7 @@ providers = [
Arxiv,
CiteSeerX,
CVF,
+ ECCC,
IACR,
JMLR,
Nature,
diff --git a/paper2remarkable/providers/eccc.py b/paper2remarkable/providers/eccc.py
new file mode 100644
index 0000000..f6a6bae
--- /dev/null
+++ b/paper2remarkable/providers/eccc.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Electronic Colloquium on Computational Complexity
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2021, G.J.J. van den Burg
+
+"""
+
+import bs4
+import re
+
+from ._info import Informer
+from ._base import Provider
+from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
+
+
+class ECCCInformer(Informer):
+ def _get_paper_div(self, soup):
+ h3 = soup.find(lambda t: t.name == "h3" and t.get_text() == "Paper:")
+ div = h3.find_next_sibling("div")
+ return bs4.BeautifulSoup(div.prettify(), "html.parser")
+
+ def get_title(self, soup):
+ divsoup = self._get_paper_div(soup)
+ h4 = divsoup.find("h4")
+ if not h4:
+ logger.warning(
+ "Couldn't determine title information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ return h4.get_text().strip()
+
+ def get_authors(self, soup):
+ divsoup = self._get_paper_div(soup)
+ aa = divsoup.find_all(
+ lambda t: t.name == "a" and t.get("href").startswith("/author/")
+ )
+ if not aa:
+ logger.warning(
+ "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ authors = [a.get_text() for a in aa]
+ return self._format_authors(authors, sep=" ", idx=-1)
+
+ def get_year(self, soup):
+ divsoup = self._get_paper_div(soup)
+ line = next(
+ (l for l in divsoup.text.split("\n") if "Publication: " in l), None
+ )
+ if line is None:
+ logger.warning(
+ "Couldn't determine year information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ year = line.strip().split(" ")[3] # bit lazy
+ return year
+
+
+class ECCC(Provider):
+
+ re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
+ re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ECCCInformer()
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.rstrip("/") + "/download"
+ elif re.match(self.re_pdf, url):
+ abs_url = url.rstrip("/")[: -len("/download")]
+ pdf_url = url
+ else:
+ raise URLResolutionError("ECCC", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 2e112e0..33e0286 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -21,6 +21,7 @@ from paper2remarkable.providers import (
Arxiv,
CVF,
CiteSeerX,
+ ECCC,
HTML,
IACR,
JMLR,
@@ -479,6 +480,29 @@ class TestProviders(unittest.TestCase):
with pdf.open_outline() as outline:
assert len(outline.root) > 0
+ def test_eccc_1(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/2021/063/"
+ exp = "Chou_et_al_-_Approximability_of_All_Finite_CSPs_in_the_Dynamic_Streaming_Setting_2021.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_eccc_2(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/2007/003/"
+ exp = "Cai_Lu_-_Bases_Collapse_in_Holographic_Algorithms_2007.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_eccc_3(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/1998/052/download"
+ exp = (
+ "Hemkemeier_Vallentin_-_On_the_Decomposition_of_Lattices_1998.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_iacr_1(self):
prov = IACR(upload=False, verbose=VERBOSE)
url = "https://eprint.iacr.org/2021/489"
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 6dfdaa6..5ecb3e1 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -22,6 +22,7 @@ from paper2remarkable.providers import (
Arxiv,
CiteSeerX,
CVF,
+ ECCC,
HTML,
IACR,
JMLR,
@@ -214,6 +215,11 @@ class TestUI(unittest.TestCase):
"https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
"https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
),
+ (
+ ECCC,
+ "https://eccc.weizmann.ac.il/report/2019/006/",
+ "https://eccc.weizmann.ac.il/report/2019/006/",
+ ),
]
for exp_prov, url, exp_url in tests:
prov, new_url, jar = choose_provider(url)