aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 16:33:49 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-05-30 16:33:49 +0100
commitf62ee2acf923d04a6b288a034ac1be930a64f326 (patch)
tree0208d0b7f20a02c7cd9158027e040d5f330e798d
parentMerge branch 'bugfix/upload_multi' (diff)
downloadpaper2remarkable-f62ee2acf923d04a6b288a034ac1be930a64f326.tar.gz
paper2remarkable-f62ee2acf923d04a6b288a034ac1be930a64f326.zip
Add provider for ECCC (partially addresses #104)
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/eccc.py86
-rw-r--r--tests/test_providers.py24
-rw-r--r--tests/test_ui.py6
4 files changed, 118 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 5130147..e574b80 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -5,6 +5,7 @@ from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
from .cvf import CVF
+from .eccc import ECCC
from .html import HTML
from .jmlr import JMLR
from .local import LocalFile
@@ -28,6 +29,7 @@ providers = [
Arxiv,
CiteSeerX,
CVF,
+ ECCC,
JMLR,
Nature,
NBER,
diff --git a/paper2remarkable/providers/eccc.py b/paper2remarkable/providers/eccc.py
new file mode 100644
index 0000000..f6a6bae
--- /dev/null
+++ b/paper2remarkable/providers/eccc.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Electronic Colloquium on Computational Complexity
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2021, G.J.J. van den Burg
+
+"""
+
+import bs4
+import re
+
+from ._info import Informer
+from ._base import Provider
+from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
+
+
+class ECCCInformer(Informer):
+ def _get_paper_div(self, soup):
+ h3 = soup.find(lambda t: t.name == "h3" and t.get_text() == "Paper:")
+ div = h3.find_next_sibling("div")
+ return bs4.BeautifulSoup(div.prettify(), "html.parser")
+
+ def get_title(self, soup):
+ divsoup = self._get_paper_div(soup)
+ h4 = divsoup.find("h4")
+ if not h4:
+ logger.warning(
+ "Couldn't determine title information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ return h4.get_text().strip()
+
+ def get_authors(self, soup):
+ divsoup = self._get_paper_div(soup)
+ aa = divsoup.find_all(
+ lambda t: t.name == "a" and t.get("href").startswith("/author/")
+ )
+ if not aa:
+ logger.warning(
+ "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ authors = [a.get_text() for a in aa]
+ return self._format_authors(authors, sep=" ", idx=-1)
+
+ def get_year(self, soup):
+ divsoup = self._get_paper_div(soup)
+ line = next(
+ (l for l in divsoup.text.split("\n") if "Publication: " in l), None
+ )
+ if line is None:
+ logger.warning(
+ "Couldn't determine year information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ year = line.strip().split(" ")[3] # bit lazy
+ return year
+
+
+class ECCC(Provider):
+
+ re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
+ re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ECCCInformer()
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.rstrip("/") + "/download"
+ elif re.match(self.re_pdf, url):
+ abs_url = url.rstrip("/")[: -len("/download")]
+ pdf_url = url
+ else:
+ raise URLResolutionError("ECCC", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(ECCC.re_abs, src) or re.match(ECCC.re_pdf, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 1c1e1e6..efc4be1 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -21,6 +21,7 @@ from paper2remarkable.providers import (
Arxiv,
CVF,
CiteSeerX,
+ ECCC,
HTML,
JMLR,
LocalFile,
@@ -478,6 +479,29 @@ class TestProviders(unittest.TestCase):
with pdf.open_outline() as outline:
assert len(outline.root) > 0
+ def test_eccc_1(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/2021/063/"
+ exp = "Chou_et_al_-_Approximability_of_All_Finite_CSPs_in_the_Dynamic_Streaming_Setting_2021.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_eccc_2(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/2007/003/"
+ exp = "Cai_Lu_-_Bases_Collapse_in_Holographic_Algorithms_2007.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_eccc_3(self):
+ prov = ECCC(upload=False, verbose=VERBOSE)
+ url = "https://eccc.weizmann.ac.il/report/1998/052/download"
+ exp = (
+ "Hemkemeier_Vallentin_-_On_the_Decomposition_of_Lattices_1998.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 86a3c8e..87b66d8 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -22,6 +22,7 @@ from paper2remarkable.providers import (
Arxiv,
CiteSeerX,
CVF,
+ ECCC,
HTML,
JMLR,
LocalFile,
@@ -203,6 +204,11 @@ class TestUI(unittest.TestCase):
"https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
"https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
),
+ (
+ ECCC,
+ "https://eccc.weizmann.ac.il/report/2019/006/",
+ "https://eccc.weizmann.ac.il/report/2019/006/",
+ ),
]
for exp_prov, url, exp_url in tests:
prov, new_url, jar = choose_provider(url)