diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-01 12:36:07 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2021-03-01 12:36:07 +0000 |
| commit | 3ec03cc9622278cabed93eba3d0ed14f21c8257c (patch) | |
| tree | a18b2c000cb5a368533f1c5c64d9b528c1aa2b00 | |
| parent | Simplify example pdf file (diff) | |
| download | paper2remarkable-3ec03cc9622278cabed93eba3d0ed14f21c8257c.tar.gz paper2remarkable-3ec03cc9622278cabed93eba3d0ed14f21c8257c.zip | |
Add provider for ACLWeb
| -rw-r--r-- | README.md | 1 | ||||
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/acl.py | 52 | ||||
| -rw-r--r-- | tests/test_providers.py | 24 | ||||
| -rw-r--r-- | tests/test_ui.py | 6 |
5 files changed, 85 insertions, 0 deletions
@@ -30,6 +30,7 @@ papers easier (just use the `-n` flag). reMarkable from any of the following sources: * [arXiv](https://arxiv.org/) +* [ACL Web](https://www.aclweb.org/anthology/) * [ACM Digital Library](https://dl.acm.org/dl.cfm) * [CiteSeerX](http://citeseerx.ist.psu.edu/index) * [CVF](https://openaccess.thecvf.com/menu) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 2be218f..5130147 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from .acl import ACL from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX @@ -22,6 +23,7 @@ from .tandfonline import TandFOnline # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ + ACL, ACM, Arxiv, CiteSeerX, diff --git a/paper2remarkable/providers/acl.py b/paper2remarkable/providers/acl.py new file mode 100644 index 0000000..3220ca0 --- /dev/null +++ b/paper2remarkable/providers/acl.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +"""Provider for ACL + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2021, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class ACLInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class ACL(Provider): + + re_abs = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)" + re_pdf = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = ACLInformer() + + def get_abs_pdf_urls(self, url): + m = re.match(self.re_pdf, url) + if m: + pdf_url = url + abs_url = f"https://www.aclweb.org/anthology/{m['key']}" + return abs_url, pdf_url + + m = re.match(self.re_abs, url) + if m: + abs_url = url + pdf_url = f"https://www.aclweb.org/anthology/{m['key']}.pdf" + return abs_url, pdf_url + + raise URLResolutionError("ACL", url) + + def validate(src): + m = re.match(ACL.re_abs, src) or re.match(ACL.re_pdf, src) + return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index 8239662..eaeb8aa 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -13,6 +13,7 @@ import tempfile import unittest from paper2remarkable.providers import ( + ACL, ACM, Arxiv, CVF, @@ -414,6 +415,29 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_acl_1(self): + prov = ACL(upload=False, verbose=VERBOSE) + url = "https://www.aclweb.org/anthology/A88-1033/" + exp = "Newman_-_Combinatorial_Disambiguation_1988.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_acl_2(self): + prov = ACL(upload=False, verbose=VERBOSE) + url = "https://www.aclweb.org/anthology/2020.acl-main.79.pdf" + exp = "Zhong_et_al_-_Interpreting_Twitter_User_Geolocation_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_acl_3(self): + prov = ACL(upload=False, verbose=VERBOSE) + url = "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf" + exp = ( + "Burness_McMullin_-_Multi-Tiered_Strictly_Local_Functions_2020.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ui.py b/tests/test_ui.py index 5ccfbab..317352f 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -17,6 +17,7 @@ from paper2remarkable.exceptions import ( UnidentifiedSourceError, ) from paper2remarkable.providers import ( + ACL, ACM, Arxiv, CiteSeerX, @@ -195,6 +196,11 @@ class TestUI(unittest.TestCase): "https://www.nature.com/articles/s41599-019-0349-z", "https://www.nature.com/articles/s41599-019-0349-z", ), + ( + ACL, + "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf", + "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) |
