aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-01 12:36:07 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2021-03-01 12:36:07 +0000
commit3ec03cc9622278cabed93eba3d0ed14f21c8257c (patch)
treea18b2c000cb5a368533f1c5c64d9b528c1aa2b00
parentSimplify example pdf file (diff)
downloadpaper2remarkable-3ec03cc9622278cabed93eba3d0ed14f21c8257c.tar.gz
paper2remarkable-3ec03cc9622278cabed93eba3d0ed14f21c8257c.zip
Add provider for ACLWeb
-rw-r--r--README.md1
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/acl.py52
-rw-r--r--tests/test_providers.py24
-rw-r--r--tests/test_ui.py6
5 files changed, 85 insertions, 0 deletions
diff --git a/README.md b/README.md
index d212549..007aace 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ papers easier (just use the `-n` flag).
reMarkable from any of the following sources:
* [arXiv](https://arxiv.org/)
+* [ACL Web](https://www.aclweb.org/anthology/)
* [ACM Digital Library](https://dl.acm.org/dl.cfm)
* [CiteSeerX](http://citeseerx.ist.psu.edu/index)
* [CVF](https://openaccess.thecvf.com/menu)
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 2be218f..5130147 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
+from .acl import ACL
from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
@@ -22,6 +23,7 @@ from .tandfonline import TandFOnline
# NOTE: Order matters here, PdfUrl and HTML should be last
providers = [
+ ACL,
ACM,
Arxiv,
CiteSeerX,
diff --git a/paper2remarkable/providers/acl.py b/paper2remarkable/providers/acl.py
new file mode 100644
index 0000000..3220ca0
--- /dev/null
+++ b/paper2remarkable/providers/acl.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for ACL
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2021, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class ACLInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class ACL(Provider):
+
+ re_abs = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
+ re_pdf = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ACLInformer()
+
+ def get_abs_pdf_urls(self, url):
+ m = re.match(self.re_pdf, url)
+ if m:
+ pdf_url = url
+ abs_url = f"https://www.aclweb.org/anthology/{m['key']}"
+ return abs_url, pdf_url
+
+ m = re.match(self.re_abs, url)
+ if m:
+ abs_url = url
+ pdf_url = f"https://www.aclweb.org/anthology/{m['key']}.pdf"
+ return abs_url, pdf_url
+
+ raise URLResolutionError("ACL", url)
+
+ def validate(src):
+ m = re.match(ACL.re_abs, src) or re.match(ACL.re_pdf, src)
+ return not m is None
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 8239662..eaeb8aa 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -13,6 +13,7 @@ import tempfile
import unittest
from paper2remarkable.providers import (
+ ACL,
ACM,
Arxiv,
CVF,
@@ -414,6 +415,29 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_acl_1(self):
+ prov = ACL(upload=False, verbose=VERBOSE)
+ url = "https://www.aclweb.org/anthology/A88-1033/"
+ exp = "Newman_-_Combinatorial_Disambiguation_1988.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_acl_2(self):
+ prov = ACL(upload=False, verbose=VERBOSE)
+ url = "https://www.aclweb.org/anthology/2020.acl-main.79.pdf"
+ exp = "Zhong_et_al_-_Interpreting_Twitter_User_Geolocation_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_acl_3(self):
+ prov = ACL(upload=False, verbose=VERBOSE)
+ url = "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf"
+ exp = (
+ "Burness_McMullin_-_Multi-Tiered_Strictly_Local_Functions_2020.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 5ccfbab..317352f 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -17,6 +17,7 @@ from paper2remarkable.exceptions import (
UnidentifiedSourceError,
)
from paper2remarkable.providers import (
+ ACL,
ACM,
Arxiv,
CiteSeerX,
@@ -195,6 +196,11 @@ class TestUI(unittest.TestCase):
"https://www.nature.com/articles/s41599-019-0349-z",
"https://www.nature.com/articles/s41599-019-0349-z",
),
+ (
+ ACL,
+ "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
+ "https://www.aclweb.org/anthology/2020.sigmorphon-1.29v2.pdf",
+ ),
]
for exp_prov, url, exp_url in tests:
prov, new_url, jar = choose_provider(url)