diff options
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/citeseerx.py | 64 | ||||
| -rw-r--r-- | tests/test_providers.py | 24 |
3 files changed, 90 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 45148fd..fabdcfe 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -2,6 +2,7 @@ from .acm import ACM from .arxiv import Arxiv +from .citeseerx import CiteSeerX from .local import LocalFile from .neurips import NeurIPS from .openreview import OpenReview @@ -14,6 +15,7 @@ from .springer import Springer providers = [ ACM, Arxiv, + CiteSeerX, NeurIPS, OpenReview, PMLR, diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py new file mode 100644 index 0000000..fdc0e8a --- /dev/null +++ b/paper2remarkable/providers/citeseerx.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +"""Provider for CiteSeerX + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class CiteSeerXInformer(Informer): + + meta_author_key = "citation_authors" + meta_date_key = "citation_year" + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + +class CiteSeerX(Provider): + + re_abs = "^https?:\/\/citeseerx.ist.psu.edu\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)" + re_pdf = "^https?:\/\/citeseerx.ist.psu.edu\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = CiteSeerXInformer() + + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + exception("Couldn't retrieve CiteSeerX publication doi.") + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match(self.re_abs, url): + abs_url = url + doi = self._get_doi(abs_url) + pdf_url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi={doi}&rep=rep1&type=pdf".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + pdf_url = url + doi = self._get_doi(pdf_url) + abs_url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi={doi}".format( + doi=doi + ) + else: + exception("Couldn't figure out CiteSeerX urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(CiteSeerX.re_abs, src) or re.match( + CiteSeerX.re_pdf, src + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index e0c98a2..75703ff 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -15,6 +15,7 @@ import unittest from paper2remarkable.providers import ( ACM, Arxiv, + CiteSeerX, LocalFile, NeurIPS, OpenReview, @@ -166,6 +167,29 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): + prov = CiteSeerX(upload=False, verbose=VERBOSE) + url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" + exp = "Aaronson_-_Is_P_Versus_NP_Formally_Independent_2003.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_citeseerx_2(self): + prov = CiteSeerX(upload=False, verbose=VERBOSE) + url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.101.6521&rep=rep1&type=pdf" + exp = "Everingham_et_al_-_The_2005_Pascal_Visual_Object_Classes_Challenge_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_citeseerx_3(self): + prov = CiteSeerX(upload=False, verbose=VERBOSE) + url = ( + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.109.4049" + ) + exp = "Brin_Page_-_The_Anatomy_of_a_Large-Scale_Hypertextual_Web_Search_Engine_1998.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
