aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-12-27 20:56:15 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-12-27 20:56:15 +0000
commite4e7f95d890502c0350bb66a17a81ab66cf20069 (patch)
tree6229ffffc426793384898e409c2c6403581fd9a8
parentAdd note on creating an alias for the docker command (diff)
downloadpaper2remarkable-e4e7f95d890502c0350bb66a17a81ab66cf20069.tar.gz
paper2remarkable-e4e7f95d890502c0350bb66a17a81ab66cf20069.zip
Add citeseerx provider
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/citeseerx.py64
-rw-r--r--tests/test_providers.py24
3 files changed, 90 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 45148fd..fabdcfe 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -2,6 +2,7 @@
from .acm import ACM
from .arxiv import Arxiv
+from .citeseerx import CiteSeerX
from .local import LocalFile
from .neurips import NeurIPS
from .openreview import OpenReview
@@ -14,6 +15,7 @@ from .springer import Springer
providers = [
ACM,
Arxiv,
+ CiteSeerX,
NeurIPS,
OpenReview,
PMLR,
diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py
new file mode 100644
index 0000000..fdc0e8a
--- /dev/null
+++ b/paper2remarkable/providers/citeseerx.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for CiteSeerX
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..utils import exception
+
+
+class CiteSeerXInformer(Informer):
+
+ meta_author_key = "citation_authors"
+ meta_date_key = "citation_year"
+
+ def _format_authors(self, soup_authors):
+ op = lambda x: x[0].split(",")
+ return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+
+class CiteSeerX(Provider):
+
+ re_abs = "^https?:\/\/citeseerx.ist.psu.edu\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
+ re_pdf = "^https?:\/\/citeseerx.ist.psu.edu\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = CiteSeerXInformer()
+
+ def _get_doi(self, url):
+ m = re.match(self.re_abs, url) or re.match(self.re_pdf, url)
+ if m:
+ return m["doi"]
+ exception("Couldn't retrieve CiteSeerX publication doi.")
+
+ def get_abs_pdf_urls(self, url):
+ """ Get the pdf and abstract url from a OpenReview url """
+ if re.match(self.re_abs, url):
+ abs_url = url
+ doi = self._get_doi(abs_url)
+ pdf_url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi={doi}&rep=rep1&type=pdf".format(
+ doi=doi
+ )
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ doi = self._get_doi(pdf_url)
+ abs_url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi={doi}".format(
+ doi=doi
+ )
+ else:
+ exception("Couldn't figure out CiteSeerX urls.")
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(CiteSeerX.re_abs, src) or re.match(
+ CiteSeerX.re_pdf, src
+ )
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e0c98a2..75703ff 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -15,6 +15,7 @@ import unittest
from paper2remarkable.providers import (
ACM,
Arxiv,
+ CiteSeerX,
LocalFile,
NeurIPS,
OpenReview,
@@ -166,6 +167,29 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_citeseerx_1(self):
+ prov = CiteSeerX(upload=False, verbose=VERBOSE)
+ url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548"
+ exp = "Aaronson_-_Is_P_Versus_NP_Formally_Independent_2003.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_citeseerx_2(self):
+ prov = CiteSeerX(upload=False, verbose=VERBOSE)
+ url = "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.101.6521&rep=rep1&type=pdf"
+ exp = "Everingham_et_al_-_The_2005_Pascal_Visual_Object_Classes_Challenge_2006.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_citeseerx_3(self):
+ prov = CiteSeerX(upload=False, verbose=VERBOSE)
+ url = (
+ "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.109.4049"
+ )
+ exp = "Brin_Page_-_The_Anatomy_of_a_Large-Scale_Hypertextual_Web_Search_Engine_1998.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()