aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-19 12:14:44 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-19 12:14:44 +0000
commit0128dce1c10be8db965584aa387bf00040a3f018 (patch)
tree76a45c3d06eaa4c647a50a7830c5005f0b5c0643
parentReplace spaces in author names (diff)
downloadpaper2remarkable-0128dce1c10be8db965584aa387bf00040a3f018.tar.gz
paper2remarkable-0128dce1c10be8db965584aa387bf00040a3f018.zip
Add NBER provider
-rw-r--r--paper2remarkable/providers/__init__.py4
-rw-r--r--paper2remarkable/providers/nber.py46
-rw-r--r--tests/test_providers.py15
3 files changed, 64 insertions, 1 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index f87a044..c868bc4 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -5,6 +5,7 @@ from .arxiv import Arxiv
from .citeseerx import CiteSeerX
from .html import HTML
from .local import LocalFile
+from .nber import NBER
from .neurips import NeurIPS
from .openreview import OpenReview
from .pdf_url import PdfUrl
@@ -12,11 +13,12 @@ from .pmlr import PMLR
from .pubmed import PubMed
from .springer import Springer
-# NOTE: Order matters here, PdfUrl should be last
+# NOTE: Order matters here, PdfUrl and HTML should be last
providers = [
ACM,
Arxiv,
CiteSeerX,
+ NBER,
NeurIPS,
OpenReview,
PMLR,
diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py
new file mode 100644
index 0000000..76bc85f
--- /dev/null
+++ b/paper2remarkable/providers/nber.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for NBER
+
+(US) National Bureau of Economic Research
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class NBERInformer(Informer):
+ def _format_year(self, soup_date):
+ return soup_date.split("-")[0]
+
+
+class NBER(Provider):
+
+ re_abs = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)$"
+ re_pdf = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)\.pdf$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = NBERInformer()
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url + ".pdf"
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ abs_url = url[: -len(".pdf")]
+ else:
+ raise URLResolutionError("NBER", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index d0e3d40..38f88b7 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -18,6 +18,7 @@ from paper2remarkable.providers import (
CiteSeerX,
HTML,
LocalFile,
+ NBER,
NeurIPS,
OpenReview,
PMLR,
@@ -179,6 +180,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_nber_1(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w26752"
+ exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_nber_2(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w19152.pdf"
+ exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_neurips_1(self):
prov = NeurIPS(upload=False, verbose=VERBOSE)
url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf"