diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-10-20 22:12:02 +0200 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-10-20 22:12:02 +0200 |
| commit | d232f5fd4ba4319b02b05efbe2e88df0e8d5c1c4 (patch) | |
| tree | bbb16b19e96e7b2f524257bbbc9e487fd52a8898 | |
| parent | Remove use of virtualenv in release script (diff) | |
| parent | Updates to NBER provider after site updates (diff) | |
| download | paper2remarkable-d232f5fd4ba4319b02b05efbe2e88df0e8d5c1c4.tar.gz paper2remarkable-d232f5fd4ba4319b02b05efbe2e88df0e8d5c1c4.zip | |
Merge branch 'bugfix/nber' into master
| -rw-r--r-- | paper2remarkable/providers/nber.py | 23 | ||||
| -rw-r--r-- | tests/test_ui.py | 2 |
2 files changed, 21 insertions, 4 deletions
diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py index 76bc85f..fa51e8a 100644 --- a/paper2remarkable/providers/nber.py +++ b/paper2remarkable/providers/nber.py @@ -18,8 +18,11 @@ from ..exceptions import URLResolutionError class NBERInformer(Informer): - def _format_year(self, soup_date): - return soup_date.split("-")[0] + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors, sep=" ", idx=0, op=None): + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=None) class NBER(Provider): @@ -27,10 +30,20 @@ class NBER(Provider): re_abs = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)$" re_pdf = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)\.pdf$" + re_pdf_2 = "https://www.nber.org/system/files/working_papers/(?P<ref>[a-z0-9]+)/(?P=ref).pdf" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = NBERInformer() + def get_report_no(self, url): + m = re.match(self.re_pdf_2, url) + if m: + return m["ref"] + raise URLResolutionError( + "NBER", url, reason="Failed to retrieve report number." + ) + def get_abs_pdf_urls(self, url): if re.match(self.re_abs, url): abs_url = url @@ -38,9 +51,13 @@ class NBER(Provider): elif re.match(self.re_pdf, url): pdf_url = url abs_url = url[: -len(".pdf")] + elif re.match(self.re_pdf_2, url): + ref = self.get_report_no(url) + abs_url = f"https://www.nber.org/papers/{ref}" + pdf_url = url else: raise URLResolutionError("NBER", url) return abs_url, pdf_url def validate(src): - return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) + return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) or re.match(NBER.re_pdf_2, src) diff --git a/tests/test_ui.py b/tests/test_ui.py index 61b371d..97ec44d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -137,7 +137,7 @@ class TestUI(unittest.TestCase): ( NBER, "https://www.nber.org/papers/w19152.pdf", - "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/system/files/working_papers/w19152/w19152.pdf", ), ( NeurIPS, |
