diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-25 21:26:04 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-10-25 21:26:04 +0100 |
| commit | 2caa1d9b55f7a16197349f31f8de2b954e27e0cb (patch) | |
| tree | 278d75fc1ea97d8a2dde7b59c2b41e7ac2203273 | |
| parent | Minor code formatting (diff) | |
| parent | Add PMLR provider (closes #8 and #13) (diff) | |
| download | paper2remarkable-2caa1d9b55f7a16197349f31f8de2b954e27e0cb.tar.gz paper2remarkable-2caa1d9b55f7a16197349f31f8de2b954e27e0cb.zip | |
Merge branch 'feature/provider_pmlr'
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 4 | ||||
| -rw-r--r-- | paper2remarkable/providers/pmlr.py | 68 | ||||
| -rw-r--r-- | tests/test_providers.py | 29 |
3 files changed, 100 insertions, 1 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index f6f93f9..fcb2d22 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -7,5 +7,7 @@ from .openreview import OpenReview from .springer import Springer from .local import LocalFile from .pdf_url import PdfUrl +from .pmlr import PMLR -providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl] +# NOTE: Order matters here, PdfUrl should be last +providers = [Arxiv, PubMed, ACM, OpenReview, Springer, PMLR, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/pmlr.py b/paper2remarkable/providers/pmlr.py new file mode 100644 index 0000000..82b8b4d --- /dev/null +++ b/paper2remarkable/providers/pmlr.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +"""Provider for PMLR + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class PMLRInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=' ', idx=-1) + + +class PMLR(Provider): + + re_abs_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.html" + re_pdf_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.pdf" + + re_abs_2 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+\w?.html" + re_pdf_2 = "https?://proceedings.mlr.press/v\d+/(?P<ref>[\w\-\w]+\d+\w?)/(?P=ref).pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = PMLRInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match(self.re_abs_1, url): + abs_url = url + pdf_url = url.replace(".html", ".pdf") + elif re.match(self.re_pdf_1, url): + abs_url = url.replace(".pdf", ".html") + pdf_url = url + elif re.match(self.re_abs_2, url): + abs_url = url + parts = url.split("/") + authoridx = parts[-1].split(".")[0] + pdf_url = "/".join(parts[:-1]) + "/%s/%s.pdf" % ( + authoridx, + authoridx, + ) + elif re.match(self.re_pdf_2, url): + parts = url.split("/") + abs_url = "/".join(parts[:-1]) + ".html" + pdf_url = url + else: + exception("Couldn't figure out OpenReview urls.") + return abs_url, pdf_url + + def validate(src): + return ( + re.fullmatch(PMLR.re_abs_1, src) + or re.fullmatch(PMLR.re_pdf_1, src) + or re.fullmatch(PMLR.re_abs_2, src) + or re.fullmatch(PMLR.re_pdf_2, src) + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index 1479967..ba5e598 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -17,6 +17,7 @@ from paper2remarkable.providers import ( Arxiv, LocalFile, OpenReview, + PMLR, PdfUrl, PubMed, Springer, @@ -122,6 +123,34 @@ class TestProviders(unittest.TestCase): filename = prov.run(url, filename="test.pdf") self.assertEqual("test.pdf", os.path.basename(filename)) + def test_pmlr_1(self): + prov = PMLR(upload=False, verbose=VERBOSE) + url = "http://proceedings.mlr.press/v97/behrmann19a.html" + exp = "Behrmann_et_al_-_Invertible_Residual_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_pmlr_2(self): + prov = PMLR(upload=False, verbose=VERBOSE) + url = "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf" + exp = "Maaten_Welling_Saul_-_Hidden-Unit_Conditional_Random_Fields_2011.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_pmlr_3(self): + prov = PMLR(upload=False, verbose=VERBOSE) + url = "http://proceedings.mlr.press/v48/melnyk16.pdf" + exp = "Melnyk_Banerjee_-_Estimating_Structured_Vector_Autoregressive_Models_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_pmlr_4(self): + prov = PMLR(upload=False, verbose=VERBOSE) + url = "http://proceedings.mlr.press/v48/zhangf16.html" + exp = "Zhang_Paisley_-_Markov_Latent_Feature_Models_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
