diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-06-13 16:57:52 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-06-13 16:57:52 +0100 |
| commit | 7e85930bf0e6dd74856d438dcb632b633af1cb83 (patch) | |
| tree | f50bbed5ea76b38add706e5cf7b47441d5b6f656 | |
| parent | bump version and update readme (diff) | |
| download | paper2remarkable-7e85930bf0e6dd74856d438dcb632b633af1cb83.tar.gz paper2remarkable-7e85930bf0e6dd74856d438dcb632b633af1cb83.zip | |
Add support for OpenReview papers
| -rwxr-xr-x | arxiv2remarkable.py | 45 | ||||
| -rw-r--r-- | test.py | 10 |
2 files changed, 55 insertions, 0 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 6eb81ff..8ee377b 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -511,6 +511,51 @@ class ACMProvider(Provider): return dict(title=title, date=date, authors=authors) +class OpenReviewProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match( + "https?://openreview.net/forum\?id=[A-Za-z0-9]+", url): + abs_url = url + pdf_url = url.replace('forum', 'pdf') + elif re.match( + "https?://openreview.net/pdf\?id=[A-Za-z0-9]+", url): + abs_url = url.replace('pdf', 'forum') + pdf_url = url + else: + exception("Couldn't figure out OpenReview urls.") + return abs_url, pdf_url + + def validate(src): + """ Check if the url is a valid OpenReview url. """ + m = re.match( + "https?://openreview.net/(forum|pdf)\?id=[A-Za-z0-9]+", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + """ Download the file and save as filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info from OpenReview") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] for x in soup.find_all("meta", {"name": + "citation_author"})] + authors = [x.split(' ')[-1].strip() for x in authors] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": + "citation_publication_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + class LocalFileProvider(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -15,6 +15,7 @@ from arxiv2remarkable import ( ArxivProvider, PMCProvider, ACMProvider, + OpenReviewProvider, LocalFileProvider, PdfUrlProvider, ) @@ -73,6 +74,15 @@ class Tests(unittest.TestCase): fsize = os.path.getsize(filename) self.assertTrue(1691444 < fsize <= 1693444) + def test_openreview(self): + prov = OpenReviewProvider(upload=False) + url = "https://openreview.net/forum?id=S1x4ghC9tQ" + exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + fsize = os.path.getsize(filename) + self.assertTrue(1110316 < fsize <= 1112316) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: |
