From d5230d43d58c992212c89f3c221f72784a3a309d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 5 Jul 2020 23:33:11 +0100 Subject: Add provider for Semantic Scholar --- paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/semantic_scholar.py | 65 ++++++++++++++++++++++++++ tests/test_providers.py | 15 ++++++ 3 files changed, 82 insertions(+) create mode 100644 paper2remarkable/providers/semantic_scholar.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index e4fa1bd..c4e3eb5 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -13,6 +13,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer +from .semantic_scholar import SemanticScholar # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ @@ -26,6 +27,7 @@ providers = [ PMLR, PubMed, Springer, + SemanticScholar, LocalFile, PdfUrl, HTML, diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py new file mode 100644 index 0000000..0a1b414 --- /dev/null +++ b/paper2remarkable/providers/semantic_scholar.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +"""Provider for SemanticScholar + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re +import bs4 + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..utils import get_page_with_retry + + +class SemanticScholarInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class SemanticScholar(Provider): + + re_abs = ( + "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" + ) + re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SemanticScholarInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a SemanticScholar url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self._get_pdf_url(abs_url) + elif re.match(self.re_pdf, url): + pdf_url = url + remainder = pdf_url.split("/")[-1][: -len(".pdf")] + first_four = pdf_url.split("/")[-2] + paper_id = first_four + remainder + abs_url = f"https://www.semanticscholar.org/paper/{paper_id}" + else: + raise URLResolutionError("SemanticScholar", url) + return abs_url, pdf_url + + def _get_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + meta = soup.find_all("meta", {"name": "citation_pdf_url"}) + if not meta: + raise URLResolutionError("SemanticScholar", url) + return meta[0]["content"] + + def validate(src): + return re.match(SemanticScholar.re_abs, src) or re.match( + SemanticScholar.re_pdf, src + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..1a6f84f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -26,6 +26,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + SemanticScholar ) VERBOSE = False @@ -268,6 +269,20 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From e298f1cfd64253347ec81cadf5324a32d81ec2e5 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 11 Jul 2020 22:21:55 +0100 Subject: Add semantic scholar to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2aa56d7..7de9c40 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ reMarkable from any of the following sources: * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) +* [SemanticScholar](https://www.semanticscholar.org/) * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file -- cgit v1.2.3 From 76bd4412abed0108b4589c84783602447f824d5d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 11 Jul 2020 23:43:17 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 5 +++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d13d43..bc58bb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.6.8 + +* Add provider for SemanticScholar papers +* Fix bug that made ``no_crop`` option no longer work + ## Version 0.6.7 * Increase robustness to PDF issues by passing through GhostScript (fixes diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 69cf861..d1ac661 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 7) +VERSION = (0, 6, 8) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 0a6a4ff3893474e33f71ef2d8a881cc360a29094 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:23:12 +0100 Subject: Improve robustness of springer provider Adds support for downloading chapters --- paper2remarkable/providers/springer.py | 37 +++++++++++++++++++++++++++++----- tests/test_providers.py | 9 ++++++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index 5ce2564..dea8bd5 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -10,10 +10,12 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib +import requests from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..utils import HEADERS class SpringerInformer(Informer): @@ -26,24 +28,49 @@ class SpringerInformer(Informer): class Springer(Provider): - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = SpringerInformer() + def _get_abs_url(self, pdf_url): + article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")] + req = requests.head( + article_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return article_url + + chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")] + req = requests.head( + chapter_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return chapter_url + + raise URLResolutionError("Springer", pdf_url) + def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): + if re.match(self.re_abs_1, url): abs_url = url pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_abs_2, url): + abs_url = url + pdf_url = url.replace("chapter", "content/pdf") elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + abs_url = self._get_abs_url(url) pdf_url = urllib.parse.unquote(url) else: raise URLResolutionError("Springer", url) return abs_url, pdf_url def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + return ( + re.match(Springer.re_abs_1, src) + or re.match(Springer.re_abs_2, src) + or re.match(Springer.re_pdf, src) + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index 1a6f84f..5c8a8e4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -126,13 +126,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: -- cgit v1.2.3 From d36bda173d5488e23ec918d4bd51c3e6fd76ae06 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:37:16 +0100 Subject: Improve publication date extraction --- paper2remarkable/providers/springer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index dea8bd5..f9dc952 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -20,11 +20,23 @@ from ..utils import HEADERS class SpringerInformer(Informer): - meta_date_key = "citation_online_date" + meta_date_key = None def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) + def get_year(self, soup): + meta = soup.find_all('meta', {'name': 'citation_online_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + meta = soup.find_all('meta', {'name': 'citation_publication_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + return '' + + class Springer(Provider): -- cgit v1.2.3 From f1f6ec91ca263e2e47357f4ddfd7e0e746fd93e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:42:25 +0100 Subject: simplify code --- paper2remarkable/providers/springer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index f9dc952..31f0a67 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -26,16 +26,12 @@ class SpringerInformer(Informer): return super()._format_authors(soup_authors, sep=" ", idx=-1) def get_year(self, soup): - meta = soup.find_all('meta', {'name': 'citation_online_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - meta = soup.find_all('meta', {'name': 'citation_publication_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - return '' - + for key in ["citation_online_date", "citation_publication_date"]: + meta = soup.find_all("meta", {"name": key}) + if not meta: + continue + return self._format_year(meta[0]["content"]) + return "" class Springer(Provider): -- cgit v1.2.3 From 8f6f3c433ce37c0205144b56cd48ea1ecc661e67 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:56:05 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc58bb8..3fbf726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.6.9 + +* Improve robustness of Springer provider + ## Version 0.6.8 * Add provider for SemanticScholar papers diff --git a/README.md b/README.md index 7de9c40..7108c3e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.7 +Paper2reMarkable version 0.6.9 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index d1ac661..214d6b7 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 8) +VERSION = (0, 6, 9) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 071b5a0f2958c34f1a189259346a8732a1110de2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 22:44:33 +0100 Subject: Add provider for SagePub --- README.md | 1 + paper2remarkable/providers/__init__.py | 2 ++ paper2remarkable/providers/sagepub.py | 52 ++++++++++++++++++++++++++++++++++ paper2remarkable/utils.py | 1 + tests/test_providers.py | 17 ++++++++++- 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 paper2remarkable/providers/sagepub.py diff --git a/README.md b/README.md index 7108c3e..62c2b0b 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ reMarkable from any of the following sources: * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) +* [SagePub](https://journals.sagepub.com/) * [SemanticScholar](https://www.semanticscholar.org/) * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index c4e3eb5..e3075f0 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -12,6 +12,7 @@ from .openreview import OpenReview from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed +from .sagepub import SagePub from .springer import Springer from .semantic_scholar import SemanticScholar @@ -26,6 +27,7 @@ providers = [ OpenReview, PMLR, PubMed, + SagePub, Springer, SemanticScholar, LocalFile, diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py new file mode 100644 index 0000000..7e76df8 --- /dev/null +++ b/paper2remarkable/providers/sagepub.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +"""Provider for SagePub + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class SagePubInformer(Informer): + + meta_author_key = "dc.Creator" + meta_title_key = "dc.Title" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.split("-")[0] + + +class SagePub(Provider): + + re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+" + re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SagePubInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("full", "pdf") + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url.replace("pdf", "full") + else: + raise URLResolutionError("SagePub", url) + return abs_url, pdf_url + + def validate(src): + return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index c2917d5..07b1524 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -38,6 +38,7 @@ def clean_string(s): cleaned = "".join(c if c in allowed else "_" for c in normalized) while "__" in cleaned: cleaned = cleaned.replace("__", "_") + cleaned = cleaned.strip('_') return cleaned diff --git a/tests/test_providers.py b/tests/test_providers.py index 5c8a8e4..ba1cc3a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -25,8 +25,9 @@ from paper2remarkable.providers import ( PMLR, PdfUrl, PubMed, + SagePub, Springer, - SemanticScholar + SemanticScholar, ) VERBOSE = False @@ -290,6 +291,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_sagepub_1(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" + exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_2(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432" + exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 848cbf8bfb82c568c94ff3842ee538dc5c990120 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 22:55:28 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fbf726..723b38f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.7.0 + +* Add provider for SagePub + ## Version 0.6.9 * Improve robustness of Springer provider diff --git a/README.md b/README.md index 62c2b0b..eaec8c4 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.9 +Paper2reMarkable version 0.7.0 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 214d6b7..1020fb7 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 9) +VERSION = (0, 7, 0) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From b56d376ff87cfc7fc599f40e13338a3c1a489877 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 11 Aug 2020 22:18:30 +0100 Subject: Fix failing unit test --- tests/test_ui.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_ui.py b/tests/test_ui.py index 7ab5099..5747eb9 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -101,13 +101,13 @@ class TestUI(unittest.TestCase): ), ( JMLR, - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", ), ( JMLR, - "http://www.jmlr.org/papers/v10/xu09a.html", - "http://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", ), ( PMLR, -- cgit v1.2.3 From 0e7b27b4e34e52744f6037f78024e1df2ee26a0c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 11 Aug 2020 22:18:40 +0100 Subject: Readme formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eaec8c4..0f23c4f 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Optionally, you can: Here's the full help of the script: -```text +``` usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] -- cgit v1.2.3 From 7ae25e6f86dcd1da60cdb40d2d12ca45c4b68201 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 27 Aug 2020 13:17:03 +0100 Subject: Rewrite author info function for OpenReview --- paper2remarkable/providers/openreview.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index 47c0555..8c44f45 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -8,17 +8,49 @@ Copyright: 2019, G.J.J. van den Burg """ +import json import re from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() class OpenReviewInformer(Informer): meta_date_key = "citation_publication_date" + def get_authors(self, soup): + # Get the authors for OpenReview by parsing the JSON payload + # + # This may not be super robust long term, but works for now. + warning = ( + "Couldn't determine author information, maybe provide " + "the desired filename using '--filename'?" + ) + + script = soup.find("script", {"id": "__NEXT_DATA__"}) + if not script: + logger.warning(warning) + return "" + + try: + paper_data = json.loads(script.contents[0]) + except json.JSONDecodeError: + logger.warning(warning) + return "" + + try: + content = paper_data["props"]["pageProps"]["forumNote"]["content"] + authors = content["authors"] + except KeyError: + logger.warning(warning) + return "" + return self._format_authors(authors) + def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) -- cgit v1.2.3 From 3a5260a3a1311bf589fe7a4ef221939f8c9727d1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 27 Aug 2020 13:37:33 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 723b38f..5dfd414 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.7.1 + +* Fix OpenReview provider after site change + ## Version 0.7.0 * Add provider for SagePub diff --git a/README.md b/README.md index 0f23c4f..2cfe192 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.7.0 +Paper2reMarkable version 0.7.1 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 1020fb7..e501a41 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 0) +VERSION = (0, 7, 1) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From f242e29147ac8ec1450a0bdb90b1dc0da3aa4e85 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 30 Aug 2020 13:20:45 +0100 Subject: Add tentative windows instructions --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2cfe192..8b1db06 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,14 @@ Specifically: - **Arch Linux:** ``pacman -S pdftk ghostscript poppler`` - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``. - - **MacOs:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + - **MacOS:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + - **Windows:** Installers or executables are available for + [qpdf](https://github.com/qpdf/qpdf/releases) (for instance the mingw + binary executables) and + [GhostScript](https://www.ghostscript.com/download/gsdnld.html). + Importantly, Windows support is untested and these are generic + instructions, so we welcome clarifications where needed. The Docker + instructions below may be more convenient on Windows. 3. Finally, install ``paper2remarkable``: ``` -- cgit v1.2.3 From 25f372c69dfc846faebb4763ecc60e9e0750021b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 20:54:38 +0200 Subject: Improve support for Neurips provider (fixes #59) --- paper2remarkable/providers/neurips.py | 4 ++-- tests/test_providers.py | 14 ++++++++++++++ tests/test_ui.py | 7 ++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/paper2remarkable/providers/neurips.py b/paper2remarkable/providers/neurips.py index 87cf2c1..d76202c 100644 --- a/paper2remarkable/providers/neurips.py +++ b/paper2remarkable/providers/neurips.py @@ -25,8 +25,8 @@ class NeurIPSInformer(Informer): class NeurIPS(Provider): - re_abs = "^https?://papers.nips.cc/paper/[\d\w\-]+$" - re_pdf = "^https?://papers.nips.cc/paper/[\d\w\-]+.pdf$" + re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$" + re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/tests/test_providers.py b/tests/test_providers.py index ba1cc3a..eeaef82 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -233,6 +233,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_3(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits" + exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_neurips_4(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf" + exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): prov = CiteSeerX(upload=False, verbose=VERBOSE) url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" diff --git a/tests/test_ui.py b/tests/test_ui.py index 5747eb9..61b371d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -98,7 +98,7 @@ class TestUI(unittest.TestCase): PdfUrl, "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", - ), + ), ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", @@ -149,6 +149,11 @@ class TestUI(unittest.TestCase): "https://papers.nips.cc/paper/7796-middle-out-decoding", "https://papers.nips.cc/paper/7796-middle-out-decoding", ), + ( + NeurIPS, + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + ), ( CiteSeerX, "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", -- cgit v1.2.3 From 6bf72b6f8c08c7949b5efe4ef244cb0671bf5bf8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 20:57:49 +0200 Subject: Use python builtin venv instead of virtualenv in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 769fc87..bcbc420 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ doc: install ## Build documentation with Sphinx venv: $(VENV_DIR)/bin/activate $(VENV_DIR)/bin/activate: - test -d $(VENV_DIR) || virtualenv $(VENV_DIR) + test -d $(VENV_DIR) || python -m venv $(VENV_DIR) source $(VENV_DIR)/bin/activate && pip install -e .[dev] touch $(VENV_DIR)/bin/activate -- cgit v1.2.3