diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-07-05 23:33:11 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-07-05 23:33:11 +0100 |
| commit | d5230d43d58c992212c89f3c221f72784a3a309d (patch) | |
| tree | 91cc690a03b88823b0afd0552e4c748d54ff104d | |
| parent | Fix no_crop bug (diff) | |
| download | paper2remarkable-d5230d43d58c992212c89f3c221f72784a3a309d.tar.gz paper2remarkable-d5230d43d58c992212c89f3c221f72784a3a309d.zip | |
Add provider for Semantic Scholar
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/semantic_scholar.py | 65 | ||||
| -rw-r--r-- | tests/test_providers.py | 15 |
3 files changed, 82 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index e4fa1bd..c4e3eb5 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -13,6 +13,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer +from .semantic_scholar import SemanticScholar # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ @@ -26,6 +27,7 @@ providers = [ PMLR, PubMed, Springer, + SemanticScholar, LocalFile, PdfUrl, HTML, diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py new file mode 100644 index 0000000..0a1b414 --- /dev/null +++ b/paper2remarkable/providers/semantic_scholar.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +"""Provider for SemanticScholar + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re +import bs4 + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..utils import get_page_with_retry + + +class SemanticScholarInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class SemanticScholar(Provider): + + re_abs = ( + "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" + ) + re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SemanticScholarInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a SemanticScholar url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self._get_pdf_url(abs_url) + elif re.match(self.re_pdf, url): + pdf_url = url + remainder = pdf_url.split("/")[-1][: -len(".pdf")] + first_four = pdf_url.split("/")[-2] + paper_id = first_four + remainder + abs_url = f"https://www.semanticscholar.org/paper/{paper_id}" + else: + raise URLResolutionError("SemanticScholar", url) + return abs_url, pdf_url + + def _get_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + meta = soup.find_all("meta", {"name": "citation_pdf_url"}) + if not meta: + raise URLResolutionError("SemanticScholar", url) + return meta[0]["content"] + + def validate(src): + return re.match(SemanticScholar.re_abs, src) or re.match( + SemanticScholar.re_pdf, src + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..1a6f84f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -26,6 +26,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + SemanticScholar ) VERBOSE = False @@ -268,6 +269,20 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
