Merge branch 'feature/semantic_scholar'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-07-11 23:28:33 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-07-11 23:28:33 +0100
commit: 3315dd8a6a7b61f21c3bc8163ce9ca334cab8007 (patch)
tree: 0f5457de61ee13a31aca558eb2dc5aa1b9713039
parent: Fix no_crop bug (diff)
parent: Add semantic scholar to readme (diff)
download: paper2remarkable-3315dd8a6a7b61f21c3bc8163ce9ca334cab8007.tar.gz
paper2remarkable-3315dd8a6a7b61f21c3bc8163ce9ca334cab8007.zip
4 files changed, 83 insertions, 0 deletions
diff --git a/README.md b/README.md
index 2aa56d7..7de9c40 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ reMarkable from any of the following sources:
 * [OpenReview](https://openreview.net/)
 * [PMLR](http://proceedings.mlr.press/)
 * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/)
+* [SemanticScholar](https://www.semanticscholar.org/)
 * [SpringerLink](https://link.springer.com/)
 * A generic URL to a PDF file
 * A local PDF file
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index e4fa1bd..c4e3eb5 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -13,6 +13,7 @@ from .pdf_url import PdfUrl
 from .pmlr import PMLR
 from .pubmed import PubMed
 from .springer import Springer
+from .semantic_scholar import SemanticScholar
 
 # NOTE: Order matters here, PdfUrl and HTML should be last
 providers = [
@@ -26,6 +27,7 @@ providers = [
     PMLR,
     PubMed,
     Springer,
+    SemanticScholar,
     LocalFile,
     PdfUrl,
     HTML,
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
new file mode 100644
index 0000000..0a1b414
--- /dev/null
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for SemanticScholar
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+import bs4
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+from ..utils import get_page_with_retry
+
+
+class SemanticScholarInformer(Informer):
+
+    meta_date_key = "citation_publication_date"
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class SemanticScholar(Provider):
+
+    re_abs = (
+        "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
+    )
+    re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.informer = SemanticScholarInformer()
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract urls from a SemanticScholar url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = self._get_pdf_url(abs_url)
+        elif re.match(self.re_pdf, url):
+            pdf_url = url
+            remainder = pdf_url.split("/")[-1][: -len(".pdf")]
+            first_four = pdf_url.split("/")[-2]
+            paper_id = first_four + remainder
+            abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
+        else:
+            raise URLResolutionError("SemanticScholar", url)
+        return abs_url, pdf_url
+
+    def _get_pdf_url(self, url):
+        page = get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        meta = soup.find_all("meta", {"name": "citation_pdf_url"})
+        if not meta:
+            raise URLResolutionError("SemanticScholar", url)
+        return meta[0]["content"]
+
+    def validate(src):
+        return re.match(SemanticScholar.re_abs, src) or re.match(
+            SemanticScholar.re_pdf, src
+        )
diff --git a/tests/test_providers.py b/tests/test_providers.py
index fb75fbd..1a6f84f 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -26,6 +26,7 @@ from paper2remarkable.providers import (
     PdfUrl,
     PubMed,
     Springer,
+    SemanticScholar
 )
 
 VERBOSE = False
@@ -268,6 +269,20 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    def test_semantic_scholar_1(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
+        exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+    def test_semantic_scholar_2(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f"
+        exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
 
 if __name__ == "__main__":
     unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-07-11 23:28:33 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-07-11 23:28:33 +0100
commit	3315dd8a6a7b61f21c3bc8163ce9ca334cab8007 (patch)
tree	0f5457de61ee13a31aca558eb2dc5aa1b9713039
parent	Fix no_crop bug (diff)
parent	Add semantic scholar to readme (diff)
download	paper2remarkable-3315dd8a6a7b61f21c3bc8163ce9ca334cab8007.tar.gz paper2remarkable-3315dd8a6a7b61f21c3bc8163ce9ca334cab8007.zip