13 files changed, 291 insertions, 19 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d13d43..5dfd414 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## Version 0.7.1
+
+* Fix OpenReview provider after site change
+
+## Version 0.7.0
+
+* Add provider for SagePub
+
+## Version 0.6.9
+
+* Improve robustness of Springer provider
+
+## Version 0.6.8
+
+* Add provider for SemanticScholar papers
+* Fix bug that made ``no_crop`` option no longer work
+
 ## Version 0.6.7
 
 * Increase robustness to PDF issues by passing through GhostScript (fixes 
diff --git a/Makefile b/Makefile
index 769fc87..bcbc420 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ doc: install ## Build documentation with Sphinx
 venv: $(VENV_DIR)/bin/activate
 
 $(VENV_DIR)/bin/activate:
-	test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
+	test -d $(VENV_DIR) || python -m venv $(VENV_DIR)
 	source $(VENV_DIR)/bin/activate && pip install -e .[dev]
 	touch $(VENV_DIR)/bin/activate
 
diff --git a/README.md b/README.md
index 2aa56d7..8b1db06 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,8 @@ reMarkable from any of the following sources:
 * [OpenReview](https://openreview.net/)
 * [PMLR](http://proceedings.mlr.press/)
 * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/)
+* [SagePub](https://journals.sagepub.com/)
+* [SemanticScholar](https://www.semanticscholar.org/)
 * [SpringerLink](https://link.springer.com/)
 * A generic URL to a PDF file
 * A local PDF file
@@ -68,13 +70,13 @@ Optionally, you can:
 
 Here's the full help of the script:
 
-```text
+```
 usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V]
            [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM]
            [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI]
            input
 
-Paper2reMarkable version 0.6.7
+Paper2reMarkable version 0.7.1
 
 positional arguments:
   input                 URL to a paper or the path of a local PDF file
@@ -145,7 +147,14 @@ Specifically:
    - **Arch Linux:** ``pacman -S pdftk ghostscript poppler``
    - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace 
      ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``.
-   - **MacOs:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)).
+   - **MacOS:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)).
+   - **Windows:** Installers or executables are available for 
+     [qpdf](https://github.com/qpdf/qpdf/releases) (for instance the mingw 
+     binary executables) and 
+     [GhostScript](https://www.ghostscript.com/download/gsdnld.html). 
+     Importantly, Windows support is untested and these are generic 
+     instructions, so we welcome clarifications where needed. The Docker 
+     instructions below may be more convenient on Windows.
 
 3. Finally, install ``paper2remarkable``:
    ```
diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py
index 69cf861..e501a41 100644
--- a/paper2remarkable/__version__.py
+++ b/paper2remarkable/__version__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 
-VERSION = (0, 6, 7)
+VERSION = (0, 7, 1)
 
 __version__ = ".".join(map(str, VERSION))
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index e4fa1bd..e3075f0 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -12,7 +12,9 @@ from .openreview import OpenReview
 from .pdf_url import PdfUrl
 from .pmlr import PMLR
 from .pubmed import PubMed
+from .sagepub import SagePub
 from .springer import Springer
+from .semantic_scholar import SemanticScholar
 
 # NOTE: Order matters here, PdfUrl and HTML should be last
 providers = [
@@ -25,7 +27,9 @@ providers = [
     OpenReview,
     PMLR,
     PubMed,
+    SagePub,
     Springer,
+    SemanticScholar,
     LocalFile,
     PdfUrl,
     HTML,
diff --git a/paper2remarkable/providers/neurips.py b/paper2remarkable/providers/neurips.py
index 87cf2c1..d76202c 100644
--- a/paper2remarkable/providers/neurips.py
+++ b/paper2remarkable/providers/neurips.py
@@ -25,8 +25,8 @@ class NeurIPSInformer(Informer):
 
 class NeurIPS(Provider):
 
-    re_abs = "^https?://papers.nips.cc/paper/[\d\w\-]+$"
-    re_pdf = "^https?://papers.nips.cc/paper/[\d\w\-]+.pdf$"
+    re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$"
+    re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
index 47c0555..8c44f45 100644
--- a/paper2remarkable/providers/openreview.py
+++ b/paper2remarkable/providers/openreview.py
@@ -8,17 +8,49 @@ Copyright: 2019, G.J.J. van den Burg
 
 """
 
+import json
 import re
 
 from ._base import Provider
 from ._info import Informer
 from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
 
 
 class OpenReviewInformer(Informer):
 
     meta_date_key = "citation_publication_date"
 
+    def get_authors(self, soup):
+        # Get the authors for OpenReview by parsing the JSON payload
+        #
+        # This may not be super robust long term, but works for now.
+        warning = (
+            "Couldn't determine author information, maybe provide "
+            "the desired filename using '--filename'?"
+        )
+
+        script = soup.find("script", {"id": "__NEXT_DATA__"})
+        if not script:
+            logger.warning(warning)
+            return ""
+
+        try:
+            paper_data = json.loads(script.contents[0])
+        except json.JSONDecodeError:
+            logger.warning(warning)
+            return ""
+
+        try:
+            content = paper_data["props"]["pageProps"]["forumNote"]["content"]
+            authors = content["authors"]
+        except KeyError:
+            logger.warning(warning)
+            return ""
+        return self._format_authors(authors)
+
     def _format_authors(self, soup_authors):
         return super()._format_authors(soup_authors, sep=" ", idx=-1)
 
diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py
new file mode 100644
index 0000000..7e76df8
--- /dev/null
+++ b/paper2remarkable/providers/sagepub.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for SagePub
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class SagePubInformer(Informer):
+
+    meta_author_key = "dc.Creator"
+    meta_title_key = "dc.Title"
+    meta_date_key = "dc.Date"
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+    def _format_year(self, soup_date):
+        return soup_date.split("-")[0]
+
+
+class SagePub(Provider):
+
+    re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+"
+    re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.informer = SagePubInformer()
+
+    def get_abs_pdf_urls(self, url):
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = url.replace("full", "pdf")
+        elif re.match(self.re_pdf, url):
+            pdf_url = url
+            abs_url = url.replace("pdf", "full")
+        else:
+            raise URLResolutionError("SagePub", url)
+        return abs_url, pdf_url
+
+    def validate(src):
+        return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src)
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
new file mode 100644
index 0000000..0a1b414
--- /dev/null
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for SemanticScholar
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+import bs4
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+from ..utils import get_page_with_retry
+
+
+class SemanticScholarInformer(Informer):
+
+    meta_date_key = "citation_publication_date"
+
+    def _format_authors(self, soup_authors):
+        return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class SemanticScholar(Provider):
+
+    re_abs = (
+        "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
+    )
+    re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.informer = SemanticScholarInformer()
+
+    def get_abs_pdf_urls(self, url):
+        """ Get the pdf and abstract urls from a SemanticScholar url """
+        if re.match(self.re_abs, url):
+            abs_url = url
+            pdf_url = self._get_pdf_url(abs_url)
+        elif re.match(self.re_pdf, url):
+            pdf_url = url
+            remainder = pdf_url.split("/")[-1][: -len(".pdf")]
+            first_four = pdf_url.split("/")[-2]
+            paper_id = first_four + remainder
+            abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
+        else:
+            raise URLResolutionError("SemanticScholar", url)
+        return abs_url, pdf_url
+
+    def _get_pdf_url(self, url):
+        page = get_page_with_retry(url)
+        soup = bs4.BeautifulSoup(page, "html.parser")
+        meta = soup.find_all("meta", {"name": "citation_pdf_url"})
+        if not meta:
+            raise URLResolutionError("SemanticScholar", url)
+        return meta[0]["content"]
+
+    def validate(src):
+        return re.match(SemanticScholar.re_abs, src) or re.match(
+            SemanticScholar.re_pdf, src
+        )
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
index 5ce2564..31f0a67 100644
--- a/paper2remarkable/providers/springer.py
+++ b/paper2remarkable/providers/springer.py
@@ -10,40 +10,75 @@ Copyright: 2019, G.J.J. van den Burg
 
 import re
 import urllib
+import requests
 
 from ._base import Provider
 from ._info import Informer
 from ..exceptions import URLResolutionError
+from ..utils import HEADERS
 
 
 class SpringerInformer(Informer):
 
-    meta_date_key = "citation_online_date"
+    meta_date_key = None
 
     def _format_authors(self, soup_authors):
         return super()._format_authors(soup_authors, sep=" ", idx=-1)
 
+    def get_year(self, soup):
+        for key in ["citation_online_date", "citation_publication_date"]:
+            meta = soup.find_all("meta", {"name": key})
+            if not meta:
+                continue
+            return self._format_year(meta[0]["content"])
+        return ""
+
 
 class Springer(Provider):
 
-    re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
-    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+    re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+    re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+"
+    re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.informer = SpringerInformer()
 
+    def _get_abs_url(self, pdf_url):
+        article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")]
+        req = requests.head(
+            article_url, headers=HEADERS, cookies=self.cookiejar
+        )
+        if req.status_code == 200:
+            return article_url
+
+        chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")]
+        req = requests.head(
+            chapter_url, headers=HEADERS, cookies=self.cookiejar
+        )
+        if req.status_code == 200:
+            return chapter_url
+
+        raise URLResolutionError("Springer", pdf_url)
+
     def get_abs_pdf_urls(self, url):
         """ Get the pdf and abstract urls from a Springer url """
-        if re.match(self.re_abs, url):
+        if re.match(self.re_abs_1, url):
             abs_url = url
             pdf_url = url.replace("article", "content/pdf")
+        elif re.match(self.re_abs_2, url):
+            abs_url = url
+            pdf_url = url.replace("chapter", "content/pdf")
         elif re.match(self.re_pdf, url):
-            abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+            abs_url = self._get_abs_url(url)
             pdf_url = urllib.parse.unquote(url)
         else:
             raise URLResolutionError("Springer", url)
         return abs_url, pdf_url
 
     def validate(src):
-        return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+        return (
+            re.match(Springer.re_abs_1, src)
+            or re.match(Springer.re_abs_2, src)
+            or re.match(Springer.re_pdf, src)
+        )
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index c2917d5..07b1524 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -38,6 +38,7 @@ def clean_string(s):
     cleaned = "".join(c if c in allowed else "_" for c in normalized)
     while "__" in cleaned:
         cleaned = cleaned.replace("__", "_")
+    cleaned = cleaned.strip('_')
     return cleaned
 
 
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 479fb84..70d012a 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -25,7 +25,9 @@ from paper2remarkable.providers import (
     PMLR,
     PdfUrl,
     PubMed,
+    SagePub,
     Springer,
+    SemanticScholar,
 )
 
 VERBOSE = False
@@ -125,13 +127,20 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp_filename, os.path.basename(filename))
 
-    def test_springer(self):
+    def test_springer_1(self):
         prov = Springer(upload=False, verbose=VERBOSE)
         url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
         exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
         filename = prov.run(url)
         self.assertEqual(exp_filename, os.path.basename(filename))
 
+    def test_springer_2(self):
+        prov = Springer(upload=False, verbose=VERBOSE)
+        url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf"
+        exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp_filename, os.path.basename(filename))
+
     def test_local(self):
         local_filename = "test.pdf"
         with open(local_filename, "w") as fp:
@@ -224,6 +233,20 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
 
+    def test_neurips_3(self):
+        prov = NeurIPS(upload=False, verbose=VERBOSE)
+        url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits"
+        exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+    def test_neurips_4(self):
+        prov = NeurIPS(upload=False, verbose=VERBOSE)
+        url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf"
+        exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
     def test_citeseerx_1(self):
         prov = CiteSeerX(upload=False, verbose=VERBOSE)
         url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548"
@@ -278,6 +301,35 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    def test_semantic_scholar_1(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
+        exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+    def test_semantic_scholar_2(self):
+        prov = SemanticScholar(upload=False, verbose=VERBOSE)
+        url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f"
+        exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+    def test_sagepub_1(self):
+        prov = SagePub(upload=False, verbose=VERBOSE)
+        url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"
+        exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+    def test_sagepub_2(self):
+        prov = SagePub(upload=False, verbose=VERBOSE)
+        url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432"
+        exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_ui.py b/tests/test_ui.py
index 7ab5099..61b371d 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -98,16 +98,16 @@ class TestUI(unittest.TestCase):
                 PdfUrl,
                 "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf",
                 "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf",
-                ),
+            ),
             (
                 JMLR,
-                "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
-                "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+                "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+                "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
             ),
             (
                 JMLR,
-                "http://www.jmlr.org/papers/v10/xu09a.html",
-                "http://www.jmlr.org/papers/v10/xu09a.html",
+                "https://www.jmlr.org/papers/v10/xu09a.html",
+                "https://www.jmlr.org/papers/v10/xu09a.html",
             ),
             (
                 PMLR,
@@ -150,6 +150,11 @@ class TestUI(unittest.TestCase):
                 "https://papers.nips.cc/paper/7796-middle-out-decoding",
             ),
             (
+                NeurIPS,
+                "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf",
+                "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf",
+            ),
+            (
                 CiteSeerX,
                 "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",
                 "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",