aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/__init__.py4
-rw-r--r--paper2remarkable/providers/_base.py2
-rw-r--r--paper2remarkable/providers/tandfonline.py74
-rw-r--r--paper2remarkable/utils.py22
-rw-r--r--tests/test_providers.py17
-rw-r--r--tests/test_ui.py5
6 files changed, 121 insertions, 3 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 3eeda5c..371ab82 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -15,8 +15,9 @@ from .pdf_url import PdfUrl
from .pmlr import PMLR
from .pubmed import PubMed
from .sagepub import SagePub
-from .springer import Springer
from .semantic_scholar import SemanticScholar
+from .springer import Springer
+from .tandfonline import TandFOnline
# NOTE: Order matters here, PdfUrl and HTML should be last
providers = [
@@ -34,6 +35,7 @@ providers = [
SagePub,
Springer,
SemanticScholar,
+ TandFOnline,
LocalFile,
PdfUrl,
HTML,
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 8f82f1d..74ab9e6 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -206,7 +206,9 @@ class Provider(metaclass=abc.ABCMeta):
self.initial_dir = os.getcwd()
with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir:
os.chdir(working_dir)
+ print("working_dir:", working_dir)
self.retrieve_pdf(pdf_url, tmp_filename)
+
assert_file_is_pdf(tmp_filename)
intermediate_fname = tmp_filename
diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py
new file mode 100644
index 0000000..d077211
--- /dev/null
+++ b/paper2remarkable/providers/tandfonline.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Taylor and Francis Online
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
+
+
+class TandFOnlineInformer(Informer):
+ meta_title_key = "dc.Title"
+ meta_author_key = "dc.Creator"
+ meta_date_key = "dc.Date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+ def _format_year(self, soup_date):
+ return soup_date.strip().split(" ")[-1].strip()
+
+
+class TandFOnline(Provider):
+
+ re_abs = "^https?://www.tandfonline.com/doi/(full|abs)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)"
+ re_pdf = "^https?://www.tandfonline.com/doi/(full|pdf)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = TandFOnlineInformer()
+
+ def _get_doi(self, url):
+ m = re.match(self.re_abs, url) or re.match(self.re_pdf, url)
+ if m:
+ return m["doi"]
+ raise URLResolutionError(
+ "TandFOnline", url, reason="Failed to retrieve DOI."
+ )
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ doi = self._get_doi(url)
+ pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format(
+ doi=doi
+ )
+ elif re.match(self.re_pdf, url):
+ doi = self._get_doi(url)
+ pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format(
+ doi=doi
+ )
+ # full redirects to abs if we don't have access
+ abs_url = "https://www.tandfonline.com/doi/full/{doi}".format(
+ doi=doi
+ )
+ else:
+ raise URLResolutionError("TandFOnline", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ m = re.match(TandFOnline.re_abs, src) or re.match(
+ TandFOnline.re_pdf, src
+ )
+ return not m is None
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 5b7ba2c..09082a5 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -112,6 +112,28 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None):
continue
return res.headers.get("Content-Type", None)
+ # In rare cases, a HEAD request fails but a GET request does work. So here
+ # we try to get the content type from a GET request.
+ count = 0
+ jar = {} if cookiejar is None else cookiejar
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.get(
+ url, headers=HEADERS, cookies=jar, allow_redirects=True
+ )
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ logger.warning(
+ "(%i/%i) Error getting headers for %s. Retrying in 5 seconds."
+ % (count, tries, url)
+ )
+ time.sleep(5)
+ continue
+ return res.headers.get("Content-Type", None)
+
def follow_redirects(url):
"""Follow redirects from the URL (at most 100)"""
diff --git a/tests/test_providers.py b/tests/test_providers.py
index def77d0..b8582fe 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -15,8 +15,8 @@ import unittest
from paper2remarkable.providers import (
ACM,
Arxiv,
- CiteSeerX,
CVF,
+ CiteSeerX,
HTML,
JMLR,
LocalFile,
@@ -28,8 +28,9 @@ from paper2remarkable.providers import (
PdfUrl,
PubMed,
SagePub,
- Springer,
SemanticScholar,
+ Springer,
+ TandFOnline,
)
VERBOSE = False
@@ -270,6 +271,18 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_tandfonline_1(self):
+ prov = TandFOnline(upload=False, verbose=VERBOSE)
+ url = "https://www.tandfonline.com/doi/full/10.1080/01621459.2017.1385466"
+ exp = "Fearnhead_Rigaill_-_Changepoint_Detection_in_the_Presence_of_Outliers_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_tandfonline_2(self):
+ prov = TandFOnline(upload=False, verbose=VERBOSE)
+ url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true"
+ exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf"
+
def test_html_1(self):
prov = HTML(upload=False, verbose=VERBOSE)
url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
diff --git a/tests/test_ui.py b/tests/test_ui.py
index a1eb372..e485bfe 100644
--- a/tests/test_ui.py
+++ b/tests/test_ui.py
@@ -102,6 +102,11 @@ class TestUI(unittest.TestCase):
"https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf",
),
(
+ PdfUrl,
+ "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf",
+ "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf",
+ ),
+ (
JMLR,
"https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
"https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",