diff options
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 4 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/tandfonline.py | 74 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 22 | ||||
| -rw-r--r-- | tests/test_providers.py | 17 | ||||
| -rw-r--r-- | tests/test_ui.py | 5 |
6 files changed, 121 insertions, 3 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 3eeda5c..371ab82 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -15,8 +15,9 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .sagepub import SagePub -from .springer import Springer from .semantic_scholar import SemanticScholar +from .springer import Springer +from .tandfonline import TandFOnline # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ @@ -34,6 +35,7 @@ providers = [ SagePub, Springer, SemanticScholar, + TandFOnline, LocalFile, PdfUrl, HTML, diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 8f82f1d..74ab9e6 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -206,7 +206,9 @@ class Provider(metaclass=abc.ABCMeta): self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: os.chdir(working_dir) + print("working_dir:", working_dir) self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py new file mode 100644 index 0000000..d077211 --- /dev/null +++ b/paper2remarkable/providers/tandfonline.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +"""Provider for Taylor and Francis Online + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class TandFOnlineInformer(Informer): + meta_title_key = "dc.Title" + meta_author_key = "dc.Creator" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.strip().split(" ")[-1].strip() + + +class TandFOnline(Provider): + + re_abs = "^https?://www.tandfonline.com/doi/(full|abs)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + re_pdf = "^https?://www.tandfonline.com/doi/(full|pdf)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = TandFOnlineInformer() + + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + raise URLResolutionError( + "TandFOnline", url, reason="Failed to retrieve DOI." + ) + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + # full redirects to abs if we don't have access + abs_url = "https://www.tandfonline.com/doi/full/{doi}".format( + doi=doi + ) + else: + raise URLResolutionError("TandFOnline", url) + return abs_url, pdf_url + + def validate(src): + m = re.match(TandFOnline.re_abs, src) or re.match( + TandFOnline.re_pdf, src + ) + return not m is None diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 5b7ba2c..09082a5 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -112,6 +112,28 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None): continue return res.headers.get("Content-Type", None) + # In rare cases, a HEAD request fails but a GET request does work. So here + # we try to get the content type from a GET request. + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.get( + url, headers=HEADERS, cookies=jar, allow_redirects=True + ) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + return res.headers.get("Content-Type", None) + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" diff --git a/tests/test_providers.py b/tests/test_providers.py index def77d0..b8582fe 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -15,8 +15,8 @@ import unittest from paper2remarkable.providers import ( ACM, Arxiv, - CiteSeerX, CVF, + CiteSeerX, HTML, JMLR, LocalFile, @@ -28,8 +28,9 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, SagePub, - Springer, SemanticScholar, + Springer, + TandFOnline, ) VERBOSE = False @@ -270,6 +271,18 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_tandfonline_1(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/full/10.1080/01621459.2017.1385466" + exp = "Fearnhead_Rigaill_-_Changepoint_Detection_in_the_Presence_of_Outliers_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_tandfonline_2(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" + exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + def test_html_1(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" diff --git a/tests/test_ui.py b/tests/test_ui.py index a1eb372..e485bfe 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -102,6 +102,11 @@ class TestUI(unittest.TestCase): "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", ), ( + PdfUrl, + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + ), + ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", |
