diff options
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 4 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/tandfonline.py | 74 | ||||
| -rw-r--r-- | tests/test_providers.py | 17 |
4 files changed, 94 insertions, 3 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 78fa370..935b889 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -14,8 +14,9 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .sagepub import SagePub -from .springer import Springer from .semantic_scholar import SemanticScholar +from .springer import Springer +from .tandfonline import TandFOnline # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ @@ -32,6 +33,7 @@ providers = [ SagePub, Springer, SemanticScholar, + TandFOnline, LocalFile, PdfUrl, HTML, diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 8f82f1d..74ab9e6 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -206,7 +206,9 @@ class Provider(metaclass=abc.ABCMeta): self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: os.chdir(working_dir) + print("working_dir:", working_dir) self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py new file mode 100644 index 0000000..d077211 --- /dev/null +++ b/paper2remarkable/providers/tandfonline.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +"""Provider for Taylor and Francis Online + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class TandFOnlineInformer(Informer): + meta_title_key = "dc.Title" + meta_author_key = "dc.Creator" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.strip().split(" ")[-1].strip() + + +class TandFOnline(Provider): + + re_abs = "^https?://www.tandfonline.com/doi/(full|abs)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + re_pdf = "^https?://www.tandfonline.com/doi/(full|pdf)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = TandFOnlineInformer() + + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + raise URLResolutionError( + "TandFOnline", url, reason="Failed to retrieve DOI." + ) + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + # full redirects to abs if we don't have access + abs_url = "https://www.tandfonline.com/doi/full/{doi}".format( + doi=doi + ) + else: + raise URLResolutionError("TandFOnline", url) + return abs_url, pdf_url + + def validate(src): + m = re.match(TandFOnline.re_abs, src) or re.match( + TandFOnline.re_pdf, src + ) + return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index e701234..4ee6773 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -15,8 +15,8 @@ import unittest from paper2remarkable.providers import ( ACM, Arxiv, - CiteSeerX, CVF, + CiteSeerX, HTML, JMLR, LocalFile, @@ -27,8 +27,9 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, SagePub, - Springer, SemanticScholar, + Springer, + TandFOnline, ) VERBOSE = False @@ -269,6 +270,18 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_tandfonline_1(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/full/10.1080/01621459.2017.1385466" + exp = "Fearnhead_Rigaill_-_Changepoint_Detection_in_the_Presence_of_Outliers_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_tandfonline_2(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" + exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + def test_html_1(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" |
