diff options
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/tandfonline.py | 75 | ||||
| -rw-r--r-- | tests/test_providers.py | 15 |
4 files changed, 94 insertions, 0 deletions
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index fabdcfe..53fda1f 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -10,6 +10,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer +from .tandfonline import TandFOnline # NOTE: Order matters here, PdfUrl should be last providers = [ @@ -21,6 +22,7 @@ providers = [ PMLR, PubMed, Springer, + TandFOnline, LocalFile, PdfUrl, ] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index b2f584c..596af98 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -116,7 +116,9 @@ class Provider(metaclass=abc.ABCMeta): self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: os.chdir(working_dir) + print("working_dir:", working_dir) self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py new file mode 100644 index 0000000..9f9c59e --- /dev/null +++ b/paper2remarkable/providers/tandfonline.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Provider for Taylor and Francis Online + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class TandFOnlineInformer(Informer): + meta_title_key = "dc.Title" + meta_author_key = "dc.Creator" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.strip().split(" ")[-1].strip() + + +class TandFOnline(Provider): + + re_abs = "^https?://www.tandfonline.com/doi/(full|abs)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + re_pdf = "^https?://www.tandfonline.com/doi/(full|pdf)/(?P<doi>\d+\.\d+/\d+\.\d+\.\d+)" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = TandFOnlineInformer() + + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + raise URLResolutionError( + "TandFOnline", url, reason="Failed to retrieve DOI." + ) + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + # full redirects to abs if we don't have access + abs_url = "https://www.tandfonline.com/doi/full/{doi}".format( + doi=doi + ) + else: + raise URLResolutionError("TandFOnline", url) + print("abs:", abs_url, "pdf:", pdf_url) + return abs_url, pdf_url + + def validate(src): + m = re.match(TandFOnline.re_abs, src) or re.match( + TandFOnline.re_pdf, src + ) + return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index e256eec..3204768 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -23,6 +23,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + TandFOnline, ) from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX @@ -206,6 +207,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_tandfonline_1(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/full/10.1080/01621459.2017.1385466" + exp = "Fearnhead_Rigaill_-_Changepoint_Detection_in_the_Presence_of_Outliers_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_tandfonline_2(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" + exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
