From 120e7be74216a22306f6177c043d50b05ce89dfb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 Jan 2020 21:17:06 +0000 Subject: [WIP] Provider for Taylor and Francis Online --- paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/_base.py | 2 + paper2remarkable/providers/tandfonline.py | 75 +++++++++++++++++++++++++++++++ tests/test_providers.py | 15 +++++++ 4 files changed, 94 insertions(+) create mode 100644 paper2remarkable/providers/tandfonline.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index fabdcfe..53fda1f 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -10,6 +10,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer +from .tandfonline import TandFOnline # NOTE: Order matters here, PdfUrl should be last providers = [ @@ -21,6 +22,7 @@ providers = [ PMLR, PubMed, Springer, + TandFOnline, LocalFile, PdfUrl, ] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index b2f584c..596af98 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -116,7 +116,9 @@ class Provider(metaclass=abc.ABCMeta): self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: os.chdir(working_dir) + print("working_dir:", working_dir) self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py new file mode 100644 index 0000000..9f9c59e --- /dev/null +++ b/paper2remarkable/providers/tandfonline.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Provider for Taylor and Francis Online + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class TandFOnlineInformer(Informer): + meta_title_key = "dc.Title" + meta_author_key = "dc.Creator" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.strip().split(" ")[-1].strip() + + +class TandFOnline(Provider): + + re_abs = "^https?://www.tandfonline.com/doi/(full|abs)/(?P\d+\.\d+/\d+\.\d+\.\d+)" + re_pdf = "^https?://www.tandfonline.com/doi/(full|pdf)/(?P\d+\.\d+/\d+\.\d+\.\d+)" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = TandFOnlineInformer() + + def _get_doi(self, url): + m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) + if m: + return m["doi"] + raise URLResolutionError( + "TandFOnline", url, reason="Failed to retrieve DOI." + ) + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + elif re.match(self.re_pdf, url): + doi = self._get_doi(url) + pdf_url = "https://www.tandfonline.com/doi/pdf/{doi}?needAccess=true".format( + doi=doi + ) + # full redirects to abs if we don't have access + abs_url = "https://www.tandfonline.com/doi/full/{doi}".format( + doi=doi + ) + else: + raise URLResolutionError("TandFOnline", url) + print("abs:", abs_url, "pdf:", pdf_url) + return abs_url, pdf_url + + def validate(src): + m = re.match(TandFOnline.re_abs, src) or re.match( + TandFOnline.re_pdf, src + ) + return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index e256eec..3204768 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -23,6 +23,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + TandFOnline, ) from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX @@ -206,6 +207,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_tandfonline_1(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/full/10.1080/01621459.2017.1385466" + exp = "Fearnhead_Rigaill_-_Changepoint_Detection_in_the_Presence_of_Outliers_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_tandfonline_2(self): + prov = TandFOnline(upload=False, verbose=VERBOSE) + url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" + exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From f3fdc28417a892b9d42dc411a85b40d237355157 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 27 Oct 2020 20:53:58 +0100 Subject: Bugfix for content type detection --- paper2remarkable/utils.py | 22 ++++++++++++++++++++++ tests/test_ui.py | 5 +++++ 2 files changed, 27 insertions(+) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 0b4be07..573e010 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -109,6 +109,28 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None): continue return res.headers.get("Content-Type", None) + # In rare cases, a HEAD request fails but a GET request does work. So here + # we try to get the content type from a GET request. + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.get( + url, headers=HEADERS, cookies=jar, allow_redirects=True + ) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + return res.headers.get("Content-Type", None) + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" diff --git a/tests/test_ui.py b/tests/test_ui.py index 835f594..1cca0cd 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -100,6 +100,11 @@ class TestUI(unittest.TestCase): "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", ), + ( + PdfUrl, + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + ), ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", -- cgit v1.2.3 From b99b6cc3ab5a5f78833bf6e88e58b9dfab43d5ee Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 27 Oct 2020 21:27:28 +0100 Subject: Remove print statement --- paper2remarkable/providers/tandfonline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paper2remarkable/providers/tandfonline.py b/paper2remarkable/providers/tandfonline.py index 9f9c59e..d077211 100644 --- a/paper2remarkable/providers/tandfonline.py +++ b/paper2remarkable/providers/tandfonline.py @@ -65,7 +65,6 @@ class TandFOnline(Provider): ) else: raise URLResolutionError("TandFOnline", url) - print("abs:", abs_url, "pdf:", pdf_url) return abs_url, pdf_url def validate(src): -- cgit v1.2.3