diff options
| -rw-r--r-- | paper2remarkable/providers/_base.py | 16 | ||||
| -rw-r--r-- | paper2remarkable/providers/citeseerx.py | 19 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 16 |
3 files changed, 43 insertions, 8 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index b455dd6..b2f584c 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -12,6 +12,7 @@ import abc import os import shutil import tempfile +import time from ._info import Informer from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf @@ -41,6 +42,7 @@ class Provider(metaclass=abc.ABCMeta): pdfcrop_path="pdfcrop", pdftk_path="pdftk", gs_path="gs", + cookiejar=None, ): self.upload = upload self.debug = debug @@ -50,6 +52,10 @@ class Provider(metaclass=abc.ABCMeta): self.pdftk_path = pdftk_path self.gs_path = gs_path self.informer = Informer() + self.cookiejar = cookiejar + + # wait time to not hit the server too frequently + self.server_delay = 0 # disable logging if requested if not verbose: @@ -88,11 +94,17 @@ class Provider(metaclass=abc.ABCMeta): def retrieve_pdf(self, pdf_url, filename): """ Download pdf from src and save to filename """ # This must exist so that the LocalFile provider can overwrite it - download_url(pdf_url, filename) + download_url(pdf_url, filename, cookiejar=self.cookiejar) def run(self, src, filename=None): # follow_redirects here is needed with library use - src = src if os.path.exists(src) else follow_redirects(src) + if os.path.exists(src): + src = src + elif self.cookiejar is None: + # NOTE: We assume that if the cookiejar is not None, we are + # properly redirected. + src, self.cookiejar = follow_redirects(src) + time.sleep(self.server_delay) # extract page and pdf file urls abs_url, pdf_url = self.get_abs_pdf_urls(src) diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py index 82adca7..e483f28 100644 --- a/paper2remarkable/providers/citeseerx.py +++ b/paper2remarkable/providers/citeseerx.py @@ -9,10 +9,14 @@ Copyright: 2019, G.J.J. van den Burg """ import re +import time from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() class CiteSeerXInformer(Informer): @@ -33,6 +37,21 @@ class CiteSeerX(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = CiteSeerXInformer() + self.server_delay = 30 + + # NOTE: This is here because of this: + # https://github.com/SeerLabs/CiteSeerX/blob/8a62545ffc904f2b41b4ecd30ce91900dc7790f4/src/java/edu/psu/citeseerx/webutils/SimpleDownloadLimitFilter.java#L136 + # The server does not allow hits to the same URL twice within a 30 + # second window. We need to hit the URL more than once to ensure it + # redirects properly. Waiting is therefore needed. + logger.info( + "Waiting 30 seconds so we don't overload the CiteSeerX server." + ) + time.sleep(30) + + # NOTE: The delay should only be hit twice when p2r is used as a + # library (e.g. during testing). Otherwise the ``server_delay`` is + # never reached in run(). def _get_doi(self, url): m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index c1ef394..1bf261e 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -55,21 +55,22 @@ def assert_file_is_pdf(filename): raise FileTypeError(filename, "pdf") -def download_url(url, filename): +def download_url(url, filename, cookiejar=None): """Download the content of an url and save it to a filename """ logger.info("Downloading file at url: %s" % url) - content = get_page_with_retry(url) + content = get_page_with_retry(url, cookiejar=cookiejar) with open(filename, "wb") as fid: fid.write(content) -def get_page_with_retry(url, tries=5): +def get_page_with_retry(url, tries=5, cookiejar=None): count = 0 + jar = {} if cookiejar is None else cookiejar while count < tries: count += 1 error = False try: - res = requests.get(url, headers=HEADERS) + res = requests.get(url, headers=HEADERS, cookies=jar) except requests.exceptions.ConnectionError: error = True if error or not res.ok: @@ -88,7 +89,9 @@ def follow_redirects(url): it = 0 jar = {} while it < 100: - req = requests.head(url, allow_redirects=False, cookies=jar) + req = requests.head( + url, headers=HEADERS, allow_redirects=False, cookies=jar + ) if req.status_code == 200: break if not "Location" in req.headers: @@ -96,7 +99,8 @@ def follow_redirects(url): url = req.headers["Location"] jar = req.cookies it += 1 - return url + jar = jar or req.cookies + return url, jar def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): |
