aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/_base.py16
-rw-r--r--paper2remarkable/providers/citeseerx.py19
-rw-r--r--paper2remarkable/utils.py16
3 files changed, 43 insertions, 8 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index b455dd6..b2f584c 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -12,6 +12,7 @@ import abc
import os
import shutil
import tempfile
+import time
from ._info import Informer
from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
@@ -41,6 +42,7 @@ class Provider(metaclass=abc.ABCMeta):
pdfcrop_path="pdfcrop",
pdftk_path="pdftk",
gs_path="gs",
+ cookiejar=None,
):
self.upload = upload
self.debug = debug
@@ -50,6 +52,10 @@ class Provider(metaclass=abc.ABCMeta):
self.pdftk_path = pdftk_path
self.gs_path = gs_path
self.informer = Informer()
+ self.cookiejar = cookiejar
+
+ # wait time to not hit the server too frequently
+ self.server_delay = 0
# disable logging if requested
if not verbose:
@@ -88,11 +94,17 @@ class Provider(metaclass=abc.ABCMeta):
def retrieve_pdf(self, pdf_url, filename):
""" Download pdf from src and save to filename """
# This must exist so that the LocalFile provider can overwrite it
- download_url(pdf_url, filename)
+ download_url(pdf_url, filename, cookiejar=self.cookiejar)
def run(self, src, filename=None):
# follow_redirects here is needed with library use
- src = src if os.path.exists(src) else follow_redirects(src)
+ if os.path.exists(src):
+ src = src
+ elif self.cookiejar is None:
+ # NOTE: We assume that if the cookiejar is not None, we are
+ # properly redirected.
+ src, self.cookiejar = follow_redirects(src)
+ time.sleep(self.server_delay)
# extract page and pdf file urls
abs_url, pdf_url = self.get_abs_pdf_urls(src)
diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py
index 82adca7..e483f28 100644
--- a/paper2remarkable/providers/citeseerx.py
+++ b/paper2remarkable/providers/citeseerx.py
@@ -9,10 +9,14 @@ Copyright: 2019, G.J.J. van den Burg
"""
import re
+import time
from ._base import Provider
from ._info import Informer
from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
class CiteSeerXInformer(Informer):
@@ -33,6 +37,21 @@ class CiteSeerX(Provider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = CiteSeerXInformer()
+ self.server_delay = 30
+
+ # NOTE: This is here because of this:
+ # https://github.com/SeerLabs/CiteSeerX/blob/8a62545ffc904f2b41b4ecd30ce91900dc7790f4/src/java/edu/psu/citeseerx/webutils/SimpleDownloadLimitFilter.java#L136
+ # The server does not allow hits to the same URL twice within a 30
+ # second window. We need to hit the URL more than once to ensure it
+ # redirects properly. Waiting is therefore needed.
+ logger.info(
+ "Waiting 30 seconds so we don't overload the CiteSeerX server."
+ )
+ time.sleep(30)
+
+ # NOTE: The delay should only be hit twice when p2r is used as a
+ # library (e.g. during testing). Otherwise the ``server_delay`` is
+ # never reached in run().
def _get_doi(self, url):
m = re.match(self.re_abs, url) or re.match(self.re_pdf, url)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index c1ef394..1bf261e 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -55,21 +55,22 @@ def assert_file_is_pdf(filename):
raise FileTypeError(filename, "pdf")
-def download_url(url, filename):
+def download_url(url, filename, cookiejar=None):
"""Download the content of an url and save it to a filename """
logger.info("Downloading file at url: %s" % url)
- content = get_page_with_retry(url)
+ content = get_page_with_retry(url, cookiejar=cookiejar)
with open(filename, "wb") as fid:
fid.write(content)
-def get_page_with_retry(url, tries=5):
+def get_page_with_retry(url, tries=5, cookiejar=None):
count = 0
+ jar = {} if cookiejar is None else cookiejar
while count < tries:
count += 1
error = False
try:
- res = requests.get(url, headers=HEADERS)
+ res = requests.get(url, headers=HEADERS, cookies=jar)
except requests.exceptions.ConnectionError:
error = True
if error or not res.ok:
@@ -88,7 +89,9 @@ def follow_redirects(url):
it = 0
jar = {}
while it < 100:
- req = requests.head(url, allow_redirects=False, cookies=jar)
+ req = requests.head(
+ url, headers=HEADERS, allow_redirects=False, cookies=jar
+ )
if req.status_code == 200:
break
if not "Location" in req.headers:
@@ -96,7 +99,8 @@ def follow_redirects(url):
url = req.headers["Location"]
jar = req.cookies
it += 1
- return url
+ jar = jar or req.cookies
+ return url, jar
def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):