diff options
| -rw-r--r-- | paper2remarkable/utils.py | 22 | ||||
| -rw-r--r-- | tests/test_ui.py | 5 |
2 files changed, 27 insertions, 0 deletions
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 5b7ba2c..09082a5 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -112,6 +112,28 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None): continue return res.headers.get("Content-Type", None) + # In rare cases, a HEAD request fails but a GET request does work. So here + # we try to get the content type from a GET request. + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.get( + url, headers=HEADERS, cookies=jar, allow_redirects=True + ) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + return res.headers.get("Content-Type", None) + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" diff --git a/tests/test_ui.py b/tests/test_ui.py index 835f594..1cca0cd 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -101,6 +101,11 @@ class TestUI(unittest.TestCase): "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", ), ( + PdfUrl, + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + "https://publications.aston.ac.uk/id/eprint/38334/1/5th_Artificial_Neural_Networks.pdf", + ), + ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", |
