diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-02-04 11:03:21 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-02-04 11:03:21 +0000 |
| commit | 44ea81ca8eb74a043851156c270776a6ccdd4e6f (patch) | |
| tree | d1915146e1d57d94328ceb5e437293600d27dbf0 | |
| parent | Minor readme typo (diff) | |
| parent | Add another test for the html provider (diff) | |
| download | paper2remarkable-44ea81ca8eb74a043851156c270776a6ccdd4e6f.tar.gz paper2remarkable-44ea81ca8eb74a043851156c270776a6ccdd4e6f.zip | |
Merge branch 'feature/html-document'
| -rw-r--r-- | paper2remarkable/providers/html.py | 18 | ||||
| -rw-r--r-- | paper2remarkable/providers/pdf_url.py | 13 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 13 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 22 | ||||
| -rw-r--r-- | tests/test_providers.py | 8 |
5 files changed, 54 insertions, 20 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 20185fd..d0d55f4 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -23,7 +23,11 @@ import weasyprint.fonts from ._base import Provider from ._info import Informer -from ..utils import clean_string, get_page_with_retry +from ..utils import ( + clean_string, + get_page_with_retry, + get_content_type_with_retry, +) from ..log import Logger logger = Logger() @@ -122,8 +126,12 @@ class HTML(Provider): html.write_pdf(filename, stylesheets=[css], font_config=font_config) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("text/html") diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 5314ec7..77accc9 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -13,6 +13,7 @@ import urllib from ._base import Provider from ._info import Informer from ..exceptions import FilenameMissingError +from ..utils import get_content_type_with_retry class PdfUrlInformer(Informer): @@ -30,8 +31,12 @@ class PdfUrl(Provider): return (None, url) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("application/pdf") diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 50ccad9..9b5dd42 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,7 +13,7 @@ import sys from . import __version__, GITHUB_URL -from .providers import providers, LocalFile, HTML +from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -22,11 +22,6 @@ def parse_args(): description="Paper2reMarkable version %s" % __version__ ) parser.add_argument( - '-t', "--html", - help="URL is to a HTML article instead of a PDF", - action="store_true", - ) - parser.add_argument( "-b", "--blank", help="Add a blank page after every page of the PDF", @@ -111,11 +106,7 @@ def main(): args = parse_args() cookiejar = None - if args.html and is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = HTML - elif LocalFile.validate(args.input): + if LocalFile.validate(args.input): # input is a local file provider = LocalFile elif is_url(args.input): diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index d4e5075..52c2a38 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -87,6 +87,28 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): return res.content +def get_content_type_with_retry(url, tries=5, cookiejar=None): + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.head(url, headers=HEADERS, cookies=jar, + allow_redirects=True) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + print("res.headers = %r" % res.headers) + return res.headers.get("Content-Type", None) + + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" it = 0 diff --git a/tests/test_providers.py b/tests/test_providers.py index 80f4662..d0e3d40 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -214,6 +214,14 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_2(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.nature.com/articles/d41586-020-00176-4" + exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + if __name__ == "__main__": unittest.main() |
