diff options
| -rw-r--r-- | paper2remarkable/providers/html.py | 18 | ||||
| -rw-r--r-- | paper2remarkable/providers/pdf_url.py | 13 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 13 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 22 |
4 files changed, 46 insertions, 20 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 20185fd..d0d55f4 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -23,7 +23,11 @@ import weasyprint.fonts from ._base import Provider from ._info import Informer -from ..utils import clean_string, get_page_with_retry +from ..utils import ( + clean_string, + get_page_with_retry, + get_content_type_with_retry, +) from ..log import Logger logger = Logger() @@ -122,8 +126,12 @@ class HTML(Provider): html.write_pdf(filename, stylesheets=[css], font_config=font_config) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("text/html") diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 5314ec7..77accc9 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -13,6 +13,7 @@ import urllib from ._base import Provider from ._info import Informer from ..exceptions import FilenameMissingError +from ..utils import get_content_type_with_retry class PdfUrlInformer(Informer): @@ -30,8 +31,12 @@ class PdfUrl(Provider): return (None, url) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("application/pdf") diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 50ccad9..9b5dd42 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,7 +13,7 @@ import sys from . import __version__, GITHUB_URL -from .providers import providers, LocalFile, HTML +from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -22,11 +22,6 @@ def parse_args(): description="Paper2reMarkable version %s" % __version__ ) parser.add_argument( - '-t', "--html", - help="URL is to a HTML article instead of a PDF", - action="store_true", - ) - parser.add_argument( "-b", "--blank", help="Add a blank page after every page of the PDF", @@ -111,11 +106,7 @@ def main(): args = parse_args() cookiejar = None - if args.html and is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = HTML - elif LocalFile.validate(args.input): + if LocalFile.validate(args.input): # input is a local file provider = LocalFile elif is_url(args.input): diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index d4e5075..52c2a38 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -87,6 +87,28 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): return res.content +def get_content_type_with_retry(url, tries=5, cookiejar=None): + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.head(url, headers=HEADERS, cookies=jar, + allow_redirects=True) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + print("res.headers = %r" % res.headers) + return res.headers.get("Content-Type", None) + + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" it = 0 |
