aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-04 10:50:35 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-04 10:50:35 +0000
commit4ee288f1963f94b50e57c4f1d7b2680fd689d64f (patch)
tree6170405d6a3dda05faf2e17159edb0f1b0418bcf
parentMinor readme typo (diff)
downloadpaper2remarkable-4ee288f1963f94b50e57c4f1d7b2680fd689d64f.tar.gz
paper2remarkable-4ee288f1963f94b50e57c4f1d7b2680fd689d64f.zip
Automatically detect html source using content type
This removes the need to provide a --html flag!
-rw-r--r--paper2remarkable/providers/html.py18
-rw-r--r--paper2remarkable/providers/pdf_url.py13
-rw-r--r--paper2remarkable/ui.py13
-rw-r--r--paper2remarkable/utils.py22
4 files changed, 46 insertions, 20 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 20185fd..d0d55f4 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -23,7 +23,11 @@ import weasyprint.fonts
from ._base import Provider
from ._info import Informer
-from ..utils import clean_string, get_page_with_retry
+from ..utils import (
+ clean_string,
+ get_page_with_retry,
+ get_content_type_with_retry,
+)
from ..log import Logger
logger = Logger()
@@ -122,8 +126,12 @@ class HTML(Provider):
html.write_pdf(filename, stylesheets=[css], font_config=font_config)
def validate(src):
- try:
- result = urllib.parse.urlparse(src)
- return all([result.scheme, result.netloc, result.path])
- except:
+ # first check if it is a valid url
+ parsed = urllib.parse.urlparse(src)
+ if not all([parsed.scheme, parsed.netloc, parsed.path]):
return False
+ # next, get the header and check the content type
+ ct = get_content_type_with_retry(src)
+ if ct is None:
+ return False
+ return ct.startswith("text/html")
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index 5314ec7..77accc9 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -13,6 +13,7 @@ import urllib
from ._base import Provider
from ._info import Informer
from ..exceptions import FilenameMissingError
+from ..utils import get_content_type_with_retry
class PdfUrlInformer(Informer):
@@ -30,8 +31,12 @@ class PdfUrl(Provider):
return (None, url)
def validate(src):
- try:
- result = urllib.parse.urlparse(src)
- return all([result.scheme, result.netloc, result.path])
- except:
+ # first check if it is a valid url
+ parsed = urllib.parse.urlparse(src)
+ if not all([parsed.scheme, parsed.netloc, parsed.path]):
return False
+ # next, get the header and check the content type
+ ct = get_content_type_with_retry(src)
+ if ct is None:
+ return False
+ return ct.startswith("application/pdf")
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 50ccad9..9b5dd42 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -13,7 +13,7 @@ import sys
from . import __version__, GITHUB_URL
-from .providers import providers, LocalFile, HTML
+from .providers import providers, LocalFile
from .utils import follow_redirects, is_url
@@ -22,11 +22,6 @@ def parse_args():
description="Paper2reMarkable version %s" % __version__
)
parser.add_argument(
- '-t', "--html",
- help="URL is to a HTML article instead of a PDF",
- action="store_true",
- )
- parser.add_argument(
"-b",
"--blank",
help="Add a blank page after every page of the PDF",
@@ -111,11 +106,7 @@ def main():
args = parse_args()
cookiejar = None
- if args.html and is_url(args.input):
- # input is a url
- url, cookiejar = follow_redirects(args.input)
- provider = HTML
- elif LocalFile.validate(args.input):
+ if LocalFile.validate(args.input):
# input is a local file
provider = LocalFile
elif is_url(args.input):
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index d4e5075..52c2a38 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -87,6 +87,28 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False):
return res.content
+def get_content_type_with_retry(url, tries=5, cookiejar=None):
+ count = 0
+ jar = {} if cookiejar is None else cookiejar
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.head(url, headers=HEADERS, cookies=jar,
+ allow_redirects=True)
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ logger.warning(
+ "(%i/%i) Error getting headers for %s. Retrying in 5 seconds."
+ % (count, tries, url)
+ )
+ time.sleep(5)
+ continue
+ print("res.headers = %r" % res.headers)
+ return res.headers.get("Content-Type", None)
+
+
def follow_redirects(url):
"""Follow redirects from the URL (at most 100)"""
it = 0