aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-27 14:14:25 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-27 14:14:25 +0100
commit71fb1f0e8433705e891aa4b6e176e26b62e5a9bb (patch)
treedb8622e68642bbc186561a429693bfc88e36122c
parentUpdate release script (diff)
downloadpaper2remarkable-71fb1f0e8433705e891aa4b6e176e26b62e5a9bb.tar.gz
paper2remarkable-71fb1f0e8433705e891aa4b6e176e26b62e5a9bb.zip
Properly resolve image urls (fixes #45)
-rw-r--r--paper2remarkable/providers/html.py33
1 files changed, 24 insertions, 9 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d0d55f4..bbafe10 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,26 @@ def my_fetcher(url):
return weasyprint.default_url_fetcher(url)
+class ImgProcessor(markdown.treeprocessors.Treeprocessor):
+ def __init__(self, base_url, *args, **kwargs):
+ self._base_url = base_url
+ super().__init__(*args, **kwargs)
+
+ def _find_img(self, node):
+ """ Find img nodes recursively """
+ for img in node.findall("img"):
+ yield img
+ for child in node:
+ yield from self._find_img(child)
+
+ def run(self, root):
+ """ Ensure all img src urls are absolute """
+ for img in self._find_img(root):
+ img.attrib["src"] = urllib.parse.urljoin(
+ self._base_url, img.attrib["src"]
+ )
+
+
class HTMLInformer(Informer):
def __init__(self):
super().__init__()
@@ -105,15 +125,10 @@ class HTML(Provider):
# Add the title back to the document
article = "# {title}\n\n{text}".format(title=title, text=text)
- # fix relative urls
- base_url = "{0.scheme}://{0.netloc}".format(
- urllib.parse.urlsplit(pdf_url)
- )
- html_article = markdown.markdown(article)
- html_article = html_article.replace(' src="//', ' src="https://')
- html_article = html_article.replace(
- ' src="/', ' src="{base}/'.format(base=base_url)
- )
+ # Convert to html, fixing relative image urls.
+ md = markdown.Markdown()
+ md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+ html_article = md.convert(article)
if self.debug:
with open("./paper.html", "w") as fp: