diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-27 14:14:25 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-27 14:14:25 +0100 |
| commit | 71fb1f0e8433705e891aa4b6e176e26b62e5a9bb (patch) | |
| tree | db8622e68642bbc186561a429693bfc88e36122c | |
| parent | Update release script (diff) | |
| download | paper2remarkable-71fb1f0e8433705e891aa4b6e176e26b62e5a9bb.tar.gz paper2remarkable-71fb1f0e8433705e891aa4b6e176e26b62e5a9bb.zip | |
Properly resolve image urls (fixes #45)
| -rw-r--r-- | paper2remarkable/providers/html.py | 33 |
1 files changed, 24 insertions, 9 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d0d55f4..bbafe10 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,26 @@ def my_fetcher(url): return weasyprint.default_url_fetcher(url) +class ImgProcessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, base_url, *args, **kwargs): + self._base_url = base_url + super().__init__(*args, **kwargs) + + def _find_img(self, node): + """ Find img nodes recursively """ + for img in node.findall("img"): + yield img + for child in node: + yield from self._find_img(child) + + def run(self, root): + """ Ensure all img src urls are absolute """ + for img in self._find_img(root): + img.attrib["src"] = urllib.parse.urljoin( + self._base_url, img.attrib["src"] + ) + + class HTMLInformer(Informer): def __init__(self): super().__init__() @@ -105,15 +125,10 @@ class HTML(Provider): # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) - # fix relative urls - base_url = "{0.scheme}://{0.netloc}".format( - urllib.parse.urlsplit(pdf_url) - ) - html_article = markdown.markdown(article) - html_article = html_article.replace(' src="//', ' src="https://') - html_article = html_article.replace( - ' src="/', ' src="{base}/'.format(base=base_url) - ) + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) if self.debug: with open("./paper.html", "w") as fp: |
