From 71fb1f0e8433705e891aa4b6e176e26b62e5a9bb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 14:14:25 +0100 Subject: Properly resolve image urls (fixes #45) --- paper2remarkable/providers/html.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d0d55f4..bbafe10 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,26 @@ def my_fetcher(url): return weasyprint.default_url_fetcher(url) +class ImgProcessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, base_url, *args, **kwargs): + self._base_url = base_url + super().__init__(*args, **kwargs) + + def _find_img(self, node): + """ Find img nodes recursively """ + for img in node.findall("img"): + yield img + for child in node: + yield from self._find_img(child) + + def run(self, root): + """ Ensure all img src urls are absolute """ + for img in self._find_img(root): + img.attrib["src"] = urllib.parse.urljoin( + self._base_url, img.attrib["src"] + ) + + class HTMLInformer(Informer): def __init__(self): super().__init__() @@ -105,15 +125,10 @@ class HTML(Provider): # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) - # fix relative urls - base_url = "{0.scheme}://{0.netloc}".format( - urllib.parse.urlsplit(pdf_url) - ) - html_article = markdown.markdown(article) - html_article = html_article.replace(' src="//', ' src="https://') - html_article = html_article.replace( - ' src="/', ' src="{base}/'.format(base=base_url) - ) + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) if self.debug: with open("./paper.html", "w") as fp: -- cgit v1.2.3 From 3224b3857cc2f11226043ced1da586756403cbb1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:26:42 +0100 Subject: Use builtin iter() function to find img elements --- paper2remarkable/providers/html.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index bbafe10..ba250e7 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -61,16 +61,9 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): self._base_url = base_url super().__init__(*args, **kwargs) - def _find_img(self, node): - """ Find img nodes recursively """ - for img in node.findall("img"): - yield img - for child in node: - yield from self._find_img(child) - def run(self, root): """ Ensure all img src urls are absolute """ - for img in self._find_img(root): + for img in root.iter("img"): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) -- cgit v1.2.3 From fb825cab2e4681a6e6cae8cf32adeeb880a4910c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:33:13 +0100 Subject: Add unit test for this bug --- tests/test_providers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index e0239ed..0787792 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os +import pdfplumber import shutil import tempfile import unittest @@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 26ea8d0691b9574561a1afb519956c2b0c6513da Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:48:53 +0100 Subject: upgrade travis distribution --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e2edaaa..8399160 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: xenial language: python python: -- cgit v1.2.3