Merge branch 'bugfix/html-images'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-27 18:03:16 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-04-27 18:03:16 +0100
commit: f64bb1d918097bf46759807232ab985595968c7b (patch)
tree: fb6edfc1158cdcc6cd3731e4ea27b6388b6ee343
parent: code formatting (diff)
parent: Merge branch 'master' into bugfix/html-images (diff)
download: paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.tar.gz
paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.zip
3 files changed, 28 insertions, 10 deletions
diff --git a/.travis.yml b/.travis.yml
index e2edaaa..8399160 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,4 @@
-dist: trusty
+dist: xenial
 
 language: python
 python:
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 6e08f1c..9f8394c 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,19 @@ def url_fetcher(url):
     return weasyprint.default_url_fetcher(url)
 
 
+class ImgProcessor(markdown.treeprocessors.Treeprocessor):
+    def __init__(self, base_url, *args, **kwargs):
+        self._base_url = base_url
+        super().__init__(*args, **kwargs)
+
+    def run(self, root):
+        """ Ensure all img src urls are absolute """
+        for img in root.iter("img"):
+            img.attrib["src"] = urllib.parse.urljoin(
+                self._base_url, img.attrib["src"]
+            )
+
+
 class HTMLInformer(Informer):
     def __init__(self):
         super().__init__()
@@ -105,15 +118,10 @@ class HTML(Provider):
         # Add the title back to the document
         article = "# {title}\n\n{text}".format(title=title, text=text)
 
-        # fix relative urls
-        base_url = "{0.scheme}://{0.netloc}".format(
-            urllib.parse.urlsplit(pdf_url)
-        )
-        html_article = markdown.markdown(article)
-        html_article = html_article.replace(' src="//', ' src="https://')
-        html_article = html_article.replace(
-            ' src="/', ' src="{base}/'.format(base=base_url)
-        )
+        # Convert to html, fixing relative image urls.
+        md = markdown.Markdown()
+        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+        html_article = md.convert(article)
 
         if self.debug:
             with open("./paper.html", "w") as fp:
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e0239ed..0787792 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg"
 
 import hashlib
 import os
+import pdfplumber
 import shutil
 import tempfile
 import unittest
@@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
 
+    def test_html_3(self):
+        prov = HTML(upload=False, verbose=VERBOSE)
+        url = "https://conclave-team.github.io/conclave-site/"
+        exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+        # this is a proxy test to check that all images are included
+        self.assertEqual(32, len(pdfplumber.open(filename).pages))
+
 
 if __name__ == "__main__":
     unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-27 18:03:16 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-04-27 18:03:16 +0100
commit	f64bb1d918097bf46759807232ab985595968c7b (patch)
tree	fb6edfc1158cdcc6cd3731e4ea27b6388b6ee343
parent	code formatting (diff)
parent	Merge branch 'master' into bugfix/html-images (diff)
download	paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.tar.gz paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.zip