aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-27 18:03:16 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-27 18:03:16 +0100
commitf64bb1d918097bf46759807232ab985595968c7b (patch)
treefb6edfc1158cdcc6cd3731e4ea27b6388b6ee343
parentcode formatting (diff)
parentMerge branch 'master' into bugfix/html-images (diff)
downloadpaper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.tar.gz
paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.zip
Merge branch 'bugfix/html-images'
-rw-r--r--.travis.yml2
-rw-r--r--paper2remarkable/providers/html.py26
-rw-r--r--tests/test_providers.py10
3 files changed, 28 insertions, 10 deletions
diff --git a/.travis.yml b/.travis.yml
index e2edaaa..8399160 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,4 @@
-dist: trusty
+dist: xenial
language: python
python:
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 6e08f1c..9f8394c 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,19 @@ def url_fetcher(url):
return weasyprint.default_url_fetcher(url)
+class ImgProcessor(markdown.treeprocessors.Treeprocessor):
+ def __init__(self, base_url, *args, **kwargs):
+ self._base_url = base_url
+ super().__init__(*args, **kwargs)
+
+ def run(self, root):
+ """ Ensure all img src urls are absolute """
+ for img in root.iter("img"):
+ img.attrib["src"] = urllib.parse.urljoin(
+ self._base_url, img.attrib["src"]
+ )
+
+
class HTMLInformer(Informer):
def __init__(self):
super().__init__()
@@ -105,15 +118,10 @@ class HTML(Provider):
# Add the title back to the document
article = "# {title}\n\n{text}".format(title=title, text=text)
- # fix relative urls
- base_url = "{0.scheme}://{0.netloc}".format(
- urllib.parse.urlsplit(pdf_url)
- )
- html_article = markdown.markdown(article)
- html_article = html_article.replace(' src="//', ' src="https://')
- html_article = html_article.replace(
- ' src="/', ' src="{base}/'.format(base=base_url)
- )
+ # Convert to html, fixing relative image urls.
+ md = markdown.Markdown()
+ md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+ html_article = md.convert(article)
if self.debug:
with open("./paper.html", "w") as fp:
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e0239ed..0787792 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg"
import hashlib
import os
+import pdfplumber
import shutil
import tempfile
import unittest
@@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_html_3(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://conclave-team.github.io/conclave-site/"
+ exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+ # this is a proxy test to check that all images are included
+ self.assertEqual(32, len(pdfplumber.open(filename).pages))
+
if __name__ == "__main__":
unittest.main()