diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-27 18:03:16 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-04-27 18:03:16 +0100 |
| commit | f64bb1d918097bf46759807232ab985595968c7b (patch) | |
| tree | fb6edfc1158cdcc6cd3731e4ea27b6388b6ee343 | |
| parent | code formatting (diff) | |
| parent | Merge branch 'master' into bugfix/html-images (diff) | |
| download | paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.tar.gz paper2remarkable-f64bb1d918097bf46759807232ab985595968c7b.zip | |
Merge branch 'bugfix/html-images'
| -rw-r--r-- | .travis.yml | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/html.py | 26 | ||||
| -rw-r--r-- | tests/test_providers.py | 10 |
3 files changed, 28 insertions, 10 deletions
diff --git a/.travis.yml b/.travis.yml index e2edaaa..8399160 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: xenial language: python python: diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 6e08f1c..9f8394c 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,19 @@ def url_fetcher(url): return weasyprint.default_url_fetcher(url) +class ImgProcessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, base_url, *args, **kwargs): + self._base_url = base_url + super().__init__(*args, **kwargs) + + def run(self, root): + """ Ensure all img src urls are absolute """ + for img in root.iter("img"): + img.attrib["src"] = urllib.parse.urljoin( + self._base_url, img.attrib["src"] + ) + + class HTMLInformer(Informer): def __init__(self): super().__init__() @@ -105,15 +118,10 @@ class HTML(Provider): # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) - # fix relative urls - base_url = "{0.scheme}://{0.netloc}".format( - urllib.parse.urlsplit(pdf_url) - ) - html_article = markdown.markdown(article) - html_article = html_article.replace(' src="//', ' src="https://') - html_article = html_article.replace( - ' src="/', ' src="{base}/'.format(base=base_url) - ) + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) if self.debug: with open("./paper.html", "w") as fp: diff --git a/tests/test_providers.py b/tests/test_providers.py index e0239ed..0787792 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os +import pdfplumber import shutil import tempfile import unittest @@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() |
