diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-11-11 19:36:07 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-11-11 19:36:07 +0000 |
| commit | 882805565241bf2765b632e7b89a1f733a935a45 (patch) | |
| tree | 0202289c2afe912232c92fd76ce7420a753584fe | |
| parent | Add note on activating alias (diff) | |
| download | paper2remarkable-882805565241bf2765b632e7b89a1f733a935a45.tar.gz paper2remarkable-882805565241bf2765b632e7b89a1f733a935a45.zip | |
Add experimental fix for lazy loaded images in html
| -rw-r--r-- | paper2remarkable/providers/_base.py | 2 | ||||
| -rw-r--r-- | paper2remarkable/providers/html.py | 47 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 7 | ||||
| -rw-r--r-- | tests/test_html.py | 29 |
4 files changed, 74 insertions, 11 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 57774d6..a664f23 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -38,6 +38,7 @@ class Provider(metaclass=abc.ABCMeta): verbose=False, upload=True, debug=False, + experimental=False, center=False, right=False, blank=False, @@ -52,6 +53,7 @@ class Provider(metaclass=abc.ABCMeta): ): self.upload = upload self.debug = debug + self.experimental = experimental self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index e050ea3..3e32539 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -13,6 +13,7 @@ Copyright: 2020, G.J.J. van den Burg import html2text import markdown +import re import readability import titlecase import unidecode @@ -133,6 +134,40 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url + def fix_lazy_loading(self, article): + if not self.experimental: + return article + + # This attempts to fix sites where the image src element points to a + # placeholder and the data-src attribute contains the url to the actual + # image. + regex = '<img src="(?P<src>.*?)" (?P<rest1>.*) data-src="(?P<datasrc>.*?)" (?P<rest2>.*?)>' + sub = '<img src="\g<datasrc>" \g<rest1> \g<rest2>>' + + article, nsub = re.subn(regex, sub, article, flags=re.MULTILINE) + if nsub: + logger.info( + f"[experimental] Attempted to fix lazy image loading ({nsub} times). " + "Please report bad results." + ) + return article + + def preprocess_html(self, pdf_url, title, article): + article = self.fix_lazy_loading(article) + + h2t = html2text.HTML2Text() + h2t.wrap_links = False + text = h2t.handle(article) + + # Add the title back to the document + article = "# {title}\n\n{text}".format(title=title, text=text) + + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) + return html_article + def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file @@ -152,17 +187,7 @@ class HTML(Provider): request_html = get_page_with_retry(pdf_url, return_text=True) title, article = make_readable(request_html) - h2t = html2text.HTML2Text() - h2t.wrap_links = False - text = h2t.handle(article) - - # Add the title back to the document - article = "# {title}\n\n{text}".format(title=title, text=text) - - # Convert to html, fixing relative image urls. - md = markdown.Markdown() - md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) - html_article = md.convert(article) + html_article = self.preprocess_html(pdf_url, title, article) if self.debug: with open("./paper.html", "w") as fp: diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index ea24403..a2b71cb 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -41,6 +41,12 @@ def parse_args(): action="store_true", ) parser.add_argument( + "-e", + "--experimental", + help="enable experimental features", + action="store_true", + ) + parser.add_argument( "-n", "--no-upload", help="don't upload to the reMarkable, save the output in current working dir", @@ -211,6 +217,7 @@ def main(): verbose=args.verbose, upload=not args.no_upload, debug=args.debug, + experimental=args.experimental, center=args.center, right=args.right, blank=args.blank, diff --git a/tests/test_html.py b/tests/test_html.py new file mode 100644 index 0000000..d271bb5 --- /dev/null +++ b/tests/test_html.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Additional tests for the HTML provider + +This file is part of paper2remarkable. + +""" + +import unittest + +from paper2remarkable.providers.html import HTML +from paper2remarkable.providers.html import make_readable +from paper2remarkable.utils import get_page_with_retry + + +class TestHTML(unittest.TestCase): + def test_experimental_fix_lazy_loading(self): + url = "https://www.seriouseats.com/2015/01/tea-for-everyone.html" + prov = HTML(upload=False, experimental=True) + page = get_page_with_retry(url, return_text=True) + title, article = make_readable(page) + html_article = prov.preprocess_html(url, title, article) + expected_image = "https://www.seriouseats.com/images/2015/01/20150118-tea-max-falkowitz-3.jpg" + self.assertIn(expected_image, html_article) + + +if __name__ == "__main__": + unittest.main() |
