Merge branch 'feature/experimental'

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-11-11 20:05:01 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-11-11 20:05:01 +0000
commit: 5919ed338176121b1c05d6250c30e4467ac61a4b (patch)
tree: 0202289c2afe912232c92fd76ce7420a753584fe
parent: Add note on activating alias (diff)
parent: Add experimental fix for lazy loaded images in html (diff)
download: paper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.tar.gz
paper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.zip
4 files changed, 74 insertions, 11 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 57774d6..a664f23 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -38,6 +38,7 @@ class Provider(metaclass=abc.ABCMeta):
         verbose=False,
         upload=True,
         debug=False,
+        experimental=False,
         center=False,
         right=False,
         blank=False,
@@ -52,6 +53,7 @@ class Provider(metaclass=abc.ABCMeta):
     ):
         self.upload = upload
         self.debug = debug
+        self.experimental = experimental
         self.remarkable_dir = remarkable_dir
         self.rmapi_path = rmapi_path
         self.pdftoppm_path = pdftoppm_path
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index e050ea3..3e32539 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -13,6 +13,7 @@ Copyright: 2020, G.J.J. van den Burg
 
 import html2text
 import markdown
+import re
 import readability
 import titlecase
 import unidecode
@@ -133,6 +134,40 @@ class HTML(Provider):
     def get_abs_pdf_urls(self, url):
         return url, url
 
+    def fix_lazy_loading(self, article):
+        if not self.experimental:
+            return article
+
+        # This attempts to fix sites where the image src element points to a
+        # placeholder and the data-src attribute contains the url to the actual
+        # image.
+        regex = '<img src="(?P<src>.*?)" (?P<rest1>.*) data-src="(?P<datasrc>.*?)" (?P<rest2>.*?)>'
+        sub = '<img src="\g<datasrc>" \g<rest1> \g<rest2>>'
+
+        article, nsub = re.subn(regex, sub, article, flags=re.MULTILINE)
+        if nsub:
+            logger.info(
+                f"[experimental] Attempted to fix lazy image loading ({nsub} times). "
+                "Please report bad results."
+            )
+        return article
+
+    def preprocess_html(self, pdf_url, title, article):
+        article = self.fix_lazy_loading(article)
+
+        h2t = html2text.HTML2Text()
+        h2t.wrap_links = False
+        text = h2t.handle(article)
+
+        # Add the title back to the document
+        article = "# {title}\n\n{text}".format(title=title, text=text)
+
+        # Convert to html, fixing relative image urls.
+        md = markdown.Markdown()
+        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+        html_article = md.convert(article)
+        return html_article
+
     def retrieve_pdf(self, pdf_url, filename):
         """Turn the HTML article in a clean pdf file
 
@@ -152,17 +187,7 @@ class HTML(Provider):
             request_html = get_page_with_retry(pdf_url, return_text=True)
             title, article = make_readable(request_html)
 
-        h2t = html2text.HTML2Text()
-        h2t.wrap_links = False
-        text = h2t.handle(article)
-
-        # Add the title back to the document
-        article = "# {title}\n\n{text}".format(title=title, text=text)
-
-        # Convert to html, fixing relative image urls.
-        md = markdown.Markdown()
-        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
-        html_article = md.convert(article)
+        html_article = self.preprocess_html(pdf_url, title, article)
 
         if self.debug:
             with open("./paper.html", "w") as fp:
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index ea24403..a2b71cb 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -41,6 +41,12 @@ def parse_args():
         action="store_true",
     )
     parser.add_argument(
+        "-e",
+        "--experimental",
+        help="enable experimental features",
+        action="store_true",
+    )
+    parser.add_argument(
         "-n",
         "--no-upload",
         help="don't upload to the reMarkable, save the output in current working dir",
@@ -211,6 +217,7 @@ def main():
             verbose=args.verbose,
             upload=not args.no_upload,
             debug=args.debug,
+            experimental=args.experimental,
             center=args.center,
             right=args.right,
             blank=args.blank,
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..d271bb5
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Additional tests for the HTML provider
+
+This file is part of paper2remarkable.
+
+"""
+
+import unittest
+
+from paper2remarkable.providers.html import HTML
+from paper2remarkable.providers.html import make_readable
+from paper2remarkable.utils import get_page_with_retry
+
+
+class TestHTML(unittest.TestCase):
+    def test_experimental_fix_lazy_loading(self):
+        url = "https://www.seriouseats.com/2015/01/tea-for-everyone.html"
+        prov = HTML(upload=False, experimental=True)
+        page = get_page_with_retry(url, return_text=True)
+        title, article = make_readable(page)
+        html_article = prov.preprocess_html(url, title, article)
+        expected_image = "https://www.seriouseats.com/images/2015/01/20150118-tea-max-falkowitz-3.jpg"
+        self.assertIn(expected_image, html_article)
+
+
+if __name__ == "__main__":
+    unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-11-11 20:05:01 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-11-11 20:05:01 +0000
commit	5919ed338176121b1c05d6250c30e4467ac61a4b (patch)
tree	0202289c2afe912232c92fd76ce7420a753584fe
parent	Add note on activating alias (diff)
parent	Add experimental fix for lazy loaded images in html (diff)
download	paper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.tar.gz paper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.zip