aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-11-11 20:05:01 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-11-11 20:05:01 +0000
commit5919ed338176121b1c05d6250c30e4467ac61a4b (patch)
tree0202289c2afe912232c92fd76ce7420a753584fe
parentAdd note on activating alias (diff)
parentAdd experimental fix for lazy loaded images in html (diff)
downloadpaper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.tar.gz
paper2remarkable-5919ed338176121b1c05d6250c30e4467ac61a4b.zip
Merge branch 'feature/experimental'
-rw-r--r--paper2remarkable/providers/_base.py2
-rw-r--r--paper2remarkable/providers/html.py47
-rw-r--r--paper2remarkable/ui.py7
-rw-r--r--tests/test_html.py29
4 files changed, 74 insertions, 11 deletions
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 57774d6..a664f23 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -38,6 +38,7 @@ class Provider(metaclass=abc.ABCMeta):
verbose=False,
upload=True,
debug=False,
+ experimental=False,
center=False,
right=False,
blank=False,
@@ -52,6 +53,7 @@ class Provider(metaclass=abc.ABCMeta):
):
self.upload = upload
self.debug = debug
+ self.experimental = experimental
self.remarkable_dir = remarkable_dir
self.rmapi_path = rmapi_path
self.pdftoppm_path = pdftoppm_path
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index e050ea3..3e32539 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -13,6 +13,7 @@ Copyright: 2020, G.J.J. van den Burg
import html2text
import markdown
+import re
import readability
import titlecase
import unidecode
@@ -133,6 +134,40 @@ class HTML(Provider):
def get_abs_pdf_urls(self, url):
return url, url
+ def fix_lazy_loading(self, article):
+ if not self.experimental:
+ return article
+
+ # This attempts to fix sites where the image src element points to a
+ # placeholder and the data-src attribute contains the url to the actual
+ # image.
+ regex = '<img src="(?P<src>.*?)" (?P<rest1>.*) data-src="(?P<datasrc>.*?)" (?P<rest2>.*?)>'
+ sub = '<img src="\g<datasrc>" \g<rest1> \g<rest2>>'
+
+ article, nsub = re.subn(regex, sub, article, flags=re.MULTILINE)
+ if nsub:
+ logger.info(
+ f"[experimental] Attempted to fix lazy image loading ({nsub} times). "
+ "Please report bad results."
+ )
+ return article
+
+ def preprocess_html(self, pdf_url, title, article):
+ article = self.fix_lazy_loading(article)
+
+ h2t = html2text.HTML2Text()
+ h2t.wrap_links = False
+ text = h2t.handle(article)
+
+ # Add the title back to the document
+ article = "# {title}\n\n{text}".format(title=title, text=text)
+
+ # Convert to html, fixing relative image urls.
+ md = markdown.Markdown()
+ md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+ html_article = md.convert(article)
+ return html_article
+
def retrieve_pdf(self, pdf_url, filename):
"""Turn the HTML article in a clean pdf file
@@ -152,17 +187,7 @@ class HTML(Provider):
request_html = get_page_with_retry(pdf_url, return_text=True)
title, article = make_readable(request_html)
- h2t = html2text.HTML2Text()
- h2t.wrap_links = False
- text = h2t.handle(article)
-
- # Add the title back to the document
- article = "# {title}\n\n{text}".format(title=title, text=text)
-
- # Convert to html, fixing relative image urls.
- md = markdown.Markdown()
- md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
- html_article = md.convert(article)
+ html_article = self.preprocess_html(pdf_url, title, article)
if self.debug:
with open("./paper.html", "w") as fp:
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index ea24403..a2b71cb 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -41,6 +41,12 @@ def parse_args():
action="store_true",
)
parser.add_argument(
+ "-e",
+ "--experimental",
+ help="enable experimental features",
+ action="store_true",
+ )
+ parser.add_argument(
"-n",
"--no-upload",
help="don't upload to the reMarkable, save the output in current working dir",
@@ -211,6 +217,7 @@ def main():
verbose=args.verbose,
upload=not args.no_upload,
debug=args.debug,
+ experimental=args.experimental,
center=args.center,
right=args.right,
blank=args.blank,
diff --git a/tests/test_html.py b/tests/test_html.py
new file mode 100644
index 0000000..d271bb5
--- /dev/null
+++ b/tests/test_html.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Additional tests for the HTML provider
+
+This file is part of paper2remarkable.
+
+"""
+
+import unittest
+
+from paper2remarkable.providers.html import HTML
+from paper2remarkable.providers.html import make_readable
+from paper2remarkable.utils import get_page_with_retry
+
+
+class TestHTML(unittest.TestCase):
+ def test_experimental_fix_lazy_loading(self):
+ url = "https://www.seriouseats.com/2015/01/tea-for-everyone.html"
+ prov = HTML(upload=False, experimental=True)
+ page = get_page_with_retry(url, return_text=True)
+ title, article = make_readable(page)
+ html_article = prov.preprocess_html(url, title, article)
+ expected_image = "https://www.seriouseats.com/images/2015/01/20150118-tea-max-falkowitz-3.jpg"
+ self.assertIn(expected_image, html_article)
+
+
+if __name__ == "__main__":
+ unittest.main()