Add support for using ReadabiliPy

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-06-20 22:42:10 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-06-20 22:42:10 +0100
commit: ec000de563a32b4e757c9afde5a1b1b5ac80a511 (patch)
tree: c578df0983cecc98a3b22321b2d061f4630aaf78
parent: Fix no_crop bug (diff)
download: paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.tar.gz
paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.zip
2 files changed, 37 insertions, 5 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d71f210..abe30ba 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
             img.attrib["src"] = urllib.parse.urljoin(
                 self._base_url, img.attrib["src"]
             )
-            img.attrib["src"] = img.attrib['src'].rstrip('/')
+            img.attrib["src"] = img.attrib["src"].rstrip("/")
 
 
 class HTMLInformer(Informer):
@@ -98,6 +98,32 @@ class HTML(Provider):
     def get_abs_pdf_urls(self, url):
         return url, url
 
+    def make_readable(self, request_html):
+        have_readabilipy = False
+        try:
+            from readabilipy import simple_json_from_html_string
+
+            have_readabilipy = True
+        except ImportError:
+            pass
+
+        logger.info(
+            "Converting HTML using %s"
+            % ("ReadabiliPy" if have_readabilipy else "readability")
+        )
+
+        if have_readabilipy:
+            article = simple_json_from_html_string(
+                request_html, use_readability=True
+            )
+            title = article["title"]
+            raw_html = article["content"]
+        else:
+            doc = readability.Document(request_html)
+            title = doc.title()
+            raw_html = doc.summary(html_partial=True)
+        return title, raw_html
+
     def retrieve_pdf(self, pdf_url, filename):
         """Turn the HTML article in a clean pdf file"""
         # Steps
@@ -107,10 +133,9 @@ class HTML(Provider):
         # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
         # 4. Convert the HTML to PDF, pulling in images where needed
         # 5. Save the PDF to the specified filename.
-        request_text = get_page_with_retry(pdf_url, return_text=True)
-        doc = readability.Document(request_text)
-        title = doc.title()
-        raw_html = doc.summary(html_partial=True)
+
+        request_html = get_page_with_retry(pdf_url, return_text=True)
+        title, raw_html = self.make_readable(request_html)
 
         h2t = html2text.HTML2Text()
         h2t.wrap_links = False
diff --git a/tests/test_providers.py b/tests/test_providers.py
index fb75fbd..ca6c1ae 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    def test_html_5(self):
+        prov = HTML(upload=False, verbose=VERBOSE)
+        url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+        filename = prov.run(url)
+        # this is a proxy test to check that all images are included
+        self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
 
 if __name__ == "__main__":
     unittest.main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-06-20 22:42:10 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-06-20 22:42:10 +0100
commit	ec000de563a32b4e757c9afde5a1b1b5ac80a511 (patch)
tree	c578df0983cecc98a3b22321b2d061f4630aaf78
parent	Fix no_crop bug (diff)
download	paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.tar.gz paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.zip