aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-06-20 22:42:10 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-06-20 22:42:10 +0100
commitec000de563a32b4e757c9afde5a1b1b5ac80a511 (patch)
treec578df0983cecc98a3b22321b2d061f4630aaf78
parentFix no_crop bug (diff)
downloadpaper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.tar.gz
paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.zip
Add support for using ReadabiliPy
-rw-r--r--paper2remarkable/providers/html.py35
-rw-r--r--tests/test_providers.py7
2 files changed, 37 insertions, 5 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d71f210..abe30ba 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
img.attrib["src"] = urllib.parse.urljoin(
self._base_url, img.attrib["src"]
)
- img.attrib["src"] = img.attrib['src'].rstrip('/')
+ img.attrib["src"] = img.attrib["src"].rstrip("/")
class HTMLInformer(Informer):
@@ -98,6 +98,32 @@ class HTML(Provider):
def get_abs_pdf_urls(self, url):
return url, url
+ def make_readable(self, request_html):
+ have_readabilipy = False
+ try:
+ from readabilipy import simple_json_from_html_string
+
+ have_readabilipy = True
+ except ImportError:
+ pass
+
+ logger.info(
+ "Converting HTML using %s"
+ % ("ReadabiliPy" if have_readabilipy else "readability")
+ )
+
+ if have_readabilipy:
+ article = simple_json_from_html_string(
+ request_html, use_readability=True
+ )
+ title = article["title"]
+ raw_html = article["content"]
+ else:
+ doc = readability.Document(request_html)
+ title = doc.title()
+ raw_html = doc.summary(html_partial=True)
+ return title, raw_html
+
def retrieve_pdf(self, pdf_url, filename):
"""Turn the HTML article in a clean pdf file"""
# Steps
@@ -107,10 +133,9 @@ class HTML(Provider):
# 4. Convert the markdown back to HTML (this is done to sanitize HTML)
# 4. Convert the HTML to PDF, pulling in images where needed
# 5. Save the PDF to the specified filename.
- request_text = get_page_with_retry(pdf_url, return_text=True)
- doc = readability.Document(request_text)
- title = doc.title()
- raw_html = doc.summary(html_partial=True)
+
+ request_html = get_page_with_retry(pdf_url, return_text=True)
+ title, raw_html = self.make_readable(request_html)
h2t = html2text.HTML2Text()
h2t.wrap_links = False
diff --git a/tests/test_providers.py b/tests/test_providers.py
index fb75fbd..ca6c1ae 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase):
# this is a proxy test to check that all images are included
self.assertEqual(4, len(pdfplumber.open(filename).pages))
+ def test_html_5(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
if __name__ == "__main__":
unittest.main()