diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-06-20 22:42:10 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-06-20 22:42:10 +0100 |
| commit | ec000de563a32b4e757c9afde5a1b1b5ac80a511 (patch) | |
| tree | c578df0983cecc98a3b22321b2d061f4630aaf78 | |
| parent | Fix no_crop bug (diff) | |
| download | paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.tar.gz paper2remarkable-ec000de563a32b4e757c9afde5a1b1b5ac80a511.zip | |
Add support for using ReadabiliPy
| -rw-r--r-- | paper2remarkable/providers/html.py | 35 | ||||
| -rw-r--r-- | tests/test_providers.py | 7 |
2 files changed, 37 insertions, 5 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d71f210..abe30ba 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) - img.attrib["src"] = img.attrib['src'].rstrip('/') + img.attrib["src"] = img.attrib["src"].rstrip("/") class HTMLInformer(Informer): @@ -98,6 +98,32 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url + def make_readable(self, request_html): + have_readabilipy = False + try: + from readabilipy import simple_json_from_html_string + + have_readabilipy = True + except ImportError: + pass + + logger.info( + "Converting HTML using %s" + % ("ReadabiliPy" if have_readabilipy else "readability") + ) + + if have_readabilipy: + article = simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -107,10 +133,9 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - request_text = get_page_with_retry(pdf_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() - raw_html = doc.summary(html_partial=True) + + request_html = get_page_with_retry(pdf_url, return_text=True) + title, raw_html = self.make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..ca6c1ae 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() |
