From 6338388cea254ba4c6090eb17a8942a13b7a2b1c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:25:41 +0200 Subject: Clean up readability providers This reorganizes the code a bit to ensure we only pull the HTML page once, and use the same readability provider for both the informer and the converter. --- paper2remarkable/providers/html.py | 78 ++++++++++++++++++++++---------------- tests/test_providers.py | 5 ++- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index abe30ba..b734bd1 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,37 @@ def url_fetcher(url): return weasyprint.default_url_fetcher(url) +def make_readable(request_html): + """Use an extraction method to get the main article html + + This function checks if ReadabiliPy is installed with NodeJS support, as + that generally yields better results. If that is not available, it falls + back on readability. + """ + + have_readabilipy_js = False + try: + import readabilipy + + have_readabilipy_js = readabilipy.simple_json.have_node() + except ImportError: + pass + + if have_readabilipy_js: + logger.info("Converting HTML using Readability.js") + article = readabilipy.simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + logger.info("Converting HTML using readability") + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + + class ImgProcessor(markdown.treeprocessors.Treeprocessor): def __init__(self, base_url, *args, **kwargs): self._base_url = base_url @@ -73,11 +104,15 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): class HTMLInformer(Informer): def __init__(self): super().__init__() + self._cached_title = None + self._cached_article = None def get_filename(self, abs_url): - request_text = get_page_with_retry(abs_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() + request_html = get_page_with_retry(abs_url, return_text=True) + title, article = make_readable(request_html) + + self._cached_title = title + self._cached_article = article # Clean the title and make it titlecase title = clean_string(title) @@ -98,32 +133,6 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url - def make_readable(self, request_html): - have_readabilipy = False - try: - from readabilipy import simple_json_from_html_string - - have_readabilipy = True - except ImportError: - pass - - logger.info( - "Converting HTML using %s" - % ("ReadabiliPy" if have_readabilipy else "readability") - ) - - if have_readabilipy: - article = simple_json_from_html_string( - request_html, use_readability=True - ) - title = article["title"] - raw_html = article["content"] - else: - doc = readability.Document(request_html) - title = doc.title() - raw_html = doc.summary(html_partial=True) - return title, raw_html - def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -133,13 +142,16 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - - request_html = get_page_with_retry(pdf_url, return_text=True) - title, raw_html = self.make_readable(request_html) + if self.informer._cached_title and self.informer._cached_article: + title = self.informer._cached_title + article = self.informer._cached_article + else: + request_html = get_page_with_retry(pdf_url, return_text=True) + title, article = make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False - text = h2t.handle(raw_html) + text = h2t.handle(article) # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) diff --git a/tests/test_providers.py b/tests/test_providers.py index ca6c1ae..479fb84 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) # this is a proxy test to check that all images are included -- cgit v1.2.3