aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--paper2remarkable/providers/html.py78
-rw-r--r--tests/test_providers.py5
2 files changed, 49 insertions, 34 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index abe30ba..b734bd1 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,37 @@ def url_fetcher(url):
return weasyprint.default_url_fetcher(url)
+def make_readable(request_html):
+ """Use an extraction method to get the main article html
+
+ This function checks if ReadabiliPy is installed with NodeJS support, as
+ that generally yields better results. If that is not available, it falls
+ back on readability.
+ """
+
+ have_readabilipy_js = False
+ try:
+ import readabilipy
+
+ have_readabilipy_js = readabilipy.simple_json.have_node()
+ except ImportError:
+ pass
+
+ if have_readabilipy_js:
+ logger.info("Converting HTML using Readability.js")
+ article = readabilipy.simple_json_from_html_string(
+ request_html, use_readability=True
+ )
+ title = article["title"]
+ raw_html = article["content"]
+ else:
+ logger.info("Converting HTML using readability")
+ doc = readability.Document(request_html)
+ title = doc.title()
+ raw_html = doc.summary(html_partial=True)
+ return title, raw_html
+
+
class ImgProcessor(markdown.treeprocessors.Treeprocessor):
def __init__(self, base_url, *args, **kwargs):
self._base_url = base_url
@@ -73,11 +104,15 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
class HTMLInformer(Informer):
def __init__(self):
super().__init__()
+ self._cached_title = None
+ self._cached_article = None
def get_filename(self, abs_url):
- request_text = get_page_with_retry(abs_url, return_text=True)
- doc = readability.Document(request_text)
- title = doc.title()
+ request_html = get_page_with_retry(abs_url, return_text=True)
+ title, article = make_readable(request_html)
+
+ self._cached_title = title
+ self._cached_article = article
# Clean the title and make it titlecase
title = clean_string(title)
@@ -98,32 +133,6 @@ class HTML(Provider):
def get_abs_pdf_urls(self, url):
return url, url
- def make_readable(self, request_html):
- have_readabilipy = False
- try:
- from readabilipy import simple_json_from_html_string
-
- have_readabilipy = True
- except ImportError:
- pass
-
- logger.info(
- "Converting HTML using %s"
- % ("ReadabiliPy" if have_readabilipy else "readability")
- )
-
- if have_readabilipy:
- article = simple_json_from_html_string(
- request_html, use_readability=True
- )
- title = article["title"]
- raw_html = article["content"]
- else:
- doc = readability.Document(request_html)
- title = doc.title()
- raw_html = doc.summary(html_partial=True)
- return title, raw_html
-
def retrieve_pdf(self, pdf_url, filename):
"""Turn the HTML article in a clean pdf file"""
# Steps
@@ -133,13 +142,16 @@ class HTML(Provider):
# 4. Convert the markdown back to HTML (this is done to sanitize HTML)
# 4. Convert the HTML to PDF, pulling in images where needed
# 5. Save the PDF to the specified filename.
-
- request_html = get_page_with_retry(pdf_url, return_text=True)
- title, raw_html = self.make_readable(request_html)
+ if self.informer._cached_title and self.informer._cached_article:
+ title = self.informer._cached_title
+ article = self.informer._cached_article
+ else:
+ request_html = get_page_with_retry(pdf_url, return_text=True)
+ title, article = make_readable(request_html)
h2t = html2text.HTML2Text()
h2t.wrap_links = False
- text = h2t.handle(raw_html)
+ text = h2t.handle(article)
# Add the title back to the document
article = "# {title}\n\n{text}".format(title=title, text=text)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index ca6c1ae..479fb84 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase):
def test_html_3(self):
prov = HTML(upload=False, verbose=VERBOSE)
url = "https://conclave-team.github.io/conclave-site/"
- exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ # NOTE: Title differs between Readability.JS and readability-lxml, we
+ # assume that testing is done with Readability.JS
+ exp = "Conclave.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
# this is a proxy test to check that all images are included