4 files changed, 68 insertions, 16 deletions
diff --git a/.travis.yml b/.travis.yml
index 8399160..6a57cd3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,9 +7,11 @@ python:
 before_install:
   - sudo apt-get update
   - sudo apt-get install ghostscript pdftk poppler-utils qpdf
+  - nvm install v12.18.1
+  - nvm use v12.18.1
 
 install:
-  - pip install -e .[dev]
+  - pip install -e .[test]
 
 script:
   - green -vv -a ./tests
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d71f210..b734bd1 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,37 @@ def url_fetcher(url):
     return weasyprint.default_url_fetcher(url)
 
 
+def make_readable(request_html):
+    """Use an extraction method to get the main article html
+
+    This function checks if ReadabiliPy is installed with NodeJS support, as
+    that generally yields better results. If that is not available, it falls
+    back on readability.
+    """
+
+    have_readabilipy_js = False
+    try:
+        import readabilipy
+
+        have_readabilipy_js = readabilipy.simple_json.have_node()
+    except ImportError:
+        pass
+
+    if have_readabilipy_js:
+        logger.info("Converting HTML using Readability.js")
+        article = readabilipy.simple_json_from_html_string(
+            request_html, use_readability=True
+        )
+        title = article["title"]
+        raw_html = article["content"]
+    else:
+        logger.info("Converting HTML using readability")
+        doc = readability.Document(request_html)
+        title = doc.title()
+        raw_html = doc.summary(html_partial=True)
+    return title, raw_html
+
+
 class ImgProcessor(markdown.treeprocessors.Treeprocessor):
     def __init__(self, base_url, *args, **kwargs):
         self._base_url = base_url
@@ -67,17 +98,21 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
             img.attrib["src"] = urllib.parse.urljoin(
                 self._base_url, img.attrib["src"]
             )
-            img.attrib["src"] = img.attrib['src'].rstrip('/')
+            img.attrib["src"] = img.attrib["src"].rstrip("/")
 
 
 class HTMLInformer(Informer):
     def __init__(self):
         super().__init__()
+        self._cached_title = None
+        self._cached_article = None
 
     def get_filename(self, abs_url):
-        request_text = get_page_with_retry(abs_url, return_text=True)
-        doc = readability.Document(request_text)
-        title = doc.title()
+        request_html = get_page_with_retry(abs_url, return_text=True)
+        title, article = make_readable(request_html)
+
+        self._cached_title = title
+        self._cached_article = article
 
         # Clean the title and make it titlecase
         title = clean_string(title)
@@ -107,14 +142,16 @@ class HTML(Provider):
         # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
         # 4. Convert the HTML to PDF, pulling in images where needed
         # 5. Save the PDF to the specified filename.
-        request_text = get_page_with_retry(pdf_url, return_text=True)
-        doc = readability.Document(request_text)
-        title = doc.title()
-        raw_html = doc.summary(html_partial=True)
+        if self.informer._cached_title and self.informer._cached_article:
+            title = self.informer._cached_title
+            article = self.informer._cached_article
+        else:
+            request_html = get_page_with_retry(pdf_url, return_text=True)
+            title, article = make_readable(request_html)
 
         h2t = html2text.HTML2Text()
         h2t.wrap_links = False
-        text = h2t.handle(raw_html)
+        text = h2t.handle(article)
 
         # Add the title back to the document
         article = "# {title}\n\n{text}".format(title=title, text=text)
diff --git a/setup.py b/setup.py
index 82a693a..54a8cb1 100644
--- a/setup.py
+++ b/setup.py
@@ -29,18 +29,20 @@ REQUIRED = [
     "readability-lxml>=0.7.1",
     "html2text>=2020.1.16",
     "weasyprint>=51",
-    "markdown>=3.1.1"
+    "markdown>=3.1.1",
 ]
 
+full_require = ["readabilipy"]
 docs_require = []
-test_require = []
-dev_require = ["green"]
+test_require = ["green"]
+dev_require = []
 
 # What packages are optional?
 EXTRAS = {
+    "full": full_require,
     "docs": docs_require,
-    "tests": test_require,
-    "dev": docs_require + test_require + dev_require,
+    "test": test_require + full_require,
+    "dev": docs_require + test_require + dev_require + full_require,
 }
 
 # The rest you shouldn't have to touch too much :)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index eeaef82..70d012a 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -278,7 +278,10 @@ class TestProviders(unittest.TestCase):
     def test_html_3(self):
         prov = HTML(upload=False, verbose=VERBOSE)
         url = "https://conclave-team.github.io/conclave-site/"
-        exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+        #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+        # NOTE: Title differs between Readability.JS and readability-lxml, we 
+        # assume that testing is done with Readability.JS
+        exp = "Conclave.pdf"
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
         # this is a proxy test to check that all images are included
@@ -291,6 +294,13 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    def test_html_5(self):
+        prov = HTML(upload=False, verbose=VERBOSE)
+        url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+        filename = prov.run(url)
+        # this is a proxy test to check that all images are included
+        self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
     def test_semantic_scholar_1(self):
         prov = SemanticScholar(upload=False, verbose=VERBOSE)
         url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
@@ -320,5 +330,6 @@ class TestProviders(unittest.TestCase):
         self.assertEqual(exp, os.path.basename(filename))
 
 
+
 if __name__ == "__main__":
     unittest.main()