From ec000de563a32b4e757c9afde5a1b1b5ac80a511 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 20 Jun 2020 22:42:10 +0100
Subject: Add support for using ReadabiliPy

---
 paper2remarkable/providers/html.py | 35 ++++++++++++++++++++++++++++++-----
 tests/test_providers.py            |  7 +++++++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d71f210..abe30ba 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
             img.attrib["src"] = urllib.parse.urljoin(
                 self._base_url, img.attrib["src"]
             )
-            img.attrib["src"] = img.attrib['src'].rstrip('/')
+            img.attrib["src"] = img.attrib["src"].rstrip("/")
 
 
 class HTMLInformer(Informer):
@@ -98,6 +98,32 @@ class HTML(Provider):
     def get_abs_pdf_urls(self, url):
         return url, url
 
+    def make_readable(self, request_html):
+        have_readabilipy = False
+        try:
+            from readabilipy import simple_json_from_html_string
+
+            have_readabilipy = True
+        except ImportError:
+            pass
+
+        logger.info(
+            "Converting HTML using %s"
+            % ("ReadabiliPy" if have_readabilipy else "readability")
+        )
+
+        if have_readabilipy:
+            article = simple_json_from_html_string(
+                request_html, use_readability=True
+            )
+            title = article["title"]
+            raw_html = article["content"]
+        else:
+            doc = readability.Document(request_html)
+            title = doc.title()
+            raw_html = doc.summary(html_partial=True)
+        return title, raw_html
+
     def retrieve_pdf(self, pdf_url, filename):
         """Turn the HTML article in a clean pdf file"""
         # Steps
@@ -107,10 +133,9 @@ class HTML(Provider):
         # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
         # 4. Convert the HTML to PDF, pulling in images where needed
         # 5. Save the PDF to the specified filename.
-        request_text = get_page_with_retry(pdf_url, return_text=True)
-        doc = readability.Document(request_text)
-        title = doc.title()
-        raw_html = doc.summary(html_partial=True)
+
+        request_html = get_page_with_retry(pdf_url, return_text=True)
+        title, raw_html = self.make_readable(request_html)
 
         h2t = html2text.HTML2Text()
         h2t.wrap_links = False
diff --git a/tests/test_providers.py b/tests/test_providers.py
index fb75fbd..ca6c1ae 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase):
         # this is a proxy test to check that all images are included
         self.assertEqual(4, len(pdfplumber.open(filename).pages))
 
+    def test_html_5(self):
+        prov = HTML(upload=False, verbose=VERBOSE)
+        url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+        filename = prov.run(url)
+        # this is a proxy test to check that all images are included
+        self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
cgit v1.2.3


From 603353cd2cf16f99cc5eb823918105146fea6bcb Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 20 Jun 2020 22:59:40 +0100
Subject: Make readabilipy an optional dependency

---
 setup.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 82a693a..d1de5bd 100644
--- a/setup.py
+++ b/setup.py
@@ -29,18 +29,23 @@ REQUIRED = [
     "readability-lxml>=0.7.1",
     "html2text>=2020.1.16",
     "weasyprint>=51",
-    "markdown>=3.1.1"
+    "markdown>=3.1.1",
 ]
 
+full_require = [
+    # TEMPORARY: Until ReadabiliPy is available on PyPI
+    "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy",
+]
 docs_require = []
 test_require = []
 dev_require = ["green"]
 
 # What packages are optional?
 EXTRAS = {
+    "full": full_require,
     "docs": docs_require,
     "tests": test_require,
-    "dev": docs_require + test_require + dev_require,
+    "dev": docs_require + test_require + dev_require + full_require,
 }
 
 # The rest you shouldn't have to touch too much :)
-- 
cgit v1.2.3


From 65c2ad9c4be36fc10ba06579baf1fdc549dae99d Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Sat, 20 Jun 2020 23:15:00 +0100
Subject: Upgrade nvm on travis

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 8399160..7d220e0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,6 +7,8 @@ python:
 before_install:
   - sudo apt-get update
   - sudo apt-get install ghostscript pdftk poppler-utils qpdf
+  - nvm install v12.18.1
+  - nvm use v12.18.1
 
 install:
   - pip install -e .[dev]
-- 
cgit v1.2.3


From 6338388cea254ba4c6090eb17a8942a13b7a2b1c Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Sep 2020 22:25:41 +0200
Subject: Clean up readability providers

This reorganizes the code a bit to ensure we only pull
the HTML page once, and use the same readability provider
for both the informer and the converter.
---
 paper2remarkable/providers/html.py | 78 ++++++++++++++++++++++----------------
 tests/test_providers.py            |  5 ++-
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index abe30ba..b734bd1 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,37 @@ def url_fetcher(url):
     return weasyprint.default_url_fetcher(url)
 
 
+def make_readable(request_html):
+    """Use an extraction method to get the main article html
+
+    This function checks if ReadabiliPy is installed with NodeJS support, as
+    that generally yields better results. If that is not available, it falls
+    back on readability.
+    """
+
+    have_readabilipy_js = False
+    try:
+        import readabilipy
+
+        have_readabilipy_js = readabilipy.simple_json.have_node()
+    except ImportError:
+        pass
+
+    if have_readabilipy_js:
+        logger.info("Converting HTML using Readability.js")
+        article = readabilipy.simple_json_from_html_string(
+            request_html, use_readability=True
+        )
+        title = article["title"]
+        raw_html = article["content"]
+    else:
+        logger.info("Converting HTML using readability")
+        doc = readability.Document(request_html)
+        title = doc.title()
+        raw_html = doc.summary(html_partial=True)
+    return title, raw_html
+
+
 class ImgProcessor(markdown.treeprocessors.Treeprocessor):
     def __init__(self, base_url, *args, **kwargs):
         self._base_url = base_url
@@ -73,11 +104,15 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
 class HTMLInformer(Informer):
     def __init__(self):
         super().__init__()
+        self._cached_title = None
+        self._cached_article = None
 
     def get_filename(self, abs_url):
-        request_text = get_page_with_retry(abs_url, return_text=True)
-        doc = readability.Document(request_text)
-        title = doc.title()
+        request_html = get_page_with_retry(abs_url, return_text=True)
+        title, article = make_readable(request_html)
+
+        self._cached_title = title
+        self._cached_article = article
 
         # Clean the title and make it titlecase
         title = clean_string(title)
@@ -98,32 +133,6 @@ class HTML(Provider):
     def get_abs_pdf_urls(self, url):
         return url, url
 
-    def make_readable(self, request_html):
-        have_readabilipy = False
-        try:
-            from readabilipy import simple_json_from_html_string
-
-            have_readabilipy = True
-        except ImportError:
-            pass
-
-        logger.info(
-            "Converting HTML using %s"
-            % ("ReadabiliPy" if have_readabilipy else "readability")
-        )
-
-        if have_readabilipy:
-            article = simple_json_from_html_string(
-                request_html, use_readability=True
-            )
-            title = article["title"]
-            raw_html = article["content"]
-        else:
-            doc = readability.Document(request_html)
-            title = doc.title()
-            raw_html = doc.summary(html_partial=True)
-        return title, raw_html
-
     def retrieve_pdf(self, pdf_url, filename):
         """Turn the HTML article in a clean pdf file"""
         # Steps
@@ -133,13 +142,16 @@ class HTML(Provider):
         # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
         # 4. Convert the HTML to PDF, pulling in images where needed
         # 5. Save the PDF to the specified filename.
-
-        request_html = get_page_with_retry(pdf_url, return_text=True)
-        title, raw_html = self.make_readable(request_html)
+        if self.informer._cached_title and self.informer._cached_article:
+            title = self.informer._cached_title
+            article = self.informer._cached_article
+        else:
+            request_html = get_page_with_retry(pdf_url, return_text=True)
+            title, article = make_readable(request_html)
 
         h2t = html2text.HTML2Text()
         h2t.wrap_links = False
-        text = h2t.handle(raw_html)
+        text = h2t.handle(article)
 
         # Add the title back to the document
         article = "# {title}\n\n{text}".format(title=title, text=text)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index ca6c1ae..479fb84 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase):
     def test_html_3(self):
         prov = HTML(upload=False, verbose=VERBOSE)
         url = "https://conclave-team.github.io/conclave-site/"
-        exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+        #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+        # NOTE: Title differs between Readability.JS and readability-lxml, we 
+        # assume that testing is done with Readability.JS
+        exp = "Conclave.pdf"
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
         # this is a proxy test to check that all images are included
-- 
cgit v1.2.3


From 0bf303a5607f42658252ef27e9f3fee3e6b84d19 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Sep 2020 22:26:14 +0200
Subject: Clean up "full" installation mode

---
 .travis.yml | 2 +-
 setup.py    | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7d220e0..a1cb636 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ before_install:
   - nvm use v12.18.1
 
 install:
-  - pip install -e .[dev]
+  - pip install -e .[full]
 
 script:
   - green -vv -a ./tests
diff --git a/setup.py b/setup.py
index d1de5bd..0635253 100644
--- a/setup.py
+++ b/setup.py
@@ -32,10 +32,7 @@ REQUIRED = [
     "markdown>=3.1.1",
 ]
 
-full_require = [
-    # TEMPORARY: Until ReadabiliPy is available on PyPI
-    "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy",
-]
+full_require = ["readabilipy"]
 docs_require = []
 test_require = []
 dev_require = ["green"]
-- 
cgit v1.2.3


From 8e0804ad491f2179135a138f9656088213ae8431 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Sep 2020 22:29:44 +0200
Subject: Ensure we test the test version on travis

---
 .travis.yml | 2 +-
 setup.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a1cb636..6a57cd3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,7 +11,7 @@ before_install:
   - nvm use v12.18.1
 
 install:
-  - pip install -e .[full]
+  - pip install -e .[test]
 
 script:
   - green -vv -a ./tests
diff --git a/setup.py b/setup.py
index 0635253..25b6895 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ dev_require = ["green"]
 EXTRAS = {
     "full": full_require,
     "docs": docs_require,
-    "tests": test_require,
+    "test": test_require + full_require,
     "dev": docs_require + test_require + dev_require + full_require,
 }
 
-- 
cgit v1.2.3


From 7e1c84db7d11541062709eb5208c2f804fac4da8 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Sep 2020 22:34:21 +0200
Subject: Move green to test dependencies

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 25b6895..54a8cb1 100644
--- a/setup.py
+++ b/setup.py
@@ -34,8 +34,8 @@ REQUIRED = [
 
 full_require = ["readabilipy"]
 docs_require = []
-test_require = []
-dev_require = ["green"]
+test_require = ["green"]
+dev_require = []
 
 # What packages are optional?
 EXTRAS = {
-- 
cgit v1.2.3


From 3b5e7eb5f34f92496aa96ee088db2925eadafd65 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 25 Sep 2020 23:36:47 +0200
Subject: Improve docs

---
 paper2remarkable/providers/html.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index b734bd1..e050ea3 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -134,14 +134,17 @@ class HTML(Provider):
         return url, url
 
     def retrieve_pdf(self, pdf_url, filename):
-        """Turn the HTML article in a clean pdf file"""
-        # Steps
-        # 1. Pull the HTML page using requests
-        # 2. Extract the article part of the page using readability
-        # 3. Convert the article HTML to markdown using html2text
-        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
-        # 4. Convert the HTML to PDF, pulling in images where needed
-        # 5. Save the PDF to the specified filename.
+        """Turn the HTML article in a clean pdf file
+
+        This function takes the following steps:
+
+        1. Pull the HTML page using requests, if not done in Informer
+        2. Extract the article part of the page using readability/readabiliPy
+        3. Convert the article HTML to markdown using html2text
+        4. Convert the markdown back to HTML (done to sanitize the HTML)
+        4. Convert the HTML to PDF, pulling in images where needed
+        5. Save the PDF to the specified filename.
+        """
         if self.informer._cached_title and self.informer._cached_article:
             title = self.informer._cached_title
             article = self.informer._cached_article
-- 
cgit v1.2.3