From ec000de563a32b4e757c9afde5a1b1b5ac80a511 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 22:42:10 +0100 Subject: Add support for using ReadabiliPy --- paper2remarkable/providers/html.py | 35 ++++++++++++++++++++++++++++++----- tests/test_providers.py | 7 +++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d71f210..abe30ba 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) - img.attrib["src"] = img.attrib['src'].rstrip('/') + img.attrib["src"] = img.attrib["src"].rstrip("/") class HTMLInformer(Informer): @@ -98,6 +98,32 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url + def make_readable(self, request_html): + have_readabilipy = False + try: + from readabilipy import simple_json_from_html_string + + have_readabilipy = True + except ImportError: + pass + + logger.info( + "Converting HTML using %s" + % ("ReadabiliPy" if have_readabilipy else "readability") + ) + + if have_readabilipy: + article = simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -107,10 +133,9 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - request_text = get_page_with_retry(pdf_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() - raw_html = doc.summary(html_partial=True) + + request_html = get_page_with_retry(pdf_url, return_text=True) + title, raw_html = self.make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..ca6c1ae 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 603353cd2cf16f99cc5eb823918105146fea6bcb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 22:59:40 +0100 Subject: Make readabilipy an optional dependency --- setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 82a693a..d1de5bd 100644 --- a/setup.py +++ b/setup.py @@ -29,18 +29,23 @@ REQUIRED = [ "readability-lxml>=0.7.1", "html2text>=2020.1.16", "weasyprint>=51", - "markdown>=3.1.1" + "markdown>=3.1.1", ] +full_require = [ + # TEMPORARY: Until ReadabiliPy is available on PyPI + "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy", +] docs_require = [] test_require = [] dev_require = ["green"] # What packages are optional? EXTRAS = { + "full": full_require, "docs": docs_require, "tests": test_require, - "dev": docs_require + test_require + dev_require, + "dev": docs_require + test_require + dev_require + full_require, } # The rest you shouldn't have to touch too much :) -- cgit v1.2.3 From 65c2ad9c4be36fc10ba06579baf1fdc549dae99d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 23:15:00 +0100 Subject: Upgrade nvm on travis --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8399160..7d220e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ python: before_install: - sudo apt-get update - sudo apt-get install ghostscript pdftk poppler-utils qpdf + - nvm install v12.18.1 + - nvm use v12.18.1 install: - pip install -e .[dev] -- cgit v1.2.3 From 6338388cea254ba4c6090eb17a8942a13b7a2b1c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:25:41 +0200 Subject: Clean up readability providers This reorganizes the code a bit to ensure we only pull the HTML page once, and use the same readability provider for both the informer and the converter. --- paper2remarkable/providers/html.py | 78 ++++++++++++++++++++++---------------- tests/test_providers.py | 5 ++- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index abe30ba..b734bd1 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,37 @@ def url_fetcher(url): return weasyprint.default_url_fetcher(url) +def make_readable(request_html): + """Use an extraction method to get the main article html + + This function checks if ReadabiliPy is installed with NodeJS support, as + that generally yields better results. If that is not available, it falls + back on readability. + """ + + have_readabilipy_js = False + try: + import readabilipy + + have_readabilipy_js = readabilipy.simple_json.have_node() + except ImportError: + pass + + if have_readabilipy_js: + logger.info("Converting HTML using Readability.js") + article = readabilipy.simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + logger.info("Converting HTML using readability") + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + + class ImgProcessor(markdown.treeprocessors.Treeprocessor): def __init__(self, base_url, *args, **kwargs): self._base_url = base_url @@ -73,11 +104,15 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): class HTMLInformer(Informer): def __init__(self): super().__init__() + self._cached_title = None + self._cached_article = None def get_filename(self, abs_url): - request_text = get_page_with_retry(abs_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() + request_html = get_page_with_retry(abs_url, return_text=True) + title, article = make_readable(request_html) + + self._cached_title = title + self._cached_article = article # Clean the title and make it titlecase title = clean_string(title) @@ -98,32 +133,6 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url - def make_readable(self, request_html): - have_readabilipy = False - try: - from readabilipy import simple_json_from_html_string - - have_readabilipy = True - except ImportError: - pass - - logger.info( - "Converting HTML using %s" - % ("ReadabiliPy" if have_readabilipy else "readability") - ) - - if have_readabilipy: - article = simple_json_from_html_string( - request_html, use_readability=True - ) - title = article["title"] - raw_html = article["content"] - else: - doc = readability.Document(request_html) - title = doc.title() - raw_html = doc.summary(html_partial=True) - return title, raw_html - def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -133,13 +142,16 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - - request_html = get_page_with_retry(pdf_url, return_text=True) - title, raw_html = self.make_readable(request_html) + if self.informer._cached_title and self.informer._cached_article: + title = self.informer._cached_title + article = self.informer._cached_article + else: + request_html = get_page_with_retry(pdf_url, return_text=True) + title, article = make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False - text = h2t.handle(raw_html) + text = h2t.handle(article) # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) diff --git a/tests/test_providers.py b/tests/test_providers.py index ca6c1ae..479fb84 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) # this is a proxy test to check that all images are included -- cgit v1.2.3 From 0bf303a5607f42658252ef27e9f3fee3e6b84d19 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:26:14 +0200 Subject: Clean up "full" installation mode --- .travis.yml | 2 +- setup.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7d220e0..a1cb636 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: - nvm use v12.18.1 install: - - pip install -e .[dev] + - pip install -e .[full] script: - green -vv -a ./tests diff --git a/setup.py b/setup.py index d1de5bd..0635253 100644 --- a/setup.py +++ b/setup.py @@ -32,10 +32,7 @@ REQUIRED = [ "markdown>=3.1.1", ] -full_require = [ - # TEMPORARY: Until ReadabiliPy is available on PyPI - "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy", -] +full_require = ["readabilipy"] docs_require = [] test_require = [] dev_require = ["green"] -- cgit v1.2.3 From 8e0804ad491f2179135a138f9656088213ae8431 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:29:44 +0200 Subject: Ensure we test the test version on travis --- .travis.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a1cb636..6a57cd3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: - nvm use v12.18.1 install: - - pip install -e .[full] + - pip install -e .[test] script: - green -vv -a ./tests diff --git a/setup.py b/setup.py index 0635253..25b6895 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ dev_require = ["green"] EXTRAS = { "full": full_require, "docs": docs_require, - "tests": test_require, + "test": test_require + full_require, "dev": docs_require + test_require + dev_require + full_require, } -- cgit v1.2.3 From 7e1c84db7d11541062709eb5208c2f804fac4da8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:34:21 +0200 Subject: Move green to test dependencies --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 25b6895..54a8cb1 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ REQUIRED = [ full_require = ["readabilipy"] docs_require = [] -test_require = [] -dev_require = ["green"] +test_require = ["green"] +dev_require = [] # What packages are optional? EXTRAS = { -- cgit v1.2.3 From 3b5e7eb5f34f92496aa96ee088db2925eadafd65 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 23:36:47 +0200 Subject: Improve docs --- paper2remarkable/providers/html.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index b734bd1..e050ea3 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -134,14 +134,17 @@ class HTML(Provider): return url, url def retrieve_pdf(self, pdf_url, filename): - """Turn the HTML article in a clean pdf file""" - # Steps - # 1. Pull the HTML page using requests - # 2. Extract the article part of the page using readability - # 3. Convert the article HTML to markdown using html2text - # 4. Convert the markdown back to HTML (this is done to sanitize HTML) - # 4. Convert the HTML to PDF, pulling in images where needed - # 5. Save the PDF to the specified filename. + """Turn the HTML article in a clean pdf file + + This function takes the following steps: + + 1. Pull the HTML page using requests, if not done in Informer + 2. Extract the article part of the page using readability/readabiliPy + 3. Convert the article HTML to markdown using html2text + 4. Convert the markdown back to HTML (done to sanitize the HTML) + 4. Convert the HTML to PDF, pulling in images where needed + 5. Save the PDF to the specified filename. + """ if self.informer._cached_title and self.informer._cached_article: title = self.informer._cached_title article = self.informer._cached_article -- cgit v1.2.3