diff options
| -rw-r--r-- | .travis.yml | 4 | ||||
| -rw-r--r-- | paper2remarkable/providers/html.py | 55 | ||||
| -rw-r--r-- | setup.py | 12 | ||||
| -rw-r--r-- | tests/test_providers.py | 13 |
4 files changed, 68 insertions, 16 deletions
diff --git a/.travis.yml b/.travis.yml index 8399160..6a57cd3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,9 +7,11 @@ python: before_install: - sudo apt-get update - sudo apt-get install ghostscript pdftk poppler-utils qpdf + - nvm install v12.18.1 + - nvm use v12.18.1 install: - - pip install -e .[dev] + - pip install -e .[test] script: - green -vv -a ./tests diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d71f210..b734bd1 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,37 @@ def url_fetcher(url): return weasyprint.default_url_fetcher(url) +def make_readable(request_html): + """Use an extraction method to get the main article html + + This function checks if ReadabiliPy is installed with NodeJS support, as + that generally yields better results. If that is not available, it falls + back on readability. + """ + + have_readabilipy_js = False + try: + import readabilipy + + have_readabilipy_js = readabilipy.simple_json.have_node() + except ImportError: + pass + + if have_readabilipy_js: + logger.info("Converting HTML using Readability.js") + article = readabilipy.simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + logger.info("Converting HTML using readability") + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + + class ImgProcessor(markdown.treeprocessors.Treeprocessor): def __init__(self, base_url, *args, **kwargs): self._base_url = base_url @@ -67,17 +98,21 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) - img.attrib["src"] = img.attrib['src'].rstrip('/') + img.attrib["src"] = img.attrib["src"].rstrip("/") class HTMLInformer(Informer): def __init__(self): super().__init__() + self._cached_title = None + self._cached_article = None def get_filename(self, abs_url): - request_text = get_page_with_retry(abs_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() + request_html = get_page_with_retry(abs_url, return_text=True) + title, article = make_readable(request_html) + + self._cached_title = title + self._cached_article = article # Clean the title and make it titlecase title = clean_string(title) @@ -107,14 +142,16 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - request_text = get_page_with_retry(pdf_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() - raw_html = doc.summary(html_partial=True) + if self.informer._cached_title and self.informer._cached_article: + title = self.informer._cached_title + article = self.informer._cached_article + else: + request_html = get_page_with_retry(pdf_url, return_text=True) + title, article = make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False - text = h2t.handle(raw_html) + text = h2t.handle(article) # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) @@ -29,18 +29,20 @@ REQUIRED = [ "readability-lxml>=0.7.1", "html2text>=2020.1.16", "weasyprint>=51", - "markdown>=3.1.1" + "markdown>=3.1.1", ] +full_require = ["readabilipy"] docs_require = [] -test_require = [] -dev_require = ["green"] +test_require = ["green"] +dev_require = [] # What packages are optional? EXTRAS = { + "full": full_require, "docs": docs_require, - "tests": test_require, - "dev": docs_require + test_require + dev_require, + "test": test_require + full_require, + "dev": docs_require + test_require + dev_require + full_require, } # The rest you shouldn't have to touch too much :) diff --git a/tests/test_providers.py b/tests/test_providers.py index eeaef82..70d012a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -278,7 +278,10 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) # this is a proxy test to check that all images are included @@ -291,6 +294,13 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_semantic_scholar_1(self): prov = SemanticScholar(upload=False, verbose=VERBOSE) url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" @@ -320,5 +330,6 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() |
