aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.travis.yml4
-rw-r--r--paper2remarkable/providers/html.py55
-rw-r--r--setup.py12
-rw-r--r--tests/test_providers.py13
4 files changed, 68 insertions, 16 deletions
diff --git a/.travis.yml b/.travis.yml
index 8399160..6a57cd3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,9 +7,11 @@ python:
before_install:
- sudo apt-get update
- sudo apt-get install ghostscript pdftk poppler-utils qpdf
+ - nvm install v12.18.1
+ - nvm use v12.18.1
install:
- - pip install -e .[dev]
+ - pip install -e .[test]
script:
- green -vv -a ./tests
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index d71f210..b734bd1 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -56,6 +56,37 @@ def url_fetcher(url):
return weasyprint.default_url_fetcher(url)
+def make_readable(request_html):
+ """Use an extraction method to get the main article html
+
+ This function checks if ReadabiliPy is installed with NodeJS support, as
+ that generally yields better results. If that is not available, it falls
+ back on readability.
+ """
+
+ have_readabilipy_js = False
+ try:
+ import readabilipy
+
+ have_readabilipy_js = readabilipy.simple_json.have_node()
+ except ImportError:
+ pass
+
+ if have_readabilipy_js:
+ logger.info("Converting HTML using Readability.js")
+ article = readabilipy.simple_json_from_html_string(
+ request_html, use_readability=True
+ )
+ title = article["title"]
+ raw_html = article["content"]
+ else:
+ logger.info("Converting HTML using readability")
+ doc = readability.Document(request_html)
+ title = doc.title()
+ raw_html = doc.summary(html_partial=True)
+ return title, raw_html
+
+
class ImgProcessor(markdown.treeprocessors.Treeprocessor):
def __init__(self, base_url, *args, **kwargs):
self._base_url = base_url
@@ -67,17 +98,21 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor):
img.attrib["src"] = urllib.parse.urljoin(
self._base_url, img.attrib["src"]
)
- img.attrib["src"] = img.attrib['src'].rstrip('/')
+ img.attrib["src"] = img.attrib["src"].rstrip("/")
class HTMLInformer(Informer):
def __init__(self):
super().__init__()
+ self._cached_title = None
+ self._cached_article = None
def get_filename(self, abs_url):
- request_text = get_page_with_retry(abs_url, return_text=True)
- doc = readability.Document(request_text)
- title = doc.title()
+ request_html = get_page_with_retry(abs_url, return_text=True)
+ title, article = make_readable(request_html)
+
+ self._cached_title = title
+ self._cached_article = article
# Clean the title and make it titlecase
title = clean_string(title)
@@ -107,14 +142,16 @@ class HTML(Provider):
# 4. Convert the markdown back to HTML (this is done to sanitize HTML)
# 4. Convert the HTML to PDF, pulling in images where needed
# 5. Save the PDF to the specified filename.
- request_text = get_page_with_retry(pdf_url, return_text=True)
- doc = readability.Document(request_text)
- title = doc.title()
- raw_html = doc.summary(html_partial=True)
+ if self.informer._cached_title and self.informer._cached_article:
+ title = self.informer._cached_title
+ article = self.informer._cached_article
+ else:
+ request_html = get_page_with_retry(pdf_url, return_text=True)
+ title, article = make_readable(request_html)
h2t = html2text.HTML2Text()
h2t.wrap_links = False
- text = h2t.handle(raw_html)
+ text = h2t.handle(article)
# Add the title back to the document
article = "# {title}\n\n{text}".format(title=title, text=text)
diff --git a/setup.py b/setup.py
index 82a693a..54a8cb1 100644
--- a/setup.py
+++ b/setup.py
@@ -29,18 +29,20 @@ REQUIRED = [
"readability-lxml>=0.7.1",
"html2text>=2020.1.16",
"weasyprint>=51",
- "markdown>=3.1.1"
+ "markdown>=3.1.1",
]
+full_require = ["readabilipy"]
docs_require = []
-test_require = []
-dev_require = ["green"]
+test_require = ["green"]
+dev_require = []
# What packages are optional?
EXTRAS = {
+ "full": full_require,
"docs": docs_require,
- "tests": test_require,
- "dev": docs_require + test_require + dev_require,
+ "test": test_require + full_require,
+ "dev": docs_require + test_require + dev_require + full_require,
}
# The rest you shouldn't have to touch too much :)
diff --git a/tests/test_providers.py b/tests/test_providers.py
index eeaef82..70d012a 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -278,7 +278,10 @@ class TestProviders(unittest.TestCase):
def test_html_3(self):
prov = HTML(upload=False, verbose=VERBOSE)
url = "https://conclave-team.github.io/conclave-site/"
- exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ # NOTE: Title differs between Readability.JS and readability-lxml, we
+ # assume that testing is done with Readability.JS
+ exp = "Conclave.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
# this is a proxy test to check that all images are included
@@ -291,6 +294,13 @@ class TestProviders(unittest.TestCase):
# this is a proxy test to check that all images are included
self.assertEqual(4, len(pdfplumber.open(filename).pages))
+ def test_html_5(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
def test_semantic_scholar_1(self):
prov = SemanticScholar(upload=False, verbose=VERBOSE)
url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
@@ -320,5 +330,6 @@ class TestProviders(unittest.TestCase):
self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()