From 0456a377b3deef09a533b79224f4590e02372040 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 31 Jan 2020 13:17:58 +0000
Subject: [WIP] Initial commit of HTML provider

---
 paper2remarkable/providers/__init__.py |   2 +
 paper2remarkable/providers/html.py     | 122 +++++++++++++++++++++++++++++++++
 paper2remarkable/ui.py                 |  13 +++-
 paper2remarkable/utils.py              |   4 +-
 setup.py                               |   4 ++
 tests/test_providers.py                |   8 +++
 6 files changed, 150 insertions(+), 3 deletions(-)
 create mode 100644 paper2remarkable/providers/html.py

diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index fabdcfe..f87a044 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -3,6 +3,7 @@
 from .acm import ACM
 from .arxiv import Arxiv
 from .citeseerx import CiteSeerX
+from .html import HTML
 from .local import LocalFile
 from .neurips import NeurIPS
 from .openreview import OpenReview
@@ -23,4 +24,5 @@ providers = [
     Springer,
     LocalFile,
     PdfUrl,
+    HTML,
 ]
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
new file mode 100644
index 0000000..6136fc9
--- /dev/null
+++ b/paper2remarkable/providers/html.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for HTML documents
+
+This provider is a little bit special, in that it isn't simply pulling an 
+academic paper from a site, but instead aims to pull a HTML article.
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import html2text
+import markdown
+import readability
+import titlecase
+import unidecode
+import urllib
+import weasyprint
+import weasyprint.fonts
+
+from ._base import Provider
+from ._info import Informer
+
+from ..utils import clean_string, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
+
+CSS = """
+@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap');
+@page { size: A4; margin: 1in; }
+a { color: black; }
+img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
+p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
+h1,h2,h3 { font-family: 'Noto Serif'; }
+h1 { font-size: 26px; }
+h2 { font-size: 18px; }
+h3 { font-size: 14px; }
+"""
+
+
+def my_fetcher(url):
+    if url.startswith("//"):
+        url = "https:" + url
+    elif url.startswith("file:///"):
+        url = "https:" + url[len("file:/") :]
+    return weasyprint.default_url_fetcher(url)
+
+
+class HTMLInformer(Informer):
+    def __init__(self):
+        super().__init__()
+
+    def get_filename(self, abs_url):
+        request_text = get_page_with_retry(abs_url, return_text=True)
+        doc = readability.Document(request_text)
+        title = doc.title()
+
+        # Clean the title and make it titlecase
+        title = clean_string(title)
+        title = titlecase.titlecase(title)
+        title = title.replace(" ", "_")
+        title = clean_string(title)
+        name = title + ".pdf"
+        name = unidecode.unidecode(name)
+        logger.info("Created filename: %s" % name)
+        return name
+
+
+class HTML(Provider):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.informer = HTMLInformer()
+
+    def get_abs_pdf_urls(self, url):
+        return url, url
+
+    def retrieve_pdf(self, pdf_url, filename):
+        """Turn the HTML article in a clean pdf file"""
+        # Steps
+        # 1. Pull the HTML page using requests
+        # 2. Extract the article part of the page using readability
+        # 3. Convert the article HTML to markdown using html2text
+        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
+        # 4. Convert the HTML to PDF, pulling in images where needed
+        # 5. Save the PDF to the specified filename.
+        request_text = get_page_with_retry(pdf_url, return_text=True)
+        doc = readability.Document(request_text)
+        title = doc.title()
+        raw_html = doc.summary(html_partial=True)
+
+        h2t = html2text.HTML2Text()
+        h2t.wrap_links = False
+        text = h2t.handle(raw_html)
+
+        # Add the title back to the document
+        article = "# {title}\n\n{text}".format(title=title, text=text)
+
+        # fix relative urls
+        base_url = "{0.scheme}://{0.netloc}".format(
+            urllib.parse.urlsplit(pdf_url)
+        )
+        html_article = markdown.markdown(article)
+        html_article = html_article.replace(' src="//', ' src="https://')
+        html_article = html_article.replace(
+            ' src="/', ' src="{base}/'.format(base=base_url)
+        )
+
+        font_config = weasyprint.fonts.FontConfiguration()
+        html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher)
+        css = weasyprint.CSS(string=CSS, font_config=font_config)
+
+        html.write_pdf(filename, stylesheets=[css], font_config=font_config)
+
+    def validate(src):
+        try:
+            result = urllib.parse.urlparse(src)
+            return all([result.scheme, result.netloc, result.path])
+        except:
+            return False
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 032bf99..05116ee 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -13,7 +13,7 @@ import sys
 
 from . import __version__, GITHUB_URL
 
-from .providers import providers, LocalFile
+from .providers import providers, LocalFile, HTML
 from .utils import follow_redirects, is_url
 
 
@@ -21,6 +21,11 @@ def parse_args():
     parser = argparse.ArgumentParser(
         description="Paper2reMarkable version %s" % __version__
     )
+    parser.add_argument(
+        "--html",
+        help="URL is to a HTML article instead of a PDF",
+        action="store_true",
+    )
     parser.add_argument(
         "-b",
         "--blank",
@@ -99,7 +104,11 @@ def main():
     args = parse_args()
     cookiejar = None
 
-    if is_url(args.input):
+    if args.html and is_url(args.input):
+        # input is a url
+        url, cookiejar = follow_redirects(args.input)
+        provider = HTML
+    elif is_url(args.input):
         # input is a url
         url, cookiejar = follow_redirects(args.input)
         provider = next((p for p in providers if p.validate(url)), None)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 79421df..d4e5075 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -64,7 +64,7 @@ def download_url(url, filename, cookiejar=None):
         fid.write(content)
 
 
-def get_page_with_retry(url, tries=5, cookiejar=None):
+def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False):
     count = 0
     jar = {} if cookiejar is None else cookiejar
     while count < tries:
@@ -82,6 +82,8 @@ def get_page_with_retry(url, tries=5, cookiejar=None):
             time.sleep(5)
             continue
         logger.info("Downloaded url: %s" % url)
+        if return_text:
+            return res.text
         return res.content
 
 
diff --git a/setup.py b/setup.py
index bddbd24..b8e3a86 100644
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,10 @@ REQUIRED = [
     "titlecase>=0.12",
     "PyPDF2>=1.26",
     "regex>=2018.11"
+    "readability-lxml>=0.7.1",
+    "html2text>=2020.1.16",
+    "weasyprint>=51",
+    "markdown>=3.1.1"
 ]
 
 docs_require = []
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e256eec..80f4662 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -16,6 +16,7 @@ from paper2remarkable.providers import (
     ACM,
     Arxiv,
     CiteSeerX,
+    HTML,
     LocalFile,
     NeurIPS,
     OpenReview,
@@ -206,6 +207,13 @@ class TestProviders(unittest.TestCase):
         filename = prov.run(url)
         self.assertEqual(exp, os.path.basename(filename))
 
+    def test_html_1(self):
+        prov = HTML(upload=False, verbose=VERBOSE)
+        url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
+        exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf"
+        filename = prov.run(url)
+        self.assertEqual(exp, os.path.basename(filename))
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
cgit v1.2.3


From 0ef64a9fd41a8edbfc35935d2b7f3f90c84200f1 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Fri, 31 Jan 2020 14:29:17 +0000
Subject: Use the actual page size of the remarkable

---
 paper2remarkable/providers/html.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 6136fc9..03121a5 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -30,10 +30,10 @@ logger = Logger()
 
 CSS = """
 @import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap');
-@page { size: A4; margin: 1in; }
+@page { size: 702px 936px; margin: 1in; }
 a { color: black; }
 img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
-p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
+p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
 h1,h2,h3 { font-family: 'Noto Serif'; }
 h1 { font-size: 26px; }
 h2 { font-size: 18px; }
-- 
cgit v1.2.3


From 8b8f517a1bedf3a9536d1d8bba3ba9ce301d6e13 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 3 Feb 2020 12:33:49 +0000
Subject: Minor changes to css and adding debugging code

---
 paper2remarkable/providers/html.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 03121a5..5a35b07 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -33,11 +33,12 @@ CSS = """
 @page { size: 702px 936px; margin: 1in; }
 a { color: black; }
 img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
-p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
+p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
 h1,h2,h3 { font-family: 'Noto Serif'; }
 h1 { font-size: 26px; }
 h2 { font-size: 18px; }
 h3 { font-size: 14px; }
+blockquote { font-style: italic; }
 """
 
 
@@ -108,6 +109,10 @@ class HTML(Provider):
             ' src="/', ' src="{base}/'.format(base=base_url)
         )
 
+        if self.debug:
+            with open("./paper.html", "w") as fp:
+                fp.write(html_article)
+
         font_config = weasyprint.fonts.FontConfiguration()
         html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher)
         css = weasyprint.CSS(string=CSS, font_config=font_config)
-- 
cgit v1.2.3


From 1a957d97c9f3ea865820030e55d3c029c801fce3 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 3 Feb 2020 12:37:31 +0000
Subject: Fix typo in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b8e3a86..82a693a 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ REQUIRED = [
     "unidecode>=1.1",
     "titlecase>=0.12",
     "PyPDF2>=1.26",
-    "regex>=2018.11"
+    "regex>=2018.11",
     "readability-lxml>=0.7.1",
     "html2text>=2020.1.16",
     "weasyprint>=51",
-- 
cgit v1.2.3


From 865fa3526ab637bc777e620649c7e7987cd54428 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 3 Feb 2020 21:57:42 +0000
Subject: Minor improvements to html provider

---
 paper2remarkable/providers/html.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
index 5a35b07..20185fd 100644
--- a/paper2remarkable/providers/html.py
+++ b/paper2remarkable/providers/html.py
@@ -29,7 +29,7 @@ from ..log import Logger
 logger = Logger()
 
 CSS = """
-@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap');
+@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap');
 @page { size: 702px 936px; margin: 1in; }
 a { color: black; }
 img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
@@ -39,6 +39,8 @@ h1 { font-size: 26px; }
 h2 { font-size: 18px; }
 h3 { font-size: 14px; }
 blockquote { font-style: italic; }
+pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; }
+code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; }
 """
 
 
@@ -64,7 +66,7 @@ class HTMLInformer(Informer):
         title = titlecase.titlecase(title)
         title = title.replace(" ", "_")
         title = clean_string(title)
-        name = title + ".pdf"
+        name = title.strip("_") + ".pdf"
         name = unidecode.unidecode(name)
         logger.info("Created filename: %s" % name)
         return name
-- 
cgit v1.2.3


From 7726955a69093f969f5c282593c2565a3210fa5b Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 3 Feb 2020 22:08:01 +0000
Subject: Add short cli flag for html

---
 paper2remarkable/ui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 38eb4fb..11f1c02 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -22,7 +22,7 @@ def parse_args():
         description="Paper2reMarkable version %s" % __version__
     )
     parser.add_argument(
-        "--html",
+        '-t', "--html",
         help="URL is to a HTML article instead of a PDF",
         action="store_true",
     )
-- 
cgit v1.2.3


From 21e334a1c1e8d70974f6e0cca2fe6a05c25abc48 Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 3 Feb 2020 22:09:55 +0000
Subject: Update readme with HTML source support

---
 README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index dc05a23..1429483 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,12 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/):
 $ p2r https://arxiv.org/abs/1811.11242
 ```
 
+There is also support for transferring an article from a website:
+
+```
+p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines
+```
+
 The script can be run through the ``p2r`` command line program or via Docker 
 (see below).
 
@@ -26,11 +32,12 @@ reMarkable from any of the following sources:
 * [SpringerLink](https://link.springer.com/)
 * A generic URL to a PDF file
 * A local PDF file
+* Any article on a website (using ``--html``)
 
 The program aims to be flexible to the exact source URL, so for many of the 
-sources you can either provide a URL to the abstract page or to the PDF file. 
-If you have an source that you would like to see added to the list, let me 
-know!
+academic sources you can either provide a URL to the abstract page or to the 
+PDF file.  If you have an source that you would like to see added to the list, 
+let me know!
 
 ``paper2remarkable`` takes the source URL and:
 
-- 
cgit v1.2.3