From 0456a377b3deef09a533b79224f4590e02372040 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 13:17:58 +0000 Subject: [WIP] Initial commit of HTML provider --- paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/html.py | 122 +++++++++++++++++++++++++++++++++ paper2remarkable/ui.py | 13 +++- paper2remarkable/utils.py | 4 +- setup.py | 4 ++ tests/test_providers.py | 8 +++ 6 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 paper2remarkable/providers/html.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index fabdcfe..f87a044 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -3,6 +3,7 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX +from .html import HTML from .local import LocalFile from .neurips import NeurIPS from .openreview import OpenReview @@ -23,4 +24,5 @@ providers = [ Springer, LocalFile, PdfUrl, + HTML, ] diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py new file mode 100644 index 0000000..6136fc9 --- /dev/null +++ b/paper2remarkable/providers/html.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +"""Provider for HTML documents + +This provider is a little bit special, in that it isn't simply pulling an +academic paper from a site, but instead aims to pull a HTML article. + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2020, G.J.J. van den Burg + +""" + +import html2text +import markdown +import readability +import titlecase +import unidecode +import urllib +import weasyprint +import weasyprint.fonts + +from ._base import Provider +from ._info import Informer + +from ..utils import clean_string, get_page_with_retry +from ..log import Logger + +logger = Logger() + +CSS = """ +@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); +@page { size: A4; margin: 1in; } +a { color: black; } +img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } +p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +h1,h2,h3 { font-family: 'Noto Serif'; } +h1 { font-size: 26px; } +h2 { font-size: 18px; } +h3 { font-size: 14px; } +""" + + +def my_fetcher(url): + if url.startswith("//"): + url = "https:" + url + elif url.startswith("file:///"): + url = "https:" + url[len("file:/") :] + return weasyprint.default_url_fetcher(url) + + +class HTMLInformer(Informer): + def __init__(self): + super().__init__() + + def get_filename(self, abs_url): + request_text = get_page_with_retry(abs_url, return_text=True) + doc = readability.Document(request_text) + title = doc.title() + + # Clean the title and make it titlecase + title = clean_string(title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + title = clean_string(title) + name = title + ".pdf" + name = unidecode.unidecode(name) + logger.info("Created filename: %s" % name) + return name + + +class HTML(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = HTMLInformer() + + def get_abs_pdf_urls(self, url): + return url, url + + def retrieve_pdf(self, pdf_url, filename): + """Turn the HTML article in a clean pdf file""" + # Steps + # 1. Pull the HTML page using requests + # 2. Extract the article part of the page using readability + # 3. Convert the article HTML to markdown using html2text + # 4. Convert the markdown back to HTML (this is done to sanitize HTML) + # 4. Convert the HTML to PDF, pulling in images where needed + # 5. Save the PDF to the specified filename. + request_text = get_page_with_retry(pdf_url, return_text=True) + doc = readability.Document(request_text) + title = doc.title() + raw_html = doc.summary(html_partial=True) + + h2t = html2text.HTML2Text() + h2t.wrap_links = False + text = h2t.handle(raw_html) + + # Add the title back to the document + article = "# {title}\n\n{text}".format(title=title, text=text) + + # fix relative urls + base_url = "{0.scheme}://{0.netloc}".format( + urllib.parse.urlsplit(pdf_url) + ) + html_article = markdown.markdown(article) + html_article = html_article.replace(' src="//', ' src="https://') + html_article = html_article.replace( + ' src="/', ' src="{base}/'.format(base=base_url) + ) + + font_config = weasyprint.fonts.FontConfiguration() + html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher) + css = weasyprint.CSS(string=CSS, font_config=font_config) + + html.write_pdf(filename, stylesheets=[css], font_config=font_config) + + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 032bf99..05116ee 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,7 +13,7 @@ import sys from . import __version__, GITHUB_URL -from .providers import providers, LocalFile +from .providers import providers, LocalFile, HTML from .utils import follow_redirects, is_url @@ -21,6 +21,11 @@ def parse_args(): parser = argparse.ArgumentParser( description="Paper2reMarkable version %s" % __version__ ) + parser.add_argument( + "--html", + help="URL is to a HTML article instead of a PDF", + action="store_true", + ) parser.add_argument( "-b", "--blank", @@ -99,7 +104,11 @@ def main(): args = parse_args() cookiejar = None - if is_url(args.input): + if args.html and is_url(args.input): + # input is a url + url, cookiejar = follow_redirects(args.input) + provider = HTML + elif is_url(args.input): # input is a url url, cookiejar = follow_redirects(args.input) provider = next((p for p in providers if p.validate(url)), None) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 79421df..d4e5075 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -64,7 +64,7 @@ def download_url(url, filename, cookiejar=None): fid.write(content) -def get_page_with_retry(url, tries=5, cookiejar=None): +def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): count = 0 jar = {} if cookiejar is None else cookiejar while count < tries: @@ -82,6 +82,8 @@ def get_page_with_retry(url, tries=5, cookiejar=None): time.sleep(5) continue logger.info("Downloaded url: %s" % url) + if return_text: + return res.text return res.content diff --git a/setup.py b/setup.py index bddbd24..b8e3a86 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,10 @@ REQUIRED = [ "titlecase>=0.12", "PyPDF2>=1.26", "regex>=2018.11" + "readability-lxml>=0.7.1", + "html2text>=2020.1.16", + "weasyprint>=51", + "markdown>=3.1.1" ] docs_require = [] diff --git a/tests/test_providers.py b/tests/test_providers.py index e256eec..80f4662 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -16,6 +16,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + HTML, LocalFile, NeurIPS, OpenReview, @@ -206,6 +207,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_1(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 0ef64a9fd41a8edbfc35935d2b7f3f90c84200f1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 14:29:17 +0000 Subject: Use the actual page size of the remarkable --- paper2remarkable/providers/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 6136fc9..03121a5 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -30,10 +30,10 @@ logger = Logger() CSS = """ @import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); -@page { size: A4; margin: 1in; } +@page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } -p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } h1,h2,h3 { font-family: 'Noto Serif'; } h1 { font-size: 26px; } h2 { font-size: 18px; } -- cgit v1.2.3 From 8b8f517a1bedf3a9536d1d8bba3ba9ce301d6e13 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 12:33:49 +0000 Subject: Minor changes to css and adding debugging code --- paper2remarkable/providers/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 03121a5..5a35b07 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -33,11 +33,12 @@ CSS = """ @page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } -p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } h1,h2,h3 { font-family: 'Noto Serif'; } h1 { font-size: 26px; } h2 { font-size: 18px; } h3 { font-size: 14px; } +blockquote { font-style: italic; } """ @@ -108,6 +109,10 @@ class HTML(Provider): ' src="/', ' src="{base}/'.format(base=base_url) ) + if self.debug: + with open("./paper.html", "w") as fp: + fp.write(html_article) + font_config = weasyprint.fonts.FontConfiguration() html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher) css = weasyprint.CSS(string=CSS, font_config=font_config) -- cgit v1.2.3 From 1a957d97c9f3ea865820030e55d3c029c801fce3 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 12:37:31 +0000 Subject: Fix typo in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8e3a86..82a693a 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ REQUIRED = [ "unidecode>=1.1", "titlecase>=0.12", "PyPDF2>=1.26", - "regex>=2018.11" + "regex>=2018.11", "readability-lxml>=0.7.1", "html2text>=2020.1.16", "weasyprint>=51", -- cgit v1.2.3 From 865fa3526ab637bc777e620649c7e7987cd54428 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 21:57:42 +0000 Subject: Minor improvements to html provider --- paper2remarkable/providers/html.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 5a35b07..20185fd 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -29,7 +29,7 @@ from ..log import Logger logger = Logger() CSS = """ -@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); +@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap'); @page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } @@ -39,6 +39,8 @@ h1 { font-size: 26px; } h2 { font-size: 18px; } h3 { font-size: 14px; } blockquote { font-style: italic; } +pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; } +code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; } """ @@ -64,7 +66,7 @@ class HTMLInformer(Informer): title = titlecase.titlecase(title) title = title.replace(" ", "_") title = clean_string(title) - name = title + ".pdf" + name = title.strip("_") + ".pdf" name = unidecode.unidecode(name) logger.info("Created filename: %s" % name) return name -- cgit v1.2.3 From 7726955a69093f969f5c282593c2565a3210fa5b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:08:01 +0000 Subject: Add short cli flag for html --- paper2remarkable/ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 38eb4fb..11f1c02 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -22,7 +22,7 @@ def parse_args(): description="Paper2reMarkable version %s" % __version__ ) parser.add_argument( - "--html", + '-t', "--html", help="URL is to a HTML article instead of a PDF", action="store_true", ) -- cgit v1.2.3 From 21e334a1c1e8d70974f6e0cca2fe6a05c25abc48 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:09:55 +0000 Subject: Update readme with HTML source support --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc05a23..1429483 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,12 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/): $ p2r https://arxiv.org/abs/1811.11242 ``` +There is also support for transferring an article from a website: + +``` +p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines +``` + The script can be run through the ``p2r`` command line program or via Docker (see below). @@ -26,11 +32,12 @@ reMarkable from any of the following sources: * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file +* Any article on a website (using ``--html``) The program aims to be flexible to the exact source URL, so for many of the -sources you can either provide a URL to the abstract page or to the PDF file. -If you have an source that you would like to see added to the list, let me -know! +academic sources you can either provide a URL to the abstract page or to the +PDF file. If you have an source that you would like to see added to the list, +let me know! ``paper2remarkable`` takes the source URL and: -- cgit v1.2.3