aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-03 22:13:57 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-02-03 22:13:57 +0000
commit88cfce1531dba39e009f3890bc147209c175fe5c (patch)
tree529caf42237b82497de67eba37c9009889483b44
parentAdd --version command to cli (diff)
parentMerge branch 'master' into feature/html-document (diff)
downloadpaper2remarkable-88cfce1531dba39e009f3890bc147209c175fe5c.tar.gz
paper2remarkable-88cfce1531dba39e009f3890bc147209c175fe5c.zip
Merge branch 'feature/html-document'
-rw-r--r--README.md13
-rw-r--r--paper2remarkable/providers/__init__.py2
-rw-r--r--paper2remarkable/providers/html.py129
-rw-r--r--paper2remarkable/ui.py13
-rw-r--r--paper2remarkable/utils.py4
-rw-r--r--setup.py6
-rw-r--r--tests/test_providers.py8
7 files changed, 168 insertions, 7 deletions
diff --git a/README.md b/README.md
index dc05a23..1429483 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,12 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/):
$ p2r https://arxiv.org/abs/1811.11242
```
+There is also support for transferring an article from a website:
+
+```
+p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines
+```
+
The script can be run through the ``p2r`` command line program or via Docker
(see below).
@@ -26,11 +32,12 @@ reMarkable from any of the following sources:
* [SpringerLink](https://link.springer.com/)
* A generic URL to a PDF file
* A local PDF file
+* Any article on a website (using ``--html``)
The program aims to be flexible to the exact source URL, so for many of the
-sources you can either provide a URL to the abstract page or to the PDF file.
-If you have an source that you would like to see added to the list, let me
-know!
+academic sources you can either provide a URL to the abstract page or to the
+PDF file. If you have an source that you would like to see added to the list,
+let me know!
``paper2remarkable`` takes the source URL and:
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index fabdcfe..f87a044 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -3,6 +3,7 @@
from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
+from .html import HTML
from .local import LocalFile
from .neurips import NeurIPS
from .openreview import OpenReview
@@ -23,4 +24,5 @@ providers = [
Springer,
LocalFile,
PdfUrl,
+ HTML,
]
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
new file mode 100644
index 0000000..20185fd
--- /dev/null
+++ b/paper2remarkable/providers/html.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for HTML documents
+
+This provider is a little bit special, in that it isn't simply pulling an
+academic paper from a site, but instead aims to pull a HTML article.
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import html2text
+import markdown
+import readability
+import titlecase
+import unidecode
+import urllib
+import weasyprint
+import weasyprint.fonts
+
+from ._base import Provider
+from ._info import Informer
+
+from ..utils import clean_string, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
+
+CSS = """
+@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap');
+@page { size: 702px 936px; margin: 1in; }
+a { color: black; }
+img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
+p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
+h1,h2,h3 { font-family: 'Noto Serif'; }
+h1 { font-size: 26px; }
+h2 { font-size: 18px; }
+h3 { font-size: 14px; }
+blockquote { font-style: italic; }
+pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; }
+code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; }
+"""
+
+
+def my_fetcher(url):
+ if url.startswith("//"):
+ url = "https:" + url
+ elif url.startswith("file:///"):
+ url = "https:" + url[len("file:/") :]
+ return weasyprint.default_url_fetcher(url)
+
+
+class HTMLInformer(Informer):
+ def __init__(self):
+ super().__init__()
+
+ def get_filename(self, abs_url):
+ request_text = get_page_with_retry(abs_url, return_text=True)
+ doc = readability.Document(request_text)
+ title = doc.title()
+
+ # Clean the title and make it titlecase
+ title = clean_string(title)
+ title = titlecase.titlecase(title)
+ title = title.replace(" ", "_")
+ title = clean_string(title)
+ name = title.strip("_") + ".pdf"
+ name = unidecode.unidecode(name)
+ logger.info("Created filename: %s" % name)
+ return name
+
+
+class HTML(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = HTMLInformer()
+
+ def get_abs_pdf_urls(self, url):
+ return url, url
+
+ def retrieve_pdf(self, pdf_url, filename):
+ """Turn the HTML article in a clean pdf file"""
+ # Steps
+ # 1. Pull the HTML page using requests
+ # 2. Extract the article part of the page using readability
+ # 3. Convert the article HTML to markdown using html2text
+ # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
+ # 4. Convert the HTML to PDF, pulling in images where needed
+ # 5. Save the PDF to the specified filename.
+ request_text = get_page_with_retry(pdf_url, return_text=True)
+ doc = readability.Document(request_text)
+ title = doc.title()
+ raw_html = doc.summary(html_partial=True)
+
+ h2t = html2text.HTML2Text()
+ h2t.wrap_links = False
+ text = h2t.handle(raw_html)
+
+ # Add the title back to the document
+ article = "# {title}\n\n{text}".format(title=title, text=text)
+
+ # fix relative urls
+ base_url = "{0.scheme}://{0.netloc}".format(
+ urllib.parse.urlsplit(pdf_url)
+ )
+ html_article = markdown.markdown(article)
+ html_article = html_article.replace(' src="//', ' src="https://')
+ html_article = html_article.replace(
+ ' src="/', ' src="{base}/'.format(base=base_url)
+ )
+
+ if self.debug:
+ with open("./paper.html", "w") as fp:
+ fp.write(html_article)
+
+ font_config = weasyprint.fonts.FontConfiguration()
+ html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher)
+ css = weasyprint.CSS(string=CSS, font_config=font_config)
+
+ html.write_pdf(filename, stylesheets=[css], font_config=font_config)
+
+ def validate(src):
+ try:
+ result = urllib.parse.urlparse(src)
+ return all([result.scheme, result.netloc, result.path])
+ except:
+ return False
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 9b5dd42..50ccad9 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -13,7 +13,7 @@ import sys
from . import __version__, GITHUB_URL
-from .providers import providers, LocalFile
+from .providers import providers, LocalFile, HTML
from .utils import follow_redirects, is_url
@@ -22,6 +22,11 @@ def parse_args():
description="Paper2reMarkable version %s" % __version__
)
parser.add_argument(
+ '-t', "--html",
+ help="URL is to a HTML article instead of a PDF",
+ action="store_true",
+ )
+ parser.add_argument(
"-b",
"--blank",
help="Add a blank page after every page of the PDF",
@@ -106,7 +111,11 @@ def main():
args = parse_args()
cookiejar = None
- if LocalFile.validate(args.input):
+ if args.html and is_url(args.input):
+ # input is a url
+ url, cookiejar = follow_redirects(args.input)
+ provider = HTML
+ elif LocalFile.validate(args.input):
# input is a local file
provider = LocalFile
elif is_url(args.input):
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 79421df..d4e5075 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -64,7 +64,7 @@ def download_url(url, filename, cookiejar=None):
fid.write(content)
-def get_page_with_retry(url, tries=5, cookiejar=None):
+def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False):
count = 0
jar = {} if cookiejar is None else cookiejar
while count < tries:
@@ -82,6 +82,8 @@ def get_page_with_retry(url, tries=5, cookiejar=None):
time.sleep(5)
continue
logger.info("Downloaded url: %s" % url)
+ if return_text:
+ return res.text
return res.content
diff --git a/setup.py b/setup.py
index bddbd24..82a693a 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,11 @@ REQUIRED = [
"unidecode>=1.1",
"titlecase>=0.12",
"PyPDF2>=1.26",
- "regex>=2018.11"
+ "regex>=2018.11",
+ "readability-lxml>=0.7.1",
+ "html2text>=2020.1.16",
+ "weasyprint>=51",
+ "markdown>=3.1.1"
]
docs_require = []
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e256eec..80f4662 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -16,6 +16,7 @@ from paper2remarkable.providers import (
ACM,
Arxiv,
CiteSeerX,
+ HTML,
LocalFile,
NeurIPS,
OpenReview,
@@ -206,6 +207,13 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_html_1(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
+ exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
if __name__ == "__main__":
unittest.main()