diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-09-25 23:36:47 +0200 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-09-25 23:36:47 +0200 |
| commit | 3b5e7eb5f34f92496aa96ee088db2925eadafd65 (patch) | |
| tree | a67f43a9178401fbdc0d6e6b7cb7047250ab841c | |
| parent | Merge branch 'master' into bugfix/html-figure (diff) | |
| download | paper2remarkable-3b5e7eb5f34f92496aa96ee088db2925eadafd65.tar.gz paper2remarkable-3b5e7eb5f34f92496aa96ee088db2925eadafd65.zip | |
Improve docs
| -rw-r--r-- | paper2remarkable/providers/html.py | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index b734bd1..e050ea3 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -134,14 +134,17 @@ class HTML(Provider): return url, url def retrieve_pdf(self, pdf_url, filename): - """Turn the HTML article in a clean pdf file""" - # Steps - # 1. Pull the HTML page using requests - # 2. Extract the article part of the page using readability - # 3. Convert the article HTML to markdown using html2text - # 4. Convert the markdown back to HTML (this is done to sanitize HTML) - # 4. Convert the HTML to PDF, pulling in images where needed - # 5. Save the PDF to the specified filename. + """Turn the HTML article in a clean pdf file + + This function takes the following steps: + + 1. Pull the HTML page using requests, if not done in Informer + 2. Extract the article part of the page using readability/readabiliPy + 3. Convert the article HTML to markdown using html2text + 4. Convert the markdown back to HTML (done to sanitize the HTML) + 4. Convert the HTML to PDF, pulling in images where needed + 5. Save the PDF to the specified filename. + """ if self.informer._cached_title and self.informer._cached_article: title = self.informer._cached_title article = self.informer._cached_article |
