From 48677feb7d08636fbbf2f4296c7f7133c6cbe6a6 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 30 Jan 2020 21:17:33 +0000 Subject: Remove explicit six dependency --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eb3ce93..ffee6c3 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ venv: $(VENV_DIR)/bin/activate $(VENV_DIR)/bin/activate: test -d $(VENV_DIR) || virtualenv $(VENV_DIR) - source $(VENV_DIR)/bin/activate && pip install -e .[dev] && pip install six + source $(VENV_DIR)/bin/activate && pip install -e .[dev] touch $(VENV_DIR)/bin/activate clean_venv: -- cgit v1.2.3 From 0456a377b3deef09a533b79224f4590e02372040 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 13:17:58 +0000 Subject: [WIP] Initial commit of HTML provider --- paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/html.py | 122 +++++++++++++++++++++++++++++++++ paper2remarkable/ui.py | 13 +++- paper2remarkable/utils.py | 4 +- setup.py | 4 ++ tests/test_providers.py | 8 +++ 6 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 paper2remarkable/providers/html.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index fabdcfe..f87a044 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -3,6 +3,7 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX +from .html import HTML from .local import LocalFile from .neurips import NeurIPS from .openreview import OpenReview @@ -23,4 +24,5 @@ providers = [ Springer, LocalFile, PdfUrl, + HTML, ] diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py new file mode 100644 index 0000000..6136fc9 --- /dev/null +++ b/paper2remarkable/providers/html.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +"""Provider for HTML documents + +This provider is a little bit special, in that it isn't simply pulling an +academic paper from a site, but instead aims to pull a HTML article. + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2020, G.J.J. van den Burg + +""" + +import html2text +import markdown +import readability +import titlecase +import unidecode +import urllib +import weasyprint +import weasyprint.fonts + +from ._base import Provider +from ._info import Informer + +from ..utils import clean_string, get_page_with_retry +from ..log import Logger + +logger = Logger() + +CSS = """ +@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); +@page { size: A4; margin: 1in; } +a { color: black; } +img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } +p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +h1,h2,h3 { font-family: 'Noto Serif'; } +h1 { font-size: 26px; } +h2 { font-size: 18px; } +h3 { font-size: 14px; } +""" + + +def my_fetcher(url): + if url.startswith("//"): + url = "https:" + url + elif url.startswith("file:///"): + url = "https:" + url[len("file:/") :] + return weasyprint.default_url_fetcher(url) + + +class HTMLInformer(Informer): + def __init__(self): + super().__init__() + + def get_filename(self, abs_url): + request_text = get_page_with_retry(abs_url, return_text=True) + doc = readability.Document(request_text) + title = doc.title() + + # Clean the title and make it titlecase + title = clean_string(title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + title = clean_string(title) + name = title + ".pdf" + name = unidecode.unidecode(name) + logger.info("Created filename: %s" % name) + return name + + +class HTML(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = HTMLInformer() + + def get_abs_pdf_urls(self, url): + return url, url + + def retrieve_pdf(self, pdf_url, filename): + """Turn the HTML article in a clean pdf file""" + # Steps + # 1. Pull the HTML page using requests + # 2. Extract the article part of the page using readability + # 3. Convert the article HTML to markdown using html2text + # 4. Convert the markdown back to HTML (this is done to sanitize HTML) + # 4. Convert the HTML to PDF, pulling in images where needed + # 5. Save the PDF to the specified filename. + request_text = get_page_with_retry(pdf_url, return_text=True) + doc = readability.Document(request_text) + title = doc.title() + raw_html = doc.summary(html_partial=True) + + h2t = html2text.HTML2Text() + h2t.wrap_links = False + text = h2t.handle(raw_html) + + # Add the title back to the document + article = "# {title}\n\n{text}".format(title=title, text=text) + + # fix relative urls + base_url = "{0.scheme}://{0.netloc}".format( + urllib.parse.urlsplit(pdf_url) + ) + html_article = markdown.markdown(article) + html_article = html_article.replace(' src="//', ' src="https://') + html_article = html_article.replace( + ' src="/', ' src="{base}/'.format(base=base_url) + ) + + font_config = weasyprint.fonts.FontConfiguration() + html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher) + css = weasyprint.CSS(string=CSS, font_config=font_config) + + html.write_pdf(filename, stylesheets=[css], font_config=font_config) + + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 032bf99..05116ee 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,7 +13,7 @@ import sys from . import __version__, GITHUB_URL -from .providers import providers, LocalFile +from .providers import providers, LocalFile, HTML from .utils import follow_redirects, is_url @@ -21,6 +21,11 @@ def parse_args(): parser = argparse.ArgumentParser( description="Paper2reMarkable version %s" % __version__ ) + parser.add_argument( + "--html", + help="URL is to a HTML article instead of a PDF", + action="store_true", + ) parser.add_argument( "-b", "--blank", @@ -99,7 +104,11 @@ def main(): args = parse_args() cookiejar = None - if is_url(args.input): + if args.html and is_url(args.input): + # input is a url + url, cookiejar = follow_redirects(args.input) + provider = HTML + elif is_url(args.input): # input is a url url, cookiejar = follow_redirects(args.input) provider = next((p for p in providers if p.validate(url)), None) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 79421df..d4e5075 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -64,7 +64,7 @@ def download_url(url, filename, cookiejar=None): fid.write(content) -def get_page_with_retry(url, tries=5, cookiejar=None): +def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): count = 0 jar = {} if cookiejar is None else cookiejar while count < tries: @@ -82,6 +82,8 @@ def get_page_with_retry(url, tries=5, cookiejar=None): time.sleep(5) continue logger.info("Downloaded url: %s" % url) + if return_text: + return res.text return res.content diff --git a/setup.py b/setup.py index bddbd24..b8e3a86 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,10 @@ REQUIRED = [ "titlecase>=0.12", "PyPDF2>=1.26", "regex>=2018.11" + "readability-lxml>=0.7.1", + "html2text>=2020.1.16", + "weasyprint>=51", + "markdown>=3.1.1" ] docs_require = [] diff --git a/tests/test_providers.py b/tests/test_providers.py index e256eec..80f4662 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -16,6 +16,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + HTML, LocalFile, NeurIPS, OpenReview, @@ -206,6 +207,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_1(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 0ef64a9fd41a8edbfc35935d2b7f3f90c84200f1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 14:29:17 +0000 Subject: Use the actual page size of the remarkable --- paper2remarkable/providers/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 6136fc9..03121a5 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -30,10 +30,10 @@ logger = Logger() CSS = """ @import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); -@page { size: A4; margin: 1in; } +@page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } -p { font-size: 11pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } h1,h2,h3 { font-family: 'Noto Serif'; } h1 { font-size: 26px; } h2 { font-size: 18px; } -- cgit v1.2.3 From 840de7bbb12f41964388d1488ec5ecdff9b37efd Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 31 Jan 2020 14:29:59 +0000 Subject: Fix cropping bug that results in rotated pages --- paper2remarkable/crop.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index d1a94d8..5f3b4e3 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -139,6 +139,15 @@ class Cropper(object): # This is the bounding box in PIL format: (0, 0) top left x0, y0, x1, y1 = left, top, W - right, H - bottom + # The remarkable changes the orientation of a portrait page if the + # width is greater than the height. To prevent this, we pad the height + # with extra whitespace. This should only occur if the original + # orientation of the page would be changed by cropping. + w, h = x1 - x0, y1 - y0 + if H > W and w > h: + y1 = y0 + w + 10 + h = y1 - y0 + # Get the bbox in Ghostscript format: (0, 0) bottom left a0, b0, a1, b1 = x0, H - y1, x1, H - y0 return [a0, b0, a1, b1] -- cgit v1.2.3 From 8b8f517a1bedf3a9536d1d8bba3ba9ce301d6e13 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 12:33:49 +0000 Subject: Minor changes to css and adding debugging code --- paper2remarkable/providers/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 03121a5..5a35b07 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -33,11 +33,12 @@ CSS = """ @page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } -p { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } h1,h2,h3 { font-family: 'Noto Serif'; } h1 { font-size: 26px; } h2 { font-size: 18px; } h3 { font-size: 14px; } +blockquote { font-style: italic; } """ @@ -108,6 +109,10 @@ class HTML(Provider): ' src="/', ' src="{base}/'.format(base=base_url) ) + if self.debug: + with open("./paper.html", "w") as fp: + fp.write(html_article) + font_config = weasyprint.fonts.FontConfiguration() html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher) css = weasyprint.CSS(string=CSS, font_config=font_config) -- cgit v1.2.3 From 1a957d97c9f3ea865820030e55d3c029c801fce3 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 12:37:31 +0000 Subject: Fix typo in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8e3a86..82a693a 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ REQUIRED = [ "unidecode>=1.1", "titlecase>=0.12", "PyPDF2>=1.26", - "regex>=2018.11" + "regex>=2018.11", "readability-lxml>=0.7.1", "html2text>=2020.1.16", "weasyprint>=51", -- cgit v1.2.3 From 865fa3526ab637bc777e620649c7e7987cd54428 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 21:57:42 +0000 Subject: Minor improvements to html provider --- paper2remarkable/providers/html.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 5a35b07..20185fd 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -29,7 +29,7 @@ from ..log import Logger logger = Logger() CSS = """ -@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif&display=swap'); +@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap'); @page { size: 702px 936px; margin: 1in; } a { color: black; } img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } @@ -39,6 +39,8 @@ h1 { font-size: 26px; } h2 { font-size: 18px; } h3 { font-size: 14px; } blockquote { font-style: italic; } +pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; } +code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; } """ @@ -64,7 +66,7 @@ class HTMLInformer(Informer): title = titlecase.titlecase(title) title = title.replace(" ", "_") title = clean_string(title) - name = title + ".pdf" + name = title.strip("_") + ".pdf" name = unidecode.unidecode(name) logger.info("Created filename: %s" % name) return name -- cgit v1.2.3 From 4c7c01f0ab441ab881d94fca25005561debf6773 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:00:17 +0000 Subject: Reorder provider check for local file The is_url function can in some cases consider a path to a file as a valid url, which defeats the purpose. So I'm setting it back to first checking for a local file, then checking for a url --- paper2remarkable/ui.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 032bf99..4160f12 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -99,13 +99,13 @@ def main(): args = parse_args() cookiejar = None - if is_url(args.input): + if LocalFile.validate(args.input): + # input is a local file + provider = LocalFile + elif is_url(args.input): # input is a url url, cookiejar = follow_redirects(args.input) provider = next((p for p in providers if p.validate(url)), None) - elif LocalFile.validate(args.input): - # input is a local file - provider = LocalFile else: # not a proper URL or non-existent file exception( -- cgit v1.2.3 From 7726955a69093f969f5c282593c2565a3210fa5b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:08:01 +0000 Subject: Add short cli flag for html --- paper2remarkable/ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 38eb4fb..11f1c02 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -22,7 +22,7 @@ def parse_args(): description="Paper2reMarkable version %s" % __version__ ) parser.add_argument( - "--html", + '-t', "--html", help="URL is to a HTML article instead of a PDF", action="store_true", ) -- cgit v1.2.3 From 21e334a1c1e8d70974f6e0cca2fe6a05c25abc48 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:09:55 +0000 Subject: Update readme with HTML source support --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc05a23..1429483 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,12 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/): $ p2r https://arxiv.org/abs/1811.11242 ``` +There is also support for transferring an article from a website: + +``` +p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines +``` + The script can be run through the ``p2r`` command line program or via Docker (see below). @@ -26,11 +32,12 @@ reMarkable from any of the following sources: * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file +* Any article on a website (using ``--html``) The program aims to be flexible to the exact source URL, so for many of the -sources you can either provide a URL to the abstract page or to the PDF file. -If you have an source that you would like to see added to the list, let me -know! +academic sources you can either provide a URL to the abstract page or to the +PDF file. If you have an source that you would like to see added to the list, +let me know! ``paper2remarkable`` takes the source URL and: -- cgit v1.2.3 From 0d8db829ec6a7ed0507cebbfabf0454149226bfa Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:11:35 +0000 Subject: Add --version command to cli --- paper2remarkable/ui.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 4160f12..9b5dd42 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -55,6 +55,13 @@ def parse_args(): parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) + parser.add_argument( + "-V", + "--version", + help="Show version and exit", + action="version", + version=__version__, + ) parser.add_argument( "--filename", help="Filename to use for the file on reMarkable", -- cgit v1.2.3 From 941e893497ae69f9e4db653c83afff8df3c9c436 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:20:32 +0000 Subject: Bump version and update changelog --- CHANGELOG.md | 7 +++++++ README.md | 6 ++++-- paper2remarkable/__version__.py | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79ea620..a888340 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## Version 0.5.0 + +* Add support for articles from the web using the ``--html`` flag + ([#23](https://github.com/GjjvdBurg/paper2remarkable/pull/23)) +* Add ``--version`` command to command line interface +* Fix cropping bug that resulted in occassional rotated pages + ## Version 0.4.6 * Add support for older arXiv URL scheme diff --git a/README.md b/README.md index 1429483..b12dbf6 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,19 @@ Optionally, you can: Here's the full help of the script: ```text -usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] +usage: p2r [-h] [-t] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] [-V] [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK] [--rmapi RMAPI] input -Paper2reMarkable version 0.4.0 +Paper2reMarkable version 0.5.0 positional arguments: input URL to a paper or the path of a local PDF file optional arguments: -h, --help show this help message and exit + -t, --html URL is to a HTML article instead of a PDF -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align -d, --debug debug mode, doesn't upload to reMarkable @@ -82,6 +83,7 @@ optional arguments: directory on reMarkable to put the file (created if missing, default: /) -v, --verbose be verbose + -V, --version Show version and exit --filename FILENAME Filename to use for the file on reMarkable --gs GS path to gs executable (default: gs) --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 6540db2..7f3b8c6 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 4, 6) +VERSION = (0, 5, 0) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 5383d413837f153fb5f4d519f9d624ad0a0f5cef Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 3 Feb 2020 22:21:44 +0000 Subject: Minor readme typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b12dbf6..027cf78 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ $ p2r https://arxiv.org/abs/1811.11242 There is also support for transferring an article from a website: ``` -p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines +$ p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines ``` The script can be run through the ``p2r`` command line program or via Docker -- cgit v1.2.3 From 4ee288f1963f94b50e57c4f1d7b2680fd689d64f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 4 Feb 2020 10:50:35 +0000 Subject: Automatically detect html source using content type This removes the need to provide a --html flag! --- paper2remarkable/providers/html.py | 18 +++++++++++++----- paper2remarkable/providers/pdf_url.py | 13 +++++++++---- paper2remarkable/ui.py | 13 ++----------- paper2remarkable/utils.py | 22 ++++++++++++++++++++++ 4 files changed, 46 insertions(+), 20 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 20185fd..d0d55f4 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -23,7 +23,11 @@ import weasyprint.fonts from ._base import Provider from ._info import Informer -from ..utils import clean_string, get_page_with_retry +from ..utils import ( + clean_string, + get_page_with_retry, + get_content_type_with_retry, +) from ..log import Logger logger = Logger() @@ -122,8 +126,12 @@ class HTML(Provider): html.write_pdf(filename, stylesheets=[css], font_config=font_config) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("text/html") diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 5314ec7..77accc9 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -13,6 +13,7 @@ import urllib from ._base import Provider from ._info import Informer from ..exceptions import FilenameMissingError +from ..utils import get_content_type_with_retry class PdfUrlInformer(Informer): @@ -30,8 +31,12 @@ class PdfUrl(Provider): return (None, url) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("application/pdf") diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 50ccad9..9b5dd42 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,7 +13,7 @@ import sys from . import __version__, GITHUB_URL -from .providers import providers, LocalFile, HTML +from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -21,11 +21,6 @@ def parse_args(): parser = argparse.ArgumentParser( description="Paper2reMarkable version %s" % __version__ ) - parser.add_argument( - '-t', "--html", - help="URL is to a HTML article instead of a PDF", - action="store_true", - ) parser.add_argument( "-b", "--blank", @@ -111,11 +106,7 @@ def main(): args = parse_args() cookiejar = None - if args.html and is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = HTML - elif LocalFile.validate(args.input): + if LocalFile.validate(args.input): # input is a local file provider = LocalFile elif is_url(args.input): diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index d4e5075..52c2a38 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -87,6 +87,28 @@ def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): return res.content +def get_content_type_with_retry(url, tries=5, cookiejar=None): + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.head(url, headers=HEADERS, cookies=jar, + allow_redirects=True) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + print("res.headers = %r" % res.headers) + return res.headers.get("Content-Type", None) + + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" it = 0 -- cgit v1.2.3 From ce9c1333fcf761e322ad169df3969ca23d9938e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 4 Feb 2020 10:50:50 +0000 Subject: Add another test for the html provider --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index 80f4662..d0e3d40 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -214,6 +214,14 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_2(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.nature.com/articles/d41586-020-00176-4" + exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 985c3d1369118831c41aa16803a9e046a0bda9f1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 4 Feb 2020 11:09:58 +0000 Subject: Remove unnecessary print statement --- paper2remarkable/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 52c2a38..22d6d38 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -105,7 +105,6 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None): ) time.sleep(5) continue - print("res.headers = %r" % res.headers) return res.headers.get("Content-Type", None) -- cgit v1.2.3 From 219b0d756faefb737cd6c65c27af32fe7661535e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 4 Feb 2020 11:10:07 +0000 Subject: Bump version and update changelog --- CHANGELOG.md | 5 +++++ README.md | 9 ++++----- paper2remarkable/__version__.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a888340..898f7da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.5.1 + +* Automatically detect when a HTML source is provided + ([#24](https://github.com/GjjvdBurg/paper2remarkable/pull/24)) + ## Version 0.5.0 * Add support for articles from the web using the ``--html`` flag diff --git a/README.md b/README.md index 027cf78..e639864 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ $ p2r https://arxiv.org/abs/1811.11242 There is also support for transferring an article from a website: ``` -$ p2r --html https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines +$ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines ``` The script can be run through the ``p2r`` command line program or via Docker @@ -32,11 +32,11 @@ reMarkable from any of the following sources: * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file -* Any article on a website (using ``--html``) +* Any article on a website The program aims to be flexible to the exact source URL, so for many of the academic sources you can either provide a URL to the abstract page or to the -PDF file. If you have an source that you would like to see added to the list, +PDF file. If you have an source that you would like to see added to the list, let me know! ``paper2remarkable`` takes the source URL and: @@ -66,14 +66,13 @@ usage: p2r [-h] [-t] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] [-V] [--rmapi RMAPI] input -Paper2reMarkable version 0.5.0 +Paper2reMarkable version 0.5.1 positional arguments: input URL to a paper or the path of a local PDF file optional arguments: -h, --help show this help message and exit - -t, --html URL is to a HTML article instead of a PDF -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align -d, --debug debug mode, doesn't upload to reMarkable diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 7f3b8c6..6e42bfc 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 0) +VERSION = (0, 5, 1) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From d4c682b869bb2bf391d5cf686baec026c5956875 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 14 Feb 2020 18:56:30 +0000 Subject: Be more robust against missing meta data --- paper2remarkable/providers/_info.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 746c436..47c2e25 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -76,8 +76,13 @@ class Informer: ## Title def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] + meta = soup.find_all("meta", {"name": self.meta_title_key}) + if not meta: + logger.warning( + "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + ) + return "" + return meta[0]["content"] ## Authors @@ -87,10 +92,13 @@ class Informer: return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] + meta = soup.find_all("meta", {"name": self.meta_author_key}) + if not meta: + logger.warning( + "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + ) + return "" + authors = [x["content"] for x in meta] return self._format_authors(authors) ## Year @@ -100,7 +108,8 @@ class Informer: def get_year(self, soup): """ Retrieve the contents of the meta_date_key field and format it """ - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] + meta = soup.find_all("meta", {"name": self.meta_date_key}) + if not meta: + return "" + date = meta[0]["content"] return self._format_year(date) -- cgit v1.2.3 From 73c6515b4fa9060666dd3e27c42be376446fcc73 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 14 Feb 2020 18:57:07 +0000 Subject: Ensure all text from exception goes to stderr --- paper2remarkable/ui.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 9b5dd42..69af4e6 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -97,8 +97,10 @@ def exception(msg): print("", file=sys.stderr) print( "If you think this might be a bug, please raise an issue on GitHub: %s" - % GITHUB_URL + % GITHUB_URL, + file=sys.stderr, ) + print("", file=sys.stderr) raise SystemExit(1) -- cgit v1.2.3 From a2e833add7f8f3e3dee5335cf213cb61d9fbbd1d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 11:09:20 +0000 Subject: Running both center and crop is unnecessary center_pdf uses the bbox of crop_pdf and adds padding, so running both does nothing more than center_pdf alone does. --- paper2remarkable/providers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index b2f584c..221d0ba 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -64,7 +64,7 @@ class Provider(metaclass=abc.ABCMeta): # Define the operations to run on the pdf. Providers can add others. self.operations = [("crop", self.crop_pdf)] if center: - self.operations.append(("center", self.center_pdf)) + self.operations = [("center", self.center_pdf)] if blank: self.operations.append(("blank", blank_pdf)) -- cgit v1.2.3 From 5b96d4ebe0d08e2b5c0fc4a99be1cf787543137d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:11:09 +0000 Subject: Replace spaces in author names --- paper2remarkable/providers/_info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 47c2e25..10b6959 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -50,6 +50,7 @@ class Informer: authors = self.authors[0] + "_et_al" else: authors = "_".join(self.authors) + authors = authors.replace(" ", "_") authors = clean_string(authors) # Clean the title and make it titlecase -- cgit v1.2.3 From 0128dce1c10be8db965584aa387bf00040a3f018 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:14:44 +0000 Subject: Add NBER provider --- paper2remarkable/providers/__init__.py | 4 ++- paper2remarkable/providers/nber.py | 46 ++++++++++++++++++++++++++++++++++ tests/test_providers.py | 15 +++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 paper2remarkable/providers/nber.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index f87a044..c868bc4 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -5,6 +5,7 @@ from .arxiv import Arxiv from .citeseerx import CiteSeerX from .html import HTML from .local import LocalFile +from .nber import NBER from .neurips import NeurIPS from .openreview import OpenReview from .pdf_url import PdfUrl @@ -12,11 +13,12 @@ from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer -# NOTE: Order matters here, PdfUrl should be last +# NOTE: Order matters here, PdfUrl and HTML should be last providers = [ ACM, Arxiv, CiteSeerX, + NBER, NeurIPS, OpenReview, PMLR, diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py new file mode 100644 index 0000000..76bc85f --- /dev/null +++ b/paper2remarkable/providers/nber.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +"""Provider for NBER + +(US) National Bureau of Economic Research + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class NBERInformer(Informer): + def _format_year(self, soup_date): + return soup_date.split("-")[0] + + +class NBER(Provider): + + re_abs = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)$" + re_pdf = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)\.pdf$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = NBERInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url + ".pdf" + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url[: -len(".pdf")] + else: + raise URLResolutionError("NBER", url) + return abs_url, pdf_url + + def validate(src): + return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) diff --git a/tests/test_providers.py b/tests/test_providers.py index d0e3d40..38f88b7 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -18,6 +18,7 @@ from paper2remarkable.providers import ( CiteSeerX, HTML, LocalFile, + NBER, NeurIPS, OpenReview, PMLR, @@ -179,6 +180,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_nber_1(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w26752" + exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_nber_2(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w19152.pdf" + exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_1(self): prov = NeurIPS(upload=False, verbose=VERBOSE) url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf" -- cgit v1.2.3 From 31d375fcc5feff87329f2230aa799b78834e2ebd Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:21:05 +0000 Subject: remove old comment --- paper2remarkable/providers/citeseerx.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py index e483f28..e819c30 100644 --- a/paper2remarkable/providers/citeseerx.py +++ b/paper2remarkable/providers/citeseerx.py @@ -49,10 +49,6 @@ class CiteSeerX(Provider): ) time.sleep(30) - # NOTE: The delay should only be hit twice when p2r is used as a - # library (e.g. during testing). Otherwise the ``server_delay`` is - # never reached in run(). - def _get_doi(self, url): m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) if m: -- cgit v1.2.3 From d43e1fbe10f18fdbac08aae414e605c8387cb19b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:38:03 +0000 Subject: Extract filename from url with pdf_url provider Fixes #25 --- paper2remarkable/exceptions.py | 10 +++++++--- paper2remarkable/providers/pdf_url.py | 31 ++++++++++++++++++++++++++++--- tests/test_providers.py | 4 ++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 86f39b4..a608bcc 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -48,13 +48,17 @@ class URLResolutionError(Error): class FilenameMissingError(Error): """Exception raised for providers that need a filename to be provided""" - def __init__(self, provider): + def __init__(self, provider, url, reason=None): self.provider = provider + self.url = url + self.reason = reason def __str__(self): - msg = "ERROR: Filename must be given with the {provider} provider (hint: use --filename)".format( - provider=self.provider + msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format( + provider=self.provider, url=self.url ) + if self.reason: + msg += "\nReason: {reason}".format(reason=self.reason) msg += GH_MSG return msg diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 77accc9..b86c7c3 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -12,14 +12,39 @@ import urllib from ._base import Provider from ._info import Informer + +from .. import GITHUB_URL from ..exceptions import FilenameMissingError +from ..log import Logger from ..utils import get_content_type_with_retry +logger = Logger() + class PdfUrlInformer(Informer): def get_filename(self, abs_url): - # if this is called, filename must not have been provided - raise FilenameMissingError(provider="PDFUrl") + # try to get a nice filename by parsing the url + parsed = urllib.parse.urlparse(abs_url) + path_parts = parsed.path.split("/") + if not path_parts: + raise FilenameMissingError( + provider="PdfUrl", url=abs_url, reason="No URL parts", + ) + + filename = path_parts[-1] + if not filename.endswith(".pdf"): + raise FilenameMissingError( + provider="PdfUrl", + url=abs_url, + reason="URL path didn't end in .pdf", + ) + logger.warning( + "Using filename {filename} extracted from url. " + "You might want to provide a nicer one using --filename " + "or request this paper source to be added " + "(see: {github}).".format(filename=filename, github=GITHUB_URL) + ) + return filename class PdfUrl(Provider): @@ -28,7 +53,7 @@ class PdfUrl(Provider): self.informer = PdfUrlInformer() def get_abs_pdf_urls(self, url): - return (None, url) + return (url, url) def validate(src): # first check if it is a valid url diff --git a/tests/test_providers.py b/tests/test_providers.py index d0e3d40..82c8500 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -148,8 +148,8 @@ class TestProviders(unittest.TestCase): def test_pdfurl(self): prov = PdfUrl(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) + filename = prov.run(url) + self.assertEqual("14-526.pdf", os.path.basename(filename)) def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) -- cgit v1.2.3 From 963555e6972d865be8abc8082df37e1f598bce29 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:51:59 +0000 Subject: Paper2ReMarkable Release version 0.5.2 Changes in this release: * Add provider for US National Bureau of Economic Research ([#27](https://github.com/GjjvdBurg/paper2remarkable/pull/27)). * Automatically extract the filename from a pdf url where possible ([#25](https://github.com/GjjvdBurg/paper2remarkable/issues/25)). * Speed up centering of pdfs by removing unnecessary cropping operation. * Improve robustness against missing metadata, remove spaces in author names, and other minor improvements. --- CHANGELOG.md | 10 ++++++++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 898f7da..b273ac4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Version 0.5.2 + +* Add provider for US National Bureau of Economic Research + ([#27](https://github.com/GjjvdBurg/paper2remarkable/pull/27)). +* Automatically extract the filename from a pdf url where possible + ([#25](https://github.com/GjjvdBurg/paper2remarkable/issues/25)). +* Speed up centering of pdfs by removing unnecessary cropping operation. +* Improve robustness against missing metadata, remove spaces in author names, + and other minor improvements. + ## Version 0.5.1 * Automatically detect when a HTML source is provided diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 6e42bfc..3fec55b 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 1) +VERSION = (0, 5, 2) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From b895efee7c8a8ddbd13e19a9bc777ed4450c5865 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 19 Feb 2020 12:54:53 +0000 Subject: Readme update --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e639864..7b5c9ad 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ reMarkable from any of the following sources: * [arXiv](https://arxiv.org/) * [ACM Digital Library](https://dl.acm.org/dl.cfm) * [CiteSeerX](http://citeseerx.ist.psu.edu/index) +* [NBER](https://www.nber.org) * [NeurIPS](https://papers.nips.cc/) * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) @@ -131,7 +132,7 @@ This installs the ``p2r`` command line program. ## Docker -You can also use our Dockerfile to avoid installing dependencies on your +You can also use the Dockerfile to avoid installing dependencies on your machine. You will need `git` and `docker` installed. First clone this repository with `git clone` and `cd` inside of it, then build -- cgit v1.2.3 From de7fa6bf3b7c25eb2a2e07fce769b515bca92e7d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 21 Feb 2020 16:23:01 +0000 Subject: Add provider for JMLR --- README.md | 1 + paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/jmlr.py | 70 ++++++++++++++++++++++++++++++++++ tests/test_providers.py | 16 +++++++- 4 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 paper2remarkable/providers/jmlr.py diff --git a/README.md b/README.md index 7b5c9ad..446682e 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ reMarkable from any of the following sources: * [arXiv](https://arxiv.org/) * [ACM Digital Library](https://dl.acm.org/dl.cfm) * [CiteSeerX](http://citeseerx.ist.psu.edu/index) +* [JMLR](http://jmlr.org) * [NBER](https://www.nber.org) * [NeurIPS](https://papers.nips.cc/) * [OpenReview](https://openreview.net/) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index c868bc4..e4fa1bd 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -4,6 +4,7 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX from .html import HTML +from .jmlr import JMLR from .local import LocalFile from .nber import NBER from .neurips import NeurIPS @@ -18,6 +19,7 @@ providers = [ ACM, Arxiv, CiteSeerX, + JMLR, NBER, NeurIPS, OpenReview, diff --git a/paper2remarkable/providers/jmlr.py b/paper2remarkable/providers/jmlr.py new file mode 100644 index 0000000..3634b4f --- /dev/null +++ b/paper2remarkable/providers/jmlr.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +"""Provider for JMLR + +Journal of Machine Learning Research + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class JMLRInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + have_comma = any(("," in auth for auth in soup_authors)) + if have_comma: + return super()._format_authors(soup_authors, sep=",", idx=0) + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class JMLR(Provider): + + re_abs_1 = "https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\d{2}\-\d{3}).html$" + re_pdf_1 = "https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\d{2}\-\d{3})/(?P=pid).pdf$" + + re_abs_2 = "https?://(www\.)?jmlr\.org/papers/v(?P\d+)/(?P\w+\d{2}\w).html$" + re_pdf_2 = "https?://(www\.)?jmlr\.org/papers/volume(?P\d+)/(?P\w+\d{2}\w)/(?P=pid).pdf$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = JMLRInformer() + + def get_abs_pdf_urls(self, url): + abs_url = pdf_url = None + abs_fmt = "http://jmlr.org/papers/v{vol}/{pid}.html" + pdf_fmt = "http://jmlr.org/papers/volume{vol}/{pid}/{pid}.pdf" + formats = [ + (self.re_abs_1, self.re_pdf_1), + (self.re_abs_2, self.re_pdf_2), + ] + + for re_abs, re_pdf in formats: + ma = re.match(re_abs, url) + mp = re.match(re_pdf, url) + if ma: + abs_url = url + pdf_url = pdf_fmt.format( + vol=ma.group("vol"), pid=ma.group("pid") + ) + elif mp: + abs_url = abs_fmt.format( + vol=mp.group("vol"), pid=mp.group("pid") + ) + pdf_url = url + if abs_url is None or pdf_url is None: + raise URLResolutionError("JMLR", url) + return abs_url, pdf_url + + def validate(src): + return re.match(JMLR.re_abs, src) or re.match(JMLR.re_pdf, src) diff --git a/tests/test_providers.py b/tests/test_providers.py index 493a209..2bf7507 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -17,6 +17,7 @@ from paper2remarkable.providers import ( Arxiv, CiteSeerX, HTML, + JMLR, LocalFile, NBER, NeurIPS, @@ -152,6 +153,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual("14-526.pdf", os.path.basename(filename)) + def test_jmlr_1(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_jmlr_2(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/v10/xu09a.html" + exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) url = "http://proceedings.mlr.press/v97/behrmann19a.html" @@ -237,6 +252,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From aa21dbbaeefa29183a6e0e5933fb06ab41450d8d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 21 Feb 2020 16:24:13 +0000 Subject: Bugfix for creating nested directories Turns out rMapi doesn't support the equivalent of mkdir -p, so we do it ourselves. --- paper2remarkable/utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 22d6d38..859ce6c 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -133,13 +133,18 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): # Create the reMarkable dir if it doesn't exist remarkable_dir = remarkable_dir.rstrip("/") if remarkable_dir: - status = subprocess.call( - [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL, - ) - if not status == 0: - raise RemarkableError( - "Creating directory %s on reMarkable failed" % remarkable_dir + parts = remarkable_dir.split("/") + rmdir = "" + while parts: + rmdir += "/" + parts.pop(0) + status = subprocess.call( + [rmapi_path, "mkdir", rmdir], stdout=subprocess.DEVNULL, ) + if not status == 0: + raise RemarkableError( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) # Upload the file status = subprocess.call( -- cgit v1.2.3 From 806f22a6245c6379df0fb72255b08b8d1850eb71 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 21 Feb 2020 16:24:19 +0000 Subject: Code formatting --- paper2remarkable/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 859ce6c..592dcd3 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -94,8 +94,9 @@ def get_content_type_with_retry(url, tries=5, cookiejar=None): count += 1 error = False try: - res = requests.head(url, headers=HEADERS, cookies=jar, - allow_redirects=True) + res = requests.head( + url, headers=HEADERS, cookies=jar, allow_redirects=True + ) except requests.exceptions.ConnectionError: error = True if error or not res.ok: -- cgit v1.2.3 From e557ed7b42f2b12193e0a9eaf686e7b1524b070b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 12:39:55 +0000 Subject: Try to get the bounding box with pdftoppm We use pdftoppm when it's available, but default to pdfplumber when it's not. Using pdftoppm gives a 15x speed up in finding the bounding box. --- paper2remarkable/crop.py | 128 +++++++++++++++++++++++++++++++----- paper2remarkable/pdf_ops.py | 18 +++-- paper2remarkable/providers/_base.py | 8 ++- paper2remarkable/ui.py | 6 ++ 4 files changed, 139 insertions(+), 21 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 5f3b4e3..895fe58 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -9,9 +9,10 @@ Copyright: 2019, G.J.J. van den Burg """ import PyPDF2 +import io import os -import subprocess import pdfplumber +import subprocess from .log import Logger @@ -21,17 +22,43 @@ RM_HEIGHT = 1872 logger = Logger() +def find_offset_byte_line(line): + """Find index of first nonzero bit in a line of bytes + + The given line is a string of bytes, each representing 8 pixels. This code + finds the index of the first bit that is not zero. Used when find the + cropbox with pdftoppm. + """ + off = 0 + for c in line: + if c == 0: + off += 8 + else: + k = 0 + while c > 0: + k += 1 + c >>= 1 + off += k + break + return off + + class Cropper(object): def __init__( - self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + self, + input_file=None, + output_file=None, + pdfcrop_path="pdfcrop", + pdftoppm_path="pdftoppm", ): if not input_file is None: self.input_file = os.path.abspath(input_file) self.reader = PyPDF2.PdfFileReader(self.input_file) if not output_file is None: self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path + self.pdfcrop_path = pdfcrop_path + self.pdftoppm_path = pdftoppm_path self.writer = PyPDF2.PdfFileWriter() def crop(self, margins=1): @@ -96,17 +123,20 @@ class Cropper(object): os.unlink(tmpfout) return 0 - def get_bbox(self, filename, margins=1, resolution=72): - """Get the bounding box, with optional margins - - if margins is integer, used for all margins, else - margins = [left, top, right, bottom] - - We get the bounding box by finding the smallest rectangle that is - completely surrounded by white pixels. - """ - if isinstance(margins, int): - margins = [margins for _ in range(4)] + def get_raw_bbox(self, filename, resolution=72): + """Get the basic bounding box of a pdf file""" + # We try to use pdftoppm, but if it's not available or fails, we + # default to pdfplumber. + try: + bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) + except subprocess.CalledProcessError: + bbox = self.get_raw_bbox_pdfplumber( + filename, resolution=resolution + ) + return bbox + + def get_raw_bbox_pdfplumber(self, filename, resolution=72): + """Get the basic bounding box with pdfplumber""" pdf = pdfplumber.open(filename) im = pdf.pages[0].to_image(resolution=resolution) pdf.close() @@ -131,6 +161,74 @@ class Cropper(object): while right < W and sum(M[W - 1 - right]) == H * 255 * 3: right += 1 + return left, right, top, bottom, W, H + + def get_raw_bbox_pdftoppm(self, filename, resolution=72): + """Get the basic bounding box using pdftoppm """ + cmd = [ + self.pdftoppm_path, + "-r", + str(resolution), + "-singlefile", + "-mono", + filename, + ] + + im = subprocess.check_output(cmd) + im = io.BytesIO(im) + + id_ = im.readline().rstrip(b"\n") + if not id_ == b"P4": + raise ValueError("Not in P4 format") + wh = im.readline().rstrip(b"\n").split(b" ") + width, height = int(wh[0]), int(wh[1]) + imdata = im.read() + + pad = width % 8 + padwidth = width + pad + stepsize = padwidth // 8 + + for top in range(height): + if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0: + break + + for bottom in reversed(range(height)): + if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0: + break + + left = width + right = 0 + for i in range(top, bottom): + lline = imdata[i * stepsize : (i + 1) * stepsize] + rline = reversed(imdata[i * stepsize : (i + 1) * stepsize]) + l = find_offset_byte_line(lline) + left = min(left, l) + r = padwidth + pad - find_offset_byte_line(rline) + right = max(right, r) + + top += 1 + left += 1 + right = width - right + 2 + bottom = height - bottom - 2 + + return left, right, top, bottom, width, height + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + + left, right, top, bottom, W, H = self.get_raw_bbox( + filename, resolution=resolution + ) + left -= margins[0] top -= margins[1] right -= margins[2] @@ -141,7 +239,7 @@ class Cropper(object): # The remarkable changes the orientation of a portrait page if the # width is greater than the height. To prevent this, we pad the height - # with extra whitespace. This should only occur if the original + # with extra whitespace. This should only occur if the original # orientation of the page would be changed by cropping. w, h = x1 - x0, y1 - y0 if H > W and w > h: diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c660452..fae9581 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -19,13 +19,18 @@ from .log import Logger logger = Logger() -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): +def crop_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): """Crop the pdf file using Cropper """ logger.info("Cropping pdf file") cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) + cropper = Cropper( + filepath, + cropped_file, + pdfcrop_path=pdfcrop_path, + pdftoppm_path=pdftoppm_path, + ) status = cropper.crop(margins=15) if not status == 0: @@ -39,13 +44,18 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"): return cropped_file -def center_pdf(filepath, pdfcrop_path="pdfcrop"): +def center_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): """Center the pdf file on the reMarkable """ logger.info("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) + cropper = Cropper( + filepath, + centered_file, + pdfcrop_path=pdfcrop_path, + pdftoppm_path=pdftoppm_path, + ) status = cropper.center() if not status == 0: diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 221d0ba..4191d35 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -40,6 +40,7 @@ class Provider(metaclass=abc.ABCMeta): remarkable_dir="/", rmapi_path="rmapi", pdfcrop_path="pdfcrop", + pdftoppm_path="pdftoppm", pdftk_path="pdftk", gs_path="gs", cookiejar=None, @@ -49,6 +50,7 @@ class Provider(metaclass=abc.ABCMeta): self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdfcrop_path = pdfcrop_path + self.pdftoppm_path = pdftoppm_path self.pdftk_path = pdftk_path self.gs_path = gs_path self.informer = Informer() @@ -83,10 +85,12 @@ class Provider(metaclass=abc.ABCMeta): # Wrappers for pdf operations that have additional arguments def crop_pdf(self, filepath): - return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path, + pdftoppm_path=self.pdftoppm_path) def center_pdf(self, filepath): - return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path, + pdftoppm_path=self.pdftoppm_path) def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 69af4e6..6b25aaf 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -75,6 +75,11 @@ def parse_args(): help="path to pdfcrop executable (default: pdfcrop)", default="pdfcrop", ) + parser.add_argument( + "--pdftoppm", + help="path to pdftoppm executable (default: pdftoppm)", + default="pdftoppm", + ) parser.add_argument( "--pdftk", help="path to pdftk executable (default: pdftk)", @@ -134,6 +139,7 @@ def main(): remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdfcrop_path=args.pdfcrop, + pdftoppm_path=args.pdftoppm, pdftk_path=args.pdftk, gs_path=args.gs, cookiejar=cookiejar, -- cgit v1.2.3 From 3862345c6bbeffc9694cdd881a13428cbfbbe294 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 13:16:52 +0000 Subject: Simply set the cropbox of the page --- paper2remarkable/crop.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 895fe58..b8b31c6 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -14,6 +14,8 @@ import os import pdfplumber import subprocess +from PyPDF2.generic import RectangleObject + from .log import Logger RM_WIDTH = 1404 @@ -102,25 +104,11 @@ class Cropper(object): def process_page(self, page_idx, bbox_func, *args, **kwargs): """Process a single page and add it to the writer """ tmpfname = self.export_page(page_idx) - tmpfout = "./output.pdf" bbox = bbox_func(tmpfname, *args, **kwargs) - status = subprocess.call( - [ - self.pdfcrop_path, - "--bbox", - " ".join(map(str, bbox)), - tmpfname, - tmpfout, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - return status - reader = PyPDF2.PdfFileReader(tmpfout) - page = reader.getPage(0) - self.writer.addPage(page) + thepage = self.reader.getPage(page_idx) + thepage.cropBox = RectangleObject(bbox) + self.writer.addPage(thepage) os.unlink(tmpfname) - os.unlink(tmpfout) return 0 def get_raw_bbox(self, filename, resolution=72): -- cgit v1.2.3 From 4501d3fb80199a4c0220372795b571e3e018ffca Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 13:26:25 +0000 Subject: Remove pdfcrop dependency --- README.md | 2 -- paper2remarkable/crop.py | 4 +--- paper2remarkable/pdf_ops.py | 6 ++---- paper2remarkable/providers/_base.py | 8 ++------ paper2remarkable/ui.py | 6 ------ 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 7b5c9ad..4f3a93e 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,6 @@ $ p2r -v https://arxiv.org/abs/1811.11242 The script requires the following external programs to be available: - [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) -- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a - LaTeX installation. - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index b8b31c6..2b6e086 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -50,7 +50,6 @@ class Cropper(object): self, input_file=None, output_file=None, - pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm", ): if not input_file is None: @@ -59,7 +58,6 @@ class Cropper(object): if not output_file is None: self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path self.pdftoppm_path = pdftoppm_path self.writer = PyPDF2.PdfFileWriter() @@ -113,7 +111,7 @@ class Cropper(object): def get_raw_bbox(self, filename, resolution=72): """Get the basic bounding box of a pdf file""" - # We try to use pdftoppm, but if it's not available or fails, we + # We try to use pdftoppm, but if it's not available or fails, we # default to pdfplumber. try: bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index fae9581..4c695c6 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -19,7 +19,7 @@ from .log import Logger logger = Logger() -def crop_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): +def crop_pdf(filepath, pdftoppm_path="pdftoppm"): """Crop the pdf file using Cropper """ logger.info("Cropping pdf file") @@ -28,7 +28,6 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): cropper = Cropper( filepath, cropped_file, - pdfcrop_path=pdfcrop_path, pdftoppm_path=pdftoppm_path, ) status = cropper.crop(margins=15) @@ -44,7 +43,7 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): return cropped_file -def center_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): +def center_pdf(filepath, pdftoppm_path="pdftoppm"): """Center the pdf file on the reMarkable """ logger.info("Centering pdf file") @@ -53,7 +52,6 @@ def center_pdf(filepath, pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm"): cropper = Cropper( filepath, centered_file, - pdfcrop_path=pdfcrop_path, pdftoppm_path=pdftoppm_path, ) status = cropper.center() diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 4191d35..bf8cdf5 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -39,7 +39,6 @@ class Provider(metaclass=abc.ABCMeta): blank=False, remarkable_dir="/", rmapi_path="rmapi", - pdfcrop_path="pdfcrop", pdftoppm_path="pdftoppm", pdftk_path="pdftk", gs_path="gs", @@ -49,7 +48,6 @@ class Provider(metaclass=abc.ABCMeta): self.debug = debug self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path - self.pdfcrop_path = pdfcrop_path self.pdftoppm_path = pdftoppm_path self.pdftk_path = pdftk_path self.gs_path = gs_path @@ -85,12 +83,10 @@ class Provider(metaclass=abc.ABCMeta): # Wrappers for pdf operations that have additional arguments def crop_pdf(self, filepath): - return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path, - pdftoppm_path=self.pdftoppm_path) + return crop_pdf(filepath, pdftoppm_path=self.pdftoppm_path) def center_pdf(self, filepath): - return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path, - pdftoppm_path=self.pdftoppm_path) + return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path) def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 6b25aaf..2303603 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -70,11 +70,6 @@ def parse_args(): parser.add_argument( "--gs", help="path to gs executable (default: gs)", default="gs" ) - parser.add_argument( - "--pdfcrop", - help="path to pdfcrop executable (default: pdfcrop)", - default="pdfcrop", - ) parser.add_argument( "--pdftoppm", help="path to pdftoppm executable (default: pdftoppm)", @@ -138,7 +133,6 @@ def main(): blank=args.blank, remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, - pdfcrop_path=args.pdfcrop, pdftoppm_path=args.pdftoppm, pdftk_path=args.pdftk, gs_path=args.gs, -- cgit v1.2.3 From 6f12b10d064d7a938197ce0c4942c4f7136840e2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 13:26:54 +0000 Subject: Add pdftoppm to readme --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 4f3a93e..dd11653 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,9 @@ The script requires the following external programs to be available: - [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) +- [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended + for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/) + installation. If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. Then, you can install -- cgit v1.2.3 From 8f8e7f9cb74e78f9cfd328ef6888e811faebbed8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 13:50:25 +0000 Subject: Add poppler-utils to Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 38db46b..86743a2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ RUN apt-get update \ libmagickwand-dev \ pdftk \ ghostscript \ - texlive-extra-utils # contains pdfcrop + poppler-utils RUN pip install --no-cache-dir paper2remarkable -- cgit v1.2.3 From 213c2f3f001a923927b92a13726f97824dba7cfb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 13:53:55 +0000 Subject: Remove texlive from travis --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5551597..f412f9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,10 +6,9 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils install: - - pip install six - pip install -e .[dev] script: -- cgit v1.2.3 From fe54ffe6eb2aece346f6727aedd8edecf6392d77 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 14:26:41 +0000 Subject: update readme --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index dfaae7d..aa884eb 100644 --- a/README.md +++ b/README.md @@ -63,12 +63,12 @@ Optionally, you can: Here's the full help of the script: ```text -usage: p2r [-h] [-t] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] [-V] - [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK] - [--rmapi RMAPI] +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] [-V] + [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] + [--pdftk PDFTK] [--rmapi RMAPI] input -Paper2reMarkable version 0.5.1 +Paper2reMarkable version 0.5.3 positional arguments: input URL to a paper or the path of a local PDF file @@ -87,7 +87,7 @@ optional arguments: -V, --version Show version and exit --filename FILENAME Filename to use for the file on reMarkable --gs GS path to gs executable (default: gs) - --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) + --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm) --pdftk PDFTK path to pdftk executable (default: pdftk) --rmapi RMAPI path to rmapi executable (default: rmapi) ``` -- cgit v1.2.3 From 72eabc0df472c3b886ec0a4a5987ccdddf7dd4cc Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 22 Feb 2020 14:32:57 +0000 Subject: Bump version and update changelog --- CHANGELOG.md | 8 ++++++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b273ac4..8147f83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## Version 0.5.3 + +* Significantly speed up the program + ([#26](https://github.com/GjjvdBurg/paper2remarkable/issues/26)) +* Add provider for JMLR + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/28)). +* Bugfix for creating nested directories with ``-p`` option. + ## Version 0.5.2 * Add provider for US National Bureau of Economic Research diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 3fec55b..da6d107 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 2) +VERSION = (0, 5, 3) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 1b2d8df3a04e4b2caa46c46d98bcc96af4ae4449 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 10:43:12 +0000 Subject: Add a no-crop option --- paper2remarkable/providers/_base.py | 8 ++++++-- paper2remarkable/ui.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index bf8cdf5..0eda537 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -37,6 +37,7 @@ class Provider(metaclass=abc.ABCMeta): debug=False, center=False, blank=False, + no_crop=False, remarkable_dir="/", rmapi_path="rmapi", pdftoppm_path="pdftoppm", @@ -62,9 +63,12 @@ class Provider(metaclass=abc.ABCMeta): logger.disable() # Define the operations to run on the pdf. Providers can add others. - self.operations = [("crop", self.crop_pdf)] - if center: + if no_crop: + self.operations = [] + elif center: self.operations = [("center", self.center_pdf)] + else: + self.operations = [("crop", self.crop_pdf)] if blank: self.operations.append(("blank", blank_pdf)) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 2303603..0c8ea91 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -52,6 +52,12 @@ def parse_args(): dest="remarkable_dir", default="/", ) + parser.add_argument( + '-k', + '--no-crop', + help="Don't crop the pdf file", + action="store_true") + parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) @@ -108,6 +114,9 @@ def main(): args = parse_args() cookiejar = None + if args.center and args.no_crop: + exception("Can't center and not crop at the same time!") + if LocalFile.validate(args.input): # input is a local file provider = LocalFile @@ -131,6 +140,7 @@ def main(): debug=args.debug, center=args.center, blank=args.blank, + no_crop=args.no_crop, remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, -- cgit v1.2.3 From 64353e7d6cb2f97dcc10b0a73307924ecbe92b50 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 15:50:58 +0000 Subject: Add option to right-align a file --- paper2remarkable/crop.py | 58 ++++++++++++++++++++++++++++++++----- paper2remarkable/pdf_ops.py | 32 +++++++++++++------- paper2remarkable/providers/_base.py | 8 ++++- paper2remarkable/ui.py | 10 +++++++ 4 files changed, 90 insertions(+), 18 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 2b6e086..02c6757 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -47,10 +47,7 @@ def find_offset_byte_line(line): class Cropper(object): def __init__( - self, - input_file=None, - output_file=None, - pdftoppm_path="pdftoppm", + self, input_file=None, output_file=None, pdftoppm_path="pdftoppm", ): if not input_file is None: self.input_file = os.path.abspath(input_file) @@ -67,6 +64,9 @@ class Cropper(object): def center(self, padding=15): return self.process_file(self.center_page, padding=padding) + def right(self, padding=15): + return self.process_file(self.right_page, padding=padding) + def process_file(self, page_func, *args, **kwargs): n = self.reader.getNumPages() for page_idx in range(n): @@ -81,13 +81,18 @@ class Cropper(object): logger.info("Processing pages ... (%i/%i)" % (n, n)) return 0 + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + def center_page(self, page_idx, padding): return self.process_page( page_idx, self.get_center_bbox, padding=padding ) - def crop_page(self, page_idx, margins): - return self.process_page(page_idx, self.get_bbox, margins=margins) + def right_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_right_bbox, padding=padding + ) def export_page(self, page_idx): """Helper function that exports a single page given by index """ @@ -216,7 +221,9 @@ class Cropper(object): ) left -= margins[0] + left = max(left, 0) top -= margins[1] + top = max(top, 0) right -= margins[2] bottom -= margins[3] @@ -252,7 +259,7 @@ class Cropper(object): # if the document is wider than the remarkable, we add top-padding to # center it, otherwise we add left-padding - x, y = 0, 0 + x = y = 0 if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 else: @@ -260,3 +267,40 @@ class Cropper(object): margins = [padding + x, padding + y, padding, padding] return self.get_bbox(filename, margins=margins) + + def get_right_bbox(self, filename, padding=15): + """Get the bounding box that ensures the menu doesn't hide the text + """ + + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # Note, the menu width is about 12mm and the entire screen is about + # 156mm. This informs the width of the left padding we'll add. + menu_width = 12 / 156 * RM_WIDTH + + H = RM_HEIGHT + W = RM_WIDTH + + # TODO: This math is approximate. The goal is to get the page centered + # in the remaining space after taking the menu width into account, + # while also providing equal padding at the top and bottom. This seems + # to give too much padding on the left for some pages, but I'm not sure + # why. Pull requests welcome! + rho_rm = H / (W - menu_width) + rho_page = (h + 2 * padding) / (w + 2 * padding) + x = y = 0 + if rho_rm < rho_page: + x = -w - 2 * padding + (h + 2 * padding) * (W - menu_width) / H + elif rho_rm > rho_page: + y = -h - 2 * padding + H * (w + 2 * padding) / (W - menu_width) + + margins = [ + menu_width + x + padding, + padding + y, + padding, + padding, + ] + return self.get_bbox(filename, margins=margins) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index 4c695c6..c7561e3 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -25,11 +25,7 @@ def crop_pdf(filepath, pdftoppm_path="pdftoppm"): logger.info("Cropping pdf file") cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper( - filepath, - cropped_file, - pdftoppm_path=pdftoppm_path, - ) + cropper = Cropper(filepath, cropped_file, pdftoppm_path=pdftoppm_path,) status = cropper.crop(margins=15) if not status == 0: @@ -49,11 +45,7 @@ def center_pdf(filepath, pdftoppm_path="pdftoppm"): logger.info("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper( - filepath, - centered_file, - pdftoppm_path=pdftoppm_path, - ) + cropper = Cropper(filepath, centered_file, pdftoppm_path=pdftoppm_path,) status = cropper.center() if not status == 0: @@ -67,6 +59,26 @@ def center_pdf(filepath, pdftoppm_path="pdftoppm"): return centered_file +def right_pdf(filepath, pdftoppm_path="pdftoppm"): + """Right-align the pdf file on the reMarkable + """ + logger.info("Right-aligning pdf file") + righted_file = os.path.splitext(filepath)[0] + "-right.pdf" + + cropper = Cropper(filepath, righted_file, pdftoppm_path=pdftoppm_path) + status = cropper.right() + + if not status == 0: + logger.warning("Failed to right-align the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(righted_file): + logger.warning( + "Can't find right-aligned file '%s' where expected" % righted_file + ) + return filepath + return righted_file + + def blank_pdf(filepath): """Add blank pages to PDF """ diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index bf8cdf5..b47fbae 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -15,7 +15,7 @@ import tempfile import time from ._info import Informer -from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf +from ..pdf_ops import crop_pdf, center_pdf, right_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, download_url, @@ -36,6 +36,7 @@ class Provider(metaclass=abc.ABCMeta): upload=True, debug=False, center=False, + right=False, blank=False, remarkable_dir="/", rmapi_path="rmapi", @@ -65,6 +66,8 @@ class Provider(metaclass=abc.ABCMeta): self.operations = [("crop", self.crop_pdf)] if center: self.operations = [("center", self.center_pdf)] + if right: + self.operations = [("right", self.right_pdf)] if blank: self.operations.append(("blank", blank_pdf)) @@ -88,6 +91,9 @@ class Provider(metaclass=abc.ABCMeta): def center_pdf(self, filepath): return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path) + def right_pdf(self, filepath): + return right_pdf(filepath, pdftoppm_path=self.pdftoppm_path) + def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 2303603..835f044 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -52,6 +52,12 @@ def parse_args(): dest="remarkable_dir", default="/", ) + parser.add_argument( + "-r", + "--right", + help="Right align so the menu doesn't cover it", + action="store_true", + ) parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) @@ -108,6 +114,9 @@ def main(): args = parse_args() cookiejar = None + if args.center and args.right: + exception("Can't center and right align at the same time!") + if LocalFile.validate(args.input): # input is a local file provider = LocalFile @@ -130,6 +139,7 @@ def main(): upload=not args.no_upload, debug=args.debug, center=args.center, + right=args.right, blank=args.blank, remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, -- cgit v1.2.3 From ec402f2e0e930ac35ca6ca83f326ffcf675ec1a7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 15:51:16 +0000 Subject: Bugfix for jmlr validation --- paper2remarkable/providers/jmlr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/jmlr.py b/paper2remarkable/providers/jmlr.py index 3634b4f..8b121cb 100644 --- a/paper2remarkable/providers/jmlr.py +++ b/paper2remarkable/providers/jmlr.py @@ -67,4 +67,9 @@ class JMLR(Provider): return abs_url, pdf_url def validate(src): - return re.match(JMLR.re_abs, src) or re.match(JMLR.re_pdf, src) + return ( + re.match(JMLR.re_abs_1, src) + or re.match(JMLR.re_abs_2, src) + or re.match(JMLR.re_pdf_1, src) + or re.match(JMLR.re_pdf_2, src) + ) -- cgit v1.2.3 From 6c2ccf8be3221ae0a31cb16fb059e74fef64076c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 16:00:09 +0000 Subject: Fix typo --- paper2remarkable/ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index a8218ec..12443d4 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -127,7 +127,7 @@ def main(): if args.center and args.no_crop: exception("Can't center and not crop at the same time!") - if args.right_align and args.no_crop: + if args.right and args.no_crop: exception("Can't right align and not crop at the same time!") if LocalFile.validate(args.input): -- cgit v1.2.3 From 1bdf321d81a733a4ae9547f655501ddf92de0343 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 16:10:47 +0000 Subject: Bump version and update changelog and readme --- CHANGELOG.md | 8 ++++++++ README.md | 6 ++++-- paper2remarkable/__version__.py | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8147f83..fc9d5fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## Version 0.5.4 + +* Add the option to not crop the file at all + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/30)). +* Add the option to right-align the file so the menu doesn't overlap + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/31)). +* Bugfix for validation for the JMLR provider + ## Version 0.5.3 * Significantly speed up the program diff --git a/README.md b/README.md index aa884eb..18753af 100644 --- a/README.md +++ b/README.md @@ -63,12 +63,12 @@ Optionally, you can: Here's the full help of the script: ```text -usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] [-V] +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK] [--rmapi RMAPI] input -Paper2reMarkable version 0.5.3 +Paper2reMarkable version 0.5.4 positional arguments: input URL to a paper or the path of a local PDF file @@ -83,6 +83,8 @@ optional arguments: -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR directory on reMarkable to put the file (created if missing, default: /) + -r, --right Right align so the menu doesn't cover it + -k, --no-crop Don't crop the pdf file -v, --verbose be verbose -V, --version Show version and exit --filename FILENAME Filename to use for the file on reMarkable diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index da6d107..28ed0b8 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 3) +VERSION = (0, 5, 4) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 780411e46e07168cc2321e7c62eef244420fccc6 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 23 Feb 2020 16:38:25 +0000 Subject: Merge pdf cropping functions --- paper2remarkable/pdf_ops.py | 72 +++++++++---------------------------- paper2remarkable/providers/_base.py | 10 +++--- 2 files changed, 22 insertions(+), 60 deletions(-) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c7561e3..5e7e111 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -19,64 +19,24 @@ from .log import Logger logger = Logger() -def crop_pdf(filepath, pdftoppm_path="pdftoppm"): - """Crop the pdf file using Cropper - """ - logger.info("Cropping pdf file") - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - - cropper = Cropper(filepath, cropped_file, pdftoppm_path=pdftoppm_path,) - status = cropper.crop(margins=15) - - if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - -def center_pdf(filepath, pdftoppm_path="pdftoppm"): - """Center the pdf file on the reMarkable - """ - logger.info("Centering pdf file") - centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - - cropper = Cropper(filepath, centered_file, pdftoppm_path=pdftoppm_path,) - status = cropper.center() - - if not status == 0: - logger.warning("Failed to center the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(centered_file): - logger.warning( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - - -def right_pdf(filepath, pdftoppm_path="pdftoppm"): - """Right-align the pdf file on the reMarkable - """ - logger.info("Right-aligning pdf file") - righted_file = os.path.splitext(filepath)[0] + "-right.pdf" - - cropper = Cropper(filepath, righted_file, pdftoppm_path=pdftoppm_path) - status = cropper.right() - - if not status == 0: - logger.warning("Failed to right-align the pdf file at: %s" % filepath) +def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"): + """Prepare pdf by cropping, centering, or right-aligning the flie""" + logger.info("Preparing PDF using %s operation" % operation) + prepared_file = os.path.splitext(filepath)[0] + "-prep.pdf" + cropper = Cropper(filepath, prepared_file, pdftoppm_path=pdftoppm_path) + if operation == "crop": + status = cropper.crop(margins=15) + elif operation == "center": + status = cropper.center() + elif operation == "right": + status = cropper.right() + else: + logger.warning("Unknown operation: %s" % operation) return filepath - if not os.path.exists(righted_file): - logger.warning( - "Can't find right-aligned file '%s' where expected" % righted_file - ) + if not status == 0 or not os.path.exists(prepared_file): + logger.warning("PDF prepare operation failed") return filepath - return righted_file + return prepared_file def blank_pdf(filepath): diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index c68caab..f8b895b 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -15,7 +15,7 @@ import tempfile import time from ._info import Informer -from ..pdf_ops import crop_pdf, center_pdf, right_pdf, blank_pdf, shrink_pdf +from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, download_url, @@ -90,13 +90,15 @@ class Provider(metaclass=abc.ABCMeta): # Wrappers for pdf operations that have additional arguments def crop_pdf(self, filepath): - return crop_pdf(filepath, pdftoppm_path=self.pdftoppm_path) + return prepare_pdf(filepath, "crop", pdftoppm_path=self.pdftoppm_path) def center_pdf(self, filepath): - return center_pdf(filepath, pdftoppm_path=self.pdftoppm_path) + return prepare_pdf( + filepath, "center", pdftoppm_path=self.pdftoppm_path + ) def right_pdf(self, filepath): - return right_pdf(filepath, pdftoppm_path=self.pdftoppm_path) + return prepare_pdf(filepath, "right", pdftoppm_path=self.pdftoppm_path) def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) -- cgit v1.2.3 From 2dd984eb80bff1a6828a197e9d225c4676e1d432 Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Sun, 23 Feb 2020 14:08:52 -0500 Subject: alfred: add alfred workflow --- .github/alfred.png | Bin 0 -> 135792 bytes README.md | 20 ++++++++++++++++++-- Remarkable.alfredworkflow | Bin 0 -> 30549 bytes 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 .github/alfred.png create mode 100644 Remarkable.alfredworkflow diff --git a/.github/alfred.png b/.github/alfred.png new file mode 100644 index 0000000..78a95d9 Binary files /dev/null and b/.github/alfred.png differ diff --git a/README.md b/README.md index 18753af..c2b3447 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ There is also support for transferring an article from a website: $ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines ``` -The script can be run through the ``p2r`` command line program or via Docker -(see below). +The script can be run through the ``p2r`` command line program or via Docker +(see below). If you're using MacOS, you might be interested in the [Alfred +workflow](#alfred). ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: @@ -134,6 +135,21 @@ pip install paper2remarkable This installs the ``p2r`` command line program. +## Alfred + +Install the [Alfred workflow][workflow], which is [a launcher for +MacOS](https://www.alfredapp.com/). + +Once installed, you can use `rm` command and `rmb` (for the `--blank` pages to +insert blank pages between pages for notes) with a URL passed. The global +shortcut `Alt-P` will send the current selection to `p2r`. Note that by default +`--right` is passed and `p2r` is executed in your `bash` environment. You can +edit the Workflow in Alfred if this doesn't work for your setup. + +![Alfred Screenshot](https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/.github/alfred.png) + +[workflow]: https://github.com/GjjvdBurg/paper2remarkable/blob/master/Remarkable.alfredworkflow?raw=true + ## Docker You can also use the Dockerfile to avoid installing dependencies on your diff --git a/Remarkable.alfredworkflow b/Remarkable.alfredworkflow new file mode 100644 index 0000000..6ad331e Binary files /dev/null and b/Remarkable.alfredworkflow differ -- cgit v1.2.3 From 74c0240b7022e199124acd8941d9fa3a1bea228f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 25 Feb 2020 11:41:04 +0000 Subject: Minor readme updates --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c2b3447..fcb1743 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,8 @@ Optionally, you can: - Download a paper but not upload to the reMarkable using the ``-n`` switch. - Insert a blank page after each page using the ``-b`` switch (useful for note taking!) -- Center the pdf on the reMarkable (default is left-aligned) +- Center (``-c``) or right-align (``-r``) the pdf on the reMarkable (default + is left-aligned), or disable cropping altogether (``-k``). - Provide an explicit filename using the ``--filename`` parameter - Specify the location on the reMarkable to place the file (default ``/``) @@ -118,12 +119,18 @@ $ p2r -v https://arxiv.org/abs/1811.11242 The script requires the following external programs to be available: -- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) +- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/), or + ``pdftk-java``, whichever your package manager provides. - [GhostScript](https://www.ghostscript.com/) -- [rMAPI](https://github.com/juruen/rmapi) - [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/) installation. +- [rMAPI](https://github.com/juruen/rmapi) + +On Arch, use ``pacman -S pdftk ghostscript poppler``, on Ubuntu try ``apt-get +install pdftk ghostscript poppler-utils``, and on Mac: ``brew install +pdftk-java poppler``. For [rMAPI](https://github.com/juruen/rmapi), use ``go +get -u github.com/juruen/rmapi``. If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. Then, you can install -- cgit v1.2.3 From 91cb0d496aadb56a547746cf7ff778a64cac7178 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 26 Feb 2020 16:37:09 +0000 Subject: Only use shrunk file if it is indeed smaller --- paper2remarkable/pdf_ops.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index 5e7e111..41cb85f 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -58,7 +58,8 @@ def blank_pdf(filepath): def shrink_pdf(filepath, gs_path="gs"): """Shrink the PDF file size using Ghostscript """ - logger.info("Shrinking pdf file") + logger.info("Shrinking pdf file ...") + size_before = os.path.getsize(filepath) output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" status = subprocess.call( [ @@ -78,4 +79,8 @@ def shrink_pdf(filepath, gs_path="gs"): if not status == 0: logger.warning("Failed to shrink the pdf file") return filepath + size_after = os.path.getsize(output_file) + if size_after > size_before: + logger.info("Shrinking has no effect for this file, using original.") + return filepath return output_file -- cgit v1.2.3 From c4a52f1297e993390e6ba31180976da8c945db38 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 26 Feb 2020 16:37:36 +0000 Subject: Use copy instead of move to keep intermediates This is mostly useful for debugging and has no effect on the working of the program. --- paper2remarkable/providers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index f8b895b..96fb151 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -134,7 +134,7 @@ class Provider(metaclass=abc.ABCMeta): intermediate_fname = tmp_filename for opname, op in self.operations: intermediate_fname = op(intermediate_fname) - shutil.move(intermediate_fname, clean_filename) + shutil.copy(intermediate_fname, clean_filename) if self.debug: print("Paused in debug mode in dir: %s" % working_dir) -- cgit v1.2.3 From 2a2991dd7a9df8d7d1595a4088f49bbde91184c5 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 26 Feb 2020 16:46:15 +0000 Subject: Bump version and update changelog --- CHANGELOG.md | 5 +++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc9d5fe..0c230a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.5.5 + +* Fix bug for when the shrink operation returns bigger files + ([#33](https://github.com/GjjvdBurg/paper2remarkable/issues/33)). + ## Version 0.5.4 * Add the option to not crop the file at all diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 28ed0b8..b3f9426 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 4) +VERSION = (0, 5, 5) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From c4cd0543d6d88975f006869b379ff2cc23d22fa4 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 3 Mar 2020 11:02:22 +0000 Subject: Add ghostscript to install instructions homebrew --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fcb1743..3d59768 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,9 @@ The script requires the following external programs to be available: On Arch, use ``pacman -S pdftk ghostscript poppler``, on Ubuntu try ``apt-get install pdftk ghostscript poppler-utils``, and on Mac: ``brew install -pdftk-java poppler``. For [rMAPI](https://github.com/juruen/rmapi), use ``go -get -u github.com/juruen/rmapi``. +pdftk-java ghostscript poppler``. For +[rMAPI](https://github.com/juruen/rmapi), use ``go get -u +github.com/juruen/rmapi``. If these scripts are not available on the ``PATH`` variable, you can supply them with the relevant options to the script. Then, you can install -- cgit v1.2.3 From 0a72c6b0dcb047ca6bfc11ae876a33f26325a2ef Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 3 Mar 2020 11:02:45 +0000 Subject: Be more robust against missing pdftoppm --- paper2remarkable/crop.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 02c6757..dc4b31c 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -45,6 +45,17 @@ def find_offset_byte_line(line): return off +def check_pdftoppm(pth): + """Check that we can run the provided pdftoppm executable + """ + try: + subprocess.check_output([pth, "-v"], stderr=subprocess.DEVNULL) + except (subprocess.CalledProcessError, FileNotFoundError, PermissionError): + logger.info("pdftoppm not found, using pdfplumber instead (slower)") + return False + return True + + class Cropper(object): def __init__( self, input_file=None, output_file=None, pdftoppm_path="pdftoppm", @@ -55,6 +66,9 @@ class Cropper(object): if not output_file is None: self.output_file = os.path.abspath(output_file) + if pdftoppm_path and not check_pdftoppm(pdftoppm_path): + pdftoppm_path = None + self.pdftoppm_path = pdftoppm_path self.writer = PyPDF2.PdfFileWriter() @@ -116,15 +130,11 @@ class Cropper(object): def get_raw_bbox(self, filename, resolution=72): """Get the basic bounding box of a pdf file""" - # We try to use pdftoppm, but if it's not available or fails, we - # default to pdfplumber. - try: - bbox = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) - except subprocess.CalledProcessError: - bbox = self.get_raw_bbox_pdfplumber( - filename, resolution=resolution - ) - return bbox + if self.pdftoppm_path is None: + box = self.get_raw_bbox_pdfplumber(filename, resolution=resolution) + else: + box = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) + return box def get_raw_bbox_pdfplumber(self, filename, resolution=72): """Get the basic bounding box with pdfplumber""" -- cgit v1.2.3 From 1038554d6c812a32648f87521c3567c614a5d9e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 3 Mar 2020 11:07:12 +0000 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c230a7..49bb980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.5.6 + +* Be more robust against missing pdftoppm executable. + ## Version 0.5.5 * Fix bug for when the shrink operation returns bigger files diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index b3f9426..fdbb314 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 5) +VERSION = (0, 5, 6) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 1117b973fe14b018376359176a27a906e76c6391 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 20 Mar 2020 20:09:34 +0000 Subject: add instructions for local file with docker --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 3d59768..2bc4e35 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,9 @@ docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r --help # equivalent to above usage docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r -v https://arxiv.org/abs/1811.11242 + +# to transfer a local file in the current directory +docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" -v "$(pwd):/home/user:r" p2r -v localfile.pdf ``` You can also create an [alias](http://tldp.org/LDP/abs/html/aliases.html) in -- cgit v1.2.3 From a964e42f144951bf752e1f995a6eea0ffc0c7743 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 20 Mar 2020 20:09:45 +0000 Subject: clean makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ffee6c3..bedbaae 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ PACKAGE=paper2remarkable DOC_DIR='./docs/' VENV_DIR=/tmp/p2r_venv/ -.PHONY: help cover dist venv +.PHONY: help dist venv .DEFAULT_GOAL := help -- cgit v1.2.3 From 721cf5ffb2ebac345f405dd2b2e0d38ba8a3e1ae Mon Sep 17 00:00:00 2001 From: Christophe Delaere Date: Wed, 1 Apr 2020 09:13:48 +0200 Subject: replaced pdftk by qpdf --- paper2remarkable/providers/_base.py | 4 ++-- paper2remarkable/providers/arxiv.py | 11 +++++------ paper2remarkable/ui.py | 8 ++++---- paper2remarkable/utils.py | 1 + 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 96fb151..fbe5308 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -42,7 +42,7 @@ class Provider(metaclass=abc.ABCMeta): remarkable_dir="/", rmapi_path="rmapi", pdftoppm_path="pdftoppm", - pdftk_path="pdftk", + qpdf_path="qpdf", gs_path="gs", cookiejar=None, ): @@ -51,7 +51,7 @@ class Provider(metaclass=abc.ABCMeta): self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path - self.pdftk_path = pdftk_path + self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() self.cookiejar = cookiejar diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 913e015..06bfdec 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -75,16 +75,15 @@ class Arxiv(Provider): status = subprocess.call( [ - self.pdftk_path, + self.qpdf_path, + "--stream-data=uncompress", input_file, - "output", uncompress_file, - "uncompress", ] ) if not status == 0: raise CalledProcessError( - "pdftk failed to uncompress the PDF file." + "qpdf failed to uncompress the PDF file." ) with open(uncompress_file, "rb") as fid: @@ -104,9 +103,9 @@ class Arxiv(Provider): output_file = basename + "_dearxiv.pdf" status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] + [self.qpdf_path, "--stream-data=compress", removed_file, output_file] ) if not status == 0: - raise CalledProcessError("pdftk failed to compress the PDF file.") + raise CalledProcessError("qpdf failed to compress the PDF file.") return output_file diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 12443d4..e17bffb 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -89,9 +89,9 @@ def parse_args(): default="pdftoppm", ) parser.add_argument( - "--pdftk", - help="path to pdftk executable (default: pdftk)", - default="pdftk", + "--qpdf", + help="path to qpdf executable (default: qpdf)", + default="qpdf", ) parser.add_argument( "--rmapi", @@ -158,7 +158,7 @@ def main(): remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, - pdftk_path=args.pdftk, + qpdf_path=args.qpdf, gs_path=args.gs, cookiejar=cookiejar, ) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 592dcd3..f1447d9 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -148,6 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file + logger.info("%s put %s %s/"%(rmapi_path,filepath,remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, -- cgit v1.2.3 From f0489693569a8ac5ac201034190f71d21c74c1b3 Mon Sep 17 00:00:00 2001 From: Clayton Yochum Date: Thu, 2 Apr 2020 01:15:32 -0400 Subject: Update Dockerfile to use Debian Buster base images. This allows Cairo to be updated, removing a warning from Weasyprint. --- Dockerfile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 86743a2..e6fc152 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:stretch AS rmapi +FROM golang:buster AS rmapi ENV GOPATH /go ENV PATH ${GOPATH}/bin:/usr/local/go/bin:$PATH @@ -7,18 +7,21 @@ ENV RMAPIREPO github.com/juruen/rmapi RUN go get -u ${RMAPIREPO} -FROM python:3.7-slim-stretch +FROM python:3.7-slim-buster # rmapi COPY --from=rmapi /go/bin/rmapi /usr/bin/rmapi -# imagemagick, pdftk, ghostscript, pdfcrop +# needed to install openjdk-11-jre-headless +RUN mkdir -p /usr/share/man/man1 + +# imagemagick, pdftk, ghostscript, pdfcrop, weasyprint RUN apt-get update \ && apt-get install --no-install-recommends -y \ libmagickwand-dev \ pdftk \ ghostscript \ - poppler-utils + poppler-utils RUN pip install --no-cache-dir paper2remarkable -- cgit v1.2.3 From 96a1526c8daf51ab151e6ccea325be8694d8f2ef Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 3 Apr 2020 10:52:16 +0100 Subject: fix name in makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bedbaae..769fc87 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ doc: install ## Build documentation with Sphinx cd $(DOC_DIR) && \ rm source/* && \ source $(VENV_DIR)/bin/activate && \ - sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ + sphinx-apidoc -H 'Paper2Remarkable API Documentation' -o source ../$(PACKAGE) && \ touch source/AUTOGENERATED $(MAKE) -C $(DOC_DIR) html -- cgit v1.2.3 From 726d4c42dde92c67131ee0311e7f965dd2ea13ad Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:48:48 +0100 Subject: Fix the calledprocesserror by not inheriting Turns out this never actually worked as intended. --- paper2remarkable/exceptions.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index a608bcc..66a329f 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -90,16 +90,13 @@ class RemarkableError(Error): return msg -class _CalledProcessError(CalledProcessError): - """Exception raised when subprocesses fail. +class _CalledProcessError(Error): + """Exception raised when subprocesses fail. """ - We subclass the CalledProcessError so we can add our custom error message. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, message): + self.message = message def __str__(self): - parent = super().__str__() - msg = parent + GH_MSG + msg = "ERROR: {message}".format(message=self.message) + msg += GH_MSG return msg -- cgit v1.2.3 From 2d5060549eccf173498b1db85788032bb0730e10 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:49:58 +0100 Subject: Add the pdftk path back to the ui --- paper2remarkable/providers/_base.py | 2 ++ paper2remarkable/ui.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index fbe5308..1337201 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -42,6 +42,7 @@ class Provider(metaclass=abc.ABCMeta): remarkable_dir="/", rmapi_path="rmapi", pdftoppm_path="pdftoppm", + pdftk_path="pdftk", qpdf_path="qpdf", gs_path="gs", cookiejar=None, @@ -51,6 +52,7 @@ class Provider(metaclass=abc.ABCMeta): self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdftoppm_path = pdftoppm_path + self.pdftk_path = pdftk_path self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index e17bffb..a3bf3c2 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -88,6 +88,11 @@ def parse_args(): help="path to pdftoppm executable (default: pdftoppm)", default="pdftoppm", ) + parser.add_argument( + "--pdftk", + help="path to pdftk executable (default: pdftk)", + default="pdftk", + ) parser.add_argument( "--qpdf", help="path to qpdf executable (default: qpdf)", @@ -158,6 +163,7 @@ def main(): remarkable_dir=args.remarkable_dir, rmapi_path=args.rmapi, pdftoppm_path=args.pdftoppm, + pdftk_path=args.pdftk, qpdf_path=args.qpdf, gs_path=args.gs, cookiejar=cookiejar, -- cgit v1.2.3 From 8d5ce28ed6a4cf52ae10bf4bed197cd00c529218 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:53:15 +0100 Subject: Enable both pdftk and qpdf This adds a function that checks which pdf tool is available and moves the compress/uncompress code to the base class of the providers for cleaner code. A new exception is added in case neither pdf tool can be found. --- paper2remarkable/exceptions.py | 16 +++++++++++++ paper2remarkable/providers/_base.py | 46 +++++++++++++++++++++++++++++++++++++ paper2remarkable/providers/arxiv.py | 27 ++++------------------ paper2remarkable/utils.py | 22 +++++++++++++++++- 4 files changed, 87 insertions(+), 24 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 66a329f..5ea9a78 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -100,3 +100,19 @@ class _CalledProcessError(Error): msg = "ERROR: {message}".format(message=self.message) msg += GH_MSG return msg + + +class NoPDFToolError(Error): + """Exception raised when neither pdftk or qpdf is found.""" + + def __init__(self): + pass + + def __str__(self): + msg = ( + "ERROR: Neither pdftk or qpdf could be found. Install " + "either of these or ensure that they can be found using " + "the --pdftk or --qpdf options." + ) + msg += GH_MSG + return msg diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 1337201..0cab6b7 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -11,13 +11,16 @@ Copyright: 2019, G.J.J. van den Burg import abc import os import shutil +import subprocess import tempfile import time +from ..exceptions import _CalledProcessError from ._info import Informer from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, + check_pdftool, download_url, upload_to_remarkable, follow_redirects, @@ -58,6 +61,8 @@ class Provider(metaclass=abc.ABCMeta): self.informer = Informer() self.cookiejar = cookiejar + self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path) + # wait time to not hit the server too frequently self.server_delay = 0 @@ -110,6 +115,47 @@ class Provider(metaclass=abc.ABCMeta): # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename, cookiejar=self.cookiejar) + def compress_pdf(self, in_pdf, out_pdf): + """ Compress a pdf file, returns subprocess status """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] + ) + elif self.pdftool == "qpdf": + # TODO: the --no-warn option is only needed because when we remove + # the arXiv stamp we don't fix the length of the pdf object. This + # causes qpdf to raise a warning and give a nonzero exit status + # (3). Fixing the pdf object is the right approach, but this does + # work as qpdf fixes the file. + status = subprocess.call( + [ + self.qpdf_path, + "--no-warn", + "--stream-data=compress", + in_pdf, + out_pdf, + ] + ) + if not (status == 0 or status == 3): + raise _CalledProcessError( + "%s failed to compress the PDF file." % self.pdftool + ) + + def uncompress_pdf(self, in_pdf, out_pdf): + """ Uncompress a pdf file """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] + ) + elif self.pdftool == "qpdf": + status = subprocess.call( + [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,] + ) + if not status == 0: + raise _CalledProcessError( + "%s failed to uncompress the PDF file." % self.pdftool + ) + def run(self, src, filename=None): # follow_redirects here is needed with library use if os.path.exists(src): diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 06bfdec..74043ed 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -10,14 +10,10 @@ Copyright: 2019, G.J.J. van den Burg import os import re -import subprocess from ._info import Informer from ._base import Provider -from ..exceptions import ( - URLResolutionError, - _CalledProcessError as CalledProcessError, -) +from ..exceptions import URLResolutionError from ..log import Logger logger = Logger() @@ -71,20 +67,9 @@ class Arxiv(Provider): """Remove the arXiv timestamp from a pdf""" logger.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - status = subprocess.call( - [ - self.qpdf_path, - "--stream-data=uncompress", - input_file, - uncompress_file, - ] - ) - if not status == 0: - raise CalledProcessError( - "qpdf failed to uncompress the PDF file." - ) + uncompress_file = basename + "_uncompress.pdf" + self.uncompress_pdf(input_file, uncompress_file) with open(uncompress_file, "rb") as fid: data = fid.read() @@ -102,10 +87,6 @@ class Arxiv(Provider): oid.write(data) output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.qpdf_path, "--stream-data=compress", removed_file, output_file] - ) - if not status == 0: - raise CalledProcessError("qpdf failed to compress the PDF file.") + self.compress_pdf(removed_file, output_file) return output_file diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index f1447d9..9bfeec6 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import time import unidecode from .log import Logger -from .exceptions import FileTypeError, RemarkableError +from .exceptions import FileTypeError, RemarkableError, NoPDFToolError HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -166,3 +166,23 @@ def is_url(string): string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None + + +def check_pdftool(pdftk_path, qpdf_path): + """Check whether we have pdftk or qpdf available""" + # set defaults in case either is set to None or something + pdftk_path = pdftk_path or 'false' + qpdf_path = qpdf_path or 'false' + + status = subprocess.call( + [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + if status == 0: + return "pdftk" + status = subprocess.call( + [qpdf_path, '--help'], stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) + if status == 0: + return "qpdf" + raise NoPDFToolError -- cgit v1.2.3 From 01c294bccd10f8c430e1c959fbb5ebacea8f3c3a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:53:26 +0100 Subject: Add unit test --- .travis.yml | 2 +- tests/test_providers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f412f9b..e2edaaa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils qpdf install: - pip install -e .[dev] diff --git a/tests/test_providers.py b/tests/test_providers.py index 2bf7507..e539949 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -69,7 +69,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) -- cgit v1.2.3 From d9a41be6b304b39730839096e1e2ddaff1f379b6 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 14:54:17 +0100 Subject: Code formatting --- paper2remarkable/providers/_base.py | 6 +++--- paper2remarkable/ui.py | 8 ++------ paper2remarkable/utils.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 0cab6b7..0374213 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -16,16 +16,16 @@ import tempfile import time from ..exceptions import _CalledProcessError -from ._info import Informer +from ..log import Logger from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, check_pdftool, download_url, - upload_to_remarkable, follow_redirects, + upload_to_remarkable, ) -from ..log import Logger +from ._info import Informer logger = Logger() diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index a3bf3c2..bfb3647 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -59,12 +59,8 @@ def parse_args(): action="store_true", ) parser.add_argument( - '-k', - '--no-crop', - help="Don't crop the pdf file", - action="store_true" - ) - + "-k", "--no-crop", help="Don't crop the pdf file", action="store_true" + ) parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 9bfeec6..39cf547 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -148,7 +148,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file - logger.info("%s put %s %s/"%(rmapi_path,filepath,remarkable_dir)) + logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, -- cgit v1.2.3 From 1e04daa6004538f57c4c51d663b4f13daf8af2e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 15:02:40 +0100 Subject: Remove --no-warn as it's unavailable on old versions (i.e. on travis) --- paper2remarkable/providers/_base.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 0374213..53ad78e 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -122,19 +122,14 @@ class Provider(metaclass=abc.ABCMeta): [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] ) elif self.pdftool == "qpdf": - # TODO: the --no-warn option is only needed because when we remove + # TODO: the status == 3 is only needed because when we remove # the arXiv stamp we don't fix the length of the pdf object. This - # causes qpdf to raise a warning and give a nonzero exit status - # (3). Fixing the pdf object is the right approach, but this does - # work as qpdf fixes the file. + # causes qpdf to raise a warning and give a nonzero exit status. + # Fixing the pdf object is the right approach, but this does + # work as it is since qpdf fixes the file for us. status = subprocess.call( - [ - self.qpdf_path, - "--no-warn", - "--stream-data=compress", - in_pdf, - out_pdf, - ] + [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,], + stderr=subprocess.DEVNULL, ) if not (status == 0 or status == 3): raise _CalledProcessError( -- cgit v1.2.3 From f24d1b3fdba482e69f7cfc7a6fb7ecabbcba069d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 16:20:15 +0100 Subject: Move arXiv tests to a separate file --- tests/test_arxiv.py | 29 +++++++++++++++++++++++++++++ tests/test_providers.py | 14 -------------- 2 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 tests/test_arxiv.py diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py new file mode 100644 index 0000000..beb9baa --- /dev/null +++ b/tests/test_arxiv.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for arXiv provider + +This file is part of paper2remarkable. + +""" + +import re +import unittest + +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX + + +class TestArxiv(unittest.TestCase): + def test_text_regex_1(self): + key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_2(self): + key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index e539949..e0239ed 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,7 +7,6 @@ __author__ = "G.J.J. van den Burg" import hashlib import os -import re import shutil import tempfile import unittest @@ -27,7 +26,6 @@ from paper2remarkable.providers import ( PubMed, Springer, ) -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX VERBOSE = False @@ -43,18 +41,6 @@ def md5sum(filename): return hasher.hexdigest() -class TestArxiv(unittest.TestCase): - def test_text_regex_1(self): - key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - def test_text_regex_2(self): - key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): -- cgit v1.2.3 From d2574dc53761d7e1379ce9569ae24508102a8aea Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 16:58:56 +0100 Subject: Properly update cookiejar This fixes a bug for Springer urls where the cookies wouldn't be properly carried over, resulting in a redirection failure. --- paper2remarkable/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 39cf547..969dc32 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -122,8 +122,10 @@ def follow_redirects(url): if not "Location" in req.headers: break url = req.headers["Location"] - jar = req.cookies + jar.update(req.cookies) it += 1 + if it == 100: + logger.warning("Max redirects reached. There may be a problem.") jar = jar or req.cookies return url, jar -- cgit v1.2.3 From ed9b8252a2361604331f7a275a7625b3de9017ff Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 17:20:47 +0100 Subject: Fix provider selection for redirected urls Some urls, such as the arXiv urls with the : in the identifier, didn't work when using the UI interface because the redirected url wasn't past to the provider, but the original url was. This commit fixes that issue and adds unit tests for the provider selection function, hopefully making this more robust in the future. --- paper2remarkable/exceptions.py | 22 +++++ paper2remarkable/ui.py | 71 ++++++++++---- tests/test_ui.py | 203 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 278 insertions(+), 18 deletions(-) create mode 100644 tests/test_ui.py diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 5ea9a78..94e9470 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -116,3 +116,25 @@ class NoPDFToolError(Error): ) msg += GH_MSG return msg + +class UnidentifiedSourceError(Error): + """Exception raised when the input is neither a local file nor a url """ + + def __str__(self): + msg = ( + "ERROR: Couldn't figure out what source you mean. If it's a " + "local file, please make sure it exists." + ) + msg += GH_MSG + return msg + +class InvalidURLError(Error): + """Exception raised when no provider can handle a url source """ + + def __str__(self): + msg = ( + "ERROR: Input URL is not valid, no provider can handle " + "this source." + ) + msg += GH_MSG + return msg diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index bfb3647..bf57552 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,6 +13,7 @@ import sys from . import __version__, GITHUB_URL +from .exceptions import UnidentifiedSourceError, InvalidURLError from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -118,9 +119,58 @@ def exception(msg): raise SystemExit(1) +def choose_provider(cli_input): + """Choose the provider to use for the given source + + This function first tries to check if the input is a local file, by + checking if the path exists. Next, it checks if the input is a "valid" url + using a regex test. If it is, the registered provider classes are checked + to see which provider can handle this url. + + Returns + ------- + provider : class + The class of the provider than can handle the source. A subclass of the + Provider abc. + + new_input : str + The updated input to the provider. This only has an effect for the url + providers, where this will be the url after following all redirects. + + cookiejar : dict or requests.RequestsCookieJar + Cookies picked up when following redirects. These are needed for some + providers to ensure later requests have the right cookie settings. + + Raises + ------ + UnidentifiedSourceError + Raised when the input is neither an existing local file nor a valid url + + InvalidURLError + Raised when the input *is* a valid url, but no provider can handle it. + + """ + provider = cookiejar = None + if LocalFile.validate(cli_input): + # input is a local file + new_input = cli_input + provider = LocalFile + elif is_url(cli_input): + # input is a url + new_input, cookiejar = follow_redirects(cli_input) + provider = next((p for p in providers if p.validate(new_input)), None) + else: + # not a proper URL or non-existent file + raise UnidentifiedSourceError + + if provider is None: + raise InvalidURLError + + return provider, new_input, cookiejar + + def main(): args = parse_args() - cookiejar = None if args.center and args.right: exception("Can't center and right align at the same time!") @@ -131,22 +181,7 @@ def main(): if args.right and args.no_crop: exception("Can't right align and not crop at the same time!") - if LocalFile.validate(args.input): - # input is a local file - provider = LocalFile - elif is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = next((p for p in providers if p.validate(url)), None) - else: - # not a proper URL or non-existent file - exception( - "Couldn't figure out what source you mean. If it's a " - "local file, make sure it exists." - ) - - if provider is None: - exception("Input not valid, no provider can handle this source.") + provider, new_input, cookiejar = choose_provider(args.input) prov = provider( verbose=args.verbose, @@ -165,4 +200,4 @@ def main(): cookiejar=cookiejar, ) - prov.run(args.input, filename=args.filename) + prov.run(new_input, filename=args.filename) diff --git a/tests/test_ui.py b/tests/test_ui.py new file mode 100644 index 0000000..fc362a0 --- /dev/null +++ b/tests/test_ui.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for command line interface + +This file is part of paper2remarkable. + +""" + +import os +import shutil +import tempfile +import unittest + +from paper2remarkable.exceptions import ( + InvalidURLError, + UnidentifiedSourceError, +) +from paper2remarkable.providers import ( + ACM, + Arxiv, + CiteSeerX, + HTML, + JMLR, + LocalFile, + NBER, + NeurIPS, + OpenReview, + PMLR, + PdfUrl, + PubMed, + Springer, +) +from paper2remarkable.ui import choose_provider + + +class TestUI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_choose_provider_1(self): + tests = [ + ( + Arxiv, + "https://arxiv.org/abs/1811.11242v1", + "https://arxiv.org/abs/1811.11242v1", + ), + ( + Arxiv, + "http://arxiv.org/abs/arXiv:1908.03213", + "https://arxiv.org/abs/1908.03213", + ), + ( + Arxiv, + "https://arxiv.org/abs/math/0309285", + "https://arxiv.org/abs/math/0309285", + ), + ( + Arxiv, + "https://arxiv.org/pdf/physics/0605197v1.pdf", + "https://arxiv.org/pdf/physics/0605197v1.pdf", + ), + ( + PubMed, + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + ), + ( + ACM, + "https://dl.acm.org/citation.cfm?id=3025626", + "https://dl.acm.org/doi/10.1145/3025453.3025626", + ), + ( + ACM, + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true", + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&", + ), + ( + OpenReview, + "http://openreview.net/forum?id=S1x4ghC9tQ", + "https://openreview.net/forum?id=S1x4ghC9tQ", + ), + ( + Springer, + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + ), + ( + PdfUrl, + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + ), + ( + JMLR, + "http://www.jmlr.org/papers/v10/xu09a.html", + "http://www.jmlr.org/papers/v10/xu09a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v97/behrmann19a.html", + "http://proceedings.mlr.press/v97/behrmann19a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/melnyk16.pdf", + "http://proceedings.mlr.press/v48/melnyk16.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/zhangf16.html", + "http://proceedings.mlr.press/v48/zhangf16.html", + ), + ( + NBER, + "https://www.nber.org/papers/w26752", + "https://www.nber.org/papers/w26752", + ), + ( + NBER, + "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/papers/w19152.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/7796-middle-out-decoding", + "https://papers.nips.cc/paper/7796-middle-out-decoding", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + ), + ( + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4" + ), + ] + for exp_prov, url, exp_url in tests: + prov, new_url, jar = choose_provider(url) + with self.subTest(url=url): + self.assertEqual(exp_url, new_url) + self.assertEqual(prov, exp_prov) + + def test_choose_provider_2(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + + prov, new_input, jar = choose_provider(local_filename) + self.assertEqual(prov, LocalFile) + self.assertEqual(new_input, local_filename) + self.assertIsNone(jar) + + def test_choose_provider_3(self): + local_filename = "/tmp/abcdef.pdf" + with self.assertRaises(UnidentifiedSourceError): + choose_provider(local_filename) + + def test_choose_provider_4(self): + url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md" + with self.assertRaises(InvalidURLError): + choose_provider(url) + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From ee0ca97a81e2510d9427c65ca092a64618a87f17 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 17:29:52 +0100 Subject: Code formatting with Black --- paper2remarkable/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 969dc32..38c8735 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -173,8 +173,8 @@ def is_url(string): def check_pdftool(pdftk_path, qpdf_path): """Check whether we have pdftk or qpdf available""" # set defaults in case either is set to None or something - pdftk_path = pdftk_path or 'false' - qpdf_path = qpdf_path or 'false' + pdftk_path = pdftk_path or "false" + qpdf_path = qpdf_path or "false" status = subprocess.call( [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL @@ -182,8 +182,9 @@ def check_pdftool(pdftk_path, qpdf_path): if status == 0: return "pdftk" status = subprocess.call( - [qpdf_path, '--help'], stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL + [qpdf_path, "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) if status == 0: return "qpdf" -- cgit v1.2.3 From fdbc77f3832b9071359b8e2e73ea1570ef691718 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 19:54:05 +0100 Subject: Fix the pdf output of dearxiv The dearxiv function removes the arXiv stamp from the pdf, but in the past would return a somewhat broken pdf. When it deleted the stamp the positions of objects and the length of the stream of the arXiv stamp object would be incorrect. This wasn't an issue for pdftk but gives a warning for qpdf. Because we don't want to mask qpdf warnings in general (something might actually be wrong), it was desirable to return a valid pdf file from dearxiv. This commit does exactly that. --- paper2remarkable/providers/_base.py | 7 +-- paper2remarkable/providers/arxiv.py | 109 ++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 18 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 53ad78e..20349c2 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -122,16 +122,11 @@ class Provider(metaclass=abc.ABCMeta): [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] ) elif self.pdftool == "qpdf": - # TODO: the status == 3 is only needed because when we remove - # the arXiv stamp we don't fix the length of the pdf object. This - # causes qpdf to raise a warning and give a nonzero exit status. - # Fixing the pdf object is the right approach, but this does - # work as it is since qpdf fixes the file for us. status = subprocess.call( [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,], stderr=subprocess.DEVNULL, ) - if not (status == 0 or status == 3): + if not status == 0: raise _CalledProcessError( "%s failed to compress the PDF file." % self.pdftool ) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 74043ed..7f3d554 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -71,22 +71,107 @@ class Arxiv(Provider): uncompress_file = basename + "_uncompress.pdf" self.uncompress_pdf(input_file, uncompress_file) - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) + new_data = [] + current_obj = [] + replaced_arXiv = False + char_count = skip_n = startxref = 0 + xref = {} + + with open(uncompress_file, "rb") as fp: + for line in fp: + if skip_n: + # Skip a line + skip_n -= 1 + continue + + if line.endswith(b" obj\n"): + # Start a new object. Add it to the current object and + # record its position for the xref table. + current_obj.append(line) + objid = int(line.split(b" ")[0]) + xref[objid] = char_count + elif current_obj and line == b"endobj\n": + # End the current object. If needed, replace the arXiv + # stamp in the block (done only once). Reset current + # object. + current_obj.append(line) + block = b"".join(current_obj) + if not replaced_arXiv: + # remove the text + block, n_subs1 = re.subn( + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", + b"()Tj", + block, + ) + # remove the url + block, n_subs2 = re.subn( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + block, + ) + if n_subs1 or n_subs2: + # fix the length of the object stream + block = fix_stream_length(block) + replaced_arXiv = True + new_data.append(block) + char_count += len(block) + current_obj = [] + elif current_obj: + # If we're recording an object, simply add the line to it + current_obj.append(line) + elif line == b"xref\n": + # We found the xref table, record its position and write it + # out using our updated indices. + startxref = sum(map(len, new_data)) + new_data.append(line) + new_data.append(b"0 %i\n" % (len(xref) + 1)) + new_data.append(b"0000000000 65535 f \n") + for objid in sorted(xref): + new_data.append(b"%010d 00000 n \n" % xref[objid]) + + # skip the appropriate number of lines + skip_n = len(xref) + 2 + elif line == b"startxref\n": + # Write out our recorded startxref position, skip the old + # position. + new_data.append(b"startxref\n%i\n" % startxref) + skip_n = 1 + else: + # Anything else passes through + new_data.append(line) + char_count += len(line) removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) + with open(removed_file, "wb") as fp: + fp.write(b"".join(new_data)) output_file = basename + "_dearxiv.pdf" self.compress_pdf(removed_file, output_file) return output_file + + +def fix_stream_length(block): + # This fixes the stream length of a block, which is needed after we have + # removed the arXiv stamp. + count = 0 + block = block.split(b"\n") + do_count = False + + for line in block: + if line in [b"stream", b"endstream"]: + do_count = not do_count + continue + + if do_count: + # +1 for the newline character + count += len(line) + 1 + + new_block = [] + for line in block: + if b" /Length " in line: + new_block.append(b"<< /Length %i >>" % count) + else: + new_block.append(line) + + return b"\n".join(new_block) -- cgit v1.2.3 From 504180c138172d7c371a7e9ad984915ab048c64f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 22:38:58 +0100 Subject: Replace excepthook for nicer errors I think that generally a traceback is too much info for an end-user, and the error message we provide can get lost in the noise. This commit disables the standard excepthook with its traceback, and replaces it with just the message of the exception. Using the debug flag enables the original traceback. --- paper2remarkable/ui.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index bf57552..2fbf49f 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -169,8 +169,21 @@ def choose_provider(cli_input): return provider, new_input, cookiejar +def set_excepthook(debug): + sys_hook = sys.excepthook + + def exception_handler(exception_type, value, traceback): + if debug: + sys_hook(exception_type, value, traceback) + else: + print(value, file=sys.stderr) + + sys.excepthook = exception_handler + + def main(): args = parse_args() + set_excepthook(args.debug) if args.center and args.right: exception("Can't center and right align at the same time!") -- cgit v1.2.3 From b8452034ed3a503e06e58f524ac322a4ab0203bb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 22:43:11 +0100 Subject: Code formatting --- paper2remarkable/exceptions.py | 21 +++++++++++---------- tests/test_ui.py | 18 +++++++++--------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 94e9470..b433ad4 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -6,10 +6,9 @@ from . import GITHUB_URL -from subprocess import CalledProcessError - -GH_MSG = "\n\nIf you think this might be a bug, please raise an issue on GitHub at: {url}".format( - url=GITHUB_URL +GH_MSG = ( + "\n\nIf you think this might be a bug, please raise an issue on " + "GitHub at:\n{url}\n".format(url=GITHUB_URL) ) @@ -117,24 +116,26 @@ class NoPDFToolError(Error): msg += GH_MSG return msg + class UnidentifiedSourceError(Error): """Exception raised when the input is neither a local file nor a url """ def __str__(self): msg = ( - "ERROR: Couldn't figure out what source you mean. If it's a " - "local file, please make sure it exists." - ) + "ERROR: Couldn't figure out what source you mean. If it's a " + "local file, please make sure it exists." + ) msg += GH_MSG return msg + class InvalidURLError(Error): """Exception raised when no provider can handle a url source """ def __str__(self): msg = ( - "ERROR: Input URL is not valid, no provider can handle " - "this source." - ) + "ERROR: Input URL is not valid, no provider can handle " + "this source." + ) msg += GH_MSG return msg diff --git a/tests/test_ui.py b/tests/test_ui.py index fc362a0..11ed87a 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -160,15 +160,15 @@ class TestUI(unittest.TestCase): "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", ), ( - HTML, - "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", - "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" - ), - ( - HTML, - "https://www.nature.com/articles/d41586-020-00176-4", - "https://www.nature.com/articles/d41586-020-00176-4" - ), + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) -- cgit v1.2.3 From c4ff7ee4533e13c0d81a138955d1f7dfe724c2ff Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 23:20:50 +0100 Subject: Bump version and update changelog and readme --- CHANGELOG.md | 19 ++++++++++ README.md | 77 +++++++++++++++++++++++------------------ paper2remarkable/__version__.py | 2 +- 3 files changed, 64 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49bb980..924abb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## Version 0.6.0 + +* The Dockerfile has been updated to use a more recent version of Cairo + ([#35](https://github.com/GjjvdBurg/paper2remarkable/issues/35)). Thanks to + @ClaytonJY. +* We've added support for optionally using qpdf instead of pdftk + ([#36](https://github.com/GjjvdBurg/paper2remarkable/pull/36)). Thanks to + @delaere. +* Resolving redirects has been improved, which solves an issue for the + Springer provider + ([#38](https://github.com/GjjvdBurg/paper2remarkable/pull/38)) and an issue + with some arXiv urls + ([#39](https://github.com/GjjvdBurg/paper2remarkable/pull/39)). +* Unit tests were added for the provider selection. +* The code that removes the arXiv stamp has been improved + ([#40](https://github.com/GjjvdBurg/paper2remarkable/pull/40)). +* Tracebacks have been disabled outside of debug mode, showing clearer errors + ([#41](https://github.com/GjjvdBurg/paper2remarkable/pull/41)). + ## Version 0.5.6 * Be more robust against missing pdftoppm executable. diff --git a/README.md b/README.md index 2bc4e35..5de3ba9 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,8 @@ $ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines The script can be run through the ``p2r`` command line program or via Docker (see below). If you're using MacOS, you might be interested in the [Alfred -workflow](#alfred). +workflow](#alfred). On Linux, a background terminal such as +[Guake](http://guake-project.org/) can be very handy. ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: @@ -44,7 +45,7 @@ let me know! ``paper2remarkable`` takes the source URL and: -1. Downloads the pdf if necessary +1. Downloads the pdf 2. Removes the arXiv timestamp (for arXiv sources) 3. Crops the pdf to remove unnecessary borders 4. Shrinks the pdf file to reduce the filesize @@ -67,10 +68,10 @@ Here's the full help of the script: ```text usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] - [--pdftk PDFTK] [--rmapi RMAPI] + [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.5.4 +Paper2reMarkable version 0.6.0 positional arguments: input URL to a paper or the path of a local PDF file @@ -93,6 +94,7 @@ optional arguments: --gs GS path to gs executable (default: gs) --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm) --pdftk PDFTK path to pdftk executable (default: pdftk) + --qpdf QPDF path to qpdf executable (default: qpdf) --rmapi RMAPI path to rmapi executable (default: rmapi) ``` @@ -119,40 +121,48 @@ $ p2r -v https://arxiv.org/abs/1811.11242 The script requires the following external programs to be available: -- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/), or - ``pdftk-java``, whichever your package manager provides. +- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/), + [qpdf](http://qpdf.sourceforge.net/), or + [pdftk-java](https://gitlab.com/pdftk-java/pdftk), whichever your package + manager provides. - [GhostScript](https://www.ghostscript.com/) -- [pdftoppm](https://linux.die.net/man/1/pdftoppm) Optional, but recommended - for speed. Usually part of a [Poppler](https://poppler.freedesktop.org/) - installation. - [rMAPI](https://github.com/juruen/rmapi) +- Optional: [pdftoppm](https://linux.die.net/man/1/pdftoppm) (recommended for + speed). Usually part of a [Poppler](https://poppler.freedesktop.org/) + installation. -On Arch, use ``pacman -S pdftk ghostscript poppler``, on Ubuntu try ``apt-get -install pdftk ghostscript poppler-utils``, and on Mac: ``brew install -pdftk-java ghostscript poppler``. For -[rMAPI](https://github.com/juruen/rmapi), use ``go get -u -github.com/juruen/rmapi``. - -If these scripts are not available on the ``PATH`` variable, you can supply -them with the relevant options to the script. Then, you can install -``paper2remarkable`` from PyPI: - -``` -pip install paper2remarkable -``` - -This installs the ``p2r`` command line program. +Specifically: + +1. First install [rMAPI](https://github.com/juruen/rmapi), using + ``` + $ go get -u github.com/juruen/rmapi + ``` +2. Then install system dependencies: + - **Arch Linux:** ``pacman -S pdftk ghostscript poppler`` + - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace + ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``. + - **MacOs:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). +3. Finally, install ``paper2remarkable``: + ``` + $ pip install paper2remarkable + ``` + this installs the ``p2r`` command line program. + +If any of the dependencies are not available on the ``PATH`` variable, you can +supply them with the relevant options to the script (for instance ``p2r +--rmapi /path/to/rmapi``). If you run into trouble with the installation, +please let me know! ## Alfred -Install the [Alfred workflow][workflow], which is [a launcher for -MacOS](https://www.alfredapp.com/). +On MacOS, you can optionally install [this Alfred workflow][workflow]. Alfred +is [a launcher for MacOS](https://www.alfredapp.com/). -Once installed, you can use `rm` command and `rmb` (for the `--blank` pages to -insert blank pages between pages for notes) with a URL passed. The global -shortcut `Alt-P` will send the current selection to `p2r`. Note that by default -`--right` is passed and `p2r` is executed in your `bash` environment. You can -edit the Workflow in Alfred if this doesn't work for your setup. +Once installed, you can then use `rm` command and `rmb` (for the `--blank` +pages to insert blank pages between pages for notes) with a URL passed. The +global shortcut `Alt-P` will send the current selection to `p2r`. Note that by +default `--right` is passed and `p2r` is executed in your `bash` environment. +You can edit the Workflow in Alfred if this doesn't work for your setup. ![Alfred Screenshot](https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/.github/alfred.png) @@ -160,8 +170,9 @@ edit the Workflow in Alfred if this doesn't work for your setup. ## Docker -You can also use the Dockerfile to avoid installing dependencies on your -machine. You will need `git` and `docker` installed. +If you'd like to avoid installing the dependencies directly on your machine, +you can use the Dockerfile. To make this work you will need ``git`` and +``docker`` installed. First clone this repository with `git clone` and `cd` inside of it, then build the container: diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index fdbb314..ea46c5a 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 5, 6) +VERSION = (0, 6, 0) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 207c94e509b108849fb14b604755a34d6a821a70 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 23:21:09 +0100 Subject: update release script --- make_release.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/make_release.py b/make_release.py index a19b5fd..f335059 100644 --- a/make_release.py +++ b/make_release.py @@ -96,6 +96,12 @@ class UpdateChangelog(Step): self.print_run("vi CHANGELOG.md") +class UpdateReadme(Step): + def action(self, context): + self.instruct(f"Update readme if necessary") + self.print_run("vi README.md") + + class RunTests(Step): def action(self, context): self.instruct("Run the unit tests") @@ -159,7 +165,7 @@ class TestPackage(Step): self.instruct( f"Ensure that the following command gives version {context['version']}" ) - self.print_run(f"p2r -h") + self.print_run(f"p2r -V") class DeactivateVenv(Step): @@ -218,6 +224,7 @@ def main(): PushToGitHub(), BumpVersionPackage(), UpdateChangelog(), + UpdateReadme(), MakeClean(), RunTests(), MakeDist(), -- cgit v1.2.3 From c68b475ea9c7d11e309d8c997924b9f4f0e44575 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 4 Apr 2020 23:24:30 +0100 Subject: readme spacing --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5de3ba9..9fdda6b 100644 --- a/README.md +++ b/README.md @@ -137,11 +137,13 @@ Specifically: ``` $ go get -u github.com/juruen/rmapi ``` + 2. Then install system dependencies: - **Arch Linux:** ``pacman -S pdftk ghostscript poppler`` - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``. - **MacOs:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + 3. Finally, install ``paper2remarkable``: ``` $ pip install paper2remarkable -- cgit v1.2.3 From ee2d9a6051f8dce54e7b20dc04f9f9adc4317bb7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 8 Apr 2020 21:15:09 +0100 Subject: Change wd back to initial directory --- paper2remarkable/providers/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 20349c2..c3abe19 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -191,4 +191,5 @@ class Provider(metaclass=abc.ABCMeta): base = os.path.splitext(target_path)[0] target_path = base + "_.pdf" shutil.move(clean_filename, target_path) - return target_path + os.chdir(self.initial_dir) + return target_path -- cgit v1.2.3 From 62d72c8c073376a036df66d872ffd6149374fd7b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 8 Apr 2020 21:15:48 +0100 Subject: Be more robust against spaces in pdf file This caused problems where the arxiv stamp was not removed for some files. This commit adds tests for this and fixes the issue. --- paper2remarkable/providers/arxiv.py | 8 ++++---- tests/test_arxiv.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 7f3d554..572c2bf 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -84,13 +84,13 @@ class Arxiv(Provider): skip_n -= 1 continue - if line.endswith(b" obj\n"): + if line.endswith(b" obj\n") or line.endswith(b" obj \n"): # Start a new object. Add it to the current object and # record its position for the xref table. current_obj.append(line) objid = int(line.split(b" ")[0]) xref[objid] = char_count - elif current_obj and line == b"endobj\n": + elif current_obj and line.startswith(b'endobj'): # End the current object. If needed, replace the arXiv # stamp in the block (done only once). Reset current # object. @@ -119,7 +119,7 @@ class Arxiv(Provider): elif current_obj: # If we're recording an object, simply add the line to it current_obj.append(line) - elif line == b"xref\n": + elif line in [b"xref\n", b"endobj xref\n"]: # We found the xref table, record its position and write it # out using our updated indices. startxref = sum(map(len, new_data)) @@ -159,7 +159,7 @@ def fix_stream_length(block): do_count = False for line in block: - if line in [b"stream", b"endstream"]: + if line.strip(b" ") in [b"stream", b"endstream"]: do_count = not do_count continue diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index beb9baa..08ea2c4 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -7,13 +7,28 @@ This file is part of paper2remarkable. """ +import os import re +import shutil +import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX +from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv class TestArxiv(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + def test_text_regex_1(self): key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" m = re.fullmatch(DEARXIV_TEXT_REGEX, key) @@ -24,6 +39,24 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_stamp_removed_1(self): + url = "https://arxiv.org/pdf/1703.06103.pdf" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data) + + def test_stamp_removed_2(self): + url = "https://arxiv.org/abs/2003.06222" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 90e97824287c192d3d99cae2981cbd35905cb91d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 8 Apr 2020 21:21:52 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 4 ++++ paper2remarkable/__version__.py | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 924abb0..cd62d25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.6.1 + +* Bugfix that makes removing the arXiv stamp more robust. + ## Version 0.6.0 * The Dockerfile has been updated to use a more recent version of Cairo diff --git a/README.md b/README.md index 9fdda6b..3a3819d 100644 --- a/README.md +++ b/README.md @@ -216,6 +216,10 @@ docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r -v https://arxiv.or docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" -v "$(pwd):/home/user:r" p2r -v localfile.pdf ``` +For transferring local files using the Docker image, you may find [this helper +function](https://github.com/GjjvdBurg/paper2remarkable/issues/34#issuecomment-610852258) +useful. + You can also create an [alias](http://tldp.org/LDP/abs/html/aliases.html) in your ``~/.bashrc`` file to abstract away the Docker commands: diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index ea46c5a..98ffeff 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 0) +VERSION = (0, 6, 1) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From ea5dc5be95a1bc8a8c8caf42ecf19b12a01e25ff Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 9 Apr 2020 14:14:13 +0100 Subject: Log whether removing arXiv stamp was successful --- paper2remarkable/log.py | 24 ++++++++++++++++-------- paper2remarkable/providers/arxiv.py | 8 +++++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index bae1cbf..3a2fcc5 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -38,19 +38,27 @@ class Logger(metaclass=Singleton): def disable(self): self.enabled = False - def _log(self, msg, mode): + def _log(self, msg, mode, end='\n', add_prefix=True): if not self.enabled: return if not mode in ("info", "warn"): raise ValueError("Unknown logging mode: %s" % mode) file = sys.stdout if mode == "info" else sys.stderr - now = datetime.datetime.now() - nowstr = now.strftime("%Y-%m-%d %H:%M:%S") - print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file) + if add_prefix: + now = datetime.datetime.now() + nowstr = now.strftime("%Y-%m-%d %H:%M:%S") + prefix = "%s - %s - " % (nowstr, mode.upper()) + else: + prefix = "" + print("%s%s" % (prefix, msg), end=end, file=file) file.flush() - def info(self, msg): - self._log(msg, "info") + def info(self, msg, end='\n'): + self._log(msg, "info", end=end) + + def warning(self, msg, end='\n'): + self._log(msg, "warn", end=end) + + def append(self, msg, mode, end='\n'): + self._log(msg, mode, end=end, add_prefix=False) - def warning(self, msg): - self._log(msg, "warn") diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 572c2bf..47da448 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -65,7 +65,7 @@ class Arxiv(Provider): def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" - logger.info("Removing arXiv timestamp") + logger.info("Removing arXiv timestamp ... ", end="") basename = os.path.splitext(input_file)[0] uncompress_file = basename + "_uncompress.pdf" @@ -90,13 +90,13 @@ class Arxiv(Provider): current_obj.append(line) objid = int(line.split(b" ")[0]) xref[objid] = char_count - elif current_obj and line.startswith(b'endobj'): + elif current_obj and line.startswith(b"endobj"): # End the current object. If needed, replace the arXiv # stamp in the block (done only once). Reset current # object. current_obj.append(line) block = b"".join(current_obj) - if not replaced_arXiv: + if not replaced_arXiv and b"arXivStAmP" in block: # remove the text block, n_subs1 = re.subn( b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", @@ -148,6 +148,8 @@ class Arxiv(Provider): output_file = basename + "_dearxiv.pdf" self.compress_pdf(removed_file, output_file) + logger.append("success" if replaced_arXiv else "failed", "info") + return output_file -- cgit v1.2.3 From 3151b28a062ca8771a80cfb9aaf41d6907034e43 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 9 Apr 2020 14:14:22 +0100 Subject: remove unnecessary logging --- paper2remarkable/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 38c8735..cca904b 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -150,7 +150,6 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) # Upload the file - logger.info("%s put %s %s/" % (rmapi_path, filepath, remarkable_dir)) status = subprocess.call( [rmapi_path, "put", filepath, remarkable_dir + "/"], stdout=subprocess.DEVNULL, -- cgit v1.2.3 From e0aba92623d9961602d37a5e3f6ce01403e3598a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Apr 2020 11:05:19 +0100 Subject: Properly check for the installed pdf tool This fixes #42. --- paper2remarkable/utils.py | 14 ++++++++++---- tests/test_utils.py | 21 +++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/test_utils.py diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index cca904b..791e81a 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -175,16 +175,22 @@ def check_pdftool(pdftk_path, qpdf_path): pdftk_path = pdftk_path or "false" qpdf_path = qpdf_path or "false" - status = subprocess.call( - [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL - ) + try: + status = subprocess.call( + [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + except FileNotFoundError: + status = 1 if status == 0: return "pdftk" - status = subprocess.call( + try: + status = subprocess.call( [qpdf_path, "--help"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) + except FileNotFoundError: + status = 1 if status == 0: return "qpdf" raise NoPDFToolError diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..4c122e0 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest + +from paper2remarkable.exceptions import NoPDFToolError +from paper2remarkable.utils import check_pdftool + + +class TestUtils(unittest.TestCase): + def test_check_pdftool(self): + # Needs a system with both pdftk and qpdf available + self.assertEqual(check_pdftool("pdftk", "qpdf"), "pdftk") + self.assertEqual(check_pdftool("pdftk_xyz", "qpdf"), "qpdf") + self.assertEqual(check_pdftool("pdftk", "qpdf_xyz"), "pdftk") + with self.assertRaises(NoPDFToolError): + check_pdftool("pdftk_xyz", "qpdf_xyz") + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From 3999c7e59dbdf50a35bbf3c6efb9af872befde95 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Apr 2020 11:18:32 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 6 ++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd62d25..8a05abc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.6.2 + +* Print to log whether removing arXiv stamp was successful. +* Fix bug that failed to correctly detect the pdf tool + ([#42](https://github.com/GjjvdBurg/paper2remarkable/issues/42)). + ## Version 0.6.1 * Bugfix that makes removing the arXiv stamp more robust. diff --git a/README.md b/README.md index 3a3819d..cf82989 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.0 +Paper2reMarkable version 0.6.2 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 98ffeff..68746e4 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 1) +VERSION = (0, 6, 2) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 2cfcf8444a1bab64f8744ac5d24c53a12ec3448d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 16 Apr 2020 10:16:33 +0100 Subject: Allow + in urls --- paper2remarkable/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 791e81a..97ff713 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -163,7 +163,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): def is_url(string): # pattern adapted from CleverCSV - pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*\-?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:]*)?(\.[a-z]+)?" + pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*\-?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None -- cgit v1.2.3 From 73903815c774332a8bf41a1d8c8a0c09eaeda5b3 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 25 Apr 2020 14:37:29 +0100 Subject: Update release script --- make_release.py | 56 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/make_release.py b/make_release.py index f335059..b3ea935 100644 --- a/make_release.py +++ b/make_release.py @@ -14,6 +14,7 @@ Date: 2019-07-23 import colorama import os +import sys def colored(msg, color=None, style=None): @@ -51,6 +52,12 @@ def get_package_name(): ) return nameline.split("=")[-1].strip().strip('"') +def get_package_version(): + ctx = {} + with open(f"{pkgname.lower()}/__version__.py", "r") as fp: + exec(fp.read(), ctx) + return ctx['__version__'] + class Step: def pre(self, context): @@ -119,10 +126,7 @@ class BumpVersionPackage(Step): def _get_version(self, context): # Get the version from the version file - about = {} - with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp: - exec(fp.read(), about) - return about["__version__"] + return get_package_version(context['pkgname']) class MakeClean(Step): @@ -219,30 +223,36 @@ class WaitForRTD(Step): def main(): colorama.init() procedure = [ - GitToMaster(), - GitAdd(), - PushToGitHub(), - BumpVersionPackage(), - UpdateChangelog(), - UpdateReadme(), - MakeClean(), - RunTests(), - MakeDist(), - PushToTestPyPI(), - InstallFromTestPyPI(), - TestPackage(), - DeactivateVenv(), - GitAdd(), - PushToPyPI(), - GitTagVersion(), - PushToGitHub(), + ("gittomaster", GitToMaster()), + ("gitadd1", GitAdd()), + ("push1", PushToGitHub()), + ("bumpversion", BumpVersionPackage()), + ("changelog", UpdateChangelog()), + ("readme", UpdateReadme()), + ("clean", MakeClean()), + ("tests", RunTests()), + ("dist", MakeDist()), + ("testpypi", PushToTestPyPI()), + ("install", InstallFromTestPyPI()), + ("testpkg", TestPackage()), + ("deactivate", DeactivateVenv()), + ("gitadd2", GitAdd()), + ("pypi", PushToPyPI()), + ("tag", GitTagVersion()), + ("push2", PushToGitHub()), ] context = {} context["pkgname"] = get_package_name() - for step in procedure: + context["version"] = get_package_version(context["pkgname"]) + skip = True if target else False + for name, step in procedure: + if not name == target and skip: + continue + skip = False step.run(context) cprint("\nDone!", color="yellow", style="bright") if __name__ == "__main__": - main() + target = sys.argv[1] if len(sys.argv) > 1 else None + main(target=target) -- cgit v1.2.3 From cfa9f9170a9cdfad5dbe88f6c76d2f939953724b Mon Sep 17 00:00:00 2001 From: Simon Eskildsen Date: Mon, 27 Apr 2020 08:34:30 -0400 Subject: readme: add printing section --- README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cf82989..c74ffe4 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ $ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines The script can be run through the ``p2r`` command line program or via Docker (see below). If you're using MacOS, you might be interested in the [Alfred -workflow](#alfred). On Linux, a background terminal such as -[Guake](http://guake-project.org/) can be very handy. +workflow](#alfred) or [Printing to p2r](#printing). On Linux, a background +terminal such as [Guake](http://guake-project.org/) can be very handy. ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: @@ -170,6 +170,23 @@ You can edit the Workflow in Alfred if this doesn't work for your setup. [workflow]: https://github.com/GjjvdBurg/paper2remarkable/blob/master/Remarkable.alfredworkflow?raw=true +## Printing + +Printing to `p2r` allows printing prompts to save directly to your reMarkable +tablet, passing through `p2r` for processing. + +For MacOS, you can follow [the guide][print-guide] for printing with `rmapi`, +but for the bash script, instead use this script: + +``` +for f in "$@" +do + bash -c -l "p2r --right '$f'" +done +``` + +[print-guide]: https://github.com/juruen/rmapi/blob/master/docs/tutorial-print-macosx.md + ## Docker If you'd like to avoid installing the dependencies directly on your machine, -- cgit v1.2.3 From 71fb1f0e8433705e891aa4b6e176e26b62e5a9bb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 14:14:25 +0100 Subject: Properly resolve image urls (fixes #45) --- paper2remarkable/providers/html.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d0d55f4..bbafe10 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,26 @@ def my_fetcher(url): return weasyprint.default_url_fetcher(url) +class ImgProcessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, base_url, *args, **kwargs): + self._base_url = base_url + super().__init__(*args, **kwargs) + + def _find_img(self, node): + """ Find img nodes recursively """ + for img in node.findall("img"): + yield img + for child in node: + yield from self._find_img(child) + + def run(self, root): + """ Ensure all img src urls are absolute """ + for img in self._find_img(root): + img.attrib["src"] = urllib.parse.urljoin( + self._base_url, img.attrib["src"] + ) + + class HTMLInformer(Informer): def __init__(self): super().__init__() @@ -105,15 +125,10 @@ class HTML(Provider): # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) - # fix relative urls - base_url = "{0.scheme}://{0.netloc}".format( - urllib.parse.urlsplit(pdf_url) - ) - html_article = markdown.markdown(article) - html_article = html_article.replace(' src="//', ' src="https://') - html_article = html_article.replace( - ' src="/', ' src="{base}/'.format(base=base_url) - ) + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) if self.debug: with open("./paper.html", "w") as fp: -- cgit v1.2.3 From 944dcb1827905c2b87b6b062e33e66841c434747 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:17:09 +0100 Subject: minor fixes to release script --- make_release.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/make_release.py b/make_release.py index b3ea935..cb4af59 100644 --- a/make_release.py +++ b/make_release.py @@ -52,7 +52,7 @@ def get_package_name(): ) return nameline.split("=")[-1].strip().strip('"') -def get_package_version(): +def get_package_version(pkgname): ctx = {} with open(f"{pkgname.lower()}/__version__.py", "r") as fp: exec(fp.read(), ctx) @@ -220,7 +220,7 @@ class WaitForRTD(Step): ) -def main(): +def main(target=None): colorama.init() procedure = [ ("gittomaster", GitToMaster()), -- cgit v1.2.3 From 3224b3857cc2f11226043ced1da586756403cbb1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:26:42 +0100 Subject: Use builtin iter() function to find img elements --- paper2remarkable/providers/html.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index bbafe10..ba250e7 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -61,16 +61,9 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): self._base_url = base_url super().__init__(*args, **kwargs) - def _find_img(self, node): - """ Find img nodes recursively """ - for img in node.findall("img"): - yield img - for child in node: - yield from self._find_img(child) - def run(self, root): """ Ensure all img src urls are absolute """ - for img in self._find_img(root): + for img in root.iter("img"): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) -- cgit v1.2.3 From fb825cab2e4681a6e6cae8cf32adeeb880a4910c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:33:13 +0100 Subject: Add unit test for this bug --- tests/test_providers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index e0239ed..0787792 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,6 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os +import pdfplumber import shutil import tempfile import unittest @@ -238,6 +239,15 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 2ac1948030b549c9b93ede4fa390a3b7b3411a9f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:37:47 +0100 Subject: code formatting --- paper2remarkable/providers/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d0d55f4..6e08f1c 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -48,7 +48,7 @@ code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; } """ -def my_fetcher(url): +def url_fetcher(url): if url.startswith("//"): url = "https:" + url elif url.startswith("file:///"): @@ -120,7 +120,7 @@ class HTML(Provider): fp.write(html_article) font_config = weasyprint.fonts.FontConfiguration() - html = weasyprint.HTML(string=html_article, url_fetcher=my_fetcher) + html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher) css = weasyprint.CSS(string=CSS, font_config=font_config) html.write_pdf(filename, stylesheets=[css], font_config=font_config) -- cgit v1.2.3 From 26ea8d0691b9574561a1afb519956c2b0c6513da Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 17:48:53 +0100 Subject: upgrade travis distribution --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e2edaaa..8399160 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: xenial language: python python: -- cgit v1.2.3 From 8e720efe6604306e32875912ca5b1ff5d0fd891c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 18:20:34 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 7 +++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a05abc..b727cb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## Version 0.6.3 + +* Properly resolve image urls in HTML sources + ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to + @sirupsen. +* Allow ``+`` in urls + ## Version 0.6.2 * Print to log whether removing arXiv stamp was successful. diff --git a/README.md b/README.md index c74ffe4..e23c1a3 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.2 +Paper2reMarkable version 0.6.3 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 68746e4..e6d205f 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 2) +VERSION = (0, 6, 3) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 1630200c652029f42b36bf2982534638bb185287 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 27 Apr 2020 18:22:16 +0100 Subject: Update release script with more automation --- make_release.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/make_release.py b/make_release.py index cb4af59..b0c01d1 100644 --- a/make_release.py +++ b/make_release.py @@ -15,6 +15,7 @@ Date: 2019-07-23 import colorama import os import sys +import tempfile def colored(msg, color=None, style=None): @@ -52,11 +53,12 @@ def get_package_name(): ) return nameline.split("=")[-1].strip().strip('"') + def get_package_version(pkgname): ctx = {} with open(f"{pkgname.lower()}/__version__.py", "r") as fp: exec(fp.read(), ctx) - return ctx['__version__'] + return ctx["__version__"] class Step: @@ -118,7 +120,7 @@ class RunTests(Step): class BumpVersionPackage(Step): def action(self, context): self.instruct(f"Update __version__.py with new version") - self.print_run(f"vi {context['pkgname']}/__version__.py") + self.do_cmd(f"vi {context['pkgname']}/__version__.py") def post(self, context): wait_for_enter() @@ -126,7 +128,7 @@ class BumpVersionPackage(Step): def _get_version(self, context): # Get the version from the version file - return get_package_version(context['pkgname']) + return get_package_version(context["pkgname"]) class MakeClean(Step): @@ -153,15 +155,14 @@ class PushToTestPyPI(Step): class InstallFromTestPyPI(Step): def action(self, context): - self.print_run("cd /tmp/") - self.print_cmd("rm -rf ./venv") - self.print_cmd("virtualenv ./venv") - self.print_cmd("cd ./venv") - self.print_cmd("source bin/activate") - self.print_cmd( + tmpvenv = tempfile.mkdtemp(prefix="p2r_venv_") + self.do_cmd( + f"virtualenv {tmpvenv} && source {tmpvenv}/bin/activate && " "pip install --index-url https://test.pypi.org/simple/ " - + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}" + "--extra-index-url https://pypi.org/simple " + f"{context['pkgname']}=={context['version']}" ) + context["tmpvenv"] = tmpvenv class TestPackage(Step): @@ -169,13 +170,12 @@ class TestPackage(Step): self.instruct( f"Ensure that the following command gives version {context['version']}" ) - self.print_run(f"p2r -V") + self.do_cmd(f"source {context['tmpvenv']}/bin/activate && p2r -V") -class DeactivateVenv(Step): +class RemoveVenv(Step): def action(self, context): - self.print_run("deactivate") - self.instruct("Go back to the project directory") + self.do_cmd(f"rm -rf {context['tmpvenv']}") class GitTagVersion(Step): @@ -235,7 +235,7 @@ def main(target=None): ("testpypi", PushToTestPyPI()), ("install", InstallFromTestPyPI()), ("testpkg", TestPackage()), - ("deactivate", DeactivateVenv()), + ("remove_venv", RemoveVenv()), ("gitadd2", GitAdd()), ("pypi", PushToPyPI()), ("tag", GitTagVersion()), -- cgit v1.2.3 From 2667c488ac02d4a346bc3b89ff11b7f27125acc0 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 28 Apr 2020 13:15:54 +0100 Subject: Strip trailing slash from image urls --- paper2remarkable/providers/html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index 9f8394c..d71f210 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -67,6 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) + img.attrib["src"] = img.attrib['src'].rstrip('/') class HTMLInformer(Informer): -- cgit v1.2.3 From 58be7d78a105c0b0f871b339daa29cdf8f6557d4 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 28 Apr 2020 13:18:42 +0100 Subject: Add unit test for image urls with trailing slash --- tests/test_providers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index 0787792..a7f17ff 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -248,6 +248,14 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(32, len(pdfplumber.open(filename).pages)) + def test_html_4(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://sirupsen.com/2019/" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From adeda34e6a488192835df1ab1fd56377182e559e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 28 Apr 2020 13:35:08 +0100 Subject: Add a comment regarding use without a remarkable --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e23c1a3..0bbc3ef 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,9 @@ $ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines The script can be run through the ``p2r`` command line program or via Docker (see below). If you're using MacOS, you might be interested in the [Alfred workflow](#alfred) or [Printing to p2r](#printing). On Linux, a background -terminal such as [Guake](http://guake-project.org/) can be very handy. +terminal such as [Guake](http://guake-project.org/) can be very handy. Note +that even without a reMarkable, this program can make downloading papers +easier (just use the `-n` flag). ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: -- cgit v1.2.3 From 9bf3cc745e9bc01cb8e9ed3e8c14407f639b233e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 2 May 2020 13:05:37 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 6 ++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b727cb7..37ed104 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.6.4 + +* Further fixes for images in HTML sources + ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to + @sirupsen. + ## Version 0.6.3 * Properly resolve image urls in HTML sources diff --git a/README.md b/README.md index 0bbc3ef..853808f 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.3 +Paper2reMarkable version 0.6.4 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index e6d205f..146c34c 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 3) +VERSION = (0, 6, 4) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From f8c0e4b2c953d617ffea4a09a7373f697a5eb104 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 2 May 2020 19:34:14 +0100 Subject: Various improvements to dearxiv functionality --- paper2remarkable/providers/_base.py | 21 ++++++++++++ paper2remarkable/providers/arxiv.py | 67 ++++++++++++++++++++++--------------- tests/test_arxiv.py | 62 +++++++++++++++++++++++++++++++++- tests/test_providers.py | 8 ++++- 4 files changed, 129 insertions(+), 29 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index c3abe19..5ca3588 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -131,8 +131,29 @@ class Provider(metaclass=abc.ABCMeta): "%s failed to compress the PDF file." % self.pdftool ) + def rewrite_pdf(self, in_pdf, out_pdf): + """ Re-write the pdf using Ghostscript + + This helps avoid issues in dearxiv due to nested pdfs. + """ + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dQUIET", + "-o", + out_pdf, + in_pdf, + ] + ) + if not status == 0: + raise _CalledProcessError( + "Failed to rewrite the pdf with GhostScript" + ) + def uncompress_pdf(self, in_pdf, out_pdf): """ Uncompress a pdf file """ + if self.pdftool == "pdftk": status = subprocess.call( [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 47da448..4d0bc19 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -18,8 +18,9 @@ from ..log import Logger logger = Logger() -DEARXIV_TEXT_REGEX = ( - b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_URI_REGEX = ( + b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+" ) @@ -32,8 +33,8 @@ class Arxiv(Provider): re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" - re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?" - re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf" + re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?" + re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -68,8 +69,11 @@ class Arxiv(Provider): logger.info("Removing arXiv timestamp ... ", end="") basename = os.path.splitext(input_file)[0] + recoded_file = basename + "_rewrite.pdf" + self.rewrite_pdf(input_file, recoded_file) + uncompress_file = basename + "_uncompress.pdf" - self.uncompress_pdf(input_file, uncompress_file) + self.uncompress_pdf(recoded_file, uncompress_file) new_data = [] current_obj = [] @@ -90,36 +94,42 @@ class Arxiv(Provider): current_obj.append(line) objid = int(line.split(b" ")[0]) xref[objid] = char_count - elif current_obj and line.startswith(b"endobj"): + elif current_obj and ( + line.startswith(b"endobj") + and not line.startswith(b"endobj xref") + ): # End the current object. If needed, replace the arXiv # stamp in the block (done only once). Reset current # object. current_obj.append(line) block = b"".join(current_obj) - if not replaced_arXiv and b"arXivStAmP" in block: - # remove the text - block, n_subs1 = re.subn( - b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", - b"()Tj", - block, - ) - # remove the url - block, n_subs2 = re.subn( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - block, - ) - if n_subs1 or n_subs2: - # fix the length of the object stream - block = fix_stream_length(block) - replaced_arXiv = True + # remove the text + block, n_subs1 = re.subn( + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block, + ) + # remove the url + block, n_subs2 = re.subn( + b"<<\n\/URI \(" + + DEARXIV_URI_REGEX + + b"\)\n\/S /URI\n>>\n", + b"", + block, + ) + if n_subs1 or n_subs2: + # fix the length of the object stream + block = fix_stream_length(block) + replaced_arXiv = True new_data.append(block) char_count += len(block) current_obj = [] - elif current_obj: - # If we're recording an object, simply add the line to it - current_obj.append(line) elif line in [b"xref\n", b"endobj xref\n"]: + if b"endobj" in line and current_obj: + current_obj.append(b"endobj\n") + block = b"".join(current_obj) + new_data.append(block) + char_count += len(block) + current_obj = [] + line = b"xref\n" # We found the xref table, record its position and write it # out using our updated indices. startxref = sum(map(len, new_data)) @@ -131,6 +141,9 @@ class Arxiv(Provider): # skip the appropriate number of lines skip_n = len(xref) + 2 + elif current_obj: + # If we're recording an object, simply add the line to it + current_obj.append(line) elif line == b"startxref\n": # Write out our recorded startxref position, skip the old # position. @@ -148,7 +161,7 @@ class Arxiv(Provider): output_file = basename + "_dearxiv.pdf" self.compress_pdf(removed_file, output_file) - logger.append("success" if replaced_arXiv else "failed", "info") + logger.append("success" if replaced_arXiv else "none found", "info") return output_file diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py index 08ea2c4..2cb84cf 100644 --- a/tests/test_arxiv.py +++ b/tests/test_arxiv.py @@ -13,7 +13,11 @@ import shutil import tempfile import unittest -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX, Arxiv +from paper2remarkable.providers.arxiv import ( + DEARXIV_TEXT_REGEX, + DEARXIV_URI_REGEX, + Arxiv, +) class TestArxiv(unittest.TestCase): @@ -39,6 +43,26 @@ class TestArxiv(unittest.TestCase): m = re.fullmatch(DEARXIV_TEXT_REGEX, key) self.assertIsNotNone(m) + def test_text_regex_3(self): + key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_4(self): + key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_1(self): + key = b"http://arxiv.org/abs/physics/0605197v1" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_2(self): + key = b"https://arxiv.org/abs/1101.0028v3" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + def test_stamp_removed_1(self): url = "https://arxiv.org/pdf/1703.06103.pdf" prov = Arxiv(upload=False) @@ -57,6 +81,42 @@ class TestArxiv(unittest.TestCase): data = fp.read() self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + def test_stamp_removed_3(self): + url = "https://arxiv.org/abs/physics/0605197v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data + ) + self.assertNotIn( + b"/URI (http://arxiv.org/abs/physics/0605197v1)", data + ) + + def test_stamp_removed_4(self): + url = "https://arxiv.org/abs/math/0309285v2" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data) + self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data) + + def test_stamp_removed_5(self): + url = "https://arxiv.org/abs/astro-ph/9207001v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data + ) + self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index a7f17ff..d2fdb0d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -86,6 +86,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_5(self): + prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None) + url = "https://arxiv.org/abs/2002.11523" + exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" @@ -256,6 +263,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(4, len(pdfplumber.open(filename).pages)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From ac387eb56a1c6aa3273571d233ec385d7d6a94f9 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 3 May 2020 22:24:36 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 6 ++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37ed104..e665198 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.6.5 + +* Corrections to code that removes the arXiv stamp + ([#49](https://github.com/GjjvdBurg/paper2remarkable/issues/49)). Thanks to + @mr-ubik. + ## Version 0.6.4 * Further fixes for images in HTML sources diff --git a/README.md b/README.md index 853808f..4ddcc30 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.4 +Paper2reMarkable version 0.6.5 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 146c34c..0123897 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 4) +VERSION = (0, 6, 5) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 0fdbbdc7f2618bf6830b99591aacab513290ad8b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 5 May 2020 11:45:13 +0100 Subject: Add badges to readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ddcc30..869e501 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # paper2remarkable -[![PyPI -version](https://badge.fury.io/py/paper2remarkable.svg)](https://pypi.org/project/paper2remarkable) +[![PyPI version](https://badge.fury.io/py/paper2remarkable.svg)](https://pypi.org/project/paper2remarkable) +[![Build Status](https://travis-ci.org/GjjvdBurg/paper2remarkable.svg?branch=master)](https://travis-ci.org/GjjvdBurg/paper2remarkable) +[![Downloads](https://pepy.tech/badge/paper2remarkable/month)](https://pepy.tech/project/paper2remarkable/month) ``paper2remarkable`` is a command line program for quickly and easily transferring an academic paper to your [reMarkable](https://remarkable.com/): -- cgit v1.2.3 From d4f76fe6a4749e2ae02fa03c33c29bdbc711adef Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 5 May 2020 11:45:20 +0100 Subject: Minor fix to release script --- make_release.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/make_release.py b/make_release.py index b0c01d1..d729c8a 100644 --- a/make_release.py +++ b/make_release.py @@ -158,7 +158,8 @@ class InstallFromTestPyPI(Step): tmpvenv = tempfile.mkdtemp(prefix="p2r_venv_") self.do_cmd( f"virtualenv {tmpvenv} && source {tmpvenv}/bin/activate && " - "pip install --index-url https://test.pypi.org/simple/ " + "pip install --no-cache-dir --index-url " + "https://test.pypi.org/simple/ " "--extra-index-url https://pypi.org/simple " f"{context['pkgname']}=={context['version']}" ) -- cgit v1.2.3 From 7cc0b6e320e45b9ce442425a04ac4708fb3df077 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 11 May 2020 17:32:21 +0100 Subject: Allow underscore in urls --- paper2remarkable/utils.py | 2 +- tests/test_ui.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 97ff713..c2917d5 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -163,7 +163,7 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): def is_url(string): # pattern adapted from CleverCSV - pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*\-?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" + pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*[\-\_]?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None diff --git a/tests/test_ui.py b/tests/test_ui.py index 11ed87a..7ae1e79 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -99,6 +99,11 @@ class TestUI(unittest.TestCase): "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", ), + ( + PdfUrl, + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + ), ( JMLR, "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", -- cgit v1.2.3 From 22ed62b8b5918324929abfc19d4798bad4d07a90 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 11 May 2020 18:00:25 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e665198..eee15fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.6.6 + +* Bugfix to url validation: allow underscore in subdomains. + ## Version 0.6.5 * Corrections to code that removes the arXiv stamp diff --git a/README.md b/README.md index 869e501..78bb0da 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.5 +Paper2reMarkable version 0.6.6 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 0123897..089d249 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 5) +VERSION = (0, 6, 6) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 96de81d4158f7779132c9f7883c62bc3f15b6915 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 20:45:44 +0100 Subject: Add test for pdf reading issue --- tests/test_providers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index d2fdb0d..fb75fbd 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -142,12 +142,18 @@ class TestProviders(unittest.TestCase): filename = prov.run(local_filename) self.assertEqual("test_.pdf", os.path.basename(filename)) - def test_pdfurl(self): + def test_pdfurl_1(self): prov = PdfUrl(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" filename = prov.run(url) self.assertEqual("14-526.pdf", os.path.basename(filename)) + def test_pdfurl_2(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "https://www.manuelrigger.at/preprints/NoREC.pdf" + filename = prov.run(url) + self.assertEqual("NoREC.pdf", os.path.basename(filename)) + def test_jmlr_1(self): prov = JMLR(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" -- cgit v1.2.3 From 17a1e7392e0d08bf820252b90ee3509e59ff4bbf Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 20:50:17 +0100 Subject: Always rewrite pdfs with GhostScript as a cleaning step --- paper2remarkable/providers/_base.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 5ca3588..1625432 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -71,14 +71,13 @@ class Provider(metaclass=abc.ABCMeta): logger.disable() # Define the operations to run on the pdf. Providers can add others. - if no_crop: - self.operations = [] - elif center: - self.operations = [("center", self.center_pdf)] + self.operations = [("rewrite", self.rewrite_pdf)] + if center: + self.operations.append(("center", self.center_pdf)) elif right: - self.operations = [("right", self.right_pdf)] + self.operations.append(("right", self.right_pdf)) else: - self.operations = [("crop", self.crop_pdf)] + self.operations.append(("crop", self.crop_pdf)) if blank: self.operations.append(("blank", blank_pdf)) @@ -131,11 +130,14 @@ class Provider(metaclass=abc.ABCMeta): "%s failed to compress the PDF file." % self.pdftool ) - def rewrite_pdf(self, in_pdf, out_pdf): + def rewrite_pdf(self, in_pdf, out_pdf=None): """ Re-write the pdf using Ghostscript This helps avoid issues in dearxiv due to nested pdfs. """ + if out_pdf is None: + out_pdf = os.path.splitext(in_pdf)[0] + "-rewrite.pdf" + status = subprocess.call( [ self.gs_path, @@ -150,6 +152,7 @@ class Provider(metaclass=abc.ABCMeta): raise _CalledProcessError( "Failed to rewrite the pdf with GhostScript" ) + return out_pdf def uncompress_pdf(self, in_pdf, out_pdf): """ Uncompress a pdf file """ -- cgit v1.2.3 From a5522a9cc39b61d0d26705f99279381dcb9e7f9f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 21:07:46 +0100 Subject: Remove extra pdfurl test This test seems to fail repeatedly on Travis, for no clear reason (it works locally). Since we have another PdfUrl test I don't think it's necessary to have this one too, so I'll remove it. --- tests/test_ui.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_ui.py b/tests/test_ui.py index 7ae1e79..7ab5099 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -94,11 +94,6 @@ class TestUI(unittest.TestCase): "https://link.springer.com/article/10.1007/s10618-019-00631-5", "https://link.springer.com/article/10.1007/s10618-019-00631-5", ), - ( - PdfUrl, - "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", - "https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf", - ), ( PdfUrl, "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", -- cgit v1.2.3 From 8252be3737fbc5e5cfd8cb8084a571fa0435800b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 21:28:23 +0100 Subject: Correctly remove alternative arXiv stamp --- paper2remarkable/providers/arxiv.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 4d0bc19..0385f94 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -107,7 +107,7 @@ class Arxiv(Provider): block, n_subs1 = re.subn( b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block, ) - # remove the url + # remove the url (type 1) block, n_subs2 = re.subn( b"<<\n\/URI \(" + DEARXIV_URI_REGEX @@ -115,6 +115,16 @@ class Arxiv(Provider): b"", block, ) + # remove the url (type 2, i.e. Jackson arXiv 0309285v2) + block, n_subs3 = re.subn( + b"<<\n\/S \/URI\n" + + b"/URI \(" + + DEARXIV_URI_REGEX + + b"\)\n>>\n", + b"", + block, + ) + if n_subs1 or n_subs2: # fix the length of the object stream block = fix_stream_length(block) -- cgit v1.2.3 From b0d61a1a6bffadb01ab62405d4c9a3818a3fec62 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 21:39:12 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 7 +++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eee15fd..6d13d43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## Version 0.6.7 + +* Increase robustness to PDF issues by passing through GhostScript (fixes + [#51](https://github.com/GjjvdBurg/paper2remarkable/issues/51)). Thanks to + @sirupsen. +* Bugfix for code that removes arXiv stamp. + ## Version 0.6.6 * Bugfix to url validation: allow underscore in subdomains. diff --git a/README.md b/README.md index 78bb0da..bd733d6 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.6 +Paper2reMarkable version 0.6.7 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 089d249..69cf861 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 6) +VERSION = (0, 6, 7) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 9deed77ca86c025d3420eef5f34f1894aae3dba2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 24 May 2020 22:16:25 +0100 Subject: Fix spelling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bd733d6..2aa56d7 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ reMarkable from any of the following sources: The program aims to be flexible to the exact source URL, so for many of the academic sources you can either provide a URL to the abstract page or to the -PDF file. If you have an source that you would like to see added to the list, +PDF file. If you have a source that you would like to see added to the list, let me know! ``paper2remarkable`` takes the source URL and: -- cgit v1.2.3 From 0402313618e0f50dcde147f6fbaa1c730b68e93b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 16 Jun 2020 16:17:14 +0100 Subject: Fix no_crop bug This fixes a bug where no_crop no longer worked. --- paper2remarkable/providers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 1625432..107f006 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -76,7 +76,7 @@ class Provider(metaclass=abc.ABCMeta): self.operations.append(("center", self.center_pdf)) elif right: self.operations.append(("right", self.right_pdf)) - else: + elif not no_crop: self.operations.append(("crop", self.crop_pdf)) if blank: -- cgit v1.2.3 From ec000de563a32b4e757c9afde5a1b1b5ac80a511 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 22:42:10 +0100 Subject: Add support for using ReadabiliPy --- paper2remarkable/providers/html.py | 35 ++++++++++++++++++++++++++++++----- tests/test_providers.py | 7 +++++++ 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index d71f210..abe30ba 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -67,7 +67,7 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): img.attrib["src"] = urllib.parse.urljoin( self._base_url, img.attrib["src"] ) - img.attrib["src"] = img.attrib['src'].rstrip('/') + img.attrib["src"] = img.attrib["src"].rstrip("/") class HTMLInformer(Informer): @@ -98,6 +98,32 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url + def make_readable(self, request_html): + have_readabilipy = False + try: + from readabilipy import simple_json_from_html_string + + have_readabilipy = True + except ImportError: + pass + + logger.info( + "Converting HTML using %s" + % ("ReadabiliPy" if have_readabilipy else "readability") + ) + + if have_readabilipy: + article = simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -107,10 +133,9 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - request_text = get_page_with_retry(pdf_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() - raw_html = doc.summary(html_partial=True) + + request_html = get_page_with_retry(pdf_url, return_text=True) + title, raw_html = self.make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..ca6c1ae 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -268,6 +268,13 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 603353cd2cf16f99cc5eb823918105146fea6bcb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 22:59:40 +0100 Subject: Make readabilipy an optional dependency --- setup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 82a693a..d1de5bd 100644 --- a/setup.py +++ b/setup.py @@ -29,18 +29,23 @@ REQUIRED = [ "readability-lxml>=0.7.1", "html2text>=2020.1.16", "weasyprint>=51", - "markdown>=3.1.1" + "markdown>=3.1.1", ] +full_require = [ + # TEMPORARY: Until ReadabiliPy is available on PyPI + "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy", +] docs_require = [] test_require = [] dev_require = ["green"] # What packages are optional? EXTRAS = { + "full": full_require, "docs": docs_require, "tests": test_require, - "dev": docs_require + test_require + dev_require, + "dev": docs_require + test_require + dev_require + full_require, } # The rest you shouldn't have to touch too much :) -- cgit v1.2.3 From 65c2ad9c4be36fc10ba06579baf1fdc549dae99d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 20 Jun 2020 23:15:00 +0100 Subject: Upgrade nvm on travis --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8399160..7d220e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,8 @@ python: before_install: - sudo apt-get update - sudo apt-get install ghostscript pdftk poppler-utils qpdf + - nvm install v12.18.1 + - nvm use v12.18.1 install: - pip install -e .[dev] -- cgit v1.2.3 From d5230d43d58c992212c89f3c221f72784a3a309d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 5 Jul 2020 23:33:11 +0100 Subject: Add provider for Semantic Scholar --- paper2remarkable/providers/__init__.py | 2 + paper2remarkable/providers/semantic_scholar.py | 65 ++++++++++++++++++++++++++ tests/test_providers.py | 15 ++++++ 3 files changed, 82 insertions(+) create mode 100644 paper2remarkable/providers/semantic_scholar.py diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index e4fa1bd..c4e3eb5 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -13,6 +13,7 @@ from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed from .springer import Springer +from .semantic_scholar import SemanticScholar # NOTE: Order matters here, PdfUrl and HTML should be last providers = [ @@ -26,6 +27,7 @@ providers = [ PMLR, PubMed, Springer, + SemanticScholar, LocalFile, PdfUrl, HTML, diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py new file mode 100644 index 0000000..0a1b414 --- /dev/null +++ b/paper2remarkable/providers/semantic_scholar.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +"""Provider for SemanticScholar + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re +import bs4 + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..utils import get_page_with_retry + + +class SemanticScholarInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class SemanticScholar(Provider): + + re_abs = ( + "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" + ) + re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SemanticScholarInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a SemanticScholar url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self._get_pdf_url(abs_url) + elif re.match(self.re_pdf, url): + pdf_url = url + remainder = pdf_url.split("/")[-1][: -len(".pdf")] + first_four = pdf_url.split("/")[-2] + paper_id = first_four + remainder + abs_url = f"https://www.semanticscholar.org/paper/{paper_id}" + else: + raise URLResolutionError("SemanticScholar", url) + return abs_url, pdf_url + + def _get_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + meta = soup.find_all("meta", {"name": "citation_pdf_url"}) + if not meta: + raise URLResolutionError("SemanticScholar", url) + return meta[0]["content"] + + def validate(src): + return re.match(SemanticScholar.re_abs, src) or re.match( + SemanticScholar.re_pdf, src + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index fb75fbd..1a6f84f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -26,6 +26,7 @@ from paper2remarkable.providers import ( PdfUrl, PubMed, Springer, + SemanticScholar ) VERBOSE = False @@ -268,6 +269,20 @@ class TestProviders(unittest.TestCase): # this is a proxy test to check that all images are included self.assertEqual(4, len(pdfplumber.open(filename).pages)) + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From e298f1cfd64253347ec81cadf5324a32d81ec2e5 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 11 Jul 2020 22:21:55 +0100 Subject: Add semantic scholar to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2aa56d7..7de9c40 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ reMarkable from any of the following sources: * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) +* [SemanticScholar](https://www.semanticscholar.org/) * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file -- cgit v1.2.3 From 76bd4412abed0108b4589c84783602447f824d5d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 11 Jul 2020 23:43:17 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 5 +++++ paper2remarkable/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d13d43..bc58bb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.6.8 + +* Add provider for SemanticScholar papers +* Fix bug that made ``no_crop`` option no longer work + ## Version 0.6.7 * Increase robustness to PDF issues by passing through GhostScript (fixes diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 69cf861..d1ac661 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 7) +VERSION = (0, 6, 8) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 0a6a4ff3893474e33f71ef2d8a881cc360a29094 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:23:12 +0100 Subject: Improve robustness of springer provider Adds support for downloading chapters --- paper2remarkable/providers/springer.py | 37 +++++++++++++++++++++++++++++----- tests/test_providers.py | 9 ++++++++- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index 5ce2564..dea8bd5 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -10,10 +10,12 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib +import requests from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..utils import HEADERS class SpringerInformer(Informer): @@ -26,24 +28,49 @@ class SpringerInformer(Informer): class Springer(Provider): - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = SpringerInformer() + def _get_abs_url(self, pdf_url): + article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")] + req = requests.head( + article_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return article_url + + chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")] + req = requests.head( + chapter_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return chapter_url + + raise URLResolutionError("Springer", pdf_url) + def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): + if re.match(self.re_abs_1, url): abs_url = url pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_abs_2, url): + abs_url = url + pdf_url = url.replace("chapter", "content/pdf") elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + abs_url = self._get_abs_url(url) pdf_url = urllib.parse.unquote(url) else: raise URLResolutionError("Springer", url) return abs_url, pdf_url def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + return ( + re.match(Springer.re_abs_1, src) + or re.match(Springer.re_abs_2, src) + or re.match(Springer.re_pdf, src) + ) diff --git a/tests/test_providers.py b/tests/test_providers.py index 1a6f84f..5c8a8e4 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -126,13 +126,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: -- cgit v1.2.3 From d36bda173d5488e23ec918d4bd51c3e6fd76ae06 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:37:16 +0100 Subject: Improve publication date extraction --- paper2remarkable/providers/springer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index dea8bd5..f9dc952 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -20,11 +20,23 @@ from ..utils import HEADERS class SpringerInformer(Informer): - meta_date_key = "citation_online_date" + meta_date_key = None def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) + def get_year(self, soup): + meta = soup.find_all('meta', {'name': 'citation_online_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + meta = soup.find_all('meta', {'name': 'citation_publication_date'}) + if meta: + date = meta[0]['content'] + return self._format_year(date) + return '' + + class Springer(Provider): -- cgit v1.2.3 From f1f6ec91ca263e2e47357f4ddfd7e0e746fd93e7 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:42:25 +0100 Subject: simplify code --- paper2remarkable/providers/springer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index f9dc952..31f0a67 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -26,16 +26,12 @@ class SpringerInformer(Informer): return super()._format_authors(soup_authors, sep=" ", idx=-1) def get_year(self, soup): - meta = soup.find_all('meta', {'name': 'citation_online_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - meta = soup.find_all('meta', {'name': 'citation_publication_date'}) - if meta: - date = meta[0]['content'] - return self._format_year(date) - return '' - + for key in ["citation_online_date", "citation_publication_date"]: + meta = soup.find_all("meta", {"name": key}) + if not meta: + continue + return self._format_year(meta[0]["content"]) + return "" class Springer(Provider): -- cgit v1.2.3 From 8f6f3c433ce37c0205144b56cd48ea1ecc661e67 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 16:56:05 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc58bb8..3fbf726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.6.9 + +* Improve robustness of Springer provider + ## Version 0.6.8 * Add provider for SemanticScholar papers diff --git a/README.md b/README.md index 7de9c40..7108c3e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.7 +Paper2reMarkable version 0.6.9 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index d1ac661..214d6b7 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 8) +VERSION = (0, 6, 9) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 071b5a0f2958c34f1a189259346a8732a1110de2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 22:44:33 +0100 Subject: Add provider for SagePub --- README.md | 1 + paper2remarkable/providers/__init__.py | 2 ++ paper2remarkable/providers/sagepub.py | 52 ++++++++++++++++++++++++++++++++++ paper2remarkable/utils.py | 1 + tests/test_providers.py | 17 ++++++++++- 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 paper2remarkable/providers/sagepub.py diff --git a/README.md b/README.md index 7108c3e..62c2b0b 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ reMarkable from any of the following sources: * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) +* [SagePub](https://journals.sagepub.com/) * [SemanticScholar](https://www.semanticscholar.org/) * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index c4e3eb5..e3075f0 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -12,6 +12,7 @@ from .openreview import OpenReview from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed +from .sagepub import SagePub from .springer import Springer from .semantic_scholar import SemanticScholar @@ -26,6 +27,7 @@ providers = [ OpenReview, PMLR, PubMed, + SagePub, Springer, SemanticScholar, LocalFile, diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py new file mode 100644 index 0000000..7e76df8 --- /dev/null +++ b/paper2remarkable/providers/sagepub.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +"""Provider for SagePub + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class SagePubInformer(Informer): + + meta_author_key = "dc.Creator" + meta_title_key = "dc.Title" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.split("-")[0] + + +class SagePub(Provider): + + re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+" + re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SagePubInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("full", "pdf") + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url.replace("pdf", "full") + else: + raise URLResolutionError("SagePub", url) + return abs_url, pdf_url + + def validate(src): + return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index c2917d5..07b1524 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -38,6 +38,7 @@ def clean_string(s): cleaned = "".join(c if c in allowed else "_" for c in normalized) while "__" in cleaned: cleaned = cleaned.replace("__", "_") + cleaned = cleaned.strip('_') return cleaned diff --git a/tests/test_providers.py b/tests/test_providers.py index 5c8a8e4..ba1cc3a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -25,8 +25,9 @@ from paper2remarkable.providers import ( PMLR, PdfUrl, PubMed, + SagePub, Springer, - SemanticScholar + SemanticScholar, ) VERBOSE = False @@ -290,6 +291,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_sagepub_1(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" + exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_2(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432" + exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 848cbf8bfb82c568c94ff3842ee538dc5c990120 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 15 Jul 2020 22:55:28 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fbf726..723b38f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.7.0 + +* Add provider for SagePub + ## Version 0.6.9 * Improve robustness of Springer provider diff --git a/README.md b/README.md index 62c2b0b..eaec8c4 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.6.9 +Paper2reMarkable version 0.7.0 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 214d6b7..1020fb7 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 6, 9) +VERSION = (0, 7, 0) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From b56d376ff87cfc7fc599f40e13338a3c1a489877 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 11 Aug 2020 22:18:30 +0100 Subject: Fix failing unit test --- tests/test_ui.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_ui.py b/tests/test_ui.py index 7ab5099..5747eb9 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -101,13 +101,13 @@ class TestUI(unittest.TestCase): ), ( JMLR, - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", - "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", ), ( JMLR, - "http://www.jmlr.org/papers/v10/xu09a.html", - "http://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", ), ( PMLR, -- cgit v1.2.3 From 0e7b27b4e34e52744f6037f78024e1df2ee26a0c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 11 Aug 2020 22:18:40 +0100 Subject: Readme formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eaec8c4..0f23c4f 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Optionally, you can: Here's the full help of the script: -```text +``` usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] -- cgit v1.2.3 From 7ae25e6f86dcd1da60cdb40d2d12ca45c4b68201 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 27 Aug 2020 13:17:03 +0100 Subject: Rewrite author info function for OpenReview --- paper2remarkable/providers/openreview.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index 47c0555..8c44f45 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -8,17 +8,49 @@ Copyright: 2019, G.J.J. van den Burg """ +import json import re from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() class OpenReviewInformer(Informer): meta_date_key = "citation_publication_date" + def get_authors(self, soup): + # Get the authors for OpenReview by parsing the JSON payload + # + # This may not be super robust long term, but works for now. + warning = ( + "Couldn't determine author information, maybe provide " + "the desired filename using '--filename'?" + ) + + script = soup.find("script", {"id": "__NEXT_DATA__"}) + if not script: + logger.warning(warning) + return "" + + try: + paper_data = json.loads(script.contents[0]) + except json.JSONDecodeError: + logger.warning(warning) + return "" + + try: + content = paper_data["props"]["pageProps"]["forumNote"]["content"] + authors = content["authors"] + except KeyError: + logger.warning(warning) + return "" + return self._format_authors(authors) + def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) -- cgit v1.2.3 From 3a5260a3a1311bf589fe7a4ef221939f8c9727d1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 27 Aug 2020 13:37:33 +0100 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 723b38f..5dfd414 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.7.1 + +* Fix OpenReview provider after site change + ## Version 0.7.0 * Add provider for SagePub diff --git a/README.md b/README.md index 0f23c4f..2cfe192 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.7.0 +Paper2reMarkable version 0.7.1 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 1020fb7..e501a41 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 0) +VERSION = (0, 7, 1) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From f242e29147ac8ec1450a0bdb90b1dc0da3aa4e85 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sun, 30 Aug 2020 13:20:45 +0100 Subject: Add tentative windows instructions --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2cfe192..8b1db06 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,14 @@ Specifically: - **Arch Linux:** ``pacman -S pdftk ghostscript poppler`` - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``. - - **MacOs:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + - **MacOS:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + - **Windows:** Installers or executables are available for + [qpdf](https://github.com/qpdf/qpdf/releases) (for instance the mingw + binary executables) and + [GhostScript](https://www.ghostscript.com/download/gsdnld.html). + Importantly, Windows support is untested and these are generic + instructions, so we welcome clarifications where needed. The Docker + instructions below may be more convenient on Windows. 3. Finally, install ``paper2remarkable``: ``` -- cgit v1.2.3 From 25f372c69dfc846faebb4763ecc60e9e0750021b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 20:54:38 +0200 Subject: Improve support for Neurips provider (fixes #59) --- paper2remarkable/providers/neurips.py | 4 ++-- tests/test_providers.py | 14 ++++++++++++++ tests/test_ui.py | 7 ++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/paper2remarkable/providers/neurips.py b/paper2remarkable/providers/neurips.py index 87cf2c1..d76202c 100644 --- a/paper2remarkable/providers/neurips.py +++ b/paper2remarkable/providers/neurips.py @@ -25,8 +25,8 @@ class NeurIPSInformer(Informer): class NeurIPS(Provider): - re_abs = "^https?://papers.nips.cc/paper/[\d\w\-]+$" - re_pdf = "^https?://papers.nips.cc/paper/[\d\w\-]+.pdf$" + re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$" + re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/tests/test_providers.py b/tests/test_providers.py index ba1cc3a..eeaef82 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -233,6 +233,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_3(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits" + exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_neurips_4(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf" + exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): prov = CiteSeerX(upload=False, verbose=VERBOSE) url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" diff --git a/tests/test_ui.py b/tests/test_ui.py index 5747eb9..61b371d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -98,7 +98,7 @@ class TestUI(unittest.TestCase): PdfUrl, "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", - ), + ), ( JMLR, "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", @@ -149,6 +149,11 @@ class TestUI(unittest.TestCase): "https://papers.nips.cc/paper/7796-middle-out-decoding", "https://papers.nips.cc/paper/7796-middle-out-decoding", ), + ( + NeurIPS, + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + ), ( CiteSeerX, "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", -- cgit v1.2.3 From 6bf72b6f8c08c7949b5efe4ef244cb0671bf5bf8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 20:57:49 +0200 Subject: Use python builtin venv instead of virtualenv in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 769fc87..bcbc420 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ doc: install ## Build documentation with Sphinx venv: $(VENV_DIR)/bin/activate $(VENV_DIR)/bin/activate: - test -d $(VENV_DIR) || virtualenv $(VENV_DIR) + test -d $(VENV_DIR) || python -m venv $(VENV_DIR) source $(VENV_DIR)/bin/activate && pip install -e .[dev] touch $(VENV_DIR)/bin/activate -- cgit v1.2.3 From 6338388cea254ba4c6090eb17a8942a13b7a2b1c Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:25:41 +0200 Subject: Clean up readability providers This reorganizes the code a bit to ensure we only pull the HTML page once, and use the same readability provider for both the informer and the converter. --- paper2remarkable/providers/html.py | 78 ++++++++++++++++++++++---------------- tests/test_providers.py | 5 ++- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index abe30ba..b734bd1 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -56,6 +56,37 @@ def url_fetcher(url): return weasyprint.default_url_fetcher(url) +def make_readable(request_html): + """Use an extraction method to get the main article html + + This function checks if ReadabiliPy is installed with NodeJS support, as + that generally yields better results. If that is not available, it falls + back on readability. + """ + + have_readabilipy_js = False + try: + import readabilipy + + have_readabilipy_js = readabilipy.simple_json.have_node() + except ImportError: + pass + + if have_readabilipy_js: + logger.info("Converting HTML using Readability.js") + article = readabilipy.simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + logger.info("Converting HTML using readability") + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + + class ImgProcessor(markdown.treeprocessors.Treeprocessor): def __init__(self, base_url, *args, **kwargs): self._base_url = base_url @@ -73,11 +104,15 @@ class ImgProcessor(markdown.treeprocessors.Treeprocessor): class HTMLInformer(Informer): def __init__(self): super().__init__() + self._cached_title = None + self._cached_article = None def get_filename(self, abs_url): - request_text = get_page_with_retry(abs_url, return_text=True) - doc = readability.Document(request_text) - title = doc.title() + request_html = get_page_with_retry(abs_url, return_text=True) + title, article = make_readable(request_html) + + self._cached_title = title + self._cached_article = article # Clean the title and make it titlecase title = clean_string(title) @@ -98,32 +133,6 @@ class HTML(Provider): def get_abs_pdf_urls(self, url): return url, url - def make_readable(self, request_html): - have_readabilipy = False - try: - from readabilipy import simple_json_from_html_string - - have_readabilipy = True - except ImportError: - pass - - logger.info( - "Converting HTML using %s" - % ("ReadabiliPy" if have_readabilipy else "readability") - ) - - if have_readabilipy: - article = simple_json_from_html_string( - request_html, use_readability=True - ) - title = article["title"] - raw_html = article["content"] - else: - doc = readability.Document(request_html) - title = doc.title() - raw_html = doc.summary(html_partial=True) - return title, raw_html - def retrieve_pdf(self, pdf_url, filename): """Turn the HTML article in a clean pdf file""" # Steps @@ -133,13 +142,16 @@ class HTML(Provider): # 4. Convert the markdown back to HTML (this is done to sanitize HTML) # 4. Convert the HTML to PDF, pulling in images where needed # 5. Save the PDF to the specified filename. - - request_html = get_page_with_retry(pdf_url, return_text=True) - title, raw_html = self.make_readable(request_html) + if self.informer._cached_title and self.informer._cached_article: + title = self.informer._cached_title + article = self.informer._cached_article + else: + request_html = get_page_with_retry(pdf_url, return_text=True) + title, article = make_readable(request_html) h2t = html2text.HTML2Text() h2t.wrap_links = False - text = h2t.handle(raw_html) + text = h2t.handle(article) # Add the title back to the document article = "# {title}\n\n{text}".format(title=title, text=text) diff --git a/tests/test_providers.py b/tests/test_providers.py index ca6c1ae..479fb84 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -255,7 +255,10 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) # this is a proxy test to check that all images are included -- cgit v1.2.3 From 0bf303a5607f42658252ef27e9f3fee3e6b84d19 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:26:14 +0200 Subject: Clean up "full" installation mode --- .travis.yml | 2 +- setup.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7d220e0..a1cb636 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: - nvm use v12.18.1 install: - - pip install -e .[dev] + - pip install -e .[full] script: - green -vv -a ./tests diff --git a/setup.py b/setup.py index d1de5bd..0635253 100644 --- a/setup.py +++ b/setup.py @@ -32,10 +32,7 @@ REQUIRED = [ "markdown>=3.1.1", ] -full_require = [ - # TEMPORARY: Until ReadabiliPy is available on PyPI - "readabilipy @ git+https://git@github.com/GjjvdBurg/ReadabiliPy@packaging#egg=readabilipy", -] +full_require = ["readabilipy"] docs_require = [] test_require = [] dev_require = ["green"] -- cgit v1.2.3 From 8e0804ad491f2179135a138f9656088213ae8431 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:29:44 +0200 Subject: Ensure we test the test version on travis --- .travis.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index a1cb636..6a57cd3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: - nvm use v12.18.1 install: - - pip install -e .[full] + - pip install -e .[test] script: - green -vv -a ./tests diff --git a/setup.py b/setup.py index 0635253..25b6895 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ dev_require = ["green"] EXTRAS = { "full": full_require, "docs": docs_require, - "tests": test_require, + "test": test_require + full_require, "dev": docs_require + test_require + dev_require + full_require, } -- cgit v1.2.3 From 7e1c84db7d11541062709eb5208c2f804fac4da8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 22:34:21 +0200 Subject: Move green to test dependencies --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 25b6895..54a8cb1 100644 --- a/setup.py +++ b/setup.py @@ -34,8 +34,8 @@ REQUIRED = [ full_require = ["readabilipy"] docs_require = [] -test_require = [] -dev_require = ["green"] +test_require = ["green"] +dev_require = [] # What packages are optional? EXTRAS = { -- cgit v1.2.3 From 3b5e7eb5f34f92496aa96ee088db2925eadafd65 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Sep 2020 23:36:47 +0200 Subject: Improve docs --- paper2remarkable/providers/html.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py index b734bd1..e050ea3 100644 --- a/paper2remarkable/providers/html.py +++ b/paper2remarkable/providers/html.py @@ -134,14 +134,17 @@ class HTML(Provider): return url, url def retrieve_pdf(self, pdf_url, filename): - """Turn the HTML article in a clean pdf file""" - # Steps - # 1. Pull the HTML page using requests - # 2. Extract the article part of the page using readability - # 3. Convert the article HTML to markdown using html2text - # 4. Convert the markdown back to HTML (this is done to sanitize HTML) - # 4. Convert the HTML to PDF, pulling in images where needed - # 5. Save the PDF to the specified filename. + """Turn the HTML article in a clean pdf file + + This function takes the following steps: + + 1. Pull the HTML page using requests, if not done in Informer + 2. Extract the article part of the page using readability/readabiliPy + 3. Convert the article HTML to markdown using html2text + 4. Convert the markdown back to HTML (done to sanitize the HTML) + 4. Convert the HTML to PDF, pulling in images where needed + 5. Save the PDF to the specified filename. + """ if self.informer._cached_title and self.informer._cached_article: title = self.informer._cached_title article = self.informer._cached_article -- cgit v1.2.3 From 8999a8e83fdafda758881fbb762ad4d88686ccfc Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 26 Sep 2020 00:24:59 +0200 Subject: Readme updates --- README.md | 118 +++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 8b1db06..341b588 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,12 @@ $ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines The script can be run through the ``p2r`` command line program or via Docker (see below). If you're using MacOS, you might be interested in the [Alfred -workflow](#alfred) or [Printing to p2r](#printing). On Linux, a background -terminal such as [Guake](http://guake-project.org/) can be very handy. Note -that even without a reMarkable, this program can make downloading papers -easier (just use the `-n` flag). +workflow](#alfred-workflow) or [Printing to p2r](#printing). On Linux, a +background terminal such as [Guake](http://guake-project.org/) can be very +handy. Note that even without a reMarkable, this program can make downloading +papers easier (just use the `-n` flag). + +## Introduction ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: @@ -68,43 +70,8 @@ Optionally, you can: - Provide an explicit filename using the ``--filename`` parameter - Specify the location on the reMarkable to place the file (default ``/``) -Here's the full help of the script: - -``` -usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] - [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] - [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] - input - -Paper2reMarkable version 0.7.1 - -positional arguments: - input URL to a paper or the path of a local PDF file - -optional arguments: - -h, --help show this help message and exit - -b, --blank Add a blank page after every page of the PDF - -c, --center Center the PDF on the page, instead of left align - -d, --debug debug mode, doesn't upload to reMarkable - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir - -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR - directory on reMarkable to put the file (created if - missing, default: /) - -r, --right Right align so the menu doesn't cover it - -k, --no-crop Don't crop the pdf file - -v, --verbose be verbose - -V, --version Show version and exit - --filename FILENAME Filename to use for the file on reMarkable - --gs GS path to gs executable (default: gs) - --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm) - --pdftk PDFTK path to pdftk executable (default: pdftk) - --qpdf QPDF path to qpdf executable (default: qpdf) - --rmapi RMAPI path to rmapi executable (default: rmapi) -``` - -And here's an example with verbose mode enabled that shows everything the -script does by default: +Here's an example with verbose mode enabled that shows everything the script +does by default: ``` $ p2r -v https://arxiv.org/abs/1811.11242 @@ -132,9 +99,6 @@ The script requires the following external programs to be available: manager provides. - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -- Optional: [pdftoppm](https://linux.die.net/man/1/pdftoppm) (recommended for - speed). Usually part of a [Poppler](https://poppler.freedesktop.org/) - installation. Specifically: @@ -162,12 +126,62 @@ Specifically: ``` this installs the ``p2r`` command line program. -If any of the dependencies are not available on the ``PATH`` variable, you can -supply them with the relevant options to the script (for instance ``p2r ---rmapi /path/to/rmapi``). If you run into trouble with the installation, -please let me know! +**Optionally**, you can install: + +- [pdftoppm](https://linux.die.net/man/1/pdftoppm) (recommended for speed). + Usually part of a [Poppler](https://poppler.freedesktop.org/) installation. + +- the [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy) + package with Node.js support, to allow using + [Readability.js](https://github.com/mozilla/readability) for HTML articles. + This is known to improve the output of certain web articles. -## Alfred +If any of the dependencies (such as rmapi or ghostscript) are not available on +the ``PATH`` variable, you can supply them with the relevant options to the +script (for instance ``p2r --rmapi /path/to/rmapi``). If you run into trouble +with the installation, please let me know by opening an issue [on +Github][github-url]. + +## Usage + +The full help of the script is as follows. Hopefully the various command line +flags are self-explanatory, but if you'd like more information, please open an +issue [on GitHub][github-url]. + +``` +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] + [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] + [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] + input + +Paper2reMarkable version 0.7.1 + +positional arguments: + input URL to a paper or the path of a local PDF file + +optional arguments: + -h, --help show this help message and exit + -b, --blank Add a blank page after every page of the PDF + -c, --center Center the PDF on the page, instead of left align + -d, --debug debug mode, doesn't upload to reMarkable + -n, --no-upload don't upload to the reMarkable, save the output in + current working dir + -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR + directory on reMarkable to put the file (created if + missing, default: /) + -r, --right Right align so the menu doesn't cover it + -k, --no-crop Don't crop the pdf file + -v, --verbose be verbose + -V, --version Show version and exit + --filename FILENAME Filename to use for the file on reMarkable + --gs GS path to gs executable (default: gs) + --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm) + --pdftk PDFTK path to pdftk executable (default: pdftk) + --qpdf QPDF path to qpdf executable (default: qpdf) + --rmapi RMAPI path to rmapi executable (default: rmapi) +``` + +## Alfred Workflow On MacOS, you can optionally install [this Alfred workflow][workflow]. Alfred is [a launcher for MacOS](https://www.alfredapp.com/). @@ -264,5 +278,7 @@ Then you can use ``paper2remarkable`` from the command line as ``p2r``! License: MIT -If you find a problem or want to suggest a feature, please let us know! You're -helping to make this project better! +If you find a problem or want to suggest a feature, please open an issue [on +Github][github-url]. You're helping to make this project better for everyone! + +[github-url]: https://github.com/GjjvdBurg/paper2remarkable -- cgit v1.2.3 From f02325944489a12b481ca3877cda0b4a44c93203 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 26 Sep 2020 00:55:10 +0200 Subject: Bump version and update changelog --- CHANGELOG.md | 11 +++++++++++ README.md | 2 +- paper2remarkable/__version__.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5dfd414..b3cc27b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## Version 0.7.2 + +* Add support to optionally use + [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy), a + wrapper around Mozilla's + [Readability.js](https://github.com/mozilla/readability), to improve text + extraction of web articles. This closes + [#53](https://github.com/GjjvdBurg/paper2remarkable/issues/53), thanks to + @sirupsen for reporting the problem. +* Improve NeurIPS provider to add support for papers.neurips.cc + ## Version 0.7.1 * Fix OpenReview provider after site change diff --git a/README.md b/README.md index 341b588..957efe9 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] input -Paper2reMarkable version 0.7.1 +Paper2reMarkable version 0.7.2 positional arguments: input URL to a paper or the path of a local PDF file diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index e501a41..79572fa 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 1) +VERSION = (0, 7, 2) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 9dca9ef0b1eaec44e8c521811b4e8370ba84375b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 21:27:06 +0200 Subject: Remove use of virtualenv in release script --- make_release.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make_release.py b/make_release.py index d729c8a..f3bc9f2 100644 --- a/make_release.py +++ b/make_release.py @@ -157,7 +157,7 @@ class InstallFromTestPyPI(Step): def action(self, context): tmpvenv = tempfile.mkdtemp(prefix="p2r_venv_") self.do_cmd( - f"virtualenv {tmpvenv} && source {tmpvenv}/bin/activate && " + f"python -m venv {tmpvenv} && source {tmpvenv}/bin/activate && " "pip install --no-cache-dir --index-url " "https://test.pypi.org/simple/ " "--extra-index-url https://pypi.org/simple " -- cgit v1.2.3 From fcd8d3cd1d94780315a82655ce6b9571534c0a7d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:00:58 +0200 Subject: Updates to NBER provider after site updates --- paper2remarkable/providers/nber.py | 23 ++++++++++++++++++++--- tests/test_ui.py | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py index 76bc85f..fa51e8a 100644 --- a/paper2remarkable/providers/nber.py +++ b/paper2remarkable/providers/nber.py @@ -18,8 +18,11 @@ from ..exceptions import URLResolutionError class NBERInformer(Informer): - def _format_year(self, soup_date): - return soup_date.split("-")[0] + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors, sep=" ", idx=0, op=None): + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=None) class NBER(Provider): @@ -27,10 +30,20 @@ class NBER(Provider): re_abs = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)$" re_pdf = "https?://www\.nber\.org/papers/(?P[a-z0-9]+)\.pdf$" + re_pdf_2 = "https://www.nber.org/system/files/working_papers/(?P[a-z0-9]+)/(?P=ref).pdf" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = NBERInformer() + def get_report_no(self, url): + m = re.match(self.re_pdf_2, url) + if m: + return m["ref"] + raise URLResolutionError( + "NBER", url, reason="Failed to retrieve report number." + ) + def get_abs_pdf_urls(self, url): if re.match(self.re_abs, url): abs_url = url @@ -38,9 +51,13 @@ class NBER(Provider): elif re.match(self.re_pdf, url): pdf_url = url abs_url = url[: -len(".pdf")] + elif re.match(self.re_pdf_2, url): + ref = self.get_report_no(url) + abs_url = f"https://www.nber.org/papers/{ref}" + pdf_url = url else: raise URLResolutionError("NBER", url) return abs_url, pdf_url def validate(src): - return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) + return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) or re.match(NBER.re_pdf_2, src) diff --git a/tests/test_ui.py b/tests/test_ui.py index 61b371d..97ec44d 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -137,7 +137,7 @@ class TestUI(unittest.TestCase): ( NBER, "https://www.nber.org/papers/w19152.pdf", - "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/system/files/working_papers/w19152/w19152.pdf", ), ( NeurIPS, -- cgit v1.2.3 From b77c06ad3deb27b90a91f468b0123923d217d53d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:11:18 +0200 Subject: Increase robustness for arXiv urls --- paper2remarkable/providers/arxiv.py | 2 ++ tests/test_providers.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 0385f94..317452e 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -45,6 +45,8 @@ class Arxiv(Provider): def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ + if '?' in url: + url = url[:url.index('?')] if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" diff --git a/tests/test_providers.py b/tests/test_providers.py index 70d012a..12f748e 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -95,6 +95,13 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_6(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_with_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" -- cgit v1.2.3 From 1f07867ec7aebb1b1aa6b806e35a46e73c034605 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:15:43 +0200 Subject: fix typo --- tests/test_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 12f748e..b6cce59 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -98,7 +98,7 @@ class TestProviders(unittest.TestCase): def test_arxiv_6(self): prov = Arxiv(upload=False, verbose=VERBOSE) url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" - exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_with_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) -- cgit v1.2.3 From a33321dfbad489cfce75d0c11fcbce3d21e64b02 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 22:51:44 +0200 Subject: Add support for handling multiple inputs simultaneously --- paper2remarkable/ui.py | 66 +++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 2fbf49f..ea24403 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -75,7 +75,7 @@ def parse_args(): parser.add_argument( "--filename", help="Filename to use for the file on reMarkable", - default=None, + action="append", ) parser.add_argument( "--gs", help="path to gs executable (default: gs)", default="gs" @@ -101,7 +101,9 @@ def parse_args(): default="rmapi", ) parser.add_argument( - "input", help="URL to a paper or the path of a local PDF file" + "input", + help="One or more URLs to a paper or paths to local PDF files", + nargs="+", ) return parser.parse_args() @@ -122,23 +124,23 @@ def exception(msg): def choose_provider(cli_input): """Choose the provider to use for the given source - This function first tries to check if the input is a local file, by - checking if the path exists. Next, it checks if the input is a "valid" url - using a regex test. If it is, the registered provider classes are checked + This function first tries to check if the input is a local file, by + checking if the path exists. Next, it checks if the input is a "valid" url + using a regex test. If it is, the registered provider classes are checked to see which provider can handle this url. Returns ------- provider : class - The class of the provider than can handle the source. A subclass of the + The class of the provider than can handle the source. A subclass of the Provider abc. new_input : str - The updated input to the provider. This only has an effect for the url + The updated input to the provider. This only has an effect for the url providers, where this will be the url after following all redirects. cookiejar : dict or requests.RequestsCookieJar - Cookies picked up when following redirects. These are needed for some + Cookies picked up when following redirects. These are needed for some providers to ensure later requests have the right cookie settings. Raises @@ -194,23 +196,31 @@ def main(): if args.right and args.no_crop: exception("Can't right align and not crop at the same time!") - provider, new_input, cookiejar = choose_provider(args.input) - - prov = provider( - verbose=args.verbose, - upload=not args.no_upload, - debug=args.debug, - center=args.center, - right=args.right, - blank=args.blank, - no_crop=args.no_crop, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - pdftoppm_path=args.pdftoppm, - pdftk_path=args.pdftk, - qpdf_path=args.qpdf, - gs_path=args.gs, - cookiejar=cookiejar, - ) - - prov.run(new_input, filename=args.filename) + if args.filename and not len(args.filename) == len(args.input): + exception( + "When providing --filename and multiple inputs, their number must match." + ) + + filenames = ( + [None] * len(args.input) if not args.filename else args.filename + ) + + for cli_input, filename in zip(args.input, filenames): + provider, new_input, cookiejar = choose_provider(cli_input) + prov = provider( + verbose=args.verbose, + upload=not args.no_upload, + debug=args.debug, + center=args.center, + right=args.right, + blank=args.blank, + no_crop=args.no_crop, + remarkable_dir=args.remarkable_dir, + rmapi_path=args.rmapi, + pdftoppm_path=args.pdftoppm, + pdftk_path=args.pdftk, + qpdf_path=args.qpdf, + gs_path=args.gs, + cookiejar=cookiejar, + ) + prov.run(new_input, filename=filename) -- cgit v1.2.3 From 0ad030303295e98330c9fd21703b8977e85e9a82 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 23:17:32 +0200 Subject: formatting --- paper2remarkable/providers/nber.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py index fa51e8a..28e0973 100644 --- a/paper2remarkable/providers/nber.py +++ b/paper2remarkable/providers/nber.py @@ -60,4 +60,8 @@ class NBER(Provider): return abs_url, pdf_url def validate(src): - return re.match(NBER.re_abs, src) or re.match(NBER.re_pdf, src) or re.match(NBER.re_pdf_2, src) + return ( + re.match(NBER.re_abs, src) + or re.match(NBER.re_pdf, src) + or re.match(NBER.re_pdf_2, src) + ) -- cgit v1.2.3 From 13a5a27233c650612253264a2f2818550df8b883 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Tue, 20 Oct 2020 23:26:02 +0200 Subject: Bump version and update changelog --- CHANGELOG.md | 6 ++++++ README.md | 18 +++++++++--------- paper2remarkable/__version__.py | 2 +- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3cc27b..9a9afc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Version 0.7.3 + +* Increase robustness for arXiv sources +* Fix NBER provider after site update +* Add support for multiple command line inputs + ## Version 0.7.2 * Add support to optionally use diff --git a/README.md b/README.md index 957efe9..20a0978 100644 --- a/README.md +++ b/README.md @@ -150,25 +150,25 @@ issue [on GitHub][github-url]. ``` usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] - [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] - [--pdftk PDFTK] [--qpdf QPDF] [--rmapi RMAPI] - input + [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK] + [--qpdf QPDF] [--rmapi RMAPI] + input [input ...] -Paper2reMarkable version 0.7.2 +Paper2reMarkable version 0.7.3 positional arguments: - input URL to a paper or the path of a local PDF file + input One or more URLs to a paper or paths to local PDF files optional arguments: -h, --help show this help message and exit -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align -d, --debug debug mode, doesn't upload to reMarkable - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir + -n, --no-upload don't upload to the reMarkable, save the output in current working + dir -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR - directory on reMarkable to put the file (created if - missing, default: /) + directory on reMarkable to put the file (created if missing, default: + /) -r, --right Right align so the menu doesn't cover it -k, --no-crop Don't crop the pdf file -v, --verbose be verbose diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 79572fa..2010c24 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 2) +VERSION = (0, 7, 3) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3 From 8dc5bd11c7de301a70fd5e2b5cf36e84590c2c69 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 12:24:50 +0200 Subject: Code formatting --- paper2remarkable/log.py | 9 ++++----- paper2remarkable/providers/arxiv.py | 8 ++++---- paper2remarkable/utils.py | 10 +++++----- tests/test_providers.py | 5 ++--- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index 3a2fcc5..fb9d8a3 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -38,7 +38,7 @@ class Logger(metaclass=Singleton): def disable(self): self.enabled = False - def _log(self, msg, mode, end='\n', add_prefix=True): + def _log(self, msg, mode, end="\n", add_prefix=True): if not self.enabled: return if not mode in ("info", "warn"): @@ -53,12 +53,11 @@ class Logger(metaclass=Singleton): print("%s%s" % (prefix, msg), end=end, file=file) file.flush() - def info(self, msg, end='\n'): + def info(self, msg, end="\n"): self._log(msg, "info", end=end) - def warning(self, msg, end='\n'): + def warning(self, msg, end="\n"): self._log(msg, "warn", end=end) - def append(self, msg, mode, end='\n'): + def append(self, msg, mode, end="\n"): self._log(msg, mode, end=end, add_prefix=False) - diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 317452e..865a3f8 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -45,8 +45,8 @@ class Arxiv(Provider): def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ - if '?' in url: - url = url[:url.index('?')] + if "?" in url: + url = url[: url.index("?")] if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" @@ -119,8 +119,8 @@ class Arxiv(Provider): ) # remove the url (type 2, i.e. Jackson arXiv 0309285v2) block, n_subs3 = re.subn( - b"<<\n\/S \/URI\n" + - b"/URI \(" + b"<<\n\/S \/URI\n" + + b"/URI \(" + DEARXIV_URI_REGEX + b"\)\n>>\n", b"", diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 07b1524..a29be3c 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -38,7 +38,7 @@ def clean_string(s): cleaned = "".join(c if c in allowed else "_" for c in normalized) while "__" in cleaned: cleaned = cleaned.replace("__", "_") - cleaned = cleaned.strip('_') + cleaned = cleaned.strip("_") return cleaned @@ -186,10 +186,10 @@ def check_pdftool(pdftk_path, qpdf_path): return "pdftk" try: status = subprocess.call( - [qpdf_path, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) + [qpdf_path, "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) except FileNotFoundError: status = 1 if status == 0: diff --git a/tests/test_providers.py b/tests/test_providers.py index b6cce59..546794c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -285,8 +285,8 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" - # NOTE: Title differs between Readability.JS and readability-lxml, we + # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we # assume that testing is done with Readability.JS exp = "Conclave.pdf" filename = prov.run(url) @@ -337,6 +337,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 1e83f9f6537fa108d7a157daaaeb3dc06e80fdce Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 12:24:50 +0200 Subject: Code formatting --- paper2remarkable/crop.py | 19 ++++++++++--------- paper2remarkable/log.py | 9 ++++----- paper2remarkable/pdf_ops.py | 6 ++---- paper2remarkable/providers/_base.py | 24 ++++++++++++++++++++---- paper2remarkable/providers/_info.py | 12 ++++++------ paper2remarkable/providers/arxiv.py | 12 +++++++----- paper2remarkable/providers/pdf_url.py | 4 +++- paper2remarkable/utils.py | 17 +++++++++-------- tests/test_providers.py | 5 ++--- 9 files changed, 63 insertions(+), 45 deletions(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index dc4b31c..573225b 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -27,8 +27,8 @@ logger = Logger() def find_offset_byte_line(line): """Find index of first nonzero bit in a line of bytes - The given line is a string of bytes, each representing 8 pixels. This code - finds the index of the first bit that is not zero. Used when find the + The given line is a string of bytes, each representing 8 pixels. This code + finds the index of the first bit that is not zero. Used when find the cropbox with pdftoppm. """ off = 0 @@ -46,8 +46,7 @@ def find_offset_byte_line(line): def check_pdftoppm(pth): - """Check that we can run the provided pdftoppm executable - """ + """Check that we can run the provided pdftoppm executable""" try: subprocess.check_output([pth, "-v"], stderr=subprocess.DEVNULL) except (subprocess.CalledProcessError, FileNotFoundError, PermissionError): @@ -58,7 +57,10 @@ def check_pdftoppm(pth): class Cropper(object): def __init__( - self, input_file=None, output_file=None, pdftoppm_path="pdftoppm", + self, + input_file=None, + output_file=None, + pdftoppm_path="pdftoppm", ): if not input_file is None: self.input_file = os.path.abspath(input_file) @@ -220,7 +222,7 @@ class Cropper(object): if margins is integer, used for all margins, else margins = [left, top, right, bottom] - We get the bounding box by finding the smallest rectangle that is + We get the bounding box by finding the smallest rectangle that is completely surrounded by white pixels. """ if isinstance(margins, int): @@ -254,7 +256,7 @@ class Cropper(object): return [a0, b0, a1, b1] def get_center_bbox(self, filename, padding=15): - """Compute a bounding box that will center the page file on the + """Compute a bounding box that will center the page file on the reMarkable """ bbox = self.get_bbox(filename, margins=0) @@ -279,8 +281,7 @@ class Cropper(object): return self.get_bbox(filename, margins=margins) def get_right_bbox(self, filename, padding=15): - """Get the bounding box that ensures the menu doesn't hide the text - """ + """Get the bounding box that ensures the menu doesn't hide the text""" bbox = self.get_bbox(filename, margins=0) diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index 3a2fcc5..fb9d8a3 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -38,7 +38,7 @@ class Logger(metaclass=Singleton): def disable(self): self.enabled = False - def _log(self, msg, mode, end='\n', add_prefix=True): + def _log(self, msg, mode, end="\n", add_prefix=True): if not self.enabled: return if not mode in ("info", "warn"): @@ -53,12 +53,11 @@ class Logger(metaclass=Singleton): print("%s%s" % (prefix, msg), end=end, file=file) file.flush() - def info(self, msg, end='\n'): + def info(self, msg, end="\n"): self._log(msg, "info", end=end) - def warning(self, msg, end='\n'): + def warning(self, msg, end="\n"): self._log(msg, "warn", end=end) - def append(self, msg, mode, end='\n'): + def append(self, msg, mode, end="\n"): self._log(msg, mode, end=end, add_prefix=False) - diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index 41cb85f..c365920 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -40,8 +40,7 @@ def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"): def blank_pdf(filepath): - """Add blank pages to PDF - """ + """Add blank pages to PDF""" logger.info("Adding blank pages") input_pdf = PyPDF2.PdfFileReader(filepath) output_pdf = PyPDF2.PdfFileWriter() @@ -56,8 +55,7 @@ def blank_pdf(filepath): def shrink_pdf(filepath, gs_path="gs"): - """Shrink the PDF file size using Ghostscript - """ + """Shrink the PDF file size using Ghostscript""" logger.info("Shrinking pdf file ...") size_before = os.path.getsize(filepath) output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 107f006..8f82f1d 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -122,7 +122,12 @@ class Provider(metaclass=abc.ABCMeta): ) elif self.pdftool == "qpdf": status = subprocess.call( - [self.qpdf_path, "--stream-data=compress", in_pdf, out_pdf,], + [ + self.qpdf_path, + "--stream-data=compress", + in_pdf, + out_pdf, + ], stderr=subprocess.DEVNULL, ) if not status == 0: @@ -131,7 +136,7 @@ class Provider(metaclass=abc.ABCMeta): ) def rewrite_pdf(self, in_pdf, out_pdf=None): - """ Re-write the pdf using Ghostscript + """Re-write the pdf using Ghostscript This helps avoid issues in dearxiv due to nested pdfs. """ @@ -159,11 +164,22 @@ class Provider(metaclass=abc.ABCMeta): if self.pdftool == "pdftk": status = subprocess.call( - [self.pdftk_path, in_pdf, "output", out_pdf, "uncompress",] + [ + self.pdftk_path, + in_pdf, + "output", + out_pdf, + "uncompress", + ] ) elif self.pdftool == "qpdf": status = subprocess.call( - [self.qpdf_path, "--stream-data=uncompress", in_pdf, out_pdf,] + [ + self.qpdf_path, + "--stream-data=uncompress", + in_pdf, + out_pdf, + ] ) if not status == 0: raise _CalledProcessError( diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 10b6959..8cffc60 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -16,12 +16,12 @@ logger = Logger() class Informer: """Base class for the informers. - The "informer" class is used to retrieve the title, authors, and year of + The "informer" class is used to retrieve the title, authors, and year of publication of the provided paper. - This base class provides the main functionality, but because various - outlets use different conventions to embed author, title, and publication - year information, we expect that individual providers will subclass this + This base class provides the main functionality, but because various + outlets use different conventions to embed author, title, and publication + year information, we expect that individual providers will subclass this class and overwrite some of the methods. """ @@ -35,9 +35,9 @@ class Informer: self.year = year def get_filename(self, abs_url): - """ Generate nice filename using the paper information + """Generate nice filename using the paper information - The provided url must be to a HTMl page where this information can be + The provided url must be to a HTMl page where this information can be found, not to the PDF file itself. """ logger.info("Generating output filename") diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 317452e..6ec1796 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -45,8 +45,8 @@ class Arxiv(Provider): def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ - if '?' in url: - url = url[:url.index('?')] + if "?" in url: + url = url[: url.index("?")] if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" @@ -107,7 +107,9 @@ class Arxiv(Provider): block = b"".join(current_obj) # remove the text block, n_subs1 = re.subn( - b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", block, + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", + b"()Tj", + block, ) # remove the url (type 1) block, n_subs2 = re.subn( @@ -119,8 +121,8 @@ class Arxiv(Provider): ) # remove the url (type 2, i.e. Jackson arXiv 0309285v2) block, n_subs3 = re.subn( - b"<<\n\/S \/URI\n" + - b"/URI \(" + b"<<\n\/S \/URI\n" + + b"/URI \(" + DEARXIV_URI_REGEX + b"\)\n>>\n", b"", diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index b86c7c3..d20d4a5 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -28,7 +28,9 @@ class PdfUrlInformer(Informer): path_parts = parsed.path.split("/") if not path_parts: raise FilenameMissingError( - provider="PdfUrl", url=abs_url, reason="No URL parts", + provider="PdfUrl", + url=abs_url, + reason="No URL parts", ) filename = path_parts[-1] diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 07b1524..0b4be07 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -30,15 +30,15 @@ logger = Logger() def clean_string(s): - """ Clean a string by replacing accented characters with equivalents and - keeping only the allowed characters (ascii letters, digits, underscore, + """Clean a string by replacing accented characters with equivalents and + keeping only the allowed characters (ascii letters, digits, underscore, space, dash, and period)""" normalized = unidecode.unidecode(s) allowed = string.ascii_letters + string.digits + "_ .-" cleaned = "".join(c if c in allowed else "_" for c in normalized) while "__" in cleaned: cleaned = cleaned.replace("__", "_") - cleaned = cleaned.strip('_') + cleaned = cleaned.strip("_") return cleaned @@ -142,7 +142,8 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): while parts: rmdir += "/" + parts.pop(0) status = subprocess.call( - [rmapi_path, "mkdir", rmdir], stdout=subprocess.DEVNULL, + [rmapi_path, "mkdir", rmdir], + stdout=subprocess.DEVNULL, ) if not status == 0: raise RemarkableError( @@ -186,10 +187,10 @@ def check_pdftool(pdftk_path, qpdf_path): return "pdftk" try: status = subprocess.call( - [qpdf_path, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) + [qpdf_path, "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) except FileNotFoundError: status = 1 if status == 0: diff --git a/tests/test_providers.py b/tests/test_providers.py index b6cce59..546794c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -285,8 +285,8 @@ class TestProviders(unittest.TestCase): def test_html_3(self): prov = HTML(upload=False, verbose=VERBOSE) url = "https://conclave-team.github.io/conclave-site/" - #exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" - # NOTE: Title differs between Readability.JS and readability-lxml, we + # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we # assume that testing is done with Readability.JS exp = "Conclave.pdf" filename = prov.run(url) @@ -337,6 +337,5 @@ class TestProviders(unittest.TestCase): self.assertEqual(exp, os.path.basename(filename)) - if __name__ == "__main__": unittest.main() -- cgit v1.2.3 From 015f229b7706390fd5dadb8701e382d19fae4f68 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 12:38:36 +0200 Subject: Add pre-commit config for code formatting This is mainly to try out pre-commit --- .pre-commit-config.yaml | 6 ++++++ .travis.yml | 2 ++ pyproject.toml | 2 ++ 3 files changed, 10 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3cb791c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black + language_version: python3 diff --git a/.travis.yml b/.travis.yml index 6a57cd3..32a2a1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,9 @@ before_install: - nvm use v12.18.1 install: + - pip install pre-commit - pip install -e .[test] script: + - pre-commit run --all-files --show-diff-on-failure - green -vv -a ./tests diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a8f43fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 79 -- cgit v1.2.3 From 14cacacf3fd7b78b287ec7e6b127bd24f0ea4f56 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 23 Oct 2020 17:36:20 +0200 Subject: Add CVF provider --- README.md | 1 + paper2remarkable/providers/__init__.py | 2 ++ paper2remarkable/providers/cvf.py | 51 ++++++++++++++++++++++++++++++++++ tests/test_providers.py | 19 +++++++++++++ tests/test_ui.py | 6 ++++ 5 files changed, 79 insertions(+) create mode 100644 paper2remarkable/providers/cvf.py diff --git a/README.md b/README.md index 20a0978..dfb9be1 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ reMarkable from any of the following sources: * [arXiv](https://arxiv.org/) * [ACM Digital Library](https://dl.acm.org/dl.cfm) * [CiteSeerX](http://citeseerx.ist.psu.edu/index) +* [CVF](https://openaccess.thecvf.com/menu) * [JMLR](http://jmlr.org) * [NBER](https://www.nber.org) * [NeurIPS](https://papers.nips.cc/) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index e3075f0..78fa370 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -3,6 +3,7 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX +from .cvf import CVF from .html import HTML from .jmlr import JMLR from .local import LocalFile @@ -21,6 +22,7 @@ providers = [ ACM, Arxiv, CiteSeerX, + CVF, JMLR, NBER, NeurIPS, diff --git a/paper2remarkable/providers/cvf.py b/paper2remarkable/providers/cvf.py new file mode 100644 index 0000000..76ca9c0 --- /dev/null +++ b/paper2remarkable/providers/cvf.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Provider for CVF + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer + +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class CVFInformer(Informer): + + meta_date_key = "citation_publication_date" + + +class CVF(Provider): + + re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$" + re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = CVFInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url[: -len(".html")] + pdf_url += ".pdf" + pdf_url = pdf_url.replace("html", "papers") + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url.replace("papers", "html").replace(".pdf", ".html") + else: + raise URLResolutionError("CVF", url) + return abs_url, pdf_url + + def validate(src): + m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src) + return not m is None diff --git a/tests/test_providers.py b/tests/test_providers.py index 546794c..e701234 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -16,6 +16,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + CVF, HTML, JMLR, LocalFile, @@ -336,6 +337,24 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_cvf_1(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html" + exp = ( + "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_2(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf" + exp = ( + "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_ui.py b/tests/test_ui.py index 97ec44d..835f594 100644 --- a/tests/test_ui.py +++ b/tests/test_ui.py @@ -20,6 +20,7 @@ from paper2remarkable.providers import ( ACM, Arxiv, CiteSeerX, + CVF, HTML, JMLR, LocalFile, @@ -174,6 +175,11 @@ class TestUI(unittest.TestCase): "https://www.nature.com/articles/d41586-020-00176-4", "https://www.nature.com/articles/d41586-020-00176-4", ), + ( + CVF, + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + ), ] for exp_prov, url, exp_url in tests: prov, new_url, jar = choose_provider(url) -- cgit v1.2.3 From ee2e846372e950bfce554d86648abfb9ac42d189 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 24 Oct 2020 00:38:38 +0200 Subject: fix comment --- paper2remarkable/crop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index 573225b..623d29f 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -28,7 +28,7 @@ def find_offset_byte_line(line): """Find index of first nonzero bit in a line of bytes The given line is a string of bytes, each representing 8 pixels. This code - finds the index of the first bit that is not zero. Used when find the + finds the index of the first bit that is not zero. Used when finding the cropbox with pdftoppm. """ off = 0 -- cgit v1.2.3 From bb43b8e634bab85d6fdff2fa7b47a1884041ae10 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Sat, 24 Oct 2020 01:04:29 +0200 Subject: Bump version and update changelog --- CHANGELOG.md | 4 ++++ README.md | 6 ++---- paper2remarkable/__version__.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a9afc0..6518b8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.7.4 + +* Add provider for CVF + ## Version 0.7.3 * Increase robustness for arXiv sources diff --git a/README.md b/README.md index dfb9be1..1d74caa 100644 --- a/README.md +++ b/README.md @@ -165,11 +165,9 @@ optional arguments: -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align -d, --debug debug mode, doesn't upload to reMarkable - -n, --no-upload don't upload to the reMarkable, save the output in current working - dir + -n, --no-upload don't upload to the reMarkable, save the output in current working dir -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR - directory on reMarkable to put the file (created if missing, default: - /) + directory on reMarkable to put the file (created if missing, default: /) -r, --right Right align so the menu doesn't cover it -k, --no-crop Don't crop the pdf file -v, --verbose be verbose diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 2010c24..5c0adff 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 7, 3) +VERSION = (0, 7, 4) __version__ = ".".join(map(str, VERSION)) -- cgit v1.2.3