aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-10-27 20:59:17 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-10-27 20:59:17 +0100
commit282de79f44e95b539c1788de8a71177b5a023557 (patch)
tree6a6a26e210dc32d4d6a5ed4d8bc0b581af9bbc0e
parent[WIP] Provider for Taylor and Francis Online (diff)
parentBump version and update changelog (diff)
downloadpaper2remarkable-282de79f44e95b539c1788de8a71177b5a023557.tar.gz
paper2remarkable-282de79f44e95b539c1788de8a71177b5a023557.zip
Merge branch 'master' into feature/tandfonline
-rw-r--r--.github/alfred.pngbin0 -> 135792 bytes
-rw-r--r--.pre-commit-config.yaml6
-rw-r--r--.travis.yml11
-rw-r--r--CHANGELOG.md144
-rw-r--r--Dockerfile11
-rw-r--r--Makefile8
-rw-r--r--README.md209
-rw-r--r--Remarkable.alfredworkflowbin0 -> 30549 bytes
-rw-r--r--make_release.py90
-rw-r--r--paper2remarkable/__version__.py2
-rw-r--r--paper2remarkable/crop.py216
-rw-r--r--paper2remarkable/exceptions.py70
-rw-r--r--paper2remarkable/log.py23
-rw-r--r--paper2remarkable/pdf_ops.py65
-rw-r--r--paper2remarkable/providers/__init__.py14
-rw-r--r--paper2remarkable/providers/_base.py113
-rw-r--r--paper2remarkable/providers/_info.py40
-rw-r--r--paper2remarkable/providers/arxiv.py178
-rw-r--r--paper2remarkable/providers/citeseerx.py4
-rw-r--r--paper2remarkable/providers/cvf.py51
-rw-r--r--paper2remarkable/providers/html.py186
-rw-r--r--paper2remarkable/providers/jmlr.py75
-rw-r--r--paper2remarkable/providers/nber.py67
-rw-r--r--paper2remarkable/providers/neurips.py4
-rw-r--r--paper2remarkable/providers/openreview.py32
-rw-r--r--paper2remarkable/providers/pdf_url.py46
-rw-r--r--paper2remarkable/providers/sagepub.py52
-rw-r--r--paper2remarkable/providers/semantic_scholar.py65
-rw-r--r--paper2remarkable/providers/springer.py47
-rw-r--r--paper2remarkable/ui.py161
-rw-r--r--paper2remarkable/utils.py84
-rw-r--r--pyproject.toml2
-rw-r--r--setup.py16
-rw-r--r--tests/test_arxiv.py122
-rw-r--r--tests/test_providers.py185
-rw-r--r--tests/test_ui.py214
-rw-r--r--tests/test_utils.py21
37 files changed, 2275 insertions, 359 deletions
diff --git a/.github/alfred.png b/.github/alfred.png
new file mode 100644
index 0000000..78a95d9
--- /dev/null
+++ b/.github/alfred.png
Binary files differ
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..3cb791c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+ - repo: https://github.com/psf/black
+ rev: 20.8b1
+ hooks:
+ - id: black
+ language_version: python3
diff --git a/.travis.yml b/.travis.yml
index 5551597..32a2a1e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,4 @@
-dist: trusty
+dist: xenial
language: python
python:
@@ -6,11 +6,14 @@ python:
before_install:
- sudo apt-get update
- - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils
+ - sudo apt-get install ghostscript pdftk poppler-utils qpdf
+ - nvm install v12.18.1
+ - nvm use v12.18.1
install:
- - pip install six
- - pip install -e .[dev]
+ - pip install pre-commit
+ - pip install -e .[test]
script:
+ - pre-commit run --all-files --show-diff-on-failure
- green -vv -a ./tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 79ea620..6518b8e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,149 @@
# Changelog
+## Version 0.7.4
+
+* Add provider for CVF
+
+## Version 0.7.3
+
+* Increase robustness for arXiv sources
+* Fix NBER provider after site update
+* Add support for multiple command line inputs
+
+## Version 0.7.2
+
+* Add support to optionally use
+ [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy), a
+ wrapper around Mozilla's
+ [Readability.js](https://github.com/mozilla/readability), to improve text
+ extraction of web articles. This closes
+ [#53](https://github.com/GjjvdBurg/paper2remarkable/issues/53), thanks to
+ @sirupsen for reporting the problem.
+* Improve NeurIPS provider to add support for papers.neurips.cc
+
+## Version 0.7.1
+
+* Fix OpenReview provider after site change
+
+## Version 0.7.0
+
+* Add provider for SagePub
+
+## Version 0.6.9
+
+* Improve robustness of Springer provider
+
+## Version 0.6.8
+
+* Add provider for SemanticScholar papers
+* Fix bug that made ``no_crop`` option no longer work
+
+## Version 0.6.7
+
+* Increase robustness to PDF issues by passing through GhostScript (fixes
+ [#51](https://github.com/GjjvdBurg/paper2remarkable/issues/51)). Thanks to
+ @sirupsen.
+* Bugfix for code that removes arXiv stamp.
+
+## Version 0.6.6
+
+* Bugfix to url validation: allow underscore in subdomains.
+
+## Version 0.6.5
+
+* Corrections to code that removes the arXiv stamp
+ ([#49](https://github.com/GjjvdBurg/paper2remarkable/issues/49)). Thanks to
+ @mr-ubik.
+
+## Version 0.6.4
+
+* Further fixes for images in HTML sources
+ ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to
+ @sirupsen.
+
+## Version 0.6.3
+
+* Properly resolve image urls in HTML sources
+ ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to
+ @sirupsen.
+* Allow ``+`` in urls
+
+## Version 0.6.2
+
+* Print to log whether removing arXiv stamp was successful.
+* Fix bug that failed to correctly detect the pdf tool
+ ([#42](https://github.com/GjjvdBurg/paper2remarkable/issues/42)).
+
+## Version 0.6.1
+
+* Bugfix that makes removing the arXiv stamp more robust.
+
+## Version 0.6.0
+
+* The Dockerfile has been updated to use a more recent version of Cairo
+ ([#35](https://github.com/GjjvdBurg/paper2remarkable/issues/35)). Thanks to
+ @ClaytonJY.
+* We've added support for optionally using qpdf instead of pdftk
+ ([#36](https://github.com/GjjvdBurg/paper2remarkable/pull/36)). Thanks to
+ @delaere.
+* Resolving redirects has been improved, which solves an issue for the
+ Springer provider
+ ([#38](https://github.com/GjjvdBurg/paper2remarkable/pull/38)) and an issue
+ with some arXiv urls
+ ([#39](https://github.com/GjjvdBurg/paper2remarkable/pull/39)).
+* Unit tests were added for the provider selection.
+* The code that removes the arXiv stamp has been improved
+ ([#40](https://github.com/GjjvdBurg/paper2remarkable/pull/40)).
+* Tracebacks have been disabled outside of debug mode, showing clearer errors
+ ([#41](https://github.com/GjjvdBurg/paper2remarkable/pull/41)).
+
+## Version 0.5.6
+
+* Be more robust against missing pdftoppm executable.
+
+## Version 0.5.5
+
+* Fix bug for when the shrink operation returns bigger files
+ ([#33](https://github.com/GjjvdBurg/paper2remarkable/issues/33)).
+
+## Version 0.5.4
+
+* Add the option to not crop the file at all
+ ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/30)).
+* Add the option to right-align the file so the menu doesn't overlap
+ ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/31)).
+* Bugfix for validation for the JMLR provider
+
+## Version 0.5.3
+
+* Significantly speed up the program
+ ([#26](https://github.com/GjjvdBurg/paper2remarkable/issues/26))
+* Add provider for JMLR
+ ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/28)).
+* Bugfix for creating nested directories with ``-p`` option.
+
+## Version 0.5.2
+
+* Add provider for US National Bureau of Economic Research
+ ([#27](https://github.com/GjjvdBurg/paper2remarkable/pull/27)).
+* Automatically extract the filename from a pdf url where possible
+ ([#25](https://github.com/GjjvdBurg/paper2remarkable/issues/25)).
+* Speed up centering of pdfs by removing unnecessary cropping operation.
+* Improve robustness against missing metadata, remove spaces in author names,
+ and other minor improvements.
+
+## Version 0.5.1
+
+* Automatically detect when a HTML source is provided
+ ([#24](https://github.com/GjjvdBurg/paper2remarkable/pull/24))
+
+## Version 0.5.0
+
+* Add support for articles from the web using the ``--html`` flag
+ ([#23](https://github.com/GjjvdBurg/paper2remarkable/pull/23))
+* Add ``--version`` command to command line interface
+* Fix cropping bug that resulted in occassional rotated pages
+
## Version 0.4.6
* Add support for older arXiv URL scheme
diff --git a/Dockerfile b/Dockerfile
index 38db46b..e6fc152 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM golang:stretch AS rmapi
+FROM golang:buster AS rmapi
ENV GOPATH /go
ENV PATH ${GOPATH}/bin:/usr/local/go/bin:$PATH
@@ -7,18 +7,21 @@ ENV RMAPIREPO github.com/juruen/rmapi
RUN go get -u ${RMAPIREPO}
-FROM python:3.7-slim-stretch
+FROM python:3.7-slim-buster
# rmapi
COPY --from=rmapi /go/bin/rmapi /usr/bin/rmapi
-# imagemagick, pdftk, ghostscript, pdfcrop
+# needed to install openjdk-11-jre-headless
+RUN mkdir -p /usr/share/man/man1
+
+# imagemagick, pdftk, ghostscript, pdfcrop, weasyprint
RUN apt-get update \
&& apt-get install --no-install-recommends -y \
libmagickwand-dev \
pdftk \
ghostscript \
- texlive-extra-utils # contains pdfcrop
+ poppler-utils
RUN pip install --no-cache-dir paper2remarkable
diff --git a/Makefile b/Makefile
index eb3ce93..bcbc420 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ PACKAGE=paper2remarkable
DOC_DIR='./docs/'
VENV_DIR=/tmp/p2r_venv/
-.PHONY: help cover dist venv
+.PHONY: help dist venv
.DEFAULT_GOAL := help
@@ -48,15 +48,15 @@ doc: install ## Build documentation with Sphinx
cd $(DOC_DIR) && \
rm source/* && \
source $(VENV_DIR)/bin/activate && \
- sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \
+ sphinx-apidoc -H 'Paper2Remarkable API Documentation' -o source ../$(PACKAGE) && \
touch source/AUTOGENERATED
$(MAKE) -C $(DOC_DIR) html
venv: $(VENV_DIR)/bin/activate
$(VENV_DIR)/bin/activate:
- test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
- source $(VENV_DIR)/bin/activate && pip install -e .[dev] && pip install six
+ test -d $(VENV_DIR) || python -m venv $(VENV_DIR)
+ source $(VENV_DIR)/bin/activate && pip install -e .[dev]
touch $(VENV_DIR)/bin/activate
clean_venv:
diff --git a/README.md b/README.md
index dc05a23..1d74caa 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
# paper2remarkable
-[![PyPI
-version](https://badge.fury.io/py/paper2remarkable.svg)](https://pypi.org/project/paper2remarkable)
+[![PyPI version](https://badge.fury.io/py/paper2remarkable.svg)](https://pypi.org/project/paper2remarkable)
+[![Build Status](https://travis-ci.org/GjjvdBurg/paper2remarkable.svg?branch=master)](https://travis-ci.org/GjjvdBurg/paper2remarkable)
+[![Downloads](https://pepy.tech/badge/paper2remarkable/month)](https://pepy.tech/project/paper2remarkable/month)
``paper2remarkable`` is a command line program for quickly and easily
transferring an academic paper to your [reMarkable](https://remarkable.com/):
@@ -10,8 +11,20 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/):
$ p2r https://arxiv.org/abs/1811.11242
```
-The script can be run through the ``p2r`` command line program or via Docker
-(see below).
+There is also support for transferring an article from a website:
+
+```
+$ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines
+```
+
+The script can be run through the ``p2r`` command line program or via Docker
+(see below). If you're using MacOS, you might be interested in the [Alfred
+workflow](#alfred-workflow) or [Printing to p2r](#printing). On Linux, a
+background terminal such as [Guake](http://guake-project.org/) can be very
+handy. Note that even without a reMarkable, this program can make downloading
+papers easier (just use the `-n` flag).
+
+## Introduction
``paper2remarkable`` makes it as easy as possible to get a PDF on your
reMarkable from any of the following sources:
@@ -19,22 +32,28 @@ reMarkable from any of the following sources:
* [arXiv](https://arxiv.org/)
* [ACM Digital Library](https://dl.acm.org/dl.cfm)
* [CiteSeerX](http://citeseerx.ist.psu.edu/index)
+* [CVF](https://openaccess.thecvf.com/menu)
+* [JMLR](http://jmlr.org)
+* [NBER](https://www.nber.org)
* [NeurIPS](https://papers.nips.cc/)
* [OpenReview](https://openreview.net/)
* [PMLR](http://proceedings.mlr.press/)
* [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/)
+* [SagePub](https://journals.sagepub.com/)
+* [SemanticScholar](https://www.semanticscholar.org/)
* [SpringerLink](https://link.springer.com/)
* A generic URL to a PDF file
* A local PDF file
+* Any article on a website
The program aims to be flexible to the exact source URL, so for many of the
-sources you can either provide a URL to the abstract page or to the PDF file.
-If you have an source that you would like to see added to the list, let me
-know!
+academic sources you can either provide a URL to the abstract page or to the
+PDF file. If you have a source that you would like to see added to the list,
+let me know!
``paper2remarkable`` takes the source URL and:
-1. Downloads the pdf if necessary
+1. Downloads the pdf
2. Removes the arXiv timestamp (for arXiv sources)
3. Crops the pdf to remove unnecessary borders
4. Shrinks the pdf file to reduce the filesize
@@ -47,43 +66,13 @@ Optionally, you can:
- Download a paper but not upload to the reMarkable using the ``-n`` switch.
- Insert a blank page after each page using the ``-b`` switch (useful for note
taking!)
-- Center the pdf on the reMarkable (default is left-aligned)
+- Center (``-c``) or right-align (``-r``) the pdf on the reMarkable (default
+ is left-aligned), or disable cropping altogether (``-k``).
- Provide an explicit filename using the ``--filename`` parameter
- Specify the location on the reMarkable to place the file (default ``/``)
-Here's the full help of the script:
-
-```text
-usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v]
- [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK]
- [--rmapi RMAPI]
- input
-
-Paper2reMarkable version 0.4.0
-
-positional arguments:
- input URL to a paper or the path of a local PDF file
-
-optional arguments:
- -h, --help show this help message and exit
- -b, --blank Add a blank page after every page of the PDF
- -c, --center Center the PDF on the page, instead of left align
- -d, --debug debug mode, doesn't upload to reMarkable
- -n, --no-upload don't upload to the reMarkable, save the output in
- current working dir
- -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR
- directory on reMarkable to put the file (created if
- missing, default: /)
- -v, --verbose be verbose
- --filename FILENAME Filename to use for the file on reMarkable
- --gs GS path to gs executable (default: gs)
- --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop)
- --pdftk PDFTK path to pdftk executable (default: pdftk)
- --rmapi RMAPI path to rmapi executable (default: rmapi)
-```
-
-And here's an example with verbose mode enabled that shows everything the
-script does by default:
+Here's an example with verbose mode enabled that shows everything the script
+does by default:
```
$ p2r -v https://arxiv.org/abs/1811.11242
@@ -105,26 +94,129 @@ $ p2r -v https://arxiv.org/abs/1811.11242
The script requires the following external programs to be available:
-- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
-- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a
- LaTeX installation.
+- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/),
+ [qpdf](http://qpdf.sourceforge.net/), or
+ [pdftk-java](https://gitlab.com/pdftk-java/pdftk), whichever your package
+ manager provides.
- [GhostScript](https://www.ghostscript.com/)
- [rMAPI](https://github.com/juruen/rmapi)
-If these scripts are not available on the ``PATH`` variable, you can supply
-them with the relevant options to the script. Then, you can install
-``paper2remarkable`` from PyPI:
+Specifically:
+
+1. First install [rMAPI](https://github.com/juruen/rmapi), using
+ ```
+ $ go get -u github.com/juruen/rmapi
+ ```
+
+2. Then install system dependencies:
+ - **Arch Linux:** ``pacman -S pdftk ghostscript poppler``
+ - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace
+ ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``.
+ - **MacOS:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)).
+ - **Windows:** Installers or executables are available for
+ [qpdf](https://github.com/qpdf/qpdf/releases) (for instance the mingw
+ binary executables) and
+ [GhostScript](https://www.ghostscript.com/download/gsdnld.html).
+ Importantly, Windows support is untested and these are generic
+ instructions, so we welcome clarifications where needed. The Docker
+ instructions below may be more convenient on Windows.
+
+3. Finally, install ``paper2remarkable``:
+ ```
+ $ pip install paper2remarkable
+ ```
+ this installs the ``p2r`` command line program.
+
+**Optionally**, you can install:
+
+- [pdftoppm](https://linux.die.net/man/1/pdftoppm) (recommended for speed).
+ Usually part of a [Poppler](https://poppler.freedesktop.org/) installation.
+
+- the [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy)
+ package with Node.js support, to allow using
+ [Readability.js](https://github.com/mozilla/readability) for HTML articles.
+ This is known to improve the output of certain web articles.
+
+If any of the dependencies (such as rmapi or ghostscript) are not available on
+the ``PATH`` variable, you can supply them with the relevant options to the
+script (for instance ``p2r --rmapi /path/to/rmapi``). If you run into trouble
+with the installation, please let me know by opening an issue [on
+Github][github-url].
+
+## Usage
+
+The full help of the script is as follows. Hopefully the various command line
+flags are self-explanatory, but if you'd like more information, please open an
+issue [on GitHub][github-url].
+
+```
+usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V]
+ [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK]
+ [--qpdf QPDF] [--rmapi RMAPI]
+ input [input ...]
+
+Paper2reMarkable version 0.7.3
+positional arguments:
+ input One or more URLs to a paper or paths to local PDF files
+
+optional arguments:
+ -h, --help show this help message and exit
+ -b, --blank Add a blank page after every page of the PDF
+ -c, --center Center the PDF on the page, instead of left align
+ -d, --debug debug mode, doesn't upload to reMarkable
+ -n, --no-upload don't upload to the reMarkable, save the output in current working dir
+ -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR
+ directory on reMarkable to put the file (created if missing, default: /)
+ -r, --right Right align so the menu doesn't cover it
+ -k, --no-crop Don't crop the pdf file
+ -v, --verbose be verbose
+ -V, --version Show version and exit
+ --filename FILENAME Filename to use for the file on reMarkable
+ --gs GS path to gs executable (default: gs)
+ --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm)
+ --pdftk PDFTK path to pdftk executable (default: pdftk)
+ --qpdf QPDF path to qpdf executable (default: qpdf)
+ --rmapi RMAPI path to rmapi executable (default: rmapi)
```
-pip install paper2remarkable
+
+## Alfred Workflow
+
+On MacOS, you can optionally install [this Alfred workflow][workflow]. Alfred
+is [a launcher for MacOS](https://www.alfredapp.com/).
+
+Once installed, you can then use `rm` command and `rmb` (for the `--blank`
+pages to insert blank pages between pages for notes) with a URL passed. The
+global shortcut `Alt-P` will send the current selection to `p2r`. Note that by
+default `--right` is passed and `p2r` is executed in your `bash` environment.
+You can edit the Workflow in Alfred if this doesn't work for your setup.
+
+![Alfred Screenshot](https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/.github/alfred.png)
+
+[workflow]: https://github.com/GjjvdBurg/paper2remarkable/blob/master/Remarkable.alfredworkflow?raw=true
+
+## Printing
+
+Printing to `p2r` allows printing prompts to save directly to your reMarkable
+tablet, passing through `p2r` for processing.
+
+For MacOS, you can follow [the guide][print-guide] for printing with `rmapi`,
+but for the bash script, instead use this script:
+
+```
+for f in "$@"
+do
+ bash -c -l "p2r --right '$f'"
+done
```
-This installs the ``p2r`` command line program.
+[print-guide]: https://github.com/juruen/rmapi/blob/master/docs/tutorial-print-macosx.md
## Docker
-You can also use our Dockerfile to avoid installing dependencies on your
-machine. You will need `git` and `docker` installed.
+If you'd like to avoid installing the dependencies directly on your machine,
+you can use the Dockerfile. To make this work you will need ``git`` and
+``docker`` installed.
First clone this repository with `git clone` and `cd` inside of it, then build
the container:
@@ -161,8 +253,15 @@ docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r --help
# equivalent to above usage
docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r -v https://arxiv.org/abs/1811.11242
+
+# to transfer a local file in the current directory
+docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" -v "$(pwd):/home/user:r" p2r -v localfile.pdf
```
+For transferring local files using the Docker image, you may find [this helper
+function](https://github.com/GjjvdBurg/paper2remarkable/issues/34#issuecomment-610852258)
+useful.
+
You can also create an [alias](http://tldp.org/LDP/abs/html/aliases.html) in
your ``~/.bashrc`` file to abstract away the Docker commands:
@@ -178,5 +277,7 @@ Then you can use ``paper2remarkable`` from the command line as ``p2r``!
License: MIT
-If you find a problem or want to suggest a feature, please let us know! You're
-helping to make this project better!
+If you find a problem or want to suggest a feature, please open an issue [on
+Github][github-url]. You're helping to make this project better for everyone!
+
+[github-url]: https://github.com/GjjvdBurg/paper2remarkable
diff --git a/Remarkable.alfredworkflow b/Remarkable.alfredworkflow
new file mode 100644
index 0000000..6ad331e
--- /dev/null
+++ b/Remarkable.alfredworkflow
Binary files differ
diff --git a/make_release.py b/make_release.py
index a19b5fd..f3bc9f2 100644
--- a/make_release.py
+++ b/make_release.py
@@ -14,6 +14,8 @@ Date: 2019-07-23
import colorama
import os
+import sys
+import tempfile
def colored(msg, color=None, style=None):
@@ -52,6 +54,13 @@ def get_package_name():
return nameline.split("=")[-1].strip().strip('"')
+def get_package_version(pkgname):
+ ctx = {}
+ with open(f"{pkgname.lower()}/__version__.py", "r") as fp:
+ exec(fp.read(), ctx)
+ return ctx["__version__"]
+
+
class Step:
def pre(self, context):
pass
@@ -96,6 +105,12 @@ class UpdateChangelog(Step):
self.print_run("vi CHANGELOG.md")
+class UpdateReadme(Step):
+ def action(self, context):
+ self.instruct(f"Update readme if necessary")
+ self.print_run("vi README.md")
+
+
class RunTests(Step):
def action(self, context):
self.instruct("Run the unit tests")
@@ -105,7 +120,7 @@ class RunTests(Step):
class BumpVersionPackage(Step):
def action(self, context):
self.instruct(f"Update __version__.py with new version")
- self.print_run(f"vi {context['pkgname']}/__version__.py")
+ self.do_cmd(f"vi {context['pkgname']}/__version__.py")
def post(self, context):
wait_for_enter()
@@ -113,10 +128,7 @@ class BumpVersionPackage(Step):
def _get_version(self, context):
# Get the version from the version file
- about = {}
- with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp:
- exec(fp.read(), about)
- return about["__version__"]
+ return get_package_version(context["pkgname"])
class MakeClean(Step):
@@ -143,15 +155,15 @@ class PushToTestPyPI(Step):
class InstallFromTestPyPI(Step):
def action(self, context):
- self.print_run("cd /tmp/")
- self.print_cmd("rm -rf ./venv")
- self.print_cmd("virtualenv ./venv")
- self.print_cmd("cd ./venv")
- self.print_cmd("source bin/activate")
- self.print_cmd(
- "pip install --index-url https://test.pypi.org/simple/ "
- + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}"
+ tmpvenv = tempfile.mkdtemp(prefix="p2r_venv_")
+ self.do_cmd(
+ f"python -m venv {tmpvenv} && source {tmpvenv}/bin/activate && "
+ "pip install --no-cache-dir --index-url "
+ "https://test.pypi.org/simple/ "
+ "--extra-index-url https://pypi.org/simple "
+ f"{context['pkgname']}=={context['version']}"
)
+ context["tmpvenv"] = tmpvenv
class TestPackage(Step):
@@ -159,13 +171,12 @@ class TestPackage(Step):
self.instruct(
f"Ensure that the following command gives version {context['version']}"
)
- self.print_run(f"p2r -h")
+ self.do_cmd(f"source {context['tmpvenv']}/bin/activate && p2r -V")
-class DeactivateVenv(Step):
+class RemoveVenv(Step):
def action(self, context):
- self.print_run("deactivate")
- self.instruct("Go back to the project directory")
+ self.do_cmd(f"rm -rf {context['tmpvenv']}")
class GitTagVersion(Step):
@@ -210,32 +221,39 @@ class WaitForRTD(Step):
)
-def main():
+def main(target=None):
colorama.init()
procedure = [
- GitToMaster(),
- GitAdd(),
- PushToGitHub(),
- BumpVersionPackage(),
- UpdateChangelog(),
- MakeClean(),
- RunTests(),
- MakeDist(),
- PushToTestPyPI(),
- InstallFromTestPyPI(),
- TestPackage(),
- DeactivateVenv(),
- GitAdd(),
- PushToPyPI(),
- GitTagVersion(),
- PushToGitHub(),
+ ("gittomaster", GitToMaster()),
+ ("gitadd1", GitAdd()),
+ ("push1", PushToGitHub()),
+ ("bumpversion", BumpVersionPackage()),
+ ("changelog", UpdateChangelog()),
+ ("readme", UpdateReadme()),
+ ("clean", MakeClean()),
+ ("tests", RunTests()),
+ ("dist", MakeDist()),
+ ("testpypi", PushToTestPyPI()),
+ ("install", InstallFromTestPyPI()),
+ ("testpkg", TestPackage()),
+ ("remove_venv", RemoveVenv()),
+ ("gitadd2", GitAdd()),
+ ("pypi", PushToPyPI()),
+ ("tag", GitTagVersion()),
+ ("push2", PushToGitHub()),
]
context = {}
context["pkgname"] = get_package_name()
- for step in procedure:
+ context["version"] = get_package_version(context["pkgname"])
+ skip = True if target else False
+ for name, step in procedure:
+ if not name == target and skip:
+ continue
+ skip = False
step.run(context)
cprint("\nDone!", color="yellow", style="bright")
if __name__ == "__main__":
- main()
+ target = sys.argv[1] if len(sys.argv) > 1 else None
+ main(target=target)
diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py
index 6540db2..5c0adff 100644
--- a/paper2remarkable/__version__.py
+++ b/paper2remarkable/__version__.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-VERSION = (0, 4, 6)
+VERSION = (0, 7, 4)
__version__ = ".".join(map(str, VERSION))
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
index d1a94d8..623d29f 100644
--- a/paper2remarkable/crop.py
+++ b/paper2remarkable/crop.py
@@ -9,9 +9,12 @@ Copyright: 2019, G.J.J. van den Burg
"""
import PyPDF2
+import io
import os
-import subprocess
import pdfplumber
+import subprocess
+
+from PyPDF2.generic import RectangleObject
from .log import Logger
@@ -21,17 +24,54 @@ RM_HEIGHT = 1872
logger = Logger()
+def find_offset_byte_line(line):
+ """Find index of first nonzero bit in a line of bytes
+
+ The given line is a string of bytes, each representing 8 pixels. This code
+ finds the index of the first bit that is not zero. Used when finding the
+ cropbox with pdftoppm.
+ """
+ off = 0
+ for c in line:
+ if c == 0:
+ off += 8
+ else:
+ k = 0
+ while c > 0:
+ k += 1
+ c >>= 1
+ off += k
+ break
+ return off
+
+
+def check_pdftoppm(pth):
+ """Check that we can run the provided pdftoppm executable"""
+ try:
+ subprocess.check_output([pth, "-v"], stderr=subprocess.DEVNULL)
+ except (subprocess.CalledProcessError, FileNotFoundError, PermissionError):
+ logger.info("pdftoppm not found, using pdfplumber instead (slower)")
+ return False
+ return True
+
+
class Cropper(object):
def __init__(
- self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ self,
+ input_file=None,
+ output_file=None,
+ pdftoppm_path="pdftoppm",
):
if not input_file is None:
self.input_file = os.path.abspath(input_file)
self.reader = PyPDF2.PdfFileReader(self.input_file)
if not output_file is None:
self.output_file = os.path.abspath(output_file)
- self.pdfcrop_path = pdfcrop_path
+ if pdftoppm_path and not check_pdftoppm(pdftoppm_path):
+ pdftoppm_path = None
+
+ self.pdftoppm_path = pdftoppm_path
self.writer = PyPDF2.PdfFileWriter()
def crop(self, margins=1):
@@ -40,6 +80,9 @@ class Cropper(object):
def center(self, padding=15):
return self.process_file(self.center_page, padding=padding)
+ def right(self, padding=15):
+ return self.process_file(self.right_page, padding=padding)
+
def process_file(self, page_func, *args, **kwargs):
n = self.reader.getNumPages()
for page_idx in range(n):
@@ -54,13 +97,18 @@ class Cropper(object):
logger.info("Processing pages ... (%i/%i)" % (n, n))
return 0
+ def crop_page(self, page_idx, margins):
+ return self.process_page(page_idx, self.get_bbox, margins=margins)
+
def center_page(self, page_idx, padding):
return self.process_page(
page_idx, self.get_center_bbox, padding=padding
)
- def crop_page(self, page_idx, margins):
- return self.process_page(page_idx, self.get_bbox, margins=margins)
+ def right_page(self, page_idx, padding):
+ return self.process_page(
+ page_idx, self.get_right_bbox, padding=padding
+ )
def export_page(self, page_idx):
"""Helper function that exports a single page given by index """
@@ -75,38 +123,23 @@ class Cropper(object):
def process_page(self, page_idx, bbox_func, *args, **kwargs):
"""Process a single page and add it to the writer """
tmpfname = self.export_page(page_idx)
- tmpfout = "./output.pdf"
bbox = bbox_func(tmpfname, *args, **kwargs)
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--bbox",
- " ".join(map(str, bbox)),
- tmpfname,
- tmpfout,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- return status
- reader = PyPDF2.PdfFileReader(tmpfout)
- page = reader.getPage(0)
- self.writer.addPage(page)
+ thepage = self.reader.getPage(page_idx)
+ thepage.cropBox = RectangleObject(bbox)
+ self.writer.addPage(thepage)
os.unlink(tmpfname)
- os.unlink(tmpfout)
return 0
- def get_bbox(self, filename, margins=1, resolution=72):
- """Get the bounding box, with optional margins
-
- if margins is integer, used for all margins, else
- margins = [left, top, right, bottom]
+ def get_raw_bbox(self, filename, resolution=72):
+ """Get the basic bounding box of a pdf file"""
+ if self.pdftoppm_path is None:
+ box = self.get_raw_bbox_pdfplumber(filename, resolution=resolution)
+ else:
+ box = self.get_raw_bbox_pdftoppm(filename, resolution=resolution)
+ return box
- We get the bounding box by finding the smallest rectangle that is
- completely surrounded by white pixels.
- """
- if isinstance(margins, int):
- margins = [margins for _ in range(4)]
+ def get_raw_bbox_pdfplumber(self, filename, resolution=72):
+ """Get the basic bounding box with pdfplumber"""
pdf = pdfplumber.open(filename)
im = pdf.pages[0].to_image(resolution=resolution)
pdf.close()
@@ -131,20 +164,99 @@ class Cropper(object):
while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
right += 1
+ return left, right, top, bottom, W, H
+
+ def get_raw_bbox_pdftoppm(self, filename, resolution=72):
+ """Get the basic bounding box using pdftoppm """
+ cmd = [
+ self.pdftoppm_path,
+ "-r",
+ str(resolution),
+ "-singlefile",
+ "-mono",
+ filename,
+ ]
+
+ im = subprocess.check_output(cmd)
+ im = io.BytesIO(im)
+
+ id_ = im.readline().rstrip(b"\n")
+ if not id_ == b"P4":
+ raise ValueError("Not in P4 format")
+ wh = im.readline().rstrip(b"\n").split(b" ")
+ width, height = int(wh[0]), int(wh[1])
+ imdata = im.read()
+
+ pad = width % 8
+ padwidth = width + pad
+ stepsize = padwidth // 8
+
+ for top in range(height):
+ if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0:
+ break
+
+ for bottom in reversed(range(height)):
+ if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0:
+ break
+
+ left = width
+ right = 0
+ for i in range(top, bottom):
+ lline = imdata[i * stepsize : (i + 1) * stepsize]
+ rline = reversed(imdata[i * stepsize : (i + 1) * stepsize])
+ l = find_offset_byte_line(lline)
+ left = min(left, l)
+ r = padwidth + pad - find_offset_byte_line(rline)
+ right = max(right, r)
+
+ top += 1
+ left += 1
+ right = width - right + 2
+ bottom = height - bottom - 2
+
+ return left, right, top, bottom, width, height
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+
+ left, right, top, bottom, W, H = self.get_raw_bbox(
+ filename, resolution=resolution
+ )
+
left -= margins[0]
+ left = max(left, 0)
top -= margins[1]
+ top = max(top, 0)
right -= margins[2]
bottom -= margins[3]
# This is the bounding box in PIL format: (0, 0) top left
x0, y0, x1, y1 = left, top, W - right, H - bottom
+ # The remarkable changes the orientation of a portrait page if the
+ # width is greater than the height. To prevent this, we pad the height
+ # with extra whitespace. This should only occur if the original
+ # orientation of the page would be changed by cropping.
+ w, h = x1 - x0, y1 - y0
+ if H > W and w > h:
+ y1 = y0 + w + 10
+ h = y1 - y0
+
# Get the bbox in Ghostscript format: (0, 0) bottom left
a0, b0, a1, b1 = x0, H - y1, x1, H - y0
return [a0, b0, a1, b1]
def get_center_bbox(self, filename, padding=15):
- """Compute a bounding box that will center the page file on the
+ """Compute a bounding box that will center the page file on the
reMarkable
"""
bbox = self.get_bbox(filename, margins=0)
@@ -159,7 +271,7 @@ class Cropper(object):
# if the document is wider than the remarkable, we add top-padding to
# center it, otherwise we add left-padding
- x, y = 0, 0
+ x = y = 0
if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
else:
@@ -167,3 +279,39 @@ class Cropper(object):
margins = [padding + x, padding + y, padding, padding]
return self.get_bbox(filename, margins=margins)
+
+ def get_right_bbox(self, filename, padding=15):
+ """Get the bounding box that ensures the menu doesn't hide the text"""
+
+ bbox = self.get_bbox(filename, margins=0)
+
+ h = bbox[3] - bbox[1]
+ w = bbox[2] - bbox[0]
+
+ # Note, the menu width is about 12mm and the entire screen is about
+ # 156mm. This informs the width of the left padding we'll add.
+ menu_width = 12 / 156 * RM_WIDTH
+
+ H = RM_HEIGHT
+ W = RM_WIDTH
+
+ # TODO: This math is approximate. The goal is to get the page centered
+ # in the remaining space after taking the menu width into account,
+ # while also providing equal padding at the top and bottom. This seems
+ # to give too much padding on the left for some pages, but I'm not sure
+ # why. Pull requests welcome!
+ rho_rm = H / (W - menu_width)
+ rho_page = (h + 2 * padding) / (w + 2 * padding)
+ x = y = 0
+ if rho_rm < rho_page:
+ x = -w - 2 * padding + (h + 2 * padding) * (W - menu_width) / H
+ elif rho_rm > rho_page:
+ y = -h - 2 * padding + H * (w + 2 * padding) / (W - menu_width)
+
+ margins = [
+ menu_width + x + padding,
+ padding + y,
+ padding,
+ padding,
+ ]
+ return self.get_bbox(filename, margins=margins)
diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py
index 86f39b4..b433ad4 100644
--- a/paper2remarkable/exceptions.py
+++ b/paper2remarkable/exceptions.py
@@ -6,10 +6,9 @@
from . import GITHUB_URL
-from subprocess import CalledProcessError
-
-GH_MSG = "\n\nIf you think this might be a bug, please raise an issue on GitHub at: {url}".format(
- url=GITHUB_URL
+GH_MSG = (
+ "\n\nIf you think this might be a bug, please raise an issue on "
+ "GitHub at:\n{url}\n".format(url=GITHUB_URL)
)
@@ -48,13 +47,17 @@ class URLResolutionError(Error):
class FilenameMissingError(Error):
"""Exception raised for providers that need a filename to be provided"""
- def __init__(self, provider):
+ def __init__(self, provider, url, reason=None):
self.provider = provider
+ self.url = url
+ self.reason = reason
def __str__(self):
- msg = "ERROR: Filename must be given with the {provider} provider (hint: use --filename)".format(
- provider=self.provider
+ msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format(
+ provider=self.provider, url=self.url
)
+ if self.reason:
+ msg += "\nReason: {reason}".format(reason=self.reason)
msg += GH_MSG
return msg
@@ -86,16 +89,53 @@ class RemarkableError(Error):
return msg
-class _CalledProcessError(CalledProcessError):
- """Exception raised when subprocesses fail.
+class _CalledProcessError(Error):
+ """Exception raised when subprocesses fail. """
- We subclass the CalledProcessError so we can add our custom error message.
- """
+ def __init__(self, message):
+ self.message = message
+
+ def __str__(self):
+ msg = "ERROR: {message}".format(message=self.message)
+ msg += GH_MSG
+ return msg
+
+
+class NoPDFToolError(Error):
+ """Exception raised when neither pdftk or qpdf is found."""
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
+ def __init__(self):
+ pass
def __str__(self):
- parent = super().__str__()
- msg = parent + GH_MSG
+ msg = (
+ "ERROR: Neither pdftk or qpdf could be found. Install "
+ "either of these or ensure that they can be found using "
+ "the --pdftk or --qpdf options."
+ )
+ msg += GH_MSG
+ return msg
+
+
+class UnidentifiedSourceError(Error):
+ """Exception raised when the input is neither a local file nor a url """
+
+ def __str__(self):
+ msg = (
+ "ERROR: Couldn't figure out what source you mean. If it's a "
+ "local file, please make sure it exists."
+ )
+ msg += GH_MSG
+ return msg
+
+
+class InvalidURLError(Error):
+ """Exception raised when no provider can handle a url source """
+
+ def __str__(self):
+ msg = (
+ "ERROR: Input URL is not valid, no provider can handle "
+ "this source."
+ )
+ msg += GH_MSG
return msg
diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py
index bae1cbf..fb9d8a3 100644
--- a/paper2remarkable/log.py
+++ b/paper2remarkable/log.py
@@ -38,19 +38,26 @@ class Logger(metaclass=Singleton):
def disable(self):
self.enabled = False
- def _log(self, msg, mode):
+ def _log(self, msg, mode, end="\n", add_prefix=True):
if not self.enabled:
return
if not mode in ("info", "warn"):
raise ValueError("Unknown logging mode: %s" % mode)
file = sys.stdout if mode == "info" else sys.stderr
- now = datetime.datetime.now()
- nowstr = now.strftime("%Y-%m-%d %H:%M:%S")
- print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file)
+ if add_prefix:
+ now = datetime.datetime.now()
+ nowstr = now.strftime("%Y-%m-%d %H:%M:%S")
+ prefix = "%s - %s - " % (nowstr, mode.upper())
+ else:
+ prefix = ""
+ print("%s%s" % (prefix, msg), end=end, file=file)
file.flush()
- def info(self, msg):
- self._log(msg, "info")
+ def info(self, msg, end="\n"):
+ self._log(msg, "info", end=end)
- def warning(self, msg):
- self._log(msg, "warn")
+ def warning(self, msg, end="\n"):
+ self._log(msg, "warn", end=end)
+
+ def append(self, msg, mode, end="\n"):
+ self._log(msg, mode, end=end, add_prefix=False)
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
index c660452..c365920 100644
--- a/paper2remarkable/pdf_ops.py
+++ b/paper2remarkable/pdf_ops.py
@@ -19,49 +19,28 @@ from .log import Logger
logger = Logger()
-def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
- """Crop the pdf file using Cropper
- """
- logger.info("Cropping pdf file")
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
-
- cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
- status = cropper.crop(margins=15)
-
- if not status == 0:
- logger.warning("Failed to crop the pdf file at: %s" % filepath)
+def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"):
+ """Prepare pdf by cropping, centering, or right-aligning the flie"""
+ logger.info("Preparing PDF using %s operation" % operation)
+ prepared_file = os.path.splitext(filepath)[0] + "-prep.pdf"
+ cropper = Cropper(filepath, prepared_file, pdftoppm_path=pdftoppm_path)
+ if operation == "crop":
+ status = cropper.crop(margins=15)
+ elif operation == "center":
+ status = cropper.center()
+ elif operation == "right":
+ status = cropper.right()
+ else:
+ logger.warning("Unknown operation: %s" % operation)
return filepath
- if not os.path.exists(cropped_file):
- logger.warning(
- "Can't find cropped file '%s' where expected." % cropped_file
- )
+ if not status == 0 or not os.path.exists(prepared_file):
+ logger.warning("PDF prepare operation failed")
return filepath
- return cropped_file
-
-
-def center_pdf(filepath, pdfcrop_path="pdfcrop"):
- """Center the pdf file on the reMarkable
- """
- logger.info("Centering pdf file")
- centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
-
- cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
- status = cropper.center()
-
- if not status == 0:
- logger.warning("Failed to center the pdf file at: %s" % filepath)
- return filepath
- if not os.path.exists(centered_file):
- logger.warning(
- "Can't find centered file '%s' where expected." % centered_file
- )
- return filepath
- return centered_file
+ return prepared_file
def blank_pdf(filepath):
- """Add blank pages to PDF
- """
+ """Add blank pages to PDF"""
logger.info("Adding blank pages")
input_pdf = PyPDF2.PdfFileReader(filepath)
output_pdf = PyPDF2.PdfFileWriter()
@@ -76,9 +55,9 @@ def blank_pdf(filepath):
def shrink_pdf(filepath, gs_path="gs"):
- """Shrink the PDF file size using Ghostscript
- """
- logger.info("Shrinking pdf file")
+ """Shrink the PDF file size using Ghostscript"""
+ logger.info("Shrinking pdf file ...")
+ size_before = os.path.getsize(filepath)
output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
status = subprocess.call(
[
@@ -98,4 +77,8 @@ def shrink_pdf(filepath, gs_path="gs"):
if not status == 0:
logger.warning("Failed to shrink the pdf file")
return filepath
+ size_after = os.path.getsize(output_file)
+ if size_after > size_before:
+ logger.info("Shrinking has no effect for this file, using original.")
+ return filepath
return output_file
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
index 53fda1f..935b889 100644
--- a/paper2remarkable/providers/__init__.py
+++ b/paper2remarkable/providers/__init__.py
@@ -3,26 +3,38 @@
from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
+from .cvf import CVF
+from .html import HTML
+from .jmlr import JMLR
from .local import LocalFile
+from .nber import NBER
from .neurips import NeurIPS
from .openreview import OpenReview
from .pdf_url import PdfUrl
from .pmlr import PMLR
from .pubmed import PubMed
+from .sagepub import SagePub
+from .semantic_scholar import SemanticScholar
from .springer import Springer
from .tandfonline import TandFOnline
-# NOTE: Order matters here, PdfUrl should be last
+# NOTE: Order matters here, PdfUrl and HTML should be last
providers = [
ACM,
Arxiv,
CiteSeerX,
+ CVF,
+ JMLR,
+ NBER,
NeurIPS,
OpenReview,
PMLR,
PubMed,
+ SagePub,
Springer,
+ SemanticScholar,
TandFOnline,
LocalFile,
PdfUrl,
+ HTML,
]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
index 596af98..74ab9e6 100644
--- a/paper2remarkable/providers/_base.py
+++ b/paper2remarkable/providers/_base.py
@@ -11,18 +11,21 @@ Copyright: 2019, G.J.J. van den Burg
import abc
import os
import shutil
+import subprocess
import tempfile
import time
-from ._info import Informer
-from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
+from ..exceptions import _CalledProcessError
+from ..log import Logger
+from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf
from ..utils import (
assert_file_is_pdf,
+ check_pdftool,
download_url,
- upload_to_remarkable,
follow_redirects,
+ upload_to_remarkable,
)
-from ..log import Logger
+from ._info import Informer
logger = Logger()
@@ -36,11 +39,14 @@ class Provider(metaclass=abc.ABCMeta):
upload=True,
debug=False,
center=False,
+ right=False,
blank=False,
+ no_crop=False,
remarkable_dir="/",
rmapi_path="rmapi",
- pdfcrop_path="pdfcrop",
+ pdftoppm_path="pdftoppm",
pdftk_path="pdftk",
+ qpdf_path="qpdf",
gs_path="gs",
cookiejar=None,
):
@@ -48,12 +54,15 @@ class Provider(metaclass=abc.ABCMeta):
self.debug = debug
self.remarkable_dir = remarkable_dir
self.rmapi_path = rmapi_path
- self.pdfcrop_path = pdfcrop_path
+ self.pdftoppm_path = pdftoppm_path
self.pdftk_path = pdftk_path
+ self.qpdf_path = qpdf_path
self.gs_path = gs_path
self.informer = Informer()
self.cookiejar = cookiejar
+ self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path)
+
# wait time to not hit the server too frequently
self.server_delay = 0
@@ -62,9 +71,13 @@ class Provider(metaclass=abc.ABCMeta):
logger.disable()
# Define the operations to run on the pdf. Providers can add others.
- self.operations = [("crop", self.crop_pdf)]
+ self.operations = [("rewrite", self.rewrite_pdf)]
if center:
self.operations.append(("center", self.center_pdf))
+ elif right:
+ self.operations.append(("right", self.right_pdf))
+ elif not no_crop:
+ self.operations.append(("crop", self.crop_pdf))
if blank:
self.operations.append(("blank", blank_pdf))
@@ -83,10 +96,15 @@ class Provider(metaclass=abc.ABCMeta):
# Wrappers for pdf operations that have additional arguments
def crop_pdf(self, filepath):
- return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+ return prepare_pdf(filepath, "crop", pdftoppm_path=self.pdftoppm_path)
def center_pdf(self, filepath):
- return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+ return prepare_pdf(
+ filepath, "center", pdftoppm_path=self.pdftoppm_path
+ )
+
+ def right_pdf(self, filepath):
+ return prepare_pdf(filepath, "right", pdftoppm_path=self.pdftoppm_path)
def shrink_pdf(self, filepath):
return shrink_pdf(filepath, gs_path=self.gs_path)
@@ -96,6 +114,78 @@ class Provider(metaclass=abc.ABCMeta):
# This must exist so that the LocalFile provider can overwrite it
download_url(pdf_url, filename, cookiejar=self.cookiejar)
+ def compress_pdf(self, in_pdf, out_pdf):
+ """ Compress a pdf file, returns subprocess status """
+ if self.pdftool == "pdftk":
+ status = subprocess.call(
+ [self.pdftk_path, in_pdf, "output", out_pdf, "compress"]
+ )
+ elif self.pdftool == "qpdf":
+ status = subprocess.call(
+ [
+ self.qpdf_path,
+ "--stream-data=compress",
+ in_pdf,
+ out_pdf,
+ ],
+ stderr=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ raise _CalledProcessError(
+ "%s failed to compress the PDF file." % self.pdftool
+ )
+
+ def rewrite_pdf(self, in_pdf, out_pdf=None):
+ """Re-write the pdf using Ghostscript
+
+ This helps avoid issues in dearxiv due to nested pdfs.
+ """
+ if out_pdf is None:
+ out_pdf = os.path.splitext(in_pdf)[0] + "-rewrite.pdf"
+
+ status = subprocess.call(
+ [
+ self.gs_path,
+ "-sDEVICE=pdfwrite",
+ "-dQUIET",
+ "-o",
+ out_pdf,
+ in_pdf,
+ ]
+ )
+ if not status == 0:
+ raise _CalledProcessError(
+ "Failed to rewrite the pdf with GhostScript"
+ )
+ return out_pdf
+
+ def uncompress_pdf(self, in_pdf, out_pdf):
+ """ Uncompress a pdf file """
+
+ if self.pdftool == "pdftk":
+ status = subprocess.call(
+ [
+ self.pdftk_path,
+ in_pdf,
+ "output",
+ out_pdf,
+ "uncompress",
+ ]
+ )
+ elif self.pdftool == "qpdf":
+ status = subprocess.call(
+ [
+ self.qpdf_path,
+ "--stream-data=uncompress",
+ in_pdf,
+ out_pdf,
+ ]
+ )
+ if not status == 0:
+ raise _CalledProcessError(
+ "%s failed to uncompress the PDF file." % self.pdftool
+ )
+
def run(self, src, filename=None):
# follow_redirects here is needed with library use
if os.path.exists(src):
@@ -124,7 +214,7 @@ class Provider(metaclass=abc.ABCMeta):
intermediate_fname = tmp_filename
for opname, op in self.operations:
intermediate_fname = op(intermediate_fname)
- shutil.move(intermediate_fname, clean_filename)
+ shutil.copy(intermediate_fname, clean_filename)
if self.debug:
print("Paused in debug mode in dir: %s" % working_dir)
@@ -143,4 +233,5 @@ class Provider(metaclass=abc.ABCMeta):
base = os.path.splitext(target_path)[0]
target_path = base + "_.pdf"
shutil.move(clean_filename, target_path)
- return target_path
+ os.chdir(self.initial_dir)
+ return target_path
diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
index 746c436..8cffc60 100644
--- a/paper2remarkable/providers/_info.py
+++ b/paper2remarkable/providers/_info.py
@@ -16,12 +16,12 @@ logger = Logger()
class Informer:
"""Base class for the informers.
- The "informer" class is used to retrieve the title, authors, and year of
+ The "informer" class is used to retrieve the title, authors, and year of
publication of the provided paper.
- This base class provides the main functionality, but because various
- outlets use different conventions to embed author, title, and publication
- year information, we expect that individual providers will subclass this
+ This base class provides the main functionality, but because various
+ outlets use different conventions to embed author, title, and publication
+ year information, we expect that individual providers will subclass this
class and overwrite some of the methods.
"""
@@ -35,9 +35,9 @@ class Informer:
self.year = year
def get_filename(self, abs_url):
- """ Generate nice filename using the paper information
+ """Generate nice filename using the paper information
- The provided url must be to a HTMl page where this information can be
+ The provided url must be to a HTMl page where this information can be
found, not to the PDF file itself.
"""
logger.info("Generating output filename")
@@ -50,6 +50,7 @@ class Informer:
authors = self.authors[0] + "_et_al"
else:
authors = "_".join(self.authors)
+ authors = authors.replace(" ", "_")
authors = clean_string(authors)
# Clean the title and make it titlecase
@@ -76,8 +77,13 @@ class Informer:
## Title
def get_title(self, soup):
- target = soup.find_all("meta", {"name": self.meta_title_key})
- return target[0]["content"]
+ meta = soup.find_all("meta", {"name": self.meta_title_key})
+ if not meta:
+ logger.warning(
+ "Couldn't determine title information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ return meta[0]["content"]
## Authors
@@ -87,10 +93,13 @@ class Informer:
return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
def get_authors(self, soup):
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": self.meta_author_key})
- ]
+ meta = soup.find_all("meta", {"name": self.meta_author_key})
+ if not meta:
+ logger.warning(
+ "Couldn't determine author information, maybe provide the desired filename using '--filename'?"
+ )
+ return ""
+ authors = [x["content"] for x in meta]
return self._format_authors(authors)
## Year
@@ -100,7 +109,8 @@ class Informer:
def get_year(self, soup):
""" Retrieve the contents of the meta_date_key field and format it """
- date = soup.find_all("meta", {"name": self.meta_date_key})[0][
- "content"
- ]
+ meta = soup.find_all("meta", {"name": self.meta_date_key})
+ if not meta:
+ return ""
+ date = meta[0]["content"]
return self._format_year(date)
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
index 913e015..6ec1796 100644
--- a/paper2remarkable/providers/arxiv.py
+++ b/paper2remarkable/providers/arxiv.py
@@ -10,20 +10,17 @@ Copyright: 2019, G.J.J. van den Burg
import os
import re
-import subprocess
from ._info import Informer
from ._base import Provider
-from ..exceptions import (
- URLResolutionError,
- _CalledProcessError as CalledProcessError,
-)
+from ..exceptions import URLResolutionError
from ..log import Logger
logger = Logger()
-DEARXIV_TEXT_REGEX = (
- b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}"
+DEARXIV_URI_REGEX = (
+ b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+"
)
@@ -36,8 +33,8 @@ class Arxiv(Provider):
re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
- re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?"
- re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf"
+ re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?"
+ re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -48,6 +45,8 @@ class Arxiv(Provider):
def get_abs_pdf_urls(self, url):
"""Get the pdf and abs url from any given arXiv url """
+ if "?" in url:
+ url = url[: url.index("?")]
if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url):
abs_url = url
pdf_url = url.replace("abs", "pdf") + ".pdf"
@@ -69,44 +68,139 @@ class Arxiv(Provider):
def dearxiv(self, input_file):
"""Remove the arXiv timestamp from a pdf"""
- logger.info("Removing arXiv timestamp")
+ logger.info("Removing arXiv timestamp ... ", end="")
basename = os.path.splitext(input_file)[0]
- uncompress_file = basename + "_uncompress.pdf"
- status = subprocess.call(
- [
- self.pdftk_path,
- input_file,
- "output",
- uncompress_file,
- "uncompress",
- ]
- )
- if not status == 0:
- raise CalledProcessError(
- "pdftk failed to uncompress the PDF file."
- )
-
- with open(uncompress_file, "rb") as fid:
- data = fid.read()
- # Remove the text element
- data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data)
- # Remove the URL element
- data = re.sub(
- b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
- b"",
- data,
- )
+ recoded_file = basename + "_rewrite.pdf"
+ self.rewrite_pdf(input_file, recoded_file)
+
+ uncompress_file = basename + "_uncompress.pdf"
+ self.uncompress_pdf(recoded_file, uncompress_file)
+
+ new_data = []
+ current_obj = []
+ replaced_arXiv = False
+ char_count = skip_n = startxref = 0
+ xref = {}
+
+ with open(uncompress_file, "rb") as fp:
+ for line in fp:
+ if skip_n:
+ # Skip a line
+ skip_n -= 1
+ continue
+
+ if line.endswith(b" obj\n") or line.endswith(b" obj \n"):
+ # Start a new object. Add it to the current object and
+ # record its position for the xref table.
+ current_obj.append(line)
+ objid = int(line.split(b" ")[0])
+ xref[objid] = char_count
+ elif current_obj and (
+ line.startswith(b"endobj")
+ and not line.startswith(b"endobj xref")
+ ):
+ # End the current object. If needed, replace the arXiv
+ # stamp in the block (done only once). Reset current
+ # object.
+ current_obj.append(line)
+ block = b"".join(current_obj)
+ # remove the text
+ block, n_subs1 = re.subn(
+ b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj",
+ b"()Tj",
+ block,
+ )
+ # remove the url (type 1)
+ block, n_subs2 = re.subn(
+ b"<<\n\/URI \("
+ + DEARXIV_URI_REGEX
+ + b"\)\n\/S /URI\n>>\n",
+ b"",
+ block,
+ )
+ # remove the url (type 2, i.e. Jackson arXiv 0309285v2)
+ block, n_subs3 = re.subn(
+ b"<<\n\/S \/URI\n"
+ + b"/URI \("
+ + DEARXIV_URI_REGEX
+ + b"\)\n>>\n",
+ b"",
+ block,
+ )
+
+ if n_subs1 or n_subs2:
+ # fix the length of the object stream
+ block = fix_stream_length(block)
+ replaced_arXiv = True
+ new_data.append(block)
+ char_count += len(block)
+ current_obj = []
+ elif line in [b"xref\n", b"endobj xref\n"]:
+ if b"endobj" in line and current_obj:
+ current_obj.append(b"endobj\n")
+ block = b"".join(current_obj)
+ new_data.append(block)
+ char_count += len(block)
+ current_obj = []
+ line = b"xref\n"
+ # We found the xref table, record its position and write it
+ # out using our updated indices.
+ startxref = sum(map(len, new_data))
+ new_data.append(line)
+ new_data.append(b"0 %i\n" % (len(xref) + 1))
+ new_data.append(b"0000000000 65535 f \n")
+ for objid in sorted(xref):
+ new_data.append(b"%010d 00000 n \n" % xref[objid])
+
+ # skip the appropriate number of lines
+ skip_n = len(xref) + 2
+ elif current_obj:
+ # If we're recording an object, simply add the line to it
+ current_obj.append(line)
+ elif line == b"startxref\n":
+ # Write out our recorded startxref position, skip the old
+ # position.
+ new_data.append(b"startxref\n%i\n" % startxref)
+ skip_n = 1
+ else:
+ # Anything else passes through
+ new_data.append(line)
+ char_count += len(line)
removed_file = basename + "_removed.pdf"
- with open(removed_file, "wb") as oid:
- oid.write(data)
+ with open(removed_file, "wb") as fp:
+ fp.write(b"".join(new_data))
output_file = basename + "_dearxiv.pdf"
- status = subprocess.call(
- [self.pdftk_path, removed_file, "output", output_file, "compress"]
- )
- if not status == 0:
- raise CalledProcessError("pdftk failed to compress the PDF file.")
+ self.compress_pdf(removed_file, output_file)
+
+ logger.append("success" if replaced_arXiv else "none found", "info")
return output_file
+
+
+def fix_stream_length(block):
+ # This fixes the stream length of a block, which is needed after we have
+ # removed the arXiv stamp.
+ count = 0
+ block = block.split(b"\n")
+ do_count = False
+
+ for line in block:
+ if line.strip(b" ") in [b"stream", b"endstream"]:
+ do_count = not do_count
+ continue
+
+ if do_count:
+ # +1 for the newline character
+ count += len(line) + 1
+
+ new_block = []
+ for line in block:
+ if b" /Length " in line:
+ new_block.append(b"<< /Length %i >>" % count)
+ else:
+ new_block.append(line)
+
+ return b"\n".join(new_block)
diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py
index e483f28..e819c30 100644
--- a/paper2remarkable/providers/citeseerx.py
+++ b/paper2remarkable/providers/citeseerx.py
@@ -49,10 +49,6 @@ class CiteSeerX(Provider):
)
time.sleep(30)
- # NOTE: The delay should only be hit twice when p2r is used as a
- # library (e.g. during testing). Otherwise the ``server_delay`` is
- # never reached in run().
-
def _get_doi(self, url):
m = re.match(self.re_abs, url) or re.match(self.re_pdf, url)
if m:
diff --git a/paper2remarkable/providers/cvf.py b/paper2remarkable/providers/cvf.py
new file mode 100644
index 0000000..76ca9c0
--- /dev/null
+++ b/paper2remarkable/providers/cvf.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for CVF
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+
+from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
+
+
+class CVFInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+
+class CVF(Provider):
+
+ re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
+ re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = CVFInformer()
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url[: -len(".html")]
+ pdf_url += ".pdf"
+ pdf_url = pdf_url.replace("html", "papers")
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ abs_url = url.replace("papers", "html").replace(".pdf", ".html")
+ else:
+ raise URLResolutionError("CVF", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src)
+ return not m is None
diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py
new file mode 100644
index 0000000..e050ea3
--- /dev/null
+++ b/paper2remarkable/providers/html.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for HTML documents
+
+This provider is a little bit special, in that it isn't simply pulling an
+academic paper from a site, but instead aims to pull a HTML article.
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import html2text
+import markdown
+import readability
+import titlecase
+import unidecode
+import urllib
+import weasyprint
+import weasyprint.fonts
+
+from ._base import Provider
+from ._info import Informer
+
+from ..utils import (
+ clean_string,
+ get_page_with_retry,
+ get_content_type_with_retry,
+)
+from ..log import Logger
+
+logger = Logger()
+
+CSS = """
+@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap');
+@page { size: 702px 936px; margin: 1in; }
+a { color: black; }
+img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
+p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
+h1,h2,h3 { font-family: 'Noto Serif'; }
+h1 { font-size: 26px; }
+h2 { font-size: 18px; }
+h3 { font-size: 14px; }
+blockquote { font-style: italic; }
+pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; }
+code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; }
+"""
+
+
+def url_fetcher(url):
+ if url.startswith("//"):
+ url = "https:" + url
+ elif url.startswith("file:///"):
+ url = "https:" + url[len("file:/") :]
+ return weasyprint.default_url_fetcher(url)
+
+
+def make_readable(request_html):
+ """Use an extraction method to get the main article html
+
+ This function checks if ReadabiliPy is installed with NodeJS support, as
+ that generally yields better results. If that is not available, it falls
+ back on readability.
+ """
+
+ have_readabilipy_js = False
+ try:
+ import readabilipy
+
+ have_readabilipy_js = readabilipy.simple_json.have_node()
+ except ImportError:
+ pass
+
+ if have_readabilipy_js:
+ logger.info("Converting HTML using Readability.js")
+ article = readabilipy.simple_json_from_html_string(
+ request_html, use_readability=True
+ )
+ title = article["title"]
+ raw_html = article["content"]
+ else:
+ logger.info("Converting HTML using readability")
+ doc = readability.Document(request_html)
+ title = doc.title()
+ raw_html = doc.summary(html_partial=True)
+ return title, raw_html
+
+
+class ImgProcessor(markdown.treeprocessors.Treeprocessor):
+ def __init__(self, base_url, *args, **kwargs):
+ self._base_url = base_url
+ super().__init__(*args, **kwargs)
+
+ def run(self, root):
+ """ Ensure all img src urls are absolute """
+ for img in root.iter("img"):
+ img.attrib["src"] = urllib.parse.urljoin(
+ self._base_url, img.attrib["src"]
+ )
+ img.attrib["src"] = img.attrib["src"].rstrip("/")
+
+
+class HTMLInformer(Informer):
+ def __init__(self):
+ super().__init__()
+ self._cached_title = None
+ self._cached_article = None
+
+ def get_filename(self, abs_url):
+ request_html = get_page_with_retry(abs_url, return_text=True)
+ title, article = make_readable(request_html)
+
+ self._cached_title = title
+ self._cached_article = article
+
+ # Clean the title and make it titlecase
+ title = clean_string(title)
+ title = titlecase.titlecase(title)
+ title = title.replace(" ", "_")
+ title = clean_string(title)
+ name = title.strip("_") + ".pdf"
+ name = unidecode.unidecode(name)
+ logger.info("Created filename: %s" % name)
+ return name
+
+
+class HTML(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = HTMLInformer()
+
+ def get_abs_pdf_urls(self, url):
+ return url, url
+
+ def retrieve_pdf(self, pdf_url, filename):
+ """Turn the HTML article in a clean pdf file
+
+ This function takes the following steps:
+
+ 1. Pull the HTML page using requests, if not done in Informer
+ 2. Extract the article part of the page using readability/readabiliPy
+ 3. Convert the article HTML to markdown using html2text
+ 4. Convert the markdown back to HTML (done to sanitize the HTML)
+ 4. Convert the HTML to PDF, pulling in images where needed
+ 5. Save the PDF to the specified filename.
+ """
+ if self.informer._cached_title and self.informer._cached_article:
+ title = self.informer._cached_title
+ article = self.informer._cached_article
+ else:
+ request_html = get_page_with_retry(pdf_url, return_text=True)
+ title, article = make_readable(request_html)
+
+ h2t = html2text.HTML2Text()
+ h2t.wrap_links = False
+ text = h2t.handle(article)
+
+ # Add the title back to the document
+ article = "# {title}\n\n{text}".format(title=title, text=text)
+
+ # Convert to html, fixing relative image urls.
+ md = markdown.Markdown()
+ md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
+ html_article = md.convert(article)
+
+ if self.debug:
+ with open("./paper.html", "w") as fp:
+ fp.write(html_article)
+
+ font_config = weasyprint.fonts.FontConfiguration()
+ html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
+ css = weasyprint.CSS(string=CSS, font_config=font_config)
+
+ html.write_pdf(filename, stylesheets=[css], font_config=font_config)
+
+ def validate(src):
+ # first check if it is a valid url
+ parsed = urllib.parse.urlparse(src)
+ if not all([parsed.scheme, parsed.netloc, parsed.path]):
+ return False
+ # next, get the header and check the content type
+ ct = get_content_type_with_retry(src)
+ if ct is None:
+ return False
+ return ct.startswith("text/html")
diff --git a/paper2remarkable/providers/jmlr.py b/paper2remarkable/providers/jmlr.py
new file mode 100644
index 0000000..8b121cb
--- /dev/null
+++ b/paper2remarkable/providers/jmlr.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for JMLR
+
+Journal of Machine Learning Research
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class JMLRInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def _format_authors(self, soup_authors):
+ have_comma = any(("," in auth for auth in soup_authors))
+ if have_comma:
+ return super()._format_authors(soup_authors, sep=",", idx=0)
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class JMLR(Provider):
+
+ re_abs_1 = "https?://(www\.)?jmlr\.org/papers/v(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3}).html$"
+ re_pdf_1 = "https?://(www\.)?jmlr\.org/papers/volume(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3})/(?P=pid).pdf$"
+
+ re_abs_2 = "https?://(www\.)?jmlr\.org/papers/v(?P<vol>\d+)/(?P<pid>\w+\d{2}\w).html$"
+ re_pdf_2 = "https?://(www\.)?jmlr\.org/papers/volume(?P<vol>\d+)/(?P<pid>\w+\d{2}\w)/(?P=pid).pdf$"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = JMLRInformer()
+
+ def get_abs_pdf_urls(self, url):
+ abs_url = pdf_url = None
+ abs_fmt = "http://jmlr.org/papers/v{vol}/{pid}.html"
+ pdf_fmt = "http://jmlr.org/papers/volume{vol}/{pid}/{pid}.pdf"
+ formats = [
+ (self.re_abs_1, self.re_pdf_1),
+ (self.re_abs_2, self.re_pdf_2),
+ ]
+
+ for re_abs, re_pdf in formats:
+ ma = re.match(re_abs, url)
+ mp = re.match(re_pdf, url)
+ if ma:
+ abs_url = url
+ pdf_url = pdf_fmt.format(
+ vol=ma.group("vol"), pid=ma.group("pid")
+ )
+ elif mp:
+ abs_url = abs_fmt.format(
+ vol=mp.group("vol"), pid=mp.group("pid")
+ )
+ pdf_url = url
+ if abs_url is None or pdf_url is None:
+ raise URLResolutionError("JMLR", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return (
+ re.match(JMLR.re_abs_1, src)
+ or re.match(JMLR.re_abs_2, src)
+ or re.match(JMLR.re_pdf_1, src)
+ or re.match(JMLR.re_pdf_2, src)
+ )
diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py
new file mode 100644
index 0000000..28e0973
--- /dev/null
+++ b/paper2remarkable/providers/nber.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for NBER
+
+(US) National Bureau of Economic Research
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class NBERInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def _format_authors(self, soup_authors, sep=" ", idx=0, op=None):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1, op=None)
+
+
+class NBER(Provider):
+
+ re_abs = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)$"
+ re_pdf = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)\.pdf$"
+
+ re_pdf_2 = "https://www.nber.org/system/files/working_papers/(?P<ref>[a-z0-9]+)/(?P=ref).pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = NBERInformer()
+
+ def get_report_no(self, url):
+ m = re.match(self.re_pdf_2, url)
+ if m:
+ return m["ref"]
+ raise URLResolutionError(
+ "NBER", url, reason="Failed to retrieve report number."
+ )
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url + ".pdf"
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ abs_url = url[: -len(".pdf")]
+ elif re.match(self.re_pdf_2, url):
+ ref = self.get_report_no(url)
+ abs_url = f"https://www.nber.org/papers/{ref}"
+ pdf_url = url
+ else:
+ raise URLResolutionError("NBER", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return (
+ re.match(NBER.re_abs, src)
+ or re.match(NBER.re_pdf, src)
+ or re.match(NBER.re_pdf_2, src)
+ )
diff --git a/paper2remarkable/providers/neurips.py b/paper2remarkable/providers/neurips.py
index 87cf2c1..d76202c 100644
--- a/paper2remarkable/providers/neurips.py
+++ b/paper2remarkable/providers/neurips.py
@@ -25,8 +25,8 @@ class NeurIPSInformer(Informer):
class NeurIPS(Provider):
- re_abs = "^https?://papers.nips.cc/paper/[\d\w\-]+$"
- re_pdf = "^https?://papers.nips.cc/paper/[\d\w\-]+.pdf$"
+ re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$"
+ re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
index 47c0555..8c44f45 100644
--- a/paper2remarkable/providers/openreview.py
+++ b/paper2remarkable/providers/openreview.py
@@ -8,17 +8,49 @@ Copyright: 2019, G.J.J. van den Burg
"""
+import json
import re
from ._base import Provider
from ._info import Informer
from ..exceptions import URLResolutionError
+from ..log import Logger
+
+logger = Logger()
class OpenReviewInformer(Informer):
meta_date_key = "citation_publication_date"
+ def get_authors(self, soup):
+ # Get the authors for OpenReview by parsing the JSON payload
+ #
+ # This may not be super robust long term, but works for now.
+ warning = (
+ "Couldn't determine author information, maybe provide "
+ "the desired filename using '--filename'?"
+ )
+
+ script = soup.find("script", {"id": "__NEXT_DATA__"})
+ if not script:
+ logger.warning(warning)
+ return ""
+
+ try:
+ paper_data = json.loads(script.contents[0])
+ except json.JSONDecodeError:
+ logger.warning(warning)
+ return ""
+
+ try:
+ content = paper_data["props"]["pageProps"]["forumNote"]["content"]
+ authors = content["authors"]
+ except KeyError:
+ logger.warning(warning)
+ return ""
+ return self._format_authors(authors)
+
def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
index 5314ec7..d20d4a5 100644
--- a/paper2remarkable/providers/pdf_url.py
+++ b/paper2remarkable/providers/pdf_url.py
@@ -12,13 +12,41 @@ import urllib
from ._base import Provider
from ._info import Informer
+
+from .. import GITHUB_URL
from ..exceptions import FilenameMissingError
+from ..log import Logger
+from ..utils import get_content_type_with_retry
+
+logger = Logger()
class PdfUrlInformer(Informer):
def get_filename(self, abs_url):
- # if this is called, filename must not have been provided
- raise FilenameMissingError(provider="PDFUrl")
+ # try to get a nice filename by parsing the url
+ parsed = urllib.parse.urlparse(abs_url)
+ path_parts = parsed.path.split("/")
+ if not path_parts:
+ raise FilenameMissingError(
+ provider="PdfUrl",
+ url=abs_url,
+ reason="No URL parts",
+ )
+
+ filename = path_parts[-1]
+ if not filename.endswith(".pdf"):
+ raise FilenameMissingError(
+ provider="PdfUrl",
+ url=abs_url,
+ reason="URL path didn't end in .pdf",
+ )
+ logger.warning(
+ "Using filename {filename} extracted from url. "
+ "You might want to provide a nicer one using --filename "
+ "or request this paper source to be added "
+ "(see: {github}).".format(filename=filename, github=GITHUB_URL)
+ )
+ return filename
class PdfUrl(Provider):
@@ -27,11 +55,15 @@ class PdfUrl(Provider):
self.informer = PdfUrlInformer()
def get_abs_pdf_urls(self, url):
- return (None, url)
+ return (url, url)
def validate(src):
- try:
- result = urllib.parse.urlparse(src)
- return all([result.scheme, result.netloc, result.path])
- except:
+ # first check if it is a valid url
+ parsed = urllib.parse.urlparse(src)
+ if not all([parsed.scheme, parsed.netloc, parsed.path]):
+ return False
+ # next, get the header and check the content type
+ ct = get_content_type_with_retry(src)
+ if ct is None:
return False
+ return ct.startswith("application/pdf")
diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py
new file mode 100644
index 0000000..7e76df8
--- /dev/null
+++ b/paper2remarkable/providers/sagepub.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for SagePub
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+
+
+class SagePubInformer(Informer):
+
+ meta_author_key = "dc.Creator"
+ meta_title_key = "dc.Title"
+ meta_date_key = "dc.Date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+ def _format_year(self, soup_date):
+ return soup_date.split("-")[0]
+
+
+class SagePub(Provider):
+
+ re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+"
+ re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = SagePubInformer()
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.replace("full", "pdf")
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ abs_url = url.replace("pdf", "full")
+ else:
+ raise URLResolutionError("SagePub", url)
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src)
diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py
new file mode 100644
index 0000000..0a1b414
--- /dev/null
+++ b/paper2remarkable/providers/semantic_scholar.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for SemanticScholar
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2020, G.J.J. van den Burg
+
+"""
+
+import re
+import bs4
+
+from ._base import Provider
+from ._info import Informer
+from ..exceptions import URLResolutionError
+from ..utils import get_page_with_retry
+
+
+class SemanticScholarInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class SemanticScholar(Provider):
+
+ re_abs = (
+ "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
+ )
+ re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = SemanticScholarInformer()
+
+ def get_abs_pdf_urls(self, url):
+ """ Get the pdf and abstract urls from a SemanticScholar url """
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = self._get_pdf_url(abs_url)
+ elif re.match(self.re_pdf, url):
+ pdf_url = url
+ remainder = pdf_url.split("/")[-1][: -len(".pdf")]
+ first_four = pdf_url.split("/")[-2]
+ paper_id = first_four + remainder
+ abs_url = f"https://www.semanticscholar.org/paper/{paper_id}"
+ else:
+ raise URLResolutionError("SemanticScholar", url)
+ return abs_url, pdf_url
+
+ def _get_pdf_url(self, url):
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ meta = soup.find_all("meta", {"name": "citation_pdf_url"})
+ if not meta:
+ raise URLResolutionError("SemanticScholar", url)
+ return meta[0]["content"]
+
+ def validate(src):
+ return re.match(SemanticScholar.re_abs, src) or re.match(
+ SemanticScholar.re_pdf, src
+ )
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
index 5ce2564..31f0a67 100644
--- a/paper2remarkable/providers/springer.py
+++ b/paper2remarkable/providers/springer.py
@@ -10,40 +10,75 @@ Copyright: 2019, G.J.J. van den Burg
import re
import urllib
+import requests
from ._base import Provider
from ._info import Informer
from ..exceptions import URLResolutionError
+from ..utils import HEADERS
class SpringerInformer(Informer):
- meta_date_key = "citation_online_date"
+ meta_date_key = None
def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)
+ def get_year(self, soup):
+ for key in ["citation_online_date", "citation_publication_date"]:
+ meta = soup.find_all("meta", {"name": key})
+ if not meta:
+ continue
+ return self._format_year(meta[0]["content"])
+ return ""
+
class Springer(Provider):
- re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
- re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+ re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+ re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+"
+ re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = SpringerInformer()
+ def _get_abs_url(self, pdf_url):
+ article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")]
+ req = requests.head(
+ article_url, headers=HEADERS, cookies=self.cookiejar
+ )
+ if req.status_code == 200:
+ return article_url
+
+ chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")]
+ req = requests.head(
+ chapter_url, headers=HEADERS, cookies=self.cookiejar
+ )
+ if req.status_code == 200:
+ return chapter_url
+
+ raise URLResolutionError("Springer", pdf_url)
+
def get_abs_pdf_urls(self, url):
""" Get the pdf and abstract urls from a Springer url """
- if re.match(self.re_abs, url):
+ if re.match(self.re_abs_1, url):
abs_url = url
pdf_url = url.replace("article", "content/pdf")
+ elif re.match(self.re_abs_2, url):
+ abs_url = url
+ pdf_url = url.replace("chapter", "content/pdf")
elif re.match(self.re_pdf, url):
- abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+ abs_url = self._get_abs_url(url)
pdf_url = urllib.parse.unquote(url)
else:
raise URLResolutionError("Springer", url)
return abs_url, pdf_url
def validate(src):
- return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
+ return (
+ re.match(Springer.re_abs_1, src)
+ or re.match(Springer.re_abs_2, src)
+ or re.match(Springer.re_pdf, src)
+ )
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
index 032bf99..ea24403 100644
--- a/paper2remarkable/ui.py
+++ b/paper2remarkable/ui.py
@@ -13,6 +13,7 @@ import sys
from . import __version__, GITHUB_URL
+from .exceptions import UnidentifiedSourceError, InvalidURLError
from .providers import providers, LocalFile
from .utils import follow_redirects, is_url
@@ -53,20 +54,36 @@ def parse_args():
default="/",
)
parser.add_argument(
+ "-r",
+ "--right",
+ help="Right align so the menu doesn't cover it",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-k", "--no-crop", help="Don't crop the pdf file", action="store_true"
+ )
+ parser.add_argument(
"-v", "--verbose", help="be verbose", action="store_true"
)
parser.add_argument(
+ "-V",
+ "--version",
+ help="Show version and exit",
+ action="version",
+ version=__version__,
+ )
+ parser.add_argument(
"--filename",
help="Filename to use for the file on reMarkable",
- default=None,
+ action="append",
)
parser.add_argument(
"--gs", help="path to gs executable (default: gs)", default="gs"
)
parser.add_argument(
- "--pdfcrop",
- help="path to pdfcrop executable (default: pdfcrop)",
- default="pdfcrop",
+ "--pdftoppm",
+ help="path to pdftoppm executable (default: pdftoppm)",
+ default="pdftoppm",
)
parser.add_argument(
"--pdftk",
@@ -74,12 +91,19 @@ def parse_args():
default="pdftk",
)
parser.add_argument(
+ "--qpdf",
+ help="path to qpdf executable (default: qpdf)",
+ default="qpdf",
+ )
+ parser.add_argument(
"--rmapi",
help="path to rmapi executable (default: rmapi)",
default="rmapi",
)
parser.add_argument(
- "input", help="URL to a paper or the path of a local PDF file"
+ "input",
+ help="One or more URLs to a paper or paths to local PDF files",
+ nargs="+",
)
return parser.parse_args()
@@ -90,44 +114,113 @@ def exception(msg):
print("", file=sys.stderr)
print(
"If you think this might be a bug, please raise an issue on GitHub: %s"
- % GITHUB_URL
+ % GITHUB_URL,
+ file=sys.stderr,
)
+ print("", file=sys.stderr)
raise SystemExit(1)
-def main():
- args = parse_args()
- cookiejar = None
+def choose_provider(cli_input):
+ """Choose the provider to use for the given source
- if is_url(args.input):
- # input is a url
- url, cookiejar = follow_redirects(args.input)
- provider = next((p for p in providers if p.validate(url)), None)
- elif LocalFile.validate(args.input):
+ This function first tries to check if the input is a local file, by
+ checking if the path exists. Next, it checks if the input is a "valid" url
+ using a regex test. If it is, the registered provider classes are checked
+ to see which provider can handle this url.
+
+ Returns
+ -------
+ provider : class
+ The class of the provider than can handle the source. A subclass of the
+ Provider abc.
+
+ new_input : str
+ The updated input to the provider. This only has an effect for the url
+ providers, where this will be the url after following all redirects.
+
+ cookiejar : dict or requests.RequestsCookieJar
+ Cookies picked up when following redirects. These are needed for some
+ providers to ensure later requests have the right cookie settings.
+
+ Raises
+ ------
+ UnidentifiedSourceError
+ Raised when the input is neither an existing local file nor a valid url
+
+ InvalidURLError
+ Raised when the input *is* a valid url, but no provider can handle it.
+
+ """
+ provider = cookiejar = None
+ if LocalFile.validate(cli_input):
# input is a local file
+ new_input = cli_input
provider = LocalFile
+ elif is_url(cli_input):
+ # input is a url
+ new_input, cookiejar = follow_redirects(cli_input)
+ provider = next((p for p in providers if p.validate(new_input)), None)
else:
# not a proper URL or non-existent file
+ raise UnidentifiedSourceError
+
+ if provider is None:
+ raise InvalidURLError
+
+ return provider, new_input, cookiejar
+
+
+def set_excepthook(debug):
+ sys_hook = sys.excepthook
+
+ def exception_handler(exception_type, value, traceback):
+ if debug:
+ sys_hook(exception_type, value, traceback)
+ else:
+ print(value, file=sys.stderr)
+
+ sys.excepthook = exception_handler
+
+
+def main():
+ args = parse_args()
+ set_excepthook(args.debug)
+
+ if args.center and args.right:
+ exception("Can't center and right align at the same time!")
+
+ if args.center and args.no_crop:
+ exception("Can't center and not crop at the same time!")
+
+ if args.right and args.no_crop:
+ exception("Can't right align and not crop at the same time!")
+
+ if args.filename and not len(args.filename) == len(args.input):
exception(
- "Couldn't figure out what source you mean. If it's a "
- "local file, make sure it exists."
+ "When providing --filename and multiple inputs, their number must match."
)
- if provider is None:
- exception("Input not valid, no provider can handle this source.")
-
- prov = provider(
- verbose=args.verbose,
- upload=not args.no_upload,
- debug=args.debug,
- center=args.center,
- blank=args.blank,
- remarkable_dir=args.remarkable_dir,
- rmapi_path=args.rmapi,
- pdfcrop_path=args.pdfcrop,
- pdftk_path=args.pdftk,
- gs_path=args.gs,
- cookiejar=cookiejar,
- )
-
- prov.run(args.input, filename=args.filename)
+ filenames = (
+ [None] * len(args.input) if not args.filename else args.filename
+ )
+
+ for cli_input, filename in zip(args.input, filenames):
+ provider, new_input, cookiejar = choose_provider(cli_input)
+ prov = provider(
+ verbose=args.verbose,
+ upload=not args.no_upload,
+ debug=args.debug,
+ center=args.center,
+ right=args.right,
+ blank=args.blank,
+ no_crop=args.no_crop,
+ remarkable_dir=args.remarkable_dir,
+ rmapi_path=args.rmapi,
+ pdftoppm_path=args.pdftoppm,
+ pdftk_path=args.pdftk,
+ qpdf_path=args.qpdf,
+ gs_path=args.gs,
+ cookiejar=cookiejar,
+ )
+ prov.run(new_input, filename=filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
index 79421df..0b4be07 100644
--- a/paper2remarkable/utils.py
+++ b/paper2remarkable/utils.py
@@ -17,7 +17,7 @@ import time
import unidecode
from .log import Logger
-from .exceptions import FileTypeError, RemarkableError
+from .exceptions import FileTypeError, RemarkableError, NoPDFToolError
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
@@ -30,14 +30,15 @@ logger = Logger()
def clean_string(s):
- """ Clean a string by replacing accented characters with equivalents and
- keeping only the allowed characters (ascii letters, digits, underscore,
+ """Clean a string by replacing accented characters with equivalents and
+ keeping only the allowed characters (ascii letters, digits, underscore,
space, dash, and period)"""
normalized = unidecode.unidecode(s)
allowed = string.ascii_letters + string.digits + "_ .-"
cleaned = "".join(c if c in allowed else "_" for c in normalized)
while "__" in cleaned:
cleaned = cleaned.replace("__", "_")
+ cleaned = cleaned.strip("_")
return cleaned
@@ -64,7 +65,7 @@ def download_url(url, filename, cookiejar=None):
fid.write(content)
-def get_page_with_retry(url, tries=5, cookiejar=None):
+def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False):
count = 0
jar = {} if cookiejar is None else cookiejar
while count < tries:
@@ -82,9 +83,33 @@ def get_page_with_retry(url, tries=5, cookiejar=None):
time.sleep(5)
continue
logger.info("Downloaded url: %s" % url)
+ if return_text:
+ return res.text
return res.content
+def get_content_type_with_retry(url, tries=5, cookiejar=None):
+ count = 0
+ jar = {} if cookiejar is None else cookiejar
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.head(
+ url, headers=HEADERS, cookies=jar, allow_redirects=True
+ )
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ logger.warning(
+ "(%i/%i) Error getting headers for %s. Retrying in 5 seconds."
+ % (count, tries, url)
+ )
+ time.sleep(5)
+ continue
+ return res.headers.get("Content-Type", None)
+
+
def follow_redirects(url):
"""Follow redirects from the URL (at most 100)"""
it = 0
@@ -98,8 +123,10 @@ def follow_redirects(url):
if not "Location" in req.headers:
break
url = req.headers["Location"]
- jar = req.cookies
+ jar.update(req.cookies)
it += 1
+ if it == 100:
+ logger.warning("Max redirects reached. There may be a problem.")
jar = jar or req.cookies
return url, jar
@@ -110,13 +137,19 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
# Create the reMarkable dir if it doesn't exist
remarkable_dir = remarkable_dir.rstrip("/")
if remarkable_dir:
- status = subprocess.call(
- [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- raise RemarkableError(
- "Creating directory %s on reMarkable failed" % remarkable_dir
+ parts = remarkable_dir.split("/")
+ rmdir = ""
+ while parts:
+ rmdir += "/" + parts.pop(0)
+ status = subprocess.call(
+ [rmapi_path, "mkdir", rmdir],
+ stdout=subprocess.DEVNULL,
)
+ if not status == 0:
+ raise RemarkableError(
+ "Creating directory %s on reMarkable failed"
+ % remarkable_dir
+ )
# Upload the file
status = subprocess.call(
@@ -132,7 +165,34 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
def is_url(string):
# pattern adapted from CleverCSV
- pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*\-?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:]*)?(\.[a-z]+)?"
+ pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*[\-\_]?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?"
string = string.strip(" ")
match = regex.fullmatch(pattern, string)
return match is not None
+
+
+def check_pdftool(pdftk_path, qpdf_path):
+ """Check whether we have pdftk or qpdf available"""
+ # set defaults in case either is set to None or something
+ pdftk_path = pdftk_path or "false"
+ qpdf_path = qpdf_path or "false"
+
+ try:
+ status = subprocess.call(
+ [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+ )
+ except FileNotFoundError:
+ status = 1
+ if status == 0:
+ return "pdftk"
+ try:
+ status = subprocess.call(
+ [qpdf_path, "--help"],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+ except FileNotFoundError:
+ status = 1
+ if status == 0:
+ return "qpdf"
+ raise NoPDFToolError
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..a8f43fe
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.black]
+line-length = 79
diff --git a/setup.py b/setup.py
index bddbd24..54a8cb1 100644
--- a/setup.py
+++ b/setup.py
@@ -25,18 +25,24 @@ REQUIRED = [
"unidecode>=1.1",
"titlecase>=0.12",
"PyPDF2>=1.26",
- "regex>=2018.11"
+ "regex>=2018.11",
+ "readability-lxml>=0.7.1",
+ "html2text>=2020.1.16",
+ "weasyprint>=51",
+ "markdown>=3.1.1",
]
+full_require = ["readabilipy"]
docs_require = []
-test_require = []
-dev_require = ["green"]
+test_require = ["green"]
+dev_require = []
# What packages are optional?
EXTRAS = {
+ "full": full_require,
"docs": docs_require,
- "tests": test_require,
- "dev": docs_require + test_require + dev_require,
+ "test": test_require + full_require,
+ "dev": docs_require + test_require + dev_require + full_require,
}
# The rest you shouldn't have to touch too much :)
diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py
new file mode 100644
index 0000000..2cb84cf
--- /dev/null
+++ b/tests/test_arxiv.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Unit tests for arXiv provider
+
+This file is part of paper2remarkable.
+
+"""
+
+import os
+import re
+import shutil
+import tempfile
+import unittest
+
+from paper2remarkable.providers.arxiv import (
+ DEARXIV_TEXT_REGEX,
+ DEARXIV_URI_REGEX,
+ Arxiv,
+)
+
+
+class TestArxiv(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
+ def test_text_regex_1(self):
+ key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_text_regex_2(self):
+ key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_text_regex_3(self):
+ key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_text_regex_4(self):
+ key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004"
+ m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_uri_regex_1(self):
+ key = b"http://arxiv.org/abs/physics/0605197v1"
+ m = re.fullmatch(DEARXIV_URI_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_uri_regex_2(self):
+ key = b"https://arxiv.org/abs/1101.0028v3"
+ m = re.fullmatch(DEARXIV_URI_REGEX, key)
+ self.assertIsNotNone(m)
+
+ def test_stamp_removed_1(self):
+ url = "https://arxiv.org/pdf/1703.06103.pdf"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data)
+
+ def test_stamp_removed_2(self):
+ url = "https://arxiv.org/abs/2003.06222"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data)
+
+ def test_stamp_removed_3(self):
+ url = "https://arxiv.org/abs/physics/0605197v1"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(
+ b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data
+ )
+ self.assertNotIn(
+ b"/URI (http://arxiv.org/abs/physics/0605197v1)", data
+ )
+
+ def test_stamp_removed_4(self):
+ url = "https://arxiv.org/abs/math/0309285v2"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data)
+ self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data)
+
+ def test_stamp_removed_5(self):
+ url = "https://arxiv.org/abs/astro-ph/9207001v1"
+ prov = Arxiv(upload=False)
+ filename = prov.run(url, filename="./target.pdf")
+ prov.uncompress_pdf(filename, "unc.pdf")
+ with open("unc.pdf", "rb") as fp:
+ data = fp.read()
+ self.assertNotIn(
+ b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data
+ )
+ self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 3204768..4ee6773 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -7,7 +7,7 @@ __author__ = "G.J.J. van den Burg"
import hashlib
import os
-import re
+import pdfplumber
import shutil
import tempfile
import unittest
@@ -15,17 +15,22 @@ import unittest
from paper2remarkable.providers import (
ACM,
Arxiv,
+ CVF,
CiteSeerX,
+ HTML,
+ JMLR,
LocalFile,
+ NBER,
NeurIPS,
OpenReview,
PMLR,
PdfUrl,
PubMed,
+ SagePub,
+ SemanticScholar,
Springer,
TandFOnline,
)
-from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX
VERBOSE = False
@@ -41,18 +46,6 @@ def md5sum(filename):
return hasher.hexdigest()
-class TestArxiv(unittest.TestCase):
- def test_text_regex_1(self):
- key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019"
- m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
- self.assertIsNotNone(m)
-
- def test_text_regex_2(self):
- key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019"
- m = re.fullmatch(DEARXIV_TEXT_REGEX, key)
- self.assertIsNotNone(m)
-
-
class TestProviders(unittest.TestCase):
@classmethod
def setUpClass(cls):
@@ -67,7 +60,8 @@ class TestProviders(unittest.TestCase):
shutil.rmtree(self.test_dir)
def test_arxiv_1(self):
- prov = Arxiv(upload=False, verbose=VERBOSE)
+ # check with qpdf
+ prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None)
url = "https://arxiv.org/abs/1811.11242v1"
exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf"
filename = prov.run(url)
@@ -96,6 +90,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_arxiv_5(self):
+ prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None)
+ url = "https://arxiv.org/abs/2002.11523"
+ exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
+ def test_arxiv_6(self):
+ prov = Arxiv(upload=False, verbose=VERBOSE)
+ url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------"
+ exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_pmc(self):
prov = PubMed(upload=False, verbose=VERBOSE)
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
@@ -128,13 +136,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
- def test_springer(self):
+ def test_springer_1(self):
prov = Springer(upload=False, verbose=VERBOSE)
url = "https://link.springer.com/article/10.1007/s10618-019-00631-5"
exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf"
filename = prov.run(url)
self.assertEqual(exp_filename, os.path.basename(filename))
+ def test_springer_2(self):
+ prov = Springer(upload=False, verbose=VERBOSE)
+ url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf"
+ exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp_filename, os.path.basename(filename))
+
def test_local(self):
local_filename = "test.pdf"
with open(local_filename, "w") as fp:
@@ -145,11 +160,31 @@ class TestProviders(unittest.TestCase):
filename = prov.run(local_filename)
self.assertEqual("test_.pdf", os.path.basename(filename))
- def test_pdfurl(self):
+ def test_pdfurl_1(self):
+ prov = PdfUrl(upload=False, verbose=VERBOSE)
+ url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
+ filename = prov.run(url)
+ self.assertEqual("14-526.pdf", os.path.basename(filename))
+
+ def test_pdfurl_2(self):
prov = PdfUrl(upload=False, verbose=VERBOSE)
+ url = "https://www.manuelrigger.at/preprints/NoREC.pdf"
+ filename = prov.run(url)
+ self.assertEqual("NoREC.pdf", os.path.basename(filename))
+
+ def test_jmlr_1(self):
+ prov = JMLR(upload=False, verbose=VERBOSE)
url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf"
- filename = prov.run(url, filename="test.pdf")
- self.assertEqual("test.pdf", os.path.basename(filename))
+ exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_jmlr_2(self):
+ prov = JMLR(upload=False, verbose=VERBOSE)
+ url = "http://www.jmlr.org/papers/v10/xu09a.html"
+ exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
def test_pmlr_1(self):
prov = PMLR(upload=False, verbose=VERBOSE)
@@ -179,6 +214,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_nber_1(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w26752"
+ exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_nber_2(self):
+ prov = NBER(upload=False, verbose=VERBOSE)
+ url = "https://www.nber.org/papers/w19152.pdf"
+ exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_neurips_1(self):
prov = NeurIPS(upload=False, verbose=VERBOSE)
url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf"
@@ -193,6 +242,20 @@ class TestProviders(unittest.TestCase):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
+ def test_neurips_3(self):
+ prov = NeurIPS(upload=False, verbose=VERBOSE)
+ url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits"
+ exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_neurips_4(self):
+ prov = NeurIPS(upload=False, verbose=VERBOSE)
+ url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf"
+ exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
def test_citeseerx_1(self):
prov = CiteSeerX(upload=False, verbose=VERBOSE)
url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548"
@@ -218,6 +281,90 @@ class TestProviders(unittest.TestCase):
prov = TandFOnline(upload=False, verbose=VERBOSE)
url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true"
exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf"
+
+ def test_html_1(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines"
+ exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_html_2(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.nature.com/articles/d41586-020-00176-4"
+ exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_html_3(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://conclave-team.github.io/conclave-site/"
+ # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf"
+ # NOTE: Title differs between Readability.JS and readability-lxml, we
+ # assume that testing is done with Readability.JS
+ exp = "Conclave.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+ # this is a proxy test to check that all images are included
+ self.assertEqual(32, len(pdfplumber.open(filename).pages))
+
+ def test_html_4(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://sirupsen.com/2019/"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
+ def test_html_5(self):
+ prov = HTML(upload=False, verbose=VERBOSE)
+ url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#"
+ filename = prov.run(url)
+ # this is a proxy test to check that all images are included
+ self.assertEqual(4, len(pdfplumber.open(filename).pages))
+
+ def test_semantic_scholar_1(self):
+ prov = SemanticScholar(upload=False, verbose=VERBOSE)
+ url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf"
+ exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_semantic_scholar_2(self):
+ prov = SemanticScholar(upload=False, verbose=VERBOSE)
+ url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f"
+ exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sagepub_1(self):
+ prov = SagePub(upload=False, verbose=VERBOSE)
+ url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679"
+ exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_sagepub_2(self):
+ prov = SagePub(upload=False, verbose=VERBOSE)
+ url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432"
+ exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf"
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_cvf_1(self):
+ prov = CVF(upload=False, verbose=VERBOSE)
+ url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html"
+ exp = (
+ "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf"
+ )
+ filename = prov.run(url)
+ self.assertEqual(exp, os.path.basename(filename))
+
+ def test_cvf_2(self):
+ prov = CVF(upload=False, verbose=VERBOSE)
+ url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf"
+ exp = (
+ "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf"
+ )
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))
diff --git a/tests/test_ui.py b/tests/test_ui.py
new file mode 100644
index 0000000..835f594
--- /dev/null
+++ b/tests/test_ui.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Unit tests for command line interface
+
+This file is part of paper2remarkable.
+
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from paper2remarkable.exceptions import (
+ InvalidURLError,
+ UnidentifiedSourceError,
+)
+from paper2remarkable.providers import (
+ ACM,
+ Arxiv,
+ CiteSeerX,
+ CVF,
+ HTML,
+ JMLR,
+ LocalFile,
+ NBER,
+ NeurIPS,
+ OpenReview,
+ PMLR,
+ PdfUrl,
+ PubMed,
+ Springer,
+)
+from paper2remarkable.ui import choose_provider
+
+
+class TestUI(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.original_dir = os.getcwd()
+
+ def setUp(self):
+ self.test_dir = tempfile.mkdtemp()
+ os.chdir(self.test_dir)
+
+ def tearDown(self):
+ os.chdir(self.original_dir)
+ shutil.rmtree(self.test_dir)
+
+ def test_choose_provider_1(self):
+ tests = [
+ (
+ Arxiv,
+ "https://arxiv.org/abs/1811.11242v1",
+ "https://arxiv.org/abs/1811.11242v1",
+ ),
+ (
+ Arxiv,
+ "http://arxiv.org/abs/arXiv:1908.03213",
+ "https://arxiv.org/abs/1908.03213",
+ ),
+ (
+ Arxiv,
+ "https://arxiv.org/abs/math/0309285",
+ "https://arxiv.org/abs/math/0309285",
+ ),
+ (
+ Arxiv,
+ "https://arxiv.org/pdf/physics/0605197v1.pdf",
+ "https://arxiv.org/pdf/physics/0605197v1.pdf",
+ ),
+ (
+ PubMed,
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/",
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/",
+ ),
+ (
+ ACM,
+ "https://dl.acm.org/citation.cfm?id=3025626",
+ "https://dl.acm.org/doi/10.1145/3025453.3025626",
+ ),
+ (
+ ACM,
+ "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true",
+ "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&",
+ ),
+ (
+ OpenReview,
+ "http://openreview.net/forum?id=S1x4ghC9tQ",
+ "https://openreview.net/forum?id=S1x4ghC9tQ",
+ ),
+ (
+ Springer,
+ "https://link.springer.com/article/10.1007/s10618-019-00631-5",
+ "https://link.springer.com/article/10.1007/s10618-019-00631-5",
+ ),
+ (
+ PdfUrl,
+ "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf",
+ "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf",
+ ),
+ (
+ JMLR,
+ "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+ "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf",
+ ),
+ (
+ JMLR,
+ "https://www.jmlr.org/papers/v10/xu09a.html",
+ "https://www.jmlr.org/papers/v10/xu09a.html",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v97/behrmann19a.html",
+ "http://proceedings.mlr.press/v97/behrmann19a.html",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf",
+ "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v48/melnyk16.pdf",
+ "http://proceedings.mlr.press/v48/melnyk16.pdf",
+ ),
+ (
+ PMLR,
+ "http://proceedings.mlr.press/v48/zhangf16.html",
+ "http://proceedings.mlr.press/v48/zhangf16.html",
+ ),
+ (
+ NBER,
+ "https://www.nber.org/papers/w26752",
+ "https://www.nber.org/papers/w26752",
+ ),
+ (
+ NBER,
+ "https://www.nber.org/papers/w19152.pdf",
+ "https://www.nber.org/system/files/working_papers/w19152/w19152.pdf",
+ ),
+ (
+ NeurIPS,
+ "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf",
+ "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf",
+ ),
+ (
+ NeurIPS,
+ "https://papers.nips.cc/paper/7796-middle-out-decoding",
+ "https://papers.nips.cc/paper/7796-middle-out-decoding",
+ ),
+ (
+ NeurIPS,
+ "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf",
+ "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf",
+ ),
+ (
+ CiteSeerX,
+ "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",
+ "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548",
+ ),
+ (
+ CiteSeerX,
+ "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf",
+ "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf",
+ ),
+ (
+ HTML,
+ "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines",
+ "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines",
+ ),
+ (
+ HTML,
+ "https://www.nature.com/articles/d41586-020-00176-4",
+ "https://www.nature.com/articles/d41586-020-00176-4",
+ ),
+ (
+ CVF,
+ "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html",
+ "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html",
+ ),
+ ]
+ for exp_prov, url, exp_url in tests:
+ prov, new_url, jar = choose_provider(url)
+ with self.subTest(url=url):
+ self.assertEqual(exp_url, new_url)
+ self.assertEqual(prov, exp_prov)
+
+ def test_choose_provider_2(self):
+ local_filename = "test.pdf"
+ with open(local_filename, "w") as fp:
+ fp.write(
+ "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF"
+ )
+
+ prov, new_input, jar = choose_provider(local_filename)
+ self.assertEqual(prov, LocalFile)
+ self.assertEqual(new_input, local_filename)
+ self.assertIsNone(jar)
+
+ def test_choose_provider_3(self):
+ local_filename = "/tmp/abcdef.pdf"
+ with self.assertRaises(UnidentifiedSourceError):
+ choose_provider(local_filename)
+
+ def test_choose_provider_4(self):
+ url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md"
+ with self.assertRaises(InvalidURLError):
+ choose_provider(url)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..4c122e0
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import unittest
+
+from paper2remarkable.exceptions import NoPDFToolError
+from paper2remarkable.utils import check_pdftool
+
+
+class TestUtils(unittest.TestCase):
+ def test_check_pdftool(self):
+ # Needs a system with both pdftk and qpdf available
+ self.assertEqual(check_pdftool("pdftk", "qpdf"), "pdftk")
+ self.assertEqual(check_pdftool("pdftk_xyz", "qpdf"), "qpdf")
+ self.assertEqual(check_pdftool("pdftk", "qpdf_xyz"), "pdftk")
+ with self.assertRaises(NoPDFToolError):
+ check_pdftool("pdftk_xyz", "qpdf_xyz")
+
+
+if __name__ == "__main__":
+ unittest.main()