diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-10-27 20:59:17 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-10-27 20:59:17 +0100 |
| commit | 282de79f44e95b539c1788de8a71177b5a023557 (patch) | |
| tree | 6a6a26e210dc32d4d6a5ed4d8bc0b581af9bbc0e | |
| parent | [WIP] Provider for Taylor and Francis Online (diff) | |
| parent | Bump version and update changelog (diff) | |
| download | paper2remarkable-282de79f44e95b539c1788de8a71177b5a023557.tar.gz paper2remarkable-282de79f44e95b539c1788de8a71177b5a023557.zip | |
Merge branch 'master' into feature/tandfonline
37 files changed, 2275 insertions, 359 deletions
diff --git a/.github/alfred.png b/.github/alfred.png Binary files differnew file mode 100644 index 0000000..78a95d9 --- /dev/null +++ b/.github/alfred.png diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3cb791c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black + language_version: python3 diff --git a/.travis.yml b/.travis.yml index 5551597..32a2a1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: xenial language: python python: @@ -6,11 +6,14 @@ python: before_install: - sudo apt-get update - - sudo apt-get install ghostscript pdftk texlive-extra-utils poppler-utils + - sudo apt-get install ghostscript pdftk poppler-utils qpdf + - nvm install v12.18.1 + - nvm use v12.18.1 install: - - pip install six - - pip install -e .[dev] + - pip install pre-commit + - pip install -e .[test] script: + - pre-commit run --all-files --show-diff-on-failure - green -vv -a ./tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 79ea620..6518b8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,149 @@ # Changelog +## Version 0.7.4 + +* Add provider for CVF + +## Version 0.7.3 + +* Increase robustness for arXiv sources +* Fix NBER provider after site update +* Add support for multiple command line inputs + +## Version 0.7.2 + +* Add support to optionally use + [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy), a + wrapper around Mozilla's + [Readability.js](https://github.com/mozilla/readability), to improve text + extraction of web articles. This closes + [#53](https://github.com/GjjvdBurg/paper2remarkable/issues/53), thanks to + @sirupsen for reporting the problem. +* Improve NeurIPS provider to add support for papers.neurips.cc + +## Version 0.7.1 + +* Fix OpenReview provider after site change + +## Version 0.7.0 + +* Add provider for SagePub + +## Version 0.6.9 + +* Improve robustness of Springer provider + +## Version 0.6.8 + +* Add provider for SemanticScholar papers +* Fix bug that made ``no_crop`` option no longer work + +## Version 0.6.7 + +* Increase robustness to PDF issues by passing through GhostScript (fixes + [#51](https://github.com/GjjvdBurg/paper2remarkable/issues/51)). Thanks to + @sirupsen. +* Bugfix for code that removes arXiv stamp. + +## Version 0.6.6 + +* Bugfix to url validation: allow underscore in subdomains. + +## Version 0.6.5 + +* Corrections to code that removes the arXiv stamp + ([#49](https://github.com/GjjvdBurg/paper2remarkable/issues/49)). Thanks to + @mr-ubik. + +## Version 0.6.4 + +* Further fixes for images in HTML sources + ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to + @sirupsen. + +## Version 0.6.3 + +* Properly resolve image urls in HTML sources + ([#45](https://github.com/GjjvdBurg/paper2remarkable/issues/45)). Thanks to + @sirupsen. +* Allow ``+`` in urls + +## Version 0.6.2 + +* Print to log whether removing arXiv stamp was successful. +* Fix bug that failed to correctly detect the pdf tool + ([#42](https://github.com/GjjvdBurg/paper2remarkable/issues/42)). + +## Version 0.6.1 + +* Bugfix that makes removing the arXiv stamp more robust. + +## Version 0.6.0 + +* The Dockerfile has been updated to use a more recent version of Cairo + ([#35](https://github.com/GjjvdBurg/paper2remarkable/issues/35)). Thanks to + @ClaytonJY. +* We've added support for optionally using qpdf instead of pdftk + ([#36](https://github.com/GjjvdBurg/paper2remarkable/pull/36)). Thanks to + @delaere. +* Resolving redirects has been improved, which solves an issue for the + Springer provider + ([#38](https://github.com/GjjvdBurg/paper2remarkable/pull/38)) and an issue + with some arXiv urls + ([#39](https://github.com/GjjvdBurg/paper2remarkable/pull/39)). +* Unit tests were added for the provider selection. +* The code that removes the arXiv stamp has been improved + ([#40](https://github.com/GjjvdBurg/paper2remarkable/pull/40)). +* Tracebacks have been disabled outside of debug mode, showing clearer errors + ([#41](https://github.com/GjjvdBurg/paper2remarkable/pull/41)). + +## Version 0.5.6 + +* Be more robust against missing pdftoppm executable. + +## Version 0.5.5 + +* Fix bug for when the shrink operation returns bigger files + ([#33](https://github.com/GjjvdBurg/paper2remarkable/issues/33)). + +## Version 0.5.4 + +* Add the option to not crop the file at all + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/30)). +* Add the option to right-align the file so the menu doesn't overlap + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/31)). +* Bugfix for validation for the JMLR provider + +## Version 0.5.3 + +* Significantly speed up the program + ([#26](https://github.com/GjjvdBurg/paper2remarkable/issues/26)) +* Add provider for JMLR + ([#28](https://github.com/GjjvdBurg/paper2remarkable/pull/28)). +* Bugfix for creating nested directories with ``-p`` option. + +## Version 0.5.2 + +* Add provider for US National Bureau of Economic Research + ([#27](https://github.com/GjjvdBurg/paper2remarkable/pull/27)). +* Automatically extract the filename from a pdf url where possible + ([#25](https://github.com/GjjvdBurg/paper2remarkable/issues/25)). +* Speed up centering of pdfs by removing unnecessary cropping operation. +* Improve robustness against missing metadata, remove spaces in author names, + and other minor improvements. + +## Version 0.5.1 + +* Automatically detect when a HTML source is provided + ([#24](https://github.com/GjjvdBurg/paper2remarkable/pull/24)) + +## Version 0.5.0 + +* Add support for articles from the web using the ``--html`` flag + ([#23](https://github.com/GjjvdBurg/paper2remarkable/pull/23)) +* Add ``--version`` command to command line interface +* Fix cropping bug that resulted in occassional rotated pages + ## Version 0.4.6 * Add support for older arXiv URL scheme @@ -1,4 +1,4 @@ -FROM golang:stretch AS rmapi +FROM golang:buster AS rmapi ENV GOPATH /go ENV PATH ${GOPATH}/bin:/usr/local/go/bin:$PATH @@ -7,18 +7,21 @@ ENV RMAPIREPO github.com/juruen/rmapi RUN go get -u ${RMAPIREPO} -FROM python:3.7-slim-stretch +FROM python:3.7-slim-buster # rmapi COPY --from=rmapi /go/bin/rmapi /usr/bin/rmapi -# imagemagick, pdftk, ghostscript, pdfcrop +# needed to install openjdk-11-jre-headless +RUN mkdir -p /usr/share/man/man1 + +# imagemagick, pdftk, ghostscript, pdfcrop, weasyprint RUN apt-get update \ && apt-get install --no-install-recommends -y \ libmagickwand-dev \ pdftk \ ghostscript \ - texlive-extra-utils # contains pdfcrop + poppler-utils RUN pip install --no-cache-dir paper2remarkable @@ -7,7 +7,7 @@ PACKAGE=paper2remarkable DOC_DIR='./docs/' VENV_DIR=/tmp/p2r_venv/ -.PHONY: help cover dist venv +.PHONY: help dist venv .DEFAULT_GOAL := help @@ -48,15 +48,15 @@ doc: install ## Build documentation with Sphinx cd $(DOC_DIR) && \ rm source/* && \ source $(VENV_DIR)/bin/activate && \ - sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ + sphinx-apidoc -H 'Paper2Remarkable API Documentation' -o source ../$(PACKAGE) && \ touch source/AUTOGENERATED $(MAKE) -C $(DOC_DIR) html venv: $(VENV_DIR)/bin/activate $(VENV_DIR)/bin/activate: - test -d $(VENV_DIR) || virtualenv $(VENV_DIR) - source $(VENV_DIR)/bin/activate && pip install -e .[dev] && pip install six + test -d $(VENV_DIR) || python -m venv $(VENV_DIR) + source $(VENV_DIR)/bin/activate && pip install -e .[dev] touch $(VENV_DIR)/bin/activate clean_venv: @@ -1,7 +1,8 @@ # paper2remarkable -[](https://pypi.org/project/paper2remarkable) +[](https://pypi.org/project/paper2remarkable) +[](https://travis-ci.org/GjjvdBurg/paper2remarkable) +[](https://pepy.tech/project/paper2remarkable/month) ``paper2remarkable`` is a command line program for quickly and easily transferring an academic paper to your [reMarkable](https://remarkable.com/): @@ -10,8 +11,20 @@ transferring an academic paper to your [reMarkable](https://remarkable.com/): $ p2r https://arxiv.org/abs/1811.11242 ``` -The script can be run through the ``p2r`` command line program or via Docker -(see below). +There is also support for transferring an article from a website: + +``` +$ p2r https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines +``` + +The script can be run through the ``p2r`` command line program or via Docker +(see below). If you're using MacOS, you might be interested in the [Alfred +workflow](#alfred-workflow) or [Printing to p2r](#printing). On Linux, a +background terminal such as [Guake](http://guake-project.org/) can be very +handy. Note that even without a reMarkable, this program can make downloading +papers easier (just use the `-n` flag). + +## Introduction ``paper2remarkable`` makes it as easy as possible to get a PDF on your reMarkable from any of the following sources: @@ -19,22 +32,28 @@ reMarkable from any of the following sources: * [arXiv](https://arxiv.org/) * [ACM Digital Library](https://dl.acm.org/dl.cfm) * [CiteSeerX](http://citeseerx.ist.psu.edu/index) +* [CVF](https://openaccess.thecvf.com/menu) +* [JMLR](http://jmlr.org) +* [NBER](https://www.nber.org) * [NeurIPS](https://papers.nips.cc/) * [OpenReview](https://openreview.net/) * [PMLR](http://proceedings.mlr.press/) * [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) +* [SagePub](https://journals.sagepub.com/) +* [SemanticScholar](https://www.semanticscholar.org/) * [SpringerLink](https://link.springer.com/) * A generic URL to a PDF file * A local PDF file +* Any article on a website The program aims to be flexible to the exact source URL, so for many of the -sources you can either provide a URL to the abstract page or to the PDF file. -If you have an source that you would like to see added to the list, let me -know! +academic sources you can either provide a URL to the abstract page or to the +PDF file. If you have a source that you would like to see added to the list, +let me know! ``paper2remarkable`` takes the source URL and: -1. Downloads the pdf if necessary +1. Downloads the pdf 2. Removes the arXiv timestamp (for arXiv sources) 3. Crops the pdf to remove unnecessary borders 4. Shrinks the pdf file to reduce the filesize @@ -47,43 +66,13 @@ Optionally, you can: - Download a paper but not upload to the reMarkable using the ``-n`` switch. - Insert a blank page after each page using the ``-b`` switch (useful for note taking!) -- Center the pdf on the reMarkable (default is left-aligned) +- Center (``-c``) or right-align (``-r``) the pdf on the reMarkable (default + is left-aligned), or disable cropping altogether (``-k``). - Provide an explicit filename using the ``--filename`` parameter - Specify the location on the reMarkable to place the file (default ``/``) -Here's the full help of the script: - -```text -usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] - [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK] - [--rmapi RMAPI] - input - -Paper2reMarkable version 0.4.0 - -positional arguments: - input URL to a paper or the path of a local PDF file - -optional arguments: - -h, --help show this help message and exit - -b, --blank Add a blank page after every page of the PDF - -c, --center Center the PDF on the page, instead of left align - -d, --debug debug mode, doesn't upload to reMarkable - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir - -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR - directory on reMarkable to put the file (created if - missing, default: /) - -v, --verbose be verbose - --filename FILENAME Filename to use for the file on reMarkable - --gs GS path to gs executable (default: gs) - --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) - --pdftk PDFTK path to pdftk executable (default: pdftk) - --rmapi RMAPI path to rmapi executable (default: rmapi) -``` - -And here's an example with verbose mode enabled that shows everything the -script does by default: +Here's an example with verbose mode enabled that shows everything the script +does by default: ``` $ p2r -v https://arxiv.org/abs/1811.11242 @@ -105,26 +94,129 @@ $ p2r -v https://arxiv.org/abs/1811.11242 The script requires the following external programs to be available: -- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/) -- [pdfcrop](https://ctan.org/pkg/pdfcrop?lang=en): usually included with a - LaTeX installation. +- [pdftk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/), + [qpdf](http://qpdf.sourceforge.net/), or + [pdftk-java](https://gitlab.com/pdftk-java/pdftk), whichever your package + manager provides. - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -If these scripts are not available on the ``PATH`` variable, you can supply -them with the relevant options to the script. Then, you can install -``paper2remarkable`` from PyPI: +Specifically: + +1. First install [rMAPI](https://github.com/juruen/rmapi), using + ``` + $ go get -u github.com/juruen/rmapi + ``` + +2. Then install system dependencies: + - **Arch Linux:** ``pacman -S pdftk ghostscript poppler`` + - **Ubuntu:** ``apt-get install pdftk ghostscript poppler-utils``. Replace + ``pdftk`` with ``qpdf`` if your distribution doesn't package ``pdftk``. + - **MacOS:** ``brew install pdftk-java ghostscript poppler`` (using [HomeBrew](https://brew.sh/)). + - **Windows:** Installers or executables are available for + [qpdf](https://github.com/qpdf/qpdf/releases) (for instance the mingw + binary executables) and + [GhostScript](https://www.ghostscript.com/download/gsdnld.html). + Importantly, Windows support is untested and these are generic + instructions, so we welcome clarifications where needed. The Docker + instructions below may be more convenient on Windows. + +3. Finally, install ``paper2remarkable``: + ``` + $ pip install paper2remarkable + ``` + this installs the ``p2r`` command line program. + +**Optionally**, you can install: + +- [pdftoppm](https://linux.die.net/man/1/pdftoppm) (recommended for speed). + Usually part of a [Poppler](https://poppler.freedesktop.org/) installation. + +- the [ReadabiliPy](https://github.com/alan-turing-institute/ReadabiliPy) + package with Node.js support, to allow using + [Readability.js](https://github.com/mozilla/readability) for HTML articles. + This is known to improve the output of certain web articles. + +If any of the dependencies (such as rmapi or ghostscript) are not available on +the ``PATH`` variable, you can supply them with the relevant options to the +script (for instance ``p2r --rmapi /path/to/rmapi``). If you run into trouble +with the installation, please let me know by opening an issue [on +Github][github-url]. + +## Usage + +The full help of the script is as follows. Hopefully the various command line +flags are self-explanatory, but if you'd like more information, please open an +issue [on GitHub][github-url]. + +``` +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-r] [-k] [-v] [-V] + [--filename FILENAME] [--gs GS] [--pdftoppm PDFTOPPM] [--pdftk PDFTK] + [--qpdf QPDF] [--rmapi RMAPI] + input [input ...] + +Paper2reMarkable version 0.7.3 +positional arguments: + input One or more URLs to a paper or paths to local PDF files + +optional arguments: + -h, --help show this help message and exit + -b, --blank Add a blank page after every page of the PDF + -c, --center Center the PDF on the page, instead of left align + -d, --debug debug mode, doesn't upload to reMarkable + -n, --no-upload don't upload to the reMarkable, save the output in current working dir + -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR + directory on reMarkable to put the file (created if missing, default: /) + -r, --right Right align so the menu doesn't cover it + -k, --no-crop Don't crop the pdf file + -v, --verbose be verbose + -V, --version Show version and exit + --filename FILENAME Filename to use for the file on reMarkable + --gs GS path to gs executable (default: gs) + --pdftoppm PDFTOPPM path to pdftoppm executable (default: pdftoppm) + --pdftk PDFTK path to pdftk executable (default: pdftk) + --qpdf QPDF path to qpdf executable (default: qpdf) + --rmapi RMAPI path to rmapi executable (default: rmapi) ``` -pip install paper2remarkable + +## Alfred Workflow + +On MacOS, you can optionally install [this Alfred workflow][workflow]. Alfred +is [a launcher for MacOS](https://www.alfredapp.com/). + +Once installed, you can then use `rm` command and `rmb` (for the `--blank` +pages to insert blank pages between pages for notes) with a URL passed. The +global shortcut `Alt-P` will send the current selection to `p2r`. Note that by +default `--right` is passed and `p2r` is executed in your `bash` environment. +You can edit the Workflow in Alfred if this doesn't work for your setup. + + + +[workflow]: https://github.com/GjjvdBurg/paper2remarkable/blob/master/Remarkable.alfredworkflow?raw=true + +## Printing + +Printing to `p2r` allows printing prompts to save directly to your reMarkable +tablet, passing through `p2r` for processing. + +For MacOS, you can follow [the guide][print-guide] for printing with `rmapi`, +but for the bash script, instead use this script: + +``` +for f in "$@" +do + bash -c -l "p2r --right '$f'" +done ``` -This installs the ``p2r`` command line program. +[print-guide]: https://github.com/juruen/rmapi/blob/master/docs/tutorial-print-macosx.md ## Docker -You can also use our Dockerfile to avoid installing dependencies on your -machine. You will need `git` and `docker` installed. +If you'd like to avoid installing the dependencies directly on your machine, +you can use the Dockerfile. To make this work you will need ``git`` and +``docker`` installed. First clone this repository with `git clone` and `cd` inside of it, then build the container: @@ -161,8 +253,15 @@ docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r --help # equivalent to above usage docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" p2r -v https://arxiv.org/abs/1811.11242 + +# to transfer a local file in the current directory +docker run --rm -v "${HOME}/.rmapi:/home/user/.rmapi:rw" -v "$(pwd):/home/user:r" p2r -v localfile.pdf ``` +For transferring local files using the Docker image, you may find [this helper +function](https://github.com/GjjvdBurg/paper2remarkable/issues/34#issuecomment-610852258) +useful. + You can also create an [alias](http://tldp.org/LDP/abs/html/aliases.html) in your ``~/.bashrc`` file to abstract away the Docker commands: @@ -178,5 +277,7 @@ Then you can use ``paper2remarkable`` from the command line as ``p2r``! License: MIT -If you find a problem or want to suggest a feature, please let us know! You're -helping to make this project better! +If you find a problem or want to suggest a feature, please open an issue [on +Github][github-url]. You're helping to make this project better for everyone! + +[github-url]: https://github.com/GjjvdBurg/paper2remarkable diff --git a/Remarkable.alfredworkflow b/Remarkable.alfredworkflow Binary files differnew file mode 100644 index 0000000..6ad331e --- /dev/null +++ b/Remarkable.alfredworkflow diff --git a/make_release.py b/make_release.py index a19b5fd..f3bc9f2 100644 --- a/make_release.py +++ b/make_release.py @@ -14,6 +14,8 @@ Date: 2019-07-23 import colorama import os +import sys +import tempfile def colored(msg, color=None, style=None): @@ -52,6 +54,13 @@ def get_package_name(): return nameline.split("=")[-1].strip().strip('"') +def get_package_version(pkgname): + ctx = {} + with open(f"{pkgname.lower()}/__version__.py", "r") as fp: + exec(fp.read(), ctx) + return ctx["__version__"] + + class Step: def pre(self, context): pass @@ -96,6 +105,12 @@ class UpdateChangelog(Step): self.print_run("vi CHANGELOG.md") +class UpdateReadme(Step): + def action(self, context): + self.instruct(f"Update readme if necessary") + self.print_run("vi README.md") + + class RunTests(Step): def action(self, context): self.instruct("Run the unit tests") @@ -105,7 +120,7 @@ class RunTests(Step): class BumpVersionPackage(Step): def action(self, context): self.instruct(f"Update __version__.py with new version") - self.print_run(f"vi {context['pkgname']}/__version__.py") + self.do_cmd(f"vi {context['pkgname']}/__version__.py") def post(self, context): wait_for_enter() @@ -113,10 +128,7 @@ class BumpVersionPackage(Step): def _get_version(self, context): # Get the version from the version file - about = {} - with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp: - exec(fp.read(), about) - return about["__version__"] + return get_package_version(context["pkgname"]) class MakeClean(Step): @@ -143,15 +155,15 @@ class PushToTestPyPI(Step): class InstallFromTestPyPI(Step): def action(self, context): - self.print_run("cd /tmp/") - self.print_cmd("rm -rf ./venv") - self.print_cmd("virtualenv ./venv") - self.print_cmd("cd ./venv") - self.print_cmd("source bin/activate") - self.print_cmd( - "pip install --index-url https://test.pypi.org/simple/ " - + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}" + tmpvenv = tempfile.mkdtemp(prefix="p2r_venv_") + self.do_cmd( + f"python -m venv {tmpvenv} && source {tmpvenv}/bin/activate && " + "pip install --no-cache-dir --index-url " + "https://test.pypi.org/simple/ " + "--extra-index-url https://pypi.org/simple " + f"{context['pkgname']}=={context['version']}" ) + context["tmpvenv"] = tmpvenv class TestPackage(Step): @@ -159,13 +171,12 @@ class TestPackage(Step): self.instruct( f"Ensure that the following command gives version {context['version']}" ) - self.print_run(f"p2r -h") + self.do_cmd(f"source {context['tmpvenv']}/bin/activate && p2r -V") -class DeactivateVenv(Step): +class RemoveVenv(Step): def action(self, context): - self.print_run("deactivate") - self.instruct("Go back to the project directory") + self.do_cmd(f"rm -rf {context['tmpvenv']}") class GitTagVersion(Step): @@ -210,32 +221,39 @@ class WaitForRTD(Step): ) -def main(): +def main(target=None): colorama.init() procedure = [ - GitToMaster(), - GitAdd(), - PushToGitHub(), - BumpVersionPackage(), - UpdateChangelog(), - MakeClean(), - RunTests(), - MakeDist(), - PushToTestPyPI(), - InstallFromTestPyPI(), - TestPackage(), - DeactivateVenv(), - GitAdd(), - PushToPyPI(), - GitTagVersion(), - PushToGitHub(), + ("gittomaster", GitToMaster()), + ("gitadd1", GitAdd()), + ("push1", PushToGitHub()), + ("bumpversion", BumpVersionPackage()), + ("changelog", UpdateChangelog()), + ("readme", UpdateReadme()), + ("clean", MakeClean()), + ("tests", RunTests()), + ("dist", MakeDist()), + ("testpypi", PushToTestPyPI()), + ("install", InstallFromTestPyPI()), + ("testpkg", TestPackage()), + ("remove_venv", RemoveVenv()), + ("gitadd2", GitAdd()), + ("pypi", PushToPyPI()), + ("tag", GitTagVersion()), + ("push2", PushToGitHub()), ] context = {} context["pkgname"] = get_package_name() - for step in procedure: + context["version"] = get_package_version(context["pkgname"]) + skip = True if target else False + for name, step in procedure: + if not name == target and skip: + continue + skip = False step.run(context) cprint("\nDone!", color="yellow", style="bright") if __name__ == "__main__": - main() + target = sys.argv[1] if len(sys.argv) > 1 else None + main(target=target) diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py index 6540db2..5c0adff 100644 --- a/paper2remarkable/__version__.py +++ b/paper2remarkable/__version__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -VERSION = (0, 4, 6) +VERSION = (0, 7, 4) __version__ = ".".join(map(str, VERSION)) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py index d1a94d8..623d29f 100644 --- a/paper2remarkable/crop.py +++ b/paper2remarkable/crop.py @@ -9,9 +9,12 @@ Copyright: 2019, G.J.J. van den Burg """ import PyPDF2 +import io import os -import subprocess import pdfplumber +import subprocess + +from PyPDF2.generic import RectangleObject from .log import Logger @@ -21,17 +24,54 @@ RM_HEIGHT = 1872 logger = Logger() +def find_offset_byte_line(line): + """Find index of first nonzero bit in a line of bytes + + The given line is a string of bytes, each representing 8 pixels. This code + finds the index of the first bit that is not zero. Used when finding the + cropbox with pdftoppm. + """ + off = 0 + for c in line: + if c == 0: + off += 8 + else: + k = 0 + while c > 0: + k += 1 + c >>= 1 + off += k + break + return off + + +def check_pdftoppm(pth): + """Check that we can run the provided pdftoppm executable""" + try: + subprocess.check_output([pth, "-v"], stderr=subprocess.DEVNULL) + except (subprocess.CalledProcessError, FileNotFoundError, PermissionError): + logger.info("pdftoppm not found, using pdfplumber instead (slower)") + return False + return True + + class Cropper(object): def __init__( - self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + self, + input_file=None, + output_file=None, + pdftoppm_path="pdftoppm", ): if not input_file is None: self.input_file = os.path.abspath(input_file) self.reader = PyPDF2.PdfFileReader(self.input_file) if not output_file is None: self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path + if pdftoppm_path and not check_pdftoppm(pdftoppm_path): + pdftoppm_path = None + + self.pdftoppm_path = pdftoppm_path self.writer = PyPDF2.PdfFileWriter() def crop(self, margins=1): @@ -40,6 +80,9 @@ class Cropper(object): def center(self, padding=15): return self.process_file(self.center_page, padding=padding) + def right(self, padding=15): + return self.process_file(self.right_page, padding=padding) + def process_file(self, page_func, *args, **kwargs): n = self.reader.getNumPages() for page_idx in range(n): @@ -54,13 +97,18 @@ class Cropper(object): logger.info("Processing pages ... (%i/%i)" % (n, n)) return 0 + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + def center_page(self, page_idx, padding): return self.process_page( page_idx, self.get_center_bbox, padding=padding ) - def crop_page(self, page_idx, margins): - return self.process_page(page_idx, self.get_bbox, margins=margins) + def right_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_right_bbox, padding=padding + ) def export_page(self, page_idx): """Helper function that exports a single page given by index """ @@ -75,38 +123,23 @@ class Cropper(object): def process_page(self, page_idx, bbox_func, *args, **kwargs): """Process a single page and add it to the writer """ tmpfname = self.export_page(page_idx) - tmpfout = "./output.pdf" bbox = bbox_func(tmpfname, *args, **kwargs) - status = subprocess.call( - [ - self.pdfcrop_path, - "--bbox", - " ".join(map(str, bbox)), - tmpfname, - tmpfout, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - return status - reader = PyPDF2.PdfFileReader(tmpfout) - page = reader.getPage(0) - self.writer.addPage(page) + thepage = self.reader.getPage(page_idx) + thepage.cropBox = RectangleObject(bbox) + self.writer.addPage(thepage) os.unlink(tmpfname) - os.unlink(tmpfout) return 0 - def get_bbox(self, filename, margins=1, resolution=72): - """Get the bounding box, with optional margins - - if margins is integer, used for all margins, else - margins = [left, top, right, bottom] + def get_raw_bbox(self, filename, resolution=72): + """Get the basic bounding box of a pdf file""" + if self.pdftoppm_path is None: + box = self.get_raw_bbox_pdfplumber(filename, resolution=resolution) + else: + box = self.get_raw_bbox_pdftoppm(filename, resolution=resolution) + return box - We get the bounding box by finding the smallest rectangle that is - completely surrounded by white pixels. - """ - if isinstance(margins, int): - margins = [margins for _ in range(4)] + def get_raw_bbox_pdfplumber(self, filename, resolution=72): + """Get the basic bounding box with pdfplumber""" pdf = pdfplumber.open(filename) im = pdf.pages[0].to_image(resolution=resolution) pdf.close() @@ -131,20 +164,99 @@ class Cropper(object): while right < W and sum(M[W - 1 - right]) == H * 255 * 3: right += 1 + return left, right, top, bottom, W, H + + def get_raw_bbox_pdftoppm(self, filename, resolution=72): + """Get the basic bounding box using pdftoppm """ + cmd = [ + self.pdftoppm_path, + "-r", + str(resolution), + "-singlefile", + "-mono", + filename, + ] + + im = subprocess.check_output(cmd) + im = io.BytesIO(im) + + id_ = im.readline().rstrip(b"\n") + if not id_ == b"P4": + raise ValueError("Not in P4 format") + wh = im.readline().rstrip(b"\n").split(b" ") + width, height = int(wh[0]), int(wh[1]) + imdata = im.read() + + pad = width % 8 + padwidth = width + pad + stepsize = padwidth // 8 + + for top in range(height): + if sum(imdata[top * stepsize : (top + 1) * stepsize]) > 0: + break + + for bottom in reversed(range(height)): + if sum(imdata[bottom * stepsize : (bottom + 1) * stepsize]) > 0: + break + + left = width + right = 0 + for i in range(top, bottom): + lline = imdata[i * stepsize : (i + 1) * stepsize] + rline = reversed(imdata[i * stepsize : (i + 1) * stepsize]) + l = find_offset_byte_line(lline) + left = min(left, l) + r = padwidth + pad - find_offset_byte_line(rline) + right = max(right, r) + + top += 1 + left += 1 + right = width - right + 2 + bottom = height - bottom - 2 + + return left, right, top, bottom, width, height + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + + left, right, top, bottom, W, H = self.get_raw_bbox( + filename, resolution=resolution + ) + left -= margins[0] + left = max(left, 0) top -= margins[1] + top = max(top, 0) right -= margins[2] bottom -= margins[3] # This is the bounding box in PIL format: (0, 0) top left x0, y0, x1, y1 = left, top, W - right, H - bottom + # The remarkable changes the orientation of a portrait page if the + # width is greater than the height. To prevent this, we pad the height + # with extra whitespace. This should only occur if the original + # orientation of the page would be changed by cropping. + w, h = x1 - x0, y1 - y0 + if H > W and w > h: + y1 = y0 + w + 10 + h = y1 - y0 + # Get the bbox in Ghostscript format: (0, 0) bottom left a0, b0, a1, b1 = x0, H - y1, x1, H - y0 return [a0, b0, a1, b1] def get_center_bbox(self, filename, padding=15): - """Compute a bounding box that will center the page file on the + """Compute a bounding box that will center the page file on the reMarkable """ bbox = self.get_bbox(filename, margins=0) @@ -159,7 +271,7 @@ class Cropper(object): # if the document is wider than the remarkable, we add top-padding to # center it, otherwise we add left-padding - x, y = 0, 0 + x = y = 0 if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 else: @@ -167,3 +279,39 @@ class Cropper(object): margins = [padding + x, padding + y, padding, padding] return self.get_bbox(filename, margins=margins) + + def get_right_bbox(self, filename, padding=15): + """Get the bounding box that ensures the menu doesn't hide the text""" + + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # Note, the menu width is about 12mm and the entire screen is about + # 156mm. This informs the width of the left padding we'll add. + menu_width = 12 / 156 * RM_WIDTH + + H = RM_HEIGHT + W = RM_WIDTH + + # TODO: This math is approximate. The goal is to get the page centered + # in the remaining space after taking the menu width into account, + # while also providing equal padding at the top and bottom. This seems + # to give too much padding on the left for some pages, but I'm not sure + # why. Pull requests welcome! + rho_rm = H / (W - menu_width) + rho_page = (h + 2 * padding) / (w + 2 * padding) + x = y = 0 + if rho_rm < rho_page: + x = -w - 2 * padding + (h + 2 * padding) * (W - menu_width) / H + elif rho_rm > rho_page: + y = -h - 2 * padding + H * (w + 2 * padding) / (W - menu_width) + + margins = [ + menu_width + x + padding, + padding + y, + padding, + padding, + ] + return self.get_bbox(filename, margins=margins) diff --git a/paper2remarkable/exceptions.py b/paper2remarkable/exceptions.py index 86f39b4..b433ad4 100644 --- a/paper2remarkable/exceptions.py +++ b/paper2remarkable/exceptions.py @@ -6,10 +6,9 @@ from . import GITHUB_URL -from subprocess import CalledProcessError - -GH_MSG = "\n\nIf you think this might be a bug, please raise an issue on GitHub at: {url}".format( - url=GITHUB_URL +GH_MSG = ( + "\n\nIf you think this might be a bug, please raise an issue on " + "GitHub at:\n{url}\n".format(url=GITHUB_URL) ) @@ -48,13 +47,17 @@ class URLResolutionError(Error): class FilenameMissingError(Error): """Exception raised for providers that need a filename to be provided""" - def __init__(self, provider): + def __init__(self, provider, url, reason=None): self.provider = provider + self.url = url + self.reason = reason def __str__(self): - msg = "ERROR: Filename must be given with the {provider} provider (hint: use --filename)".format( - provider=self.provider + msg = "ERROR: Couldn't determine a filename from {url} for provider {provider}".format( + provider=self.provider, url=self.url ) + if self.reason: + msg += "\nReason: {reason}".format(reason=self.reason) msg += GH_MSG return msg @@ -86,16 +89,53 @@ class RemarkableError(Error): return msg -class _CalledProcessError(CalledProcessError): - """Exception raised when subprocesses fail. +class _CalledProcessError(Error): + """Exception raised when subprocesses fail. """ - We subclass the CalledProcessError so we can add our custom error message. - """ + def __init__(self, message): + self.message = message + + def __str__(self): + msg = "ERROR: {message}".format(message=self.message) + msg += GH_MSG + return msg + + +class NoPDFToolError(Error): + """Exception raised when neither pdftk or qpdf is found.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self): + pass def __str__(self): - parent = super().__str__() - msg = parent + GH_MSG + msg = ( + "ERROR: Neither pdftk or qpdf could be found. Install " + "either of these or ensure that they can be found using " + "the --pdftk or --qpdf options." + ) + msg += GH_MSG + return msg + + +class UnidentifiedSourceError(Error): + """Exception raised when the input is neither a local file nor a url """ + + def __str__(self): + msg = ( + "ERROR: Couldn't figure out what source you mean. If it's a " + "local file, please make sure it exists." + ) + msg += GH_MSG + return msg + + +class InvalidURLError(Error): + """Exception raised when no provider can handle a url source """ + + def __str__(self): + msg = ( + "ERROR: Input URL is not valid, no provider can handle " + "this source." + ) + msg += GH_MSG return msg diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py index bae1cbf..fb9d8a3 100644 --- a/paper2remarkable/log.py +++ b/paper2remarkable/log.py @@ -38,19 +38,26 @@ class Logger(metaclass=Singleton): def disable(self): self.enabled = False - def _log(self, msg, mode): + def _log(self, msg, mode, end="\n", add_prefix=True): if not self.enabled: return if not mode in ("info", "warn"): raise ValueError("Unknown logging mode: %s" % mode) file = sys.stdout if mode == "info" else sys.stderr - now = datetime.datetime.now() - nowstr = now.strftime("%Y-%m-%d %H:%M:%S") - print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file) + if add_prefix: + now = datetime.datetime.now() + nowstr = now.strftime("%Y-%m-%d %H:%M:%S") + prefix = "%s - %s - " % (nowstr, mode.upper()) + else: + prefix = "" + print("%s%s" % (prefix, msg), end=end, file=file) file.flush() - def info(self, msg): - self._log(msg, "info") + def info(self, msg, end="\n"): + self._log(msg, "info", end=end) - def warning(self, msg): - self._log(msg, "warn") + def warning(self, msg, end="\n"): + self._log(msg, "warn", end=end) + + def append(self, msg, mode, end="\n"): + self._log(msg, mode, end=end, add_prefix=False) diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index c660452..c365920 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -19,49 +19,28 @@ from .log import Logger logger = Logger() -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): - """Crop the pdf file using Cropper - """ - logger.info("Cropping pdf file") - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - - cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) - status = cropper.crop(margins=15) - - if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) +def prepare_pdf(filepath, operation, pdftoppm_path="pdftoppm"): + """Prepare pdf by cropping, centering, or right-aligning the flie""" + logger.info("Preparing PDF using %s operation" % operation) + prepared_file = os.path.splitext(filepath)[0] + "-prep.pdf" + cropper = Cropper(filepath, prepared_file, pdftoppm_path=pdftoppm_path) + if operation == "crop": + status = cropper.crop(margins=15) + elif operation == "center": + status = cropper.center() + elif operation == "right": + status = cropper.right() + else: + logger.warning("Unknown operation: %s" % operation) return filepath - if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file - ) + if not status == 0 or not os.path.exists(prepared_file): + logger.warning("PDF prepare operation failed") return filepath - return cropped_file - - -def center_pdf(filepath, pdfcrop_path="pdfcrop"): - """Center the pdf file on the reMarkable - """ - logger.info("Centering pdf file") - centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - - cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) - status = cropper.center() - - if not status == 0: - logger.warning("Failed to center the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(centered_file): - logger.warning( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file + return prepared_file def blank_pdf(filepath): - """Add blank pages to PDF - """ + """Add blank pages to PDF""" logger.info("Adding blank pages") input_pdf = PyPDF2.PdfFileReader(filepath) output_pdf = PyPDF2.PdfFileWriter() @@ -76,9 +55,9 @@ def blank_pdf(filepath): def shrink_pdf(filepath, gs_path="gs"): - """Shrink the PDF file size using Ghostscript - """ - logger.info("Shrinking pdf file") + """Shrink the PDF file size using Ghostscript""" + logger.info("Shrinking pdf file ...") + size_before = os.path.getsize(filepath) output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" status = subprocess.call( [ @@ -98,4 +77,8 @@ def shrink_pdf(filepath, gs_path="gs"): if not status == 0: logger.warning("Failed to shrink the pdf file") return filepath + size_after = os.path.getsize(output_file) + if size_after > size_before: + logger.info("Shrinking has no effect for this file, using original.") + return filepath return output_file diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 53fda1f..935b889 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -3,26 +3,38 @@ from .acm import ACM from .arxiv import Arxiv from .citeseerx import CiteSeerX +from .cvf import CVF +from .html import HTML +from .jmlr import JMLR from .local import LocalFile +from .nber import NBER from .neurips import NeurIPS from .openreview import OpenReview from .pdf_url import PdfUrl from .pmlr import PMLR from .pubmed import PubMed +from .sagepub import SagePub +from .semantic_scholar import SemanticScholar from .springer import Springer from .tandfonline import TandFOnline -# NOTE: Order matters here, PdfUrl should be last +# NOTE: Order matters here, PdfUrl and HTML should be last providers = [ ACM, Arxiv, CiteSeerX, + CVF, + JMLR, + NBER, NeurIPS, OpenReview, PMLR, PubMed, + SagePub, Springer, + SemanticScholar, TandFOnline, LocalFile, PdfUrl, + HTML, ] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 596af98..74ab9e6 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -11,18 +11,21 @@ Copyright: 2019, G.J.J. van den Burg import abc import os import shutil +import subprocess import tempfile import time -from ._info import Informer -from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf +from ..exceptions import _CalledProcessError +from ..log import Logger +from ..pdf_ops import prepare_pdf, blank_pdf, shrink_pdf from ..utils import ( assert_file_is_pdf, + check_pdftool, download_url, - upload_to_remarkable, follow_redirects, + upload_to_remarkable, ) -from ..log import Logger +from ._info import Informer logger = Logger() @@ -36,11 +39,14 @@ class Provider(metaclass=abc.ABCMeta): upload=True, debug=False, center=False, + right=False, blank=False, + no_crop=False, remarkable_dir="/", rmapi_path="rmapi", - pdfcrop_path="pdfcrop", + pdftoppm_path="pdftoppm", pdftk_path="pdftk", + qpdf_path="qpdf", gs_path="gs", cookiejar=None, ): @@ -48,12 +54,15 @@ class Provider(metaclass=abc.ABCMeta): self.debug = debug self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path - self.pdfcrop_path = pdfcrop_path + self.pdftoppm_path = pdftoppm_path self.pdftk_path = pdftk_path + self.qpdf_path = qpdf_path self.gs_path = gs_path self.informer = Informer() self.cookiejar = cookiejar + self.pdftool = check_pdftool(self.pdftk_path, self.qpdf_path) + # wait time to not hit the server too frequently self.server_delay = 0 @@ -62,9 +71,13 @@ class Provider(metaclass=abc.ABCMeta): logger.disable() # Define the operations to run on the pdf. Providers can add others. - self.operations = [("crop", self.crop_pdf)] + self.operations = [("rewrite", self.rewrite_pdf)] if center: self.operations.append(("center", self.center_pdf)) + elif right: + self.operations.append(("right", self.right_pdf)) + elif not no_crop: + self.operations.append(("crop", self.crop_pdf)) if blank: self.operations.append(("blank", blank_pdf)) @@ -83,10 +96,15 @@ class Provider(metaclass=abc.ABCMeta): # Wrappers for pdf operations that have additional arguments def crop_pdf(self, filepath): - return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return prepare_pdf(filepath, "crop", pdftoppm_path=self.pdftoppm_path) def center_pdf(self, filepath): - return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + return prepare_pdf( + filepath, "center", pdftoppm_path=self.pdftoppm_path + ) + + def right_pdf(self, filepath): + return prepare_pdf(filepath, "right", pdftoppm_path=self.pdftoppm_path) def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) @@ -96,6 +114,78 @@ class Provider(metaclass=abc.ABCMeta): # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename, cookiejar=self.cookiejar) + def compress_pdf(self, in_pdf, out_pdf): + """ Compress a pdf file, returns subprocess status """ + if self.pdftool == "pdftk": + status = subprocess.call( + [self.pdftk_path, in_pdf, "output", out_pdf, "compress"] + ) + elif self.pdftool == "qpdf": + status = subprocess.call( + [ + self.qpdf_path, + "--stream-data=compress", + in_pdf, + out_pdf, + ], + stderr=subprocess.DEVNULL, + ) + if not status == 0: + raise _CalledProcessError( + "%s failed to compress the PDF file." % self.pdftool + ) + + def rewrite_pdf(self, in_pdf, out_pdf=None): + """Re-write the pdf using Ghostscript + + This helps avoid issues in dearxiv due to nested pdfs. + """ + if out_pdf is None: + out_pdf = os.path.splitext(in_pdf)[0] + "-rewrite.pdf" + + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dQUIET", + "-o", + out_pdf, + in_pdf, + ] + ) + if not status == 0: + raise _CalledProcessError( + "Failed to rewrite the pdf with GhostScript" + ) + return out_pdf + + def uncompress_pdf(self, in_pdf, out_pdf): + """ Uncompress a pdf file """ + + if self.pdftool == "pdftk": + status = subprocess.call( + [ + self.pdftk_path, + in_pdf, + "output", + out_pdf, + "uncompress", + ] + ) + elif self.pdftool == "qpdf": + status = subprocess.call( + [ + self.qpdf_path, + "--stream-data=uncompress", + in_pdf, + out_pdf, + ] + ) + if not status == 0: + raise _CalledProcessError( + "%s failed to uncompress the PDF file." % self.pdftool + ) + def run(self, src, filename=None): # follow_redirects here is needed with library use if os.path.exists(src): @@ -124,7 +214,7 @@ class Provider(metaclass=abc.ABCMeta): intermediate_fname = tmp_filename for opname, op in self.operations: intermediate_fname = op(intermediate_fname) - shutil.move(intermediate_fname, clean_filename) + shutil.copy(intermediate_fname, clean_filename) if self.debug: print("Paused in debug mode in dir: %s" % working_dir) @@ -143,4 +233,5 @@ class Provider(metaclass=abc.ABCMeta): base = os.path.splitext(target_path)[0] target_path = base + "_.pdf" shutil.move(clean_filename, target_path) - return target_path + os.chdir(self.initial_dir) + return target_path diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 746c436..8cffc60 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -16,12 +16,12 @@ logger = Logger() class Informer: """Base class for the informers. - The "informer" class is used to retrieve the title, authors, and year of + The "informer" class is used to retrieve the title, authors, and year of publication of the provided paper. - This base class provides the main functionality, but because various - outlets use different conventions to embed author, title, and publication - year information, we expect that individual providers will subclass this + This base class provides the main functionality, but because various + outlets use different conventions to embed author, title, and publication + year information, we expect that individual providers will subclass this class and overwrite some of the methods. """ @@ -35,9 +35,9 @@ class Informer: self.year = year def get_filename(self, abs_url): - """ Generate nice filename using the paper information + """Generate nice filename using the paper information - The provided url must be to a HTMl page where this information can be + The provided url must be to a HTMl page where this information can be found, not to the PDF file itself. """ logger.info("Generating output filename") @@ -50,6 +50,7 @@ class Informer: authors = self.authors[0] + "_et_al" else: authors = "_".join(self.authors) + authors = authors.replace(" ", "_") authors = clean_string(authors) # Clean the title and make it titlecase @@ -76,8 +77,13 @@ class Informer: ## Title def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] + meta = soup.find_all("meta", {"name": self.meta_title_key}) + if not meta: + logger.warning( + "Couldn't determine title information, maybe provide the desired filename using '--filename'?" + ) + return "" + return meta[0]["content"] ## Authors @@ -87,10 +93,13 @@ class Informer: return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] + meta = soup.find_all("meta", {"name": self.meta_author_key}) + if not meta: + logger.warning( + "Couldn't determine author information, maybe provide the desired filename using '--filename'?" + ) + return "" + authors = [x["content"] for x in meta] return self._format_authors(authors) ## Year @@ -100,7 +109,8 @@ class Informer: def get_year(self, soup): """ Retrieve the contents of the meta_date_key field and format it """ - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] + meta = soup.find_all("meta", {"name": self.meta_date_key}) + if not meta: + return "" + date = meta[0]["content"] return self._format_year(date) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index 913e015..6ec1796 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -10,20 +10,17 @@ Copyright: 2019, G.J.J. van den Burg import os import re -import subprocess from ._info import Informer from ._base import Provider -from ..exceptions import ( - URLResolutionError, - _CalledProcessError as CalledProcessError, -) +from ..exceptions import URLResolutionError from ..log import Logger logger = Logger() -DEARXIV_TEXT_REGEX = ( - b"arXiv:\d{4}\.\d{4,5}v\d+\s+\[[\w\-]+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_TEXT_REGEX = b"ar(x|X)iv:(\d{4}\.|[\w\-]+\/)\d+v\d+(\s+\[[\w\-]+\.[\w\-]+\])?\s+\d{1,2}\s\w{3}\s\d{4}" +DEARXIV_URI_REGEX = ( + b"https?://ar(x|X)iv\.org\/abs\/([\w\-]+\/\d+|\d{4}\.\d{4,5})v\d+" ) @@ -36,8 +33,8 @@ class Arxiv(Provider): re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" - re_abs_2 = "https?://arxiv.org/abs/\w+/\d{7}(v\d+)?" - re_pdf_2 = "https?://arxiv.org/pdf/\w+/\d{7}(v\d+)?.pdf" + re_abs_2 = "https?://arxiv.org/abs/[\w\-]+/\d{7}(v\d+)?" + re_pdf_2 = "https?://arxiv.org/pdf/[\w\-]+/\d{7}(v\d+)?.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -48,6 +45,8 @@ class Arxiv(Provider): def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ + if "?" in url: + url = url[: url.index("?")] if re.match(self.re_abs_1, url) or re.match(self.re_abs_2, url): abs_url = url pdf_url = url.replace("abs", "pdf") + ".pdf" @@ -69,44 +68,139 @@ class Arxiv(Provider): def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" - logger.info("Removing arXiv timestamp") + logger.info("Removing arXiv timestamp ... ", end="") basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - raise CalledProcessError( - "pdftk failed to uncompress the PDF file." - ) - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub(b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", b"()Tj", data) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) + recoded_file = basename + "_rewrite.pdf" + self.rewrite_pdf(input_file, recoded_file) + + uncompress_file = basename + "_uncompress.pdf" + self.uncompress_pdf(recoded_file, uncompress_file) + + new_data = [] + current_obj = [] + replaced_arXiv = False + char_count = skip_n = startxref = 0 + xref = {} + + with open(uncompress_file, "rb") as fp: + for line in fp: + if skip_n: + # Skip a line + skip_n -= 1 + continue + + if line.endswith(b" obj\n") or line.endswith(b" obj \n"): + # Start a new object. Add it to the current object and + # record its position for the xref table. + current_obj.append(line) + objid = int(line.split(b" ")[0]) + xref[objid] = char_count + elif current_obj and ( + line.startswith(b"endobj") + and not line.startswith(b"endobj xref") + ): + # End the current object. If needed, replace the arXiv + # stamp in the block (done only once). Reset current + # object. + current_obj.append(line) + block = b"".join(current_obj) + # remove the text + block, n_subs1 = re.subn( + b"\(" + DEARXIV_TEXT_REGEX + b"\)Tj", + b"()Tj", + block, + ) + # remove the url (type 1) + block, n_subs2 = re.subn( + b"<<\n\/URI \(" + + DEARXIV_URI_REGEX + + b"\)\n\/S /URI\n>>\n", + b"", + block, + ) + # remove the url (type 2, i.e. Jackson arXiv 0309285v2) + block, n_subs3 = re.subn( + b"<<\n\/S \/URI\n" + + b"/URI \(" + + DEARXIV_URI_REGEX + + b"\)\n>>\n", + b"", + block, + ) + + if n_subs1 or n_subs2: + # fix the length of the object stream + block = fix_stream_length(block) + replaced_arXiv = True + new_data.append(block) + char_count += len(block) + current_obj = [] + elif line in [b"xref\n", b"endobj xref\n"]: + if b"endobj" in line and current_obj: + current_obj.append(b"endobj\n") + block = b"".join(current_obj) + new_data.append(block) + char_count += len(block) + current_obj = [] + line = b"xref\n" + # We found the xref table, record its position and write it + # out using our updated indices. + startxref = sum(map(len, new_data)) + new_data.append(line) + new_data.append(b"0 %i\n" % (len(xref) + 1)) + new_data.append(b"0000000000 65535 f \n") + for objid in sorted(xref): + new_data.append(b"%010d 00000 n \n" % xref[objid]) + + # skip the appropriate number of lines + skip_n = len(xref) + 2 + elif current_obj: + # If we're recording an object, simply add the line to it + current_obj.append(line) + elif line == b"startxref\n": + # Write out our recorded startxref position, skip the old + # position. + new_data.append(b"startxref\n%i\n" % startxref) + skip_n = 1 + else: + # Anything else passes through + new_data.append(line) + char_count += len(line) removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) + with open(removed_file, "wb") as fp: + fp.write(b"".join(new_data)) output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - raise CalledProcessError("pdftk failed to compress the PDF file.") + self.compress_pdf(removed_file, output_file) + + logger.append("success" if replaced_arXiv else "none found", "info") return output_file + + +def fix_stream_length(block): + # This fixes the stream length of a block, which is needed after we have + # removed the arXiv stamp. + count = 0 + block = block.split(b"\n") + do_count = False + + for line in block: + if line.strip(b" ") in [b"stream", b"endstream"]: + do_count = not do_count + continue + + if do_count: + # +1 for the newline character + count += len(line) + 1 + + new_block = [] + for line in block: + if b" /Length " in line: + new_block.append(b"<< /Length %i >>" % count) + else: + new_block.append(line) + + return b"\n".join(new_block) diff --git a/paper2remarkable/providers/citeseerx.py b/paper2remarkable/providers/citeseerx.py index e483f28..e819c30 100644 --- a/paper2remarkable/providers/citeseerx.py +++ b/paper2remarkable/providers/citeseerx.py @@ -49,10 +49,6 @@ class CiteSeerX(Provider): ) time.sleep(30) - # NOTE: The delay should only be hit twice when p2r is used as a - # library (e.g. during testing). Otherwise the ``server_delay`` is - # never reached in run(). - def _get_doi(self, url): m = re.match(self.re_abs, url) or re.match(self.re_pdf, url) if m: diff --git a/paper2remarkable/providers/cvf.py b/paper2remarkable/providers/cvf.py new file mode 100644 index 0000000..76ca9c0 --- /dev/null +++ b/paper2remarkable/providers/cvf.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Provider for CVF + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer + +from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() + + +class CVFInformer(Informer): + + meta_date_key = "citation_publication_date" + + +class CVF(Provider): + + re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$" + re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = CVFInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url[: -len(".html")] + pdf_url += ".pdf" + pdf_url = pdf_url.replace("html", "papers") + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url.replace("papers", "html").replace(".pdf", ".html") + else: + raise URLResolutionError("CVF", url) + return abs_url, pdf_url + + def validate(src): + m = re.match(CVF.re_abs, src) or re.match(CVF.re_pdf, src) + return not m is None diff --git a/paper2remarkable/providers/html.py b/paper2remarkable/providers/html.py new file mode 100644 index 0000000..e050ea3 --- /dev/null +++ b/paper2remarkable/providers/html.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- + +"""Provider for HTML documents + +This provider is a little bit special, in that it isn't simply pulling an +academic paper from a site, but instead aims to pull a HTML article. + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2020, G.J.J. van den Burg + +""" + +import html2text +import markdown +import readability +import titlecase +import unidecode +import urllib +import weasyprint +import weasyprint.fonts + +from ._base import Provider +from ._info import Informer + +from ..utils import ( + clean_string, + get_page_with_retry, + get_content_type_with_retry, +) +from ..log import Logger + +logger = Logger() + +CSS = """ +@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap'); +@page { size: 702px 936px; margin: 1in; } +a { color: black; } +img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; } +p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; } +h1,h2,h3 { font-family: 'Noto Serif'; } +h1 { font-size: 26px; } +h2 { font-size: 18px; } +h3 { font-size: 14px; } +blockquote { font-style: italic; } +pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; } +code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; } +""" + + +def url_fetcher(url): + if url.startswith("//"): + url = "https:" + url + elif url.startswith("file:///"): + url = "https:" + url[len("file:/") :] + return weasyprint.default_url_fetcher(url) + + +def make_readable(request_html): + """Use an extraction method to get the main article html + + This function checks if ReadabiliPy is installed with NodeJS support, as + that generally yields better results. If that is not available, it falls + back on readability. + """ + + have_readabilipy_js = False + try: + import readabilipy + + have_readabilipy_js = readabilipy.simple_json.have_node() + except ImportError: + pass + + if have_readabilipy_js: + logger.info("Converting HTML using Readability.js") + article = readabilipy.simple_json_from_html_string( + request_html, use_readability=True + ) + title = article["title"] + raw_html = article["content"] + else: + logger.info("Converting HTML using readability") + doc = readability.Document(request_html) + title = doc.title() + raw_html = doc.summary(html_partial=True) + return title, raw_html + + +class ImgProcessor(markdown.treeprocessors.Treeprocessor): + def __init__(self, base_url, *args, **kwargs): + self._base_url = base_url + super().__init__(*args, **kwargs) + + def run(self, root): + """ Ensure all img src urls are absolute """ + for img in root.iter("img"): + img.attrib["src"] = urllib.parse.urljoin( + self._base_url, img.attrib["src"] + ) + img.attrib["src"] = img.attrib["src"].rstrip("/") + + +class HTMLInformer(Informer): + def __init__(self): + super().__init__() + self._cached_title = None + self._cached_article = None + + def get_filename(self, abs_url): + request_html = get_page_with_retry(abs_url, return_text=True) + title, article = make_readable(request_html) + + self._cached_title = title + self._cached_article = article + + # Clean the title and make it titlecase + title = clean_string(title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + title = clean_string(title) + name = title.strip("_") + ".pdf" + name = unidecode.unidecode(name) + logger.info("Created filename: %s" % name) + return name + + +class HTML(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = HTMLInformer() + + def get_abs_pdf_urls(self, url): + return url, url + + def retrieve_pdf(self, pdf_url, filename): + """Turn the HTML article in a clean pdf file + + This function takes the following steps: + + 1. Pull the HTML page using requests, if not done in Informer + 2. Extract the article part of the page using readability/readabiliPy + 3. Convert the article HTML to markdown using html2text + 4. Convert the markdown back to HTML (done to sanitize the HTML) + 4. Convert the HTML to PDF, pulling in images where needed + 5. Save the PDF to the specified filename. + """ + if self.informer._cached_title and self.informer._cached_article: + title = self.informer._cached_title + article = self.informer._cached_article + else: + request_html = get_page_with_retry(pdf_url, return_text=True) + title, article = make_readable(request_html) + + h2t = html2text.HTML2Text() + h2t.wrap_links = False + text = h2t.handle(article) + + # Add the title back to the document + article = "# {title}\n\n{text}".format(title=title, text=text) + + # Convert to html, fixing relative image urls. + md = markdown.Markdown() + md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10) + html_article = md.convert(article) + + if self.debug: + with open("./paper.html", "w") as fp: + fp.write(html_article) + + font_config = weasyprint.fonts.FontConfiguration() + html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher) + css = weasyprint.CSS(string=CSS, font_config=font_config) + + html.write_pdf(filename, stylesheets=[css], font_config=font_config) + + def validate(src): + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): + return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: + return False + return ct.startswith("text/html") diff --git a/paper2remarkable/providers/jmlr.py b/paper2remarkable/providers/jmlr.py new file mode 100644 index 0000000..8b121cb --- /dev/null +++ b/paper2remarkable/providers/jmlr.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Provider for JMLR + +Journal of Machine Learning Research + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class JMLRInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + have_comma = any(("," in auth for auth in soup_authors)) + if have_comma: + return super()._format_authors(soup_authors, sep=",", idx=0) + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class JMLR(Provider): + + re_abs_1 = "https?://(www\.)?jmlr\.org/papers/v(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3}).html$" + re_pdf_1 = "https?://(www\.)?jmlr\.org/papers/volume(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3})/(?P=pid).pdf$" + + re_abs_2 = "https?://(www\.)?jmlr\.org/papers/v(?P<vol>\d+)/(?P<pid>\w+\d{2}\w).html$" + re_pdf_2 = "https?://(www\.)?jmlr\.org/papers/volume(?P<vol>\d+)/(?P<pid>\w+\d{2}\w)/(?P=pid).pdf$" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = JMLRInformer() + + def get_abs_pdf_urls(self, url): + abs_url = pdf_url = None + abs_fmt = "http://jmlr.org/papers/v{vol}/{pid}.html" + pdf_fmt = "http://jmlr.org/papers/volume{vol}/{pid}/{pid}.pdf" + formats = [ + (self.re_abs_1, self.re_pdf_1), + (self.re_abs_2, self.re_pdf_2), + ] + + for re_abs, re_pdf in formats: + ma = re.match(re_abs, url) + mp = re.match(re_pdf, url) + if ma: + abs_url = url + pdf_url = pdf_fmt.format( + vol=ma.group("vol"), pid=ma.group("pid") + ) + elif mp: + abs_url = abs_fmt.format( + vol=mp.group("vol"), pid=mp.group("pid") + ) + pdf_url = url + if abs_url is None or pdf_url is None: + raise URLResolutionError("JMLR", url) + return abs_url, pdf_url + + def validate(src): + return ( + re.match(JMLR.re_abs_1, src) + or re.match(JMLR.re_abs_2, src) + or re.match(JMLR.re_pdf_1, src) + or re.match(JMLR.re_pdf_2, src) + ) diff --git a/paper2remarkable/providers/nber.py b/paper2remarkable/providers/nber.py new file mode 100644 index 0000000..28e0973 --- /dev/null +++ b/paper2remarkable/providers/nber.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +"""Provider for NBER + +(US) National Bureau of Economic Research + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class NBERInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors, sep=" ", idx=0, op=None): + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=None) + + +class NBER(Provider): + + re_abs = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)$" + re_pdf = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)\.pdf$" + + re_pdf_2 = "https://www.nber.org/system/files/working_papers/(?P<ref>[a-z0-9]+)/(?P=ref).pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = NBERInformer() + + def get_report_no(self, url): + m = re.match(self.re_pdf_2, url) + if m: + return m["ref"] + raise URLResolutionError( + "NBER", url, reason="Failed to retrieve report number." + ) + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url + ".pdf" + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url[: -len(".pdf")] + elif re.match(self.re_pdf_2, url): + ref = self.get_report_no(url) + abs_url = f"https://www.nber.org/papers/{ref}" + pdf_url = url + else: + raise URLResolutionError("NBER", url) + return abs_url, pdf_url + + def validate(src): + return ( + re.match(NBER.re_abs, src) + or re.match(NBER.re_pdf, src) + or re.match(NBER.re_pdf_2, src) + ) diff --git a/paper2remarkable/providers/neurips.py b/paper2remarkable/providers/neurips.py index 87cf2c1..d76202c 100644 --- a/paper2remarkable/providers/neurips.py +++ b/paper2remarkable/providers/neurips.py @@ -25,8 +25,8 @@ class NeurIPSInformer(Informer): class NeurIPS(Provider): - re_abs = "^https?://papers.nips.cc/paper/[\d\w\-]+$" - re_pdf = "^https?://papers.nips.cc/paper/[\d\w\-]+.pdf$" + re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$" + re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index 47c0555..8c44f45 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -8,17 +8,49 @@ Copyright: 2019, G.J.J. van den Burg """ +import json import re from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..log import Logger + +logger = Logger() class OpenReviewInformer(Informer): meta_date_key = "citation_publication_date" + def get_authors(self, soup): + # Get the authors for OpenReview by parsing the JSON payload + # + # This may not be super robust long term, but works for now. + warning = ( + "Couldn't determine author information, maybe provide " + "the desired filename using '--filename'?" + ) + + script = soup.find("script", {"id": "__NEXT_DATA__"}) + if not script: + logger.warning(warning) + return "" + + try: + paper_data = json.loads(script.contents[0]) + except json.JSONDecodeError: + logger.warning(warning) + return "" + + try: + content = paper_data["props"]["pageProps"]["forumNote"]["content"] + authors = content["authors"] + except KeyError: + logger.warning(warning) + return "" + return self._format_authors(authors) + def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 5314ec7..d20d4a5 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -12,13 +12,41 @@ import urllib from ._base import Provider from ._info import Informer + +from .. import GITHUB_URL from ..exceptions import FilenameMissingError +from ..log import Logger +from ..utils import get_content_type_with_retry + +logger = Logger() class PdfUrlInformer(Informer): def get_filename(self, abs_url): - # if this is called, filename must not have been provided - raise FilenameMissingError(provider="PDFUrl") + # try to get a nice filename by parsing the url + parsed = urllib.parse.urlparse(abs_url) + path_parts = parsed.path.split("/") + if not path_parts: + raise FilenameMissingError( + provider="PdfUrl", + url=abs_url, + reason="No URL parts", + ) + + filename = path_parts[-1] + if not filename.endswith(".pdf"): + raise FilenameMissingError( + provider="PdfUrl", + url=abs_url, + reason="URL path didn't end in .pdf", + ) + logger.warning( + "Using filename {filename} extracted from url. " + "You might want to provide a nicer one using --filename " + "or request this paper source to be added " + "(see: {github}).".format(filename=filename, github=GITHUB_URL) + ) + return filename class PdfUrl(Provider): @@ -27,11 +55,15 @@ class PdfUrl(Provider): self.informer = PdfUrlInformer() def get_abs_pdf_urls(self, url): - return (None, url) + return (url, url) def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: + # first check if it is a valid url + parsed = urllib.parse.urlparse(src) + if not all([parsed.scheme, parsed.netloc, parsed.path]): + return False + # next, get the header and check the content type + ct = get_content_type_with_retry(src) + if ct is None: return False + return ct.startswith("application/pdf") diff --git a/paper2remarkable/providers/sagepub.py b/paper2remarkable/providers/sagepub.py new file mode 100644 index 0000000..7e76df8 --- /dev/null +++ b/paper2remarkable/providers/sagepub.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +"""Provider for SagePub + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError + + +class SagePubInformer(Informer): + + meta_author_key = "dc.Creator" + meta_title_key = "dc.Title" + meta_date_key = "dc.Date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + def _format_year(self, soup_date): + return soup_date.split("-")[0] + + +class SagePub(Provider): + + re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+" + re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SagePubInformer() + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("full", "pdf") + elif re.match(self.re_pdf, url): + pdf_url = url + abs_url = url.replace("pdf", "full") + else: + raise URLResolutionError("SagePub", url) + return abs_url, pdf_url + + def validate(src): + return re.match(SagePub.re_abs, src) or re.match(SagePub.re_pdf, src) diff --git a/paper2remarkable/providers/semantic_scholar.py b/paper2remarkable/providers/semantic_scholar.py new file mode 100644 index 0000000..0a1b414 --- /dev/null +++ b/paper2remarkable/providers/semantic_scholar.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +"""Provider for SemanticScholar + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2020, G.J.J. van den Burg + +""" + +import re +import bs4 + +from ._base import Provider +from ._info import Informer +from ..exceptions import URLResolutionError +from ..utils import get_page_with_retry + + +class SemanticScholarInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class SemanticScholar(Provider): + + re_abs = ( + "https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}" + ) + re_pdf = "https?:\/\/pdfs.semanticscholar.org/[0-9a-f]{4}/[0-9a-f]{36}.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SemanticScholarInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a SemanticScholar url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self._get_pdf_url(abs_url) + elif re.match(self.re_pdf, url): + pdf_url = url + remainder = pdf_url.split("/")[-1][: -len(".pdf")] + first_four = pdf_url.split("/")[-2] + paper_id = first_four + remainder + abs_url = f"https://www.semanticscholar.org/paper/{paper_id}" + else: + raise URLResolutionError("SemanticScholar", url) + return abs_url, pdf_url + + def _get_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + meta = soup.find_all("meta", {"name": "citation_pdf_url"}) + if not meta: + raise URLResolutionError("SemanticScholar", url) + return meta[0]["content"] + + def validate(src): + return re.match(SemanticScholar.re_abs, src) or re.match( + SemanticScholar.re_pdf, src + ) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index 5ce2564..31f0a67 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -10,40 +10,75 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib +import requests from ._base import Provider from ._info import Informer from ..exceptions import URLResolutionError +from ..utils import HEADERS class SpringerInformer(Informer): - meta_date_key = "citation_online_date" + meta_date_key = None def _format_authors(self, soup_authors): return super()._format_authors(soup_authors, sep=" ", idx=-1) + def get_year(self, soup): + for key in ["citation_online_date", "citation_publication_date"]: + meta = soup.find_all("meta", {"name": key}) + if not meta: + continue + return self._format_year(meta[0]["content"]) + return "" + class Springer(Provider): - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.informer = SpringerInformer() + def _get_abs_url(self, pdf_url): + article_url = pdf_url.replace("content/pdf", "article")[: -len(".pdf")] + req = requests.head( + article_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return article_url + + chapter_url = pdf_url.replace("content/pdf", "chapter")[: -len(".pdf")] + req = requests.head( + chapter_url, headers=HEADERS, cookies=self.cookiejar + ) + if req.status_code == 200: + return chapter_url + + raise URLResolutionError("Springer", pdf_url) + def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): + if re.match(self.re_abs_1, url): abs_url = url pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_abs_2, url): + abs_url = url + pdf_url = url.replace("chapter", "content/pdf") elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + abs_url = self._get_abs_url(url) pdf_url = urllib.parse.unquote(url) else: raise URLResolutionError("Springer", url) return abs_url, pdf_url def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + return ( + re.match(Springer.re_abs_1, src) + or re.match(Springer.re_abs_2, src) + or re.match(Springer.re_pdf, src) + ) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 032bf99..ea24403 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -13,6 +13,7 @@ import sys from . import __version__, GITHUB_URL +from .exceptions import UnidentifiedSourceError, InvalidURLError from .providers import providers, LocalFile from .utils import follow_redirects, is_url @@ -53,20 +54,36 @@ def parse_args(): default="/", ) parser.add_argument( + "-r", + "--right", + help="Right align so the menu doesn't cover it", + action="store_true", + ) + parser.add_argument( + "-k", "--no-crop", help="Don't crop the pdf file", action="store_true" + ) + parser.add_argument( "-v", "--verbose", help="be verbose", action="store_true" ) parser.add_argument( + "-V", + "--version", + help="Show version and exit", + action="version", + version=__version__, + ) + parser.add_argument( "--filename", help="Filename to use for the file on reMarkable", - default=None, + action="append", ) parser.add_argument( "--gs", help="path to gs executable (default: gs)", default="gs" ) parser.add_argument( - "--pdfcrop", - help="path to pdfcrop executable (default: pdfcrop)", - default="pdfcrop", + "--pdftoppm", + help="path to pdftoppm executable (default: pdftoppm)", + default="pdftoppm", ) parser.add_argument( "--pdftk", @@ -74,12 +91,19 @@ def parse_args(): default="pdftk", ) parser.add_argument( + "--qpdf", + help="path to qpdf executable (default: qpdf)", + default="qpdf", + ) + parser.add_argument( "--rmapi", help="path to rmapi executable (default: rmapi)", default="rmapi", ) parser.add_argument( - "input", help="URL to a paper or the path of a local PDF file" + "input", + help="One or more URLs to a paper or paths to local PDF files", + nargs="+", ) return parser.parse_args() @@ -90,44 +114,113 @@ def exception(msg): print("", file=sys.stderr) print( "If you think this might be a bug, please raise an issue on GitHub: %s" - % GITHUB_URL + % GITHUB_URL, + file=sys.stderr, ) + print("", file=sys.stderr) raise SystemExit(1) -def main(): - args = parse_args() - cookiejar = None +def choose_provider(cli_input): + """Choose the provider to use for the given source - if is_url(args.input): - # input is a url - url, cookiejar = follow_redirects(args.input) - provider = next((p for p in providers if p.validate(url)), None) - elif LocalFile.validate(args.input): + This function first tries to check if the input is a local file, by + checking if the path exists. Next, it checks if the input is a "valid" url + using a regex test. If it is, the registered provider classes are checked + to see which provider can handle this url. + + Returns + ------- + provider : class + The class of the provider than can handle the source. A subclass of the + Provider abc. + + new_input : str + The updated input to the provider. This only has an effect for the url + providers, where this will be the url after following all redirects. + + cookiejar : dict or requests.RequestsCookieJar + Cookies picked up when following redirects. These are needed for some + providers to ensure later requests have the right cookie settings. + + Raises + ------ + UnidentifiedSourceError + Raised when the input is neither an existing local file nor a valid url + + InvalidURLError + Raised when the input *is* a valid url, but no provider can handle it. + + """ + provider = cookiejar = None + if LocalFile.validate(cli_input): # input is a local file + new_input = cli_input provider = LocalFile + elif is_url(cli_input): + # input is a url + new_input, cookiejar = follow_redirects(cli_input) + provider = next((p for p in providers if p.validate(new_input)), None) else: # not a proper URL or non-existent file + raise UnidentifiedSourceError + + if provider is None: + raise InvalidURLError + + return provider, new_input, cookiejar + + +def set_excepthook(debug): + sys_hook = sys.excepthook + + def exception_handler(exception_type, value, traceback): + if debug: + sys_hook(exception_type, value, traceback) + else: + print(value, file=sys.stderr) + + sys.excepthook = exception_handler + + +def main(): + args = parse_args() + set_excepthook(args.debug) + + if args.center and args.right: + exception("Can't center and right align at the same time!") + + if args.center and args.no_crop: + exception("Can't center and not crop at the same time!") + + if args.right and args.no_crop: + exception("Can't right align and not crop at the same time!") + + if args.filename and not len(args.filename) == len(args.input): exception( - "Couldn't figure out what source you mean. If it's a " - "local file, make sure it exists." + "When providing --filename and multiple inputs, their number must match." ) - if provider is None: - exception("Input not valid, no provider can handle this source.") - - prov = provider( - verbose=args.verbose, - upload=not args.no_upload, - debug=args.debug, - center=args.center, - blank=args.blank, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - pdfcrop_path=args.pdfcrop, - pdftk_path=args.pdftk, - gs_path=args.gs, - cookiejar=cookiejar, - ) - - prov.run(args.input, filename=args.filename) + filenames = ( + [None] * len(args.input) if not args.filename else args.filename + ) + + for cli_input, filename in zip(args.input, filenames): + provider, new_input, cookiejar = choose_provider(cli_input) + prov = provider( + verbose=args.verbose, + upload=not args.no_upload, + debug=args.debug, + center=args.center, + right=args.right, + blank=args.blank, + no_crop=args.no_crop, + remarkable_dir=args.remarkable_dir, + rmapi_path=args.rmapi, + pdftoppm_path=args.pdftoppm, + pdftk_path=args.pdftk, + qpdf_path=args.qpdf, + gs_path=args.gs, + cookiejar=cookiejar, + ) + prov.run(new_input, filename=filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 79421df..0b4be07 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import time import unidecode from .log import Logger -from .exceptions import FileTypeError, RemarkableError +from .exceptions import FileTypeError, RemarkableError, NoPDFToolError HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -30,14 +30,15 @@ logger = Logger() def clean_string(s): - """ Clean a string by replacing accented characters with equivalents and - keeping only the allowed characters (ascii letters, digits, underscore, + """Clean a string by replacing accented characters with equivalents and + keeping only the allowed characters (ascii letters, digits, underscore, space, dash, and period)""" normalized = unidecode.unidecode(s) allowed = string.ascii_letters + string.digits + "_ .-" cleaned = "".join(c if c in allowed else "_" for c in normalized) while "__" in cleaned: cleaned = cleaned.replace("__", "_") + cleaned = cleaned.strip("_") return cleaned @@ -64,7 +65,7 @@ def download_url(url, filename, cookiejar=None): fid.write(content) -def get_page_with_retry(url, tries=5, cookiejar=None): +def get_page_with_retry(url, tries=5, cookiejar=None, return_text=False): count = 0 jar = {} if cookiejar is None else cookiejar while count < tries: @@ -82,9 +83,33 @@ def get_page_with_retry(url, tries=5, cookiejar=None): time.sleep(5) continue logger.info("Downloaded url: %s" % url) + if return_text: + return res.text return res.content +def get_content_type_with_retry(url, tries=5, cookiejar=None): + count = 0 + jar = {} if cookiejar is None else cookiejar + while count < tries: + count += 1 + error = False + try: + res = requests.head( + url, headers=HEADERS, cookies=jar, allow_redirects=True + ) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting headers for %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + return res.headers.get("Content-Type", None) + + def follow_redirects(url): """Follow redirects from the URL (at most 100)""" it = 0 @@ -98,8 +123,10 @@ def follow_redirects(url): if not "Location" in req.headers: break url = req.headers["Location"] - jar = req.cookies + jar.update(req.cookies) it += 1 + if it == 100: + logger.warning("Max redirects reached. There may be a problem.") jar = jar or req.cookies return url, jar @@ -110,13 +137,19 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): # Create the reMarkable dir if it doesn't exist remarkable_dir = remarkable_dir.rstrip("/") if remarkable_dir: - status = subprocess.call( - [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL, - ) - if not status == 0: - raise RemarkableError( - "Creating directory %s on reMarkable failed" % remarkable_dir + parts = remarkable_dir.split("/") + rmdir = "" + while parts: + rmdir += "/" + parts.pop(0) + status = subprocess.call( + [rmapi_path, "mkdir", rmdir], + stdout=subprocess.DEVNULL, ) + if not status == 0: + raise RemarkableError( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) # Upload the file status = subprocess.call( @@ -132,7 +165,34 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): def is_url(string): # pattern adapted from CleverCSV - pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*\-?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:]*)?(\.[a-z]+)?" + pattern = "((https?|ftp):\/\/(?!\-))?(((([\p{L}\p{N}]*[\-\_]?[\p{L}\p{N}]+)+\.)+([a-z]{2,}|local)(\.[a-z]{2,3})?)|localhost|(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(\:\d{1,5})?))(\/[\p{L}\p{N}_\/()~?=&%\-\#\.:+]*)?(\.[a-z]+)?" string = string.strip(" ") match = regex.fullmatch(pattern, string) return match is not None + + +def check_pdftool(pdftk_path, qpdf_path): + """Check whether we have pdftk or qpdf available""" + # set defaults in case either is set to None or something + pdftk_path = pdftk_path or "false" + qpdf_path = qpdf_path or "false" + + try: + status = subprocess.call( + [pdftk_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + except FileNotFoundError: + status = 1 + if status == 0: + return "pdftk" + try: + status = subprocess.call( + [qpdf_path, "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + except FileNotFoundError: + status = 1 + if status == 0: + return "qpdf" + raise NoPDFToolError diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a8f43fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +line-length = 79 @@ -25,18 +25,24 @@ REQUIRED = [ "unidecode>=1.1", "titlecase>=0.12", "PyPDF2>=1.26", - "regex>=2018.11" + "regex>=2018.11", + "readability-lxml>=0.7.1", + "html2text>=2020.1.16", + "weasyprint>=51", + "markdown>=3.1.1", ] +full_require = ["readabilipy"] docs_require = [] -test_require = [] -dev_require = ["green"] +test_require = ["green"] +dev_require = [] # What packages are optional? EXTRAS = { + "full": full_require, "docs": docs_require, - "tests": test_require, - "dev": docs_require + test_require + dev_require, + "test": test_require + full_require, + "dev": docs_require + test_require + dev_require + full_require, } # The rest you shouldn't have to touch too much :) diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py new file mode 100644 index 0000000..2cb84cf --- /dev/null +++ b/tests/test_arxiv.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for arXiv provider + +This file is part of paper2remarkable. + +""" + +import os +import re +import shutil +import tempfile +import unittest + +from paper2remarkable.providers.arxiv import ( + DEARXIV_TEXT_REGEX, + DEARXIV_URI_REGEX, + Arxiv, +) + + +class TestArxiv(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_text_regex_1(self): + key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_2(self): + key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_3(self): + key = b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_text_regex_4(self): + key = b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004" + m = re.fullmatch(DEARXIV_TEXT_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_1(self): + key = b"http://arxiv.org/abs/physics/0605197v1" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + + def test_uri_regex_2(self): + key = b"https://arxiv.org/abs/1101.0028v3" + m = re.fullmatch(DEARXIV_URI_REGEX, key) + self.assertIsNotNone(m) + + def test_stamp_removed_1(self): + url = "https://arxiv.org/pdf/1703.06103.pdf" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:1703.06103v4 [stat.ML] 26 Oct 2017", data) + + def test_stamp_removed_2(self): + url = "https://arxiv.org/abs/2003.06222" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:2003.06222v1 [stat.ML] 13 Mar 2020", data) + + def test_stamp_removed_3(self): + url = "https://arxiv.org/abs/physics/0605197v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"arXiv:physics/0605197v1 [physics.data-an] 23 May 2006", data + ) + self.assertNotIn( + b"/URI (http://arxiv.org/abs/physics/0605197v1)", data + ) + + def test_stamp_removed_4(self): + url = "https://arxiv.org/abs/math/0309285v2" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn(b"arXiv:math/0309285v2 [math.NA] 9 Apr 2004", data) + self.assertNotIn(b"/URI (http://arXiv.org/abs/math/0309285v2)", data) + + def test_stamp_removed_5(self): + url = "https://arxiv.org/abs/astro-ph/9207001v1" + prov = Arxiv(upload=False) + filename = prov.run(url, filename="./target.pdf") + prov.uncompress_pdf(filename, "unc.pdf") + with open("unc.pdf", "rb") as fp: + data = fp.read() + self.assertNotIn( + b"/URI (http://arxiv.org/abs/astro-ph/9207001v1)", data + ) + self.assertNotIn(b"arXiv:astro-ph/9207001v1 13 Jul 1992", data) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py index 3204768..4ee6773 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -7,7 +7,7 @@ __author__ = "G.J.J. van den Burg" import hashlib import os -import re +import pdfplumber import shutil import tempfile import unittest @@ -15,17 +15,22 @@ import unittest from paper2remarkable.providers import ( ACM, Arxiv, + CVF, CiteSeerX, + HTML, + JMLR, LocalFile, + NBER, NeurIPS, OpenReview, PMLR, PdfUrl, PubMed, + SagePub, + SemanticScholar, Springer, TandFOnline, ) -from paper2remarkable.providers.arxiv import DEARXIV_TEXT_REGEX VERBOSE = False @@ -41,18 +46,6 @@ def md5sum(filename): return hasher.hexdigest() -class TestArxiv(unittest.TestCase): - def test_text_regex_1(self): - key = b"arXiv:1908.03213v1 [astro.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - def test_text_regex_2(self): - key = b"arXiv:1908.03213v1 [astro-ph.HE] 8 Aug 2019" - m = re.fullmatch(DEARXIV_TEXT_REGEX, key) - self.assertIsNotNone(m) - - class TestProviders(unittest.TestCase): @classmethod def setUpClass(cls): @@ -67,7 +60,8 @@ class TestProviders(unittest.TestCase): shutil.rmtree(self.test_dir) def test_arxiv_1(self): - prov = Arxiv(upload=False, verbose=VERBOSE) + # check with qpdf + prov = Arxiv(upload=False, verbose=VERBOSE, pdftk_path=None) url = "https://arxiv.org/abs/1811.11242v1" exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" filename = prov.run(url) @@ -96,6 +90,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_arxiv_5(self): + prov = Arxiv(upload=False, verbose=VERBOSE, qpdf_path=None) + url = "https://arxiv.org/abs/2002.11523" + exp_filename = "Ponomarev_Oseledets_Cichocki_-_Using_Reinforcement_Learning_in_the_Algorithmic_Trading_Problem_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_arxiv_6(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/pdf/1701.05517.pdf?source=post_page---------------------------" + exp_filename = "Salimans_et_al_-_PixelCNN_Improving_the_PixelCNN_With_Discretized_Logistic_Mixture_Likelihood_and_Other_Modifications_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_pmc(self): prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" @@ -128,13 +136,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) - def test_springer(self): + def test_springer_1(self): prov = Springer(upload=False, verbose=VERBOSE) url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" filename = prov.run(url) self.assertEqual(exp_filename, os.path.basename(filename)) + def test_springer_2(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/content/pdf/10.1007%2F11681878_14.pdf" + exp_filename = "Dwork_et_al_-_Calibrating_Noise_to_Sensitivity_in_Private_Data_Analysis_2006.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + def test_local(self): local_filename = "test.pdf" with open(local_filename, "w") as fp: @@ -145,11 +160,31 @@ class TestProviders(unittest.TestCase): filename = prov.run(local_filename) self.assertEqual("test_.pdf", os.path.basename(filename)) - def test_pdfurl(self): + def test_pdfurl_1(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url) + self.assertEqual("14-526.pdf", os.path.basename(filename)) + + def test_pdfurl_2(self): prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "https://www.manuelrigger.at/preprints/NoREC.pdf" + filename = prov.run(url) + self.assertEqual("NoREC.pdf", os.path.basename(filename)) + + def test_jmlr_1(self): + prov = JMLR(upload=False, verbose=VERBOSE) url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) + exp = "Burg_Groenen_-_GenSVM_a_Generalized_Multiclass_Support_Vector_Machine_2016.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_jmlr_2(self): + prov = JMLR(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/v10/xu09a.html" + exp = "Xu_Zhang_-_Refinement_of_Reproducing_Kernels_2009.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) def test_pmlr_1(self): prov = PMLR(upload=False, verbose=VERBOSE) @@ -179,6 +214,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_nber_1(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w26752" + exp = "Bhattacharya_Packalen_-_Stagnation_and_Scientific_Incentives_2020.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_nber_2(self): + prov = NBER(upload=False, verbose=VERBOSE) + url = "https://www.nber.org/papers/w19152.pdf" + exp = "Herbst_Schorfheide_-_Sequential_Monte_Carlo_Sampling_for_DSGE_Models_2013.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_1(self): prov = NeurIPS(upload=False, verbose=VERBOSE) url = "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf" @@ -193,6 +242,20 @@ class TestProviders(unittest.TestCase): filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) + def test_neurips_3(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/5433-combinatorial-pure-exploration-of-multi-armed-bandits" + exp = "Chen_et_al_-_Combinatorial_Pure_Exploration_of_Multi-Armed_Bandits_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_neurips_4(self): + prov = NeurIPS(upload=False, verbose=VERBOSE) + url = "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf" + exp = "Yin_Shen_-_On_the_Dimensionality_of_Word_Embedding_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + def test_citeseerx_1(self): prov = CiteSeerX(upload=False, verbose=VERBOSE) url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548" @@ -218,6 +281,90 @@ class TestProviders(unittest.TestCase): prov = TandFOnline(upload=False, verbose=VERBOSE) url = "https://www.tandfonline.com/doi/pdf/10.1080/03610918.2012.625790?scroll=top&needAccess=true" exp = "Huskova_Marusiakova_-_M-Procedures_for_Detection_of_Changes_for_Dependent_Observations_2012.pdf" + + def test_html_1(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines" + exp = "Getting_Your_Team_to_Do_More_Than_Meet_Deadlines.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_html_2(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.nature.com/articles/d41586-020-00176-4" + exp = "Isaac_Asimov_Centenary_of_the_Great_Explainer.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_html_3(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://conclave-team.github.io/conclave-site/" + # exp = "Conclave_Case_Study_-_A_Private_and_Secure_Real-Time_Collaborative_Text_Editor.pdf" + # NOTE: Title differs between Readability.JS and readability-lxml, we + # assume that testing is done with Readability.JS + exp = "Conclave.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + # this is a proxy test to check that all images are included + self.assertEqual(32, len(pdfplumber.open(filename).pages)) + + def test_html_4(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://sirupsen.com/2019/" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + def test_html_5(self): + prov = HTML(upload=False, verbose=VERBOSE) + url = "https://www.spiegel.de/panorama/london-tausende-rechtsextreme-demonstranten-wollen-statuen-schuetzen-a-2a1ed9b9-708a-40dc-a5ff-f312e97a60ca#" + filename = prov.run(url) + # this is a proxy test to check that all images are included + self.assertEqual(4, len(pdfplumber.open(filename).pages)) + + def test_semantic_scholar_1(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://pdfs.semanticscholar.org/1b01/dea77e9cbf049b4ee8b68dc4d43529d06299.pdf" + exp = "Dong_et_al_-_TableSense_Spreadsheet_Table_Detection_With_Convolutional_Neural_Networks_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_semantic_scholar_2(self): + prov = SemanticScholar(upload=False, verbose=VERBOSE) + url = "https://www.semanticscholar.org/paper/Fast-Meta-Learning-for-Adaptive-Hierarchical-Design-Burg-Hero/90759dc4ab0ce8d3564044ef92a91080a4f3e55f" + exp = "Burg_Hero_-_Fast_Meta-Learning_for_Adaptive_Hierarchical_Classifier_Design_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_1(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/full/10.1177/0306312714535679" + exp = "Rekdal_-_Academic_Urban_Legends_2014.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_sagepub_2(self): + prov = SagePub(upload=False, verbose=VERBOSE) + url = "https://journals.sagepub.com/doi/pdf/10.1177/1352458517694432" + exp = "Kobelt_et_al_-_New_Insights_Into_the_Burden_and_Costs_of_Multiple_Sclerosis_in_Europe_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_1(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Muhammad_Goal-Driven_Sequential_Data_Abstraction_ICCV_2019_paper.html" + exp = ( + "Muhammad_et_al_-_Goal-Driven_Sequential_Data_Abstraction_2019.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp, os.path.basename(filename)) + + def test_cvf_2(self): + prov = CVF(upload=False, verbose=VERBOSE) + url = "https://openaccess.thecvf.com/content_CVPR_2020/papers/Park_Seeing_the_World_in_a_Bag_of_Chips_CVPR_2020_paper.pdf" + exp = ( + "Park_Holynski_Seitz_-_Seeing_the_World_in_a_Bag_of_Chips_2020.pdf" + ) filename = prov.run(url) self.assertEqual(exp, os.path.basename(filename)) diff --git a/tests/test_ui.py b/tests/test_ui.py new file mode 100644 index 0000000..835f594 --- /dev/null +++ b/tests/test_ui.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Unit tests for command line interface + +This file is part of paper2remarkable. + +""" + +import os +import shutil +import tempfile +import unittest + +from paper2remarkable.exceptions import ( + InvalidURLError, + UnidentifiedSourceError, +) +from paper2remarkable.providers import ( + ACM, + Arxiv, + CiteSeerX, + CVF, + HTML, + JMLR, + LocalFile, + NBER, + NeurIPS, + OpenReview, + PMLR, + PdfUrl, + PubMed, + Springer, +) +from paper2remarkable.ui import choose_provider + + +class TestUI(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_choose_provider_1(self): + tests = [ + ( + Arxiv, + "https://arxiv.org/abs/1811.11242v1", + "https://arxiv.org/abs/1811.11242v1", + ), + ( + Arxiv, + "http://arxiv.org/abs/arXiv:1908.03213", + "https://arxiv.org/abs/1908.03213", + ), + ( + Arxiv, + "https://arxiv.org/abs/math/0309285", + "https://arxiv.org/abs/math/0309285", + ), + ( + Arxiv, + "https://arxiv.org/pdf/physics/0605197v1.pdf", + "https://arxiv.org/pdf/physics/0605197v1.pdf", + ), + ( + PubMed, + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/", + ), + ( + ACM, + "https://dl.acm.org/citation.cfm?id=3025626", + "https://dl.acm.org/doi/10.1145/3025453.3025626", + ), + ( + ACM, + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true", + "https://dl.acm.org/doi/pdf/10.1145/3219819.3220081?download=true&", + ), + ( + OpenReview, + "http://openreview.net/forum?id=S1x4ghC9tQ", + "https://openreview.net/forum?id=S1x4ghC9tQ", + ), + ( + Springer, + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + "https://link.springer.com/article/10.1007/s10618-019-00631-5", + ), + ( + PdfUrl, + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + "https://confcats_isif.s3.amazonaws.com/web-files/journals/entries/Nonlinear%20Kalman%20Filters.pdf", + ), + ( + JMLR, + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + "https://www.jmlr.org/papers/volume17/14-526/14-526.pdf", + ), + ( + JMLR, + "https://www.jmlr.org/papers/v10/xu09a.html", + "https://www.jmlr.org/papers/v10/xu09a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v97/behrmann19a.html", + "http://proceedings.mlr.press/v97/behrmann19a.html", + ), + ( + PMLR, + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + "http://proceedings.mlr.press/v15/maaten11b/maaten11b.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/melnyk16.pdf", + "http://proceedings.mlr.press/v48/melnyk16.pdf", + ), + ( + PMLR, + "http://proceedings.mlr.press/v48/zhangf16.html", + "http://proceedings.mlr.press/v48/zhangf16.html", + ), + ( + NBER, + "https://www.nber.org/papers/w26752", + "https://www.nber.org/papers/w26752", + ), + ( + NBER, + "https://www.nber.org/papers/w19152.pdf", + "https://www.nber.org/system/files/working_papers/w19152/w19152.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + "https://papers.nips.cc/paper/325-leaning-by-combining-memorization-and-gradient-descent.pdf", + ), + ( + NeurIPS, + "https://papers.nips.cc/paper/7796-middle-out-decoding", + "https://papers.nips.cc/paper/7796-middle-out-decoding", + ), + ( + NeurIPS, + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + "http://papers.neurips.cc/paper/7368-on-the-dimensionality-of-word-embedding.pdf", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.89.6548", + ), + ( + CiteSeerX, + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.123.7607&rep=rep1&type=pdf", + ), + ( + HTML, + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + "https://hbr.org/2019/11/getting-your-team-to-do-more-than-meet-deadlines", + ), + ( + HTML, + "https://www.nature.com/articles/d41586-020-00176-4", + "https://www.nature.com/articles/d41586-020-00176-4", + ), + ( + CVF, + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + "https://openaccess.thecvf.com/content_cvpr_2018/html/Cheng_Dual_Skipping_Networks_CVPR_2018_paper.html", + ), + ] + for exp_prov, url, exp_url in tests: + prov, new_url, jar = choose_provider(url) + with self.subTest(url=url): + self.assertEqual(exp_url, new_url) + self.assertEqual(prov, exp_prov) + + def test_choose_provider_2(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + + prov, new_input, jar = choose_provider(local_filename) + self.assertEqual(prov, LocalFile) + self.assertEqual(new_input, local_filename) + self.assertIsNone(jar) + + def test_choose_provider_3(self): + local_filename = "/tmp/abcdef.pdf" + with self.assertRaises(UnidentifiedSourceError): + choose_provider(local_filename) + + def test_choose_provider_4(self): + url = "https://raw.githubusercontent.com/GjjvdBurg/paper2remarkable/master/README.md" + with self.assertRaises(InvalidURLError): + choose_provider(url) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..4c122e0 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest + +from paper2remarkable.exceptions import NoPDFToolError +from paper2remarkable.utils import check_pdftool + + +class TestUtils(unittest.TestCase): + def test_check_pdftool(self): + # Needs a system with both pdftk and qpdf available + self.assertEqual(check_pdftool("pdftk", "qpdf"), "pdftk") + self.assertEqual(check_pdftool("pdftk_xyz", "qpdf"), "qpdf") + self.assertEqual(check_pdftool("pdftk", "qpdf_xyz"), "pdftk") + with self.assertRaises(NoPDFToolError): + check_pdftool("pdftk_xyz", "qpdf_xyz") + + +if __name__ == "__main__": + unittest.main() |
