aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-10-25 11:40:05 +0100
committerGitHub <noreply@github.com>2019-10-25 11:40:05 +0100
commit8ca58a5a7a3e7ca846c2e5ad8ccf354f1f3f1594 (patch)
tree2639307f4198185de38f40621de8930bc9f6a01b
parentBump version (diff)
parentupdate dockerfile (diff)
downloadpaper2remarkable-8ca58a5a7a3e7ca846c2e5ad8ccf354f1f3f1594.tar.gz
paper2remarkable-8ca58a5a7a3e7ca846c2e5ad8ccf354f1f3f1594.zip
Merge pull request #14 from GjjvdBurg/refactorv0.4.0
Refactor to package
-rw-r--r--.gitignore5
-rw-r--r--CHANGELOG.md6
-rw-r--r--Dockerfile12
-rw-r--r--MANIFEST.in10
-rw-r--r--Makefile60
-rw-r--r--README.md100
-rwxr-xr-xarxiv2remarkable.py859
-rw-r--r--make_release.py240
-rw-r--r--paper2remarkable/__init__.py5
-rw-r--r--paper2remarkable/__main__.py15
-rw-r--r--paper2remarkable/__version__.py5
-rw-r--r--paper2remarkable/crop.py160
-rw-r--r--paper2remarkable/log.py56
-rw-r--r--paper2remarkable/pdf_ops.py98
-rw-r--r--paper2remarkable/providers/__init__.py11
-rw-r--r--paper2remarkable/providers/_base.py117
-rw-r--r--paper2remarkable/providers/_info.py106
-rw-r--r--paper2remarkable/providers/acm.py80
-rw-r--r--paper2remarkable/providers/arxiv.py99
-rw-r--r--paper2remarkable/providers/local.py38
-rw-r--r--paper2remarkable/providers/openreview.py51
-rw-r--r--paper2remarkable/providers/pdf_url.py39
-rw-r--r--paper2remarkable/providers/pubmed.py57
-rw-r--r--paper2remarkable/providers/springer.py49
-rw-r--r--paper2remarkable/ui.py107
-rw-r--r--paper2remarkable/utils.py120
-rw-r--r--poetry.lock183
-rw-r--r--pyproject.toml19
-rw-r--r--setup.py100
-rw-r--r--tests/test_providers.py (renamed from test.py)8
30 files changed, 1688 insertions, 1127 deletions
diff --git a/.gitignore b/.gitignore
index c18dd8d..558dbc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
__pycache__/
+paper2remarkable.egg-info/
+dist/*
+build/*
+*.pyc
+*/__pycache__/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..ac4f357
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,6 @@
+# Changelog
+
+## Version 0.4.0
+
+* Refactor code to make it a real Python package
+* Rename to ``paper2remarkable``
diff --git a/Dockerfile b/Dockerfile
index 6578db3..cb7cb19 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,14 +19,6 @@ RUN apt-get update \
pdftk \
texlive-extra-utils # contains pdfcrop
-RUN pip install \
- bs4 \
- requests \
- PyPDF2 \
- titlecase \
- pdfplumber \
- unidecode
+RUN pip install paper2remarkable
-COPY arxiv2remarkable.py ./
-
-ENTRYPOINT ["python", "arxiv2remarkable.py"]
+ENTRYPOINT ["p2r"]
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..021523f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
+include setup.py
+include README.md
+include LICENSE
+recursive-include paper2remarkable *.py
+recursive-include tests *.py
+exclude Makefile
+exclude .gitignore
+exclude Dockerfile
+exclude make_release.py
+prune old
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2a656d4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,60 @@
+# Makefile for easier installation and cleanup.
+#
+# Uses self-documenting macros from here:
+# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+
+PACKAGE=paper2remarkable
+DOC_DIR='./docs/'
+VENV_DIR=/tmp/p2r_venv/
+
+.PHONY: help cover dist
+
+.DEFAULT_GOAL := help
+
+help:
+ @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
+ awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
+ %s\n", $$1, $$2}'
+
+release: ## Make a release
+ python make_release.py
+
+
+install: ## Install for the current user using the default python command
+ python setup.py build_ext --inplace
+ python setup.py install --user
+
+
+test: venv ## Run unit tests
+ source $(VENV_DIR)/bin/activate && green -f -vv -a ./tests
+
+
+clean: ## Clean build dist and egg directories left after install
+ rm -rf ./dist
+ rm -rf ./build
+ rm -rf ./$(PACKAGE).egg-info
+ rm -rf $(VENV_DIR)
+ rm -f MANIFEST
+ find . -type f -iname '*.pyc' -delete
+ find . -type d -name '__pycache__' -empty -delete
+
+dist: ## Make Python source distribution
+ python setup.py sdist
+ python setup.py bdist_wheel --universal
+
+docs: doc
+doc: install ## Build documentation with Sphinx
+ m2r README.md && mv README.rst $(DOC_DIR)
+ m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
+ cd $(DOC_DIR) && \
+ rm source/* && \
+ sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \
+ touch source/AUTOGENERATED
+ $(MAKE) -C $(DOC_DIR) html
+
+venv: $(VENV_DIR)/bin/activate
+
+$(VENV_DIR)/bin/activate:
+ test -d $(VENV_DIR) || virtualenv $(VENV_DIR)
+ source $(VENV_DIR)/bin/activate && pip install -e .[dev]
+ touch $(VENV_DIR)/bin/activate
diff --git a/README.md b/README.md
index a01665c..8295e37 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,21 @@
-# arxiv2remarkable.py
+# paper2remarkable
-``arxiv2remarkable`` is a command line program to quickly transfer a paper to
-your reMarkable. The script can be run as a plain Python script or via Docker
+*Note: ``paper2remarkable`` is the new name for the ``arxiv2remarkable``
+script. The name was changed because it better captures what the program
+does.*
+
+``paper2remarkable`` is a command line program for quickly and easily
+transferring an academic paper to your reMarkable:
+
+```
+$ p2r https://arxiv.org/abs/1811.11242
+```
+
+The script can be run through the ``p2r`` command line program or via Docker
(see below).
-This script makes it as easy as possible to get a PDF on your reMarkable from
-any of the following sources:
+paper2remarkable makes it as easy as possible to get a PDF on your reMarkable
+from any of the following sources:
- an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``)
- a PubMed Central url (either to the HTML or the PDF)
@@ -16,10 +26,10 @@ any of the following sources:
- a url to a PDF file
- a local file.
-The script takes the source and:
+When called, the paper2remarkable takes the source and:
1. Downloads the pdf if necessary
-2. Removes the arXiv timestamp
+2. Removes the arXiv timestamp (for arXiv sources)
3. Crops the pdf to remove unnecessary borders
4. Shrinks the pdf file to reduce the filesize
5. Generates a nice filename based on author/title/year of the paper
@@ -37,41 +47,39 @@ Optionally, you can:
Here's the full help of the script:
```text
-usage: arxiv2remarkable.py [-h] [-b] [-v] [-n] [-d] [-c] [--filename FILENAME]
- [-p REMARKABLE_DIR] [--rmapi RMAPI]
- [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS]
- input
+usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v]
+ [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK]
+ [--rmapi RMAPI]
+ input
+
+Paper2reMarkable version 0.4.0
positional arguments:
input URL to a paper or the path of a local PDF file
optional arguments:
-h, --help show this help message and exit
- -b, --blank Add a blank page after every page of the PDF (default:
- False)
- -v, --verbose be verbose (default: False)
- -n, --no-upload don't upload to the reMarkable, save the output in
- current working dir (default: False)
- -d, --debug debug mode, doesn't upload to reMarkable (default:
- False)
+ -b, --blank Add a blank page after every page of the PDF
-c, --center Center the PDF on the page, instead of left align
- (default: False)
- --filename FILENAME Filename to use for the file on reMarkable (default:
- None)
+ -d, --debug debug mode, doesn't upload to reMarkable
+ -n, --no-upload don't upload to the reMarkable, save the output in
+ current working dir
-p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR
directory on reMarkable to put the file (created if
- missing) (default: /)
- --rmapi RMAPI path to rmapi executable (default: rmapi)
+ missing, default: /)
+ -v, --verbose be verbose
+ --filename FILENAME Filename to use for the file on reMarkable
+ --gs GS path to gs executable (default: gs)
--pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop)
--pdftk PDFTK path to pdftk executable (default: pdftk)
- --gs GS path to gs executable (default: gs)
+ --rmapi RMAPI path to rmapi executable (default: rmapi)
```
And here's an example with verbose mode enabled that shows everything the
script does by default:
-```bash
-$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
+```
+$ p2r -v https://arxiv.org/abs/1811.11242
2019-05-30 00:38:27 - INFO - Starting ArxivProvider
2019-05-30 00:38:27 - INFO - Getting paper info from arXiv
2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242
@@ -86,7 +94,7 @@ $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242
2019-05-30 00:38:42 - INFO - Upload successful.
```
-## Dependencies
+## Installation
The script requires the following external programs to be available:
@@ -96,27 +104,15 @@ The script requires the following external programs to be available:
- [GhostScript](https://www.ghostscript.com/)
- [rMAPI](https://github.com/juruen/rmapi)
-If these scripts are not available on the ``PATH`` variable, you can supply them
-with the relevant options to the script.
-
-The script also needs the following Python packages:
+If these scripts are not available on the ``PATH`` variable, you can supply
+them with the relevant options to the script. Then, you can install
+paper2remarkable from PyPI:
-- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML
-- [requests](https://pypi.org/project/requests/): getting HTML
-- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF
-- [titlecase](https://pypi.org/project/titlecase/): fancy titles
-- [pdfplumber](https://github.com/jsvine/pdfplumber): used for better page
- cropping
-- [unidecode](https://pypi.org/project/Unidecode/): clean accented characters
- from the filename
-
-If you use [Poetry](https://poetry.eustace.io/) you can install these
-dependencies using ``poetry install`` in the project directory. Alternatively,
-you can use ``pip`` with the following command:
-
-```bash
-pip install --user bs4 requests PyPDF2 titlecase pdfplumber unidecode
```
+pip install paper2remarkable
+```
+
+This installs the ``p2r`` command line program.
## Docker
@@ -127,7 +123,7 @@ First clone this repository with `git clone` and `cd` inside of it, then build
the container:
```bash
-docker build -t arxiv2remarkable .
+docker build -t paper2remarkable .
```
### Authorization
@@ -137,7 +133,7 @@ we'll use `rmapi` to create it.
```bash
touch ${HOME}/.rmapi
-docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi arxiv2remarkable version
+docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi paper2remarkable version
```
which should end with output like
@@ -149,15 +145,15 @@ rmapi version: 0.0.5
### Usage
-Use the container by replacing `python arxiv2remarkable.py` with `docker run
---rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable`, e.g.
+Use the container by replacing `p2r` with `docker run --rm -v
+"${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable`, e.g.
```
# print help and exit
-docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable --help
+docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable --help
# equivalent to above usage via `python`
-docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable -v https://arxiv.org/abs/1811.11242
+docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable -v https://arxiv.org/abs/1811.11242
```
# Notes
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py
deleted file mode 100755
index 5694e1b..0000000
--- a/arxiv2remarkable.py
+++ /dev/null
@@ -1,859 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__version__ = "0.3.5"
-__author__ = "G.J.J. van den Burg"
-
-"""
-Download a paper from various sources and send it to the reMarkable.
-
-Author: G.J.J. van den Burg
-Date: 2019-02-02
-License: MIT
-
-"""
-
-import PyPDF2
-import abc
-import argparse
-import bs4
-import datetime
-import os
-import pdfplumber
-import re
-import requests
-import shutil
-import string
-import subprocess
-import sys
-import tempfile
-import time
-import titlecase
-import unidecode
-import urllib.parse
-
-GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
-
-HEADERS = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
- "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
- "Safari/537.36"
-}
-
-RM_WIDTH = 1404
-RM_HEIGHT = 1872
-
-
-class Provider(metaclass=abc.ABCMeta):
- """ ABC for providers of pdf sources """
-
- meta_author_key = "citation_author"
- meta_title_key = "citation_title"
- meta_date_key = "citation_date"
-
- def __init__(
- self,
- verbose=False,
- upload=True,
- debug=False,
- center=False,
- blank=False,
- remarkable_dir="/",
- rmapi_path="rmapi",
- pdfcrop_path="pdfcrop",
- pdftk_path="pdftk",
- gs_path="gs",
- ):
- self.verbose = verbose
- self.upload = upload
- self.debug = debug
- self.center = center
- self.blank = blank
- self.remarkable_dir = remarkable_dir
- self.rmapi_path = rmapi_path
- self.pdfcrop_path = pdfcrop_path
- self.pdftk_path = pdftk_path
- self.gs_path = gs_path
-
- self.log("Starting %s" % type(self).__name__)
-
- def log(self, msg, mode="info"):
- if not self.verbose:
- return
- if not mode in ["info", "warning"]:
- raise ValueError("unknown logging mode.")
- now = datetime.datetime.now()
- print(
- now.strftime("%Y-%m-%d %H:%M:%S")
- + " - "
- + mode.upper()
- + " - "
- + msg
- )
-
- def warn(self, msg):
- self.log(msg, mode="warning")
-
- @staticmethod
- @abc.abstractmethod
- def validate(src):
- """ Validate whether ``src`` is appropriate for this provider """
-
- def retrieve_pdf(self, src, filename):
- """ Download pdf from src and save to filename """
- _, pdf_url = self.get_abs_pdf_urls(src)
- self.download_url(pdf_url, filename)
-
- def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
- op = (lambda x: x) if op is None else op
- # format the author list retrieved by bs4
- return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
-
- def get_authors(self, soup):
- authors = [
- x["content"]
- for x in soup.find_all("meta", {"name": self.meta_author_key})
- ]
- return self._format_authors(authors)
-
- def get_title(self, soup):
- target = soup.find_all("meta", {"name": self.meta_title_key})
- return target[0]["content"]
-
- def _format_date(self, soup_date):
- return soup_date
-
- def get_date(self, soup):
- date = soup.find_all("meta", {"name": self.meta_date_key})[0][
- "content"
- ]
- return self._format_date(date)
-
- def get_paper_info(
- self,
- src,
- author_key="citation_author",
- title_key="citation_title",
- date_key="citation_date",
- ):
- """ Retrieve the title/author (surnames)/year information """
- abs_url, _ = self.get_abs_pdf_urls(src)
- self.log("Getting paper info")
- page = self.get_page_with_retry(abs_url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- authors = self.get_authors(soup)
- title = self.get_title(soup)
- date = self.get_date(soup)
- return dict(title=title, date=date, authors=authors)
-
- def string_clean(self, s):
- """ Clean a string to replace accented characters with equivalents and
- keep only the allowed characters """
- normalized = unidecode.unidecode(s)
- allowed = string.ascii_letters + string.digits + "_ ."
- cleaned = "".join(c if c in allowed else "_" for c in normalized)
- return cleaned
-
- def create_filename(self, info, filename=None):
- """ Generate filename using the info dict or filename if provided """
- if not filename is None:
- return filename
- # we assume that the list of authors is surname only.
- self.log("Generating output filename")
-
- if len(info["authors"]) > 3:
- author_part = info["authors"][0] + "_et_al"
- else:
- author_part = "_".join(info["authors"])
- author_part = self.string_clean(author_part)
-
- title_part = self.string_clean(info["title"])
- title_part = titlecase.titlecase(title_part).replace(" ", "_")
-
- year_part = info["date"].split("/")[0]
-
- name = author_part + "_-_" + title_part + "_" + year_part + ".pdf"
- name = unidecode.unidecode(name)
- self.log("Created filename: %s" % name)
- return name
-
- def blank_pdf(self, filepath):
- if not self.blank:
- return filepath
-
- self.log("Adding blank pages")
- input_pdf = PyPDF2.PdfFileReader(filepath)
- output_pdf = PyPDF2.PdfFileWriter()
- for page in input_pdf.pages:
- output_pdf.addPage(page)
- output_pdf.addBlankPage()
-
- output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
- with open(output_file, "wb") as fp:
- output_pdf.write(fp)
- return output_file
-
- def crop_pdf(self, filepath):
- self.log("Cropping pdf file")
- cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
- cropper = Cropper(
- filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
- )
- status = cropper.crop(margins=15)
-
- if not status == 0:
- self.warn("Failed to crop the pdf file at: %s" % filepath)
- return filepath
- if not os.path.exists(cropped_file):
- self.warn(
- "Can't find cropped file '%s' where expected." % cropped_file
- )
- return filepath
- return cropped_file
-
- def center_pdf(self, filepath):
- if not self.center:
- return filepath
-
- self.log("Centering pdf file")
- centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
- cropper = Cropper(
- filepath, centered_file, pdfcrop_path=self.pdfcrop_path
- )
- status = cropper.center()
- if not status == 0:
- self.warn("Failed to center the pdf file at: %s" % filepath)
- return filepath
- if not os.path.exists(centered_file):
- self.warn(
- "Can't find centered file '%s' where expected." % centered_file
- )
- return filepath
- return centered_file
-
- def shrink_pdf(self, filepath):
- self.log("Shrinking pdf file")
- output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
- status = subprocess.call(
- [
- self.gs_path,
- "-sDEVICE=pdfwrite",
- "-dCompatibilityLevel=1.4",
- "-dPDFSETTINGS=/printer",
- "-dNOPAUSE",
- "-dBATCH",
- "-dQUIET",
- "-sOutputFile=%s" % output_file,
- filepath,
- ]
- )
- if not status == 0:
- self.warn("Failed to shrink the pdf file")
- return filepath
- return output_file
-
- def check_file_is_pdf(self, filename):
- try:
- fp = open(filename, "rb")
- pdf = PyPDF2.PdfFileReader(fp, strict=False)
- fp.close()
- del pdf
- return True
- except PyPDF2.utils.PdfReadError:
- exception("Downloaded file isn't a valid pdf file.")
-
- def download_url(self, url, filename):
- """Download the content of an url and save it to a filename """
- self.log("Downloading file at url: %s" % url)
- content = self.get_page_with_retry(url)
- with open(filename, "wb") as fid:
- fid.write(content)
-
- def get_page_with_retry(self, url, tries=5):
- count = 0
- while count < tries:
- count += 1
- error = False
- try:
- res = requests.get(url, headers=HEADERS)
- except requests.exceptions.ConnectionError:
- error = True
- if error or not res.ok:
- self.warn("Error getting url %s. Retrying in 5 seconds" % url)
- time.sleep(5)
- continue
- self.log("Downloading url: %s" % url)
- return res.content
-
- def upload_to_rm(self, filepath):
- remarkable_dir = self.remarkable_dir.rstrip("/")
- self.log("Starting upload to reMarkable")
- if remarkable_dir:
- status = subprocess.call(
- [self.rmapi_path, "mkdir", remarkable_dir + "/"],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- exception(
- "Creating directory %s on reMarkable failed"
- % remarkable_dir
- )
- status = subprocess.call(
- [self.rmapi_path, "put", filepath, remarkable_dir + "/"],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- exception("Uploading file %s to reMarkable failed" % filepath)
- self.log("Upload successful.")
-
- def dearxiv(self, input_file):
- """Remove the arXiv timestamp from a pdf"""
- self.log("Removing arXiv timestamp")
- basename = os.path.splitext(input_file)[0]
- uncompress_file = basename + "_uncompress.pdf"
-
- status = subprocess.call(
- [
- self.pdftk_path,
- input_file,
- "output",
- uncompress_file,
- "uncompress",
- ]
- )
- if not status == 0:
- exception("pdftk failed to uncompress the pdf.")
-
- with open(uncompress_file, "rb") as fid:
- data = fid.read()
- # Remove the text element
- data = re.sub(
- b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
- b"()Tj",
- data,
- )
- # Remove the URL element
- data = re.sub(
- b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
- b"",
- data,
- )
-
- removed_file = basename + "_removed.pdf"
- with open(removed_file, "wb") as oid:
- oid.write(data)
-
- output_file = basename + "_dearxiv.pdf"
- status = subprocess.call(
- [self.pdftk_path, removed_file, "output", output_file, "compress"]
- )
- if not status == 0:
- exception("pdftk failed to compress the pdf.")
-
- return output_file
-
- def run(self, src, filename=None):
- info = self.get_paper_info(src)
- clean_filename = self.create_filename(info, filename)
- tmp_filename = "paper.pdf"
-
- self.initial_dir = os.getcwd()
- with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir:
- os.chdir(working_dir)
- self.retrieve_pdf(src, tmp_filename)
- self.check_file_is_pdf(tmp_filename)
-
- ops = [
- self.dearxiv,
- self.crop_pdf,
- self.center_pdf,
- self.blank_pdf,
- self.shrink_pdf,
- ]
- intermediate_fname = tmp_filename
- for op in ops:
- intermediate_fname = op(intermediate_fname)
- shutil.move(intermediate_fname, clean_filename)
-
- if self.debug:
- print("Paused in debug mode in dir: %s" % working_dir)
- print("Press enter to exit.")
- return input()
-
- if self.upload:
- return self.upload_to_rm(clean_filename)
-
- target_path = os.path.join(self.initial_dir, clean_filename)
- while os.path.exists(target_path):
- base = os.path.splitext(target_path)[0]
- target_path = base + "_.pdf"
- shutil.move(clean_filename, target_path)
- return target_path
-
-
-class Arxiv(Provider):
-
- re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
- re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_abs_pdf_urls(self, url):
- """Get the pdf and abs url from any given arXiv url """
- if re.match(self.re_abs, url):
- abs_url = url
- pdf_url = url.replace("abs", "pdf") + ".pdf"
- elif re.match(self.re_pdf, url):
- abs_url = url[:-4].replace("pdf", "abs")
- pdf_url = url
- else:
- exception("Couldn't figure out arXiv urls.")
- return abs_url, pdf_url
-
- def validate(src):
- """Check if the url is to an arXiv page. """
- return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
-
-
-class Pubmed(Provider):
-
- meta_author_key = "citation_authors"
-
- re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
- re_pdf = (
- "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
- )
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_abs_pdf_urls(self, url):
- """Get the pdf and html url from a given PMC url """
- if re.match(self.re_pdf, url):
- idx = url.index("pdf")
- abs_url = url[: idx - 1]
- pdf_url = url
- elif re.match(self.re_abs, url):
- abs_url = url
- pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually
- else:
- exception("Couldn't figure out PMC urls.")
- return abs_url, pdf_url
-
- def validate(src):
- return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src)
-
- def _format_authors(self, soup_authors):
- op = lambda x: x[0].split(",")
- return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
-
- def _format_date(self, soup_date):
- if re.match("\w+\ \d{4}", soup_date):
- return soup_date.split(" ")[-1]
- return soup_date.replace(" ", "_")
-
-
-class ACM(Provider):
-
- meta_author_key = "citation_authors"
-
- re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_acm_pdf_url(self, url):
- page = self.get_page_with_retry(url)
- soup = bs4.BeautifulSoup(page, "html.parser")
- thea = None
- for a in soup.find_all("a"):
- if a.get("name") == "FullTextPDF":
- thea = a
- break
- if thea is None:
- return None
- href = thea.get("href")
- if href.startswith("http"):
- return href
- else:
- return "https://dl.acm.org/" + href
-
- def get_abs_pdf_urls(self, url):
- if re.match(self.re_abs, url):
- abs_url = url
- pdf_url = self.get_acm_pdf_url(url)
- if pdf_url is None:
- exception(
- "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
- )
- else:
- exception(
- "Couldn't figure out ACM urls, please provide a URL of the "
- "format: http(s)://dl.acm.org/citation.cfm?id=..."
- )
- return abs_url, pdf_url
-
- def validate(src):
- m = re.fullmatch(ACM.re_abs, src)
- return not m is None
-
- def _format_authors(self, soup_authors):
- op = lambda x: x[0].split(";")
- return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
-
- def _format_date(self, soup_date):
- if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
- self.warn(
- "Couldn't extract year from ACM page, please raise an "
- "issue on GitHub so it can be fixed: %s" % GITHUB_URL
- )
- return soup_date.strip().split("/")[-1]
-
-
-class OpenReview(Provider):
-
- meta_date_key = "citation_publication_date"
-
- re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
- re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_abs_pdf_urls(self, url):
- """ Get the pdf and abstract url from a OpenReview url """
- if re.match(self.re_abs, url):
- abs_url = url
- pdf_url = url.replace("forum", "pdf")
- elif re.match(self.re_pdf, url):
- abs_url = url.replace("pdf", "forum")
- pdf_url = url
- else:
- exception("Couldn't figure out OpenReview urls.")
- return abs_url, pdf_url
-
- def validate(src):
- """ Check if the url is a valid OpenReview url. """
- return re.match(OpenReview.re_abs, src) or re.match(
- OpenReview.re_pdf, src
- )
-
- def _format_authors(self, soup_authors):
- return super()._format_authors(soup_authors, sep=" ", idx=-1)
-
-
-class Springer(Provider):
-
- meta_date_key = "citation_online_date"
-
- re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
- re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def get_abs_pdf_urls(self, url):
- """ Get the pdf and abstract urls from a Springer url """
- if re.match(self.re_abs, url):
- abs_url = url
- pdf_url = url.replace("article", "content/pdf")
- elif re.match(self.re_pdf, url):
- abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
- pdf_url = urllib.parse.unquote(url)
- else:
- exception("Couldn't figure out Springer urls.")
- return abs_url, pdf_url
-
- def validate(src):
- return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
-
- def _format_authors(self, soup_authors):
- return super()._format_authors(soup_authors, sep=" ", idx=-1)
-
-
-class LocalFile(Provider):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def validate(src):
- return os.path.exists(src)
-
- def retrieve_pdf(self, src, filename):
- source = os.path.join(self.initial_dir, src)
- shutil.copy(source, filename)
-
- def get_paper_info(self, src):
- return {"filename": src}
-
- def create_filename(self, info, filename=None):
- if not filename is None:
- return filename
- return os.path.basename(info["filename"])
-
-
-class PdfUrl(Provider):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def validate(src):
- try:
- result = urllib.parse.urlparse(src)
- return all([result.scheme, result.netloc, result.path])
- except:
- return False
-
- def retrieve_pdf(self, url, filename):
- self.download_url(url, filename)
-
- def get_paper_info(self, src):
- return None
-
- def create_filename(self, info, filename=None):
- if filename is None:
- exception(
- "Filename must be provided with PDFUrlProvider (use --filename)"
- )
- return filename
-
-
-class Cropper(object):
- def __init__(
- self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
- ):
- if not input_file is None:
- self.input_file = os.path.abspath(input_file)
- self.reader = PyPDF2.PdfFileReader(self.input_file)
- if not output_file is None:
- self.output_file = os.path.abspath(output_file)
- self.pdfcrop_path = pdfcrop_path
-
- self.writer = PyPDF2.PdfFileWriter()
-
- def crop(self, margins=1):
- return self.process_file(self.crop_page, margins=margins)
-
- def center(self, padding=15):
- return self.process_file(self.center_page, padding=padding)
-
- def process_file(self, page_func, *args, **kwargs):
- for page_idx in range(self.reader.getNumPages()):
- status = page_func(page_idx, *args, **kwargs)
- if not status == 0:
- return status
- with open(self.output_file, "wb") as fp:
- self.writer.write(fp)
- return 0
-
- def center_page(self, page_idx, padding):
- return self.process_page(
- page_idx, self.get_center_bbox, padding=padding
- )
-
- def crop_page(self, page_idx, margins):
- return self.process_page(page_idx, self.get_bbox, margins=margins)
-
- def export_page(self, page_idx):
- """Helper function that exports a single page given by index """
- page = self.reader.getPage(page_idx)
- writer = PyPDF2.PdfFileWriter()
- writer.addPage(page)
- tmpfname = "./page.pdf"
- with open(tmpfname, "wb") as fp:
- writer.write(fp)
- return tmpfname
-
- def process_page(self, page_idx, bbox_func, *args, **kwargs):
- """Process a single page and add it to the writer """
- tmpfname = self.export_page(page_idx)
- tmpfout = "./output.pdf"
- bbox = bbox_func(tmpfname, *args, **kwargs)
- status = subprocess.call(
- [
- self.pdfcrop_path,
- "--bbox",
- " ".join(map(str, bbox)),
- tmpfname,
- tmpfout,
- ],
- stdout=subprocess.DEVNULL,
- )
- if not status == 0:
- return status
- reader = PyPDF2.PdfFileReader(tmpfout)
- page = reader.getPage(0)
- self.writer.addPage(page)
- os.unlink(tmpfname)
- os.unlink(tmpfout)
- return 0
-
- def get_bbox(self, filename, margins=1, resolution=72):
- """Get the bounding box, with optional margins
-
- if margins is integer, used for all margins, else
- margins = [left, top, right, bottom]
-
- We get the bounding box by finding the smallest rectangle that is
- completely surrounded by white pixels.
- """
- if isinstance(margins, int):
- margins = [margins for _ in range(4)]
- pdf = pdfplumber.open(filename)
- im = pdf.pages[0].to_image(resolution=resolution)
- pdf.close()
-
- pixels = list(im.original.getdata())
- W, H = im.original.size
-
- # M is a list of H lists with each W integers that equal the sum of the
- # pixel values
- M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
-
- left, top, bottom, right = 0, 0, 0, 0
- while top < H and sum(M[top]) == W * 255 * 3:
- top += 1
- while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
- bottom += 1
-
- # Transpose M
- M = list(zip(*M))
- while left < W and sum(M[left]) == H * 255 * 3:
- left += 1
- while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
- right += 1
-
- left -= margins[0]
- top -= margins[1]
- right -= margins[2]
- bottom -= margins[3]
-
- # This is the bounding box in PIL format: (0, 0) top left
- x0, y0, x1, y1 = left, top, W - right, H - bottom
-
- # Get the bbox in Ghostscript format: (0, 0) bottom left
- a0, b0, a1, b1 = x0, H - y1, x1, H - y0
- return [a0, b0, a1, b1]
-
- def get_center_bbox(self, filename, padding=15):
- """Compute a bounding box that will center the page file on the
- reMarkable
- """
- bbox = self.get_bbox(filename, margins=0)
-
- h = bbox[3] - bbox[1]
- w = bbox[2] - bbox[0]
-
- # we want some minimal padding all around, because it is visually more
- # pleasing.
- h_prime = h + 2 * padding
- w_prime = w + 2 * padding
-
- # if the document is wider than the remarkable, we add top-padding to
- # center it, otherwise we add left-padding
- x, y = 0, 0
- if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
- y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
- else:
- x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
-
- margins = [padding + x, padding + y, padding, padding]
- return self.get_bbox(filename, margins=margins)
-
-
-def exception(msg):
- print("ERROR: " + msg, file=sys.stderr)
- print("Error occurred. Exiting.", file=sys.stderr)
- print("", file=sys.stderr)
- print(
- "If you think this might be a bug, please raise an issue on GitHub: %s"
- % GITHUB_URL
- )
- raise SystemExit(1)
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
- parser.add_argument(
- "-b",
- "--blank",
- help="Add a blank page after every page of the PDF",
- action="store_true",
- )
- parser.add_argument(
- "-v", "--verbose", help="be verbose", action="store_true"
- )
- parser.add_argument(
- "-n",
- "--no-upload",
- help="don't upload to the reMarkable, save the output in current working dir",
- action="store_true",
- )
- parser.add_argument(
- "-d",
- "--debug",
- help="debug mode, doesn't upload to reMarkable",
- action="store_true",
- )
- parser.add_argument(
- "-c",
- "--center",
- help="Center the PDF on the page, instead of left align",
- action="store_true",
- )
- parser.add_argument(
- "--filename",
- help="Filename to use for the file on reMarkable",
- default=None,
- )
- parser.add_argument(
- "-p",
- "--remarkable-path",
- help="directory on reMarkable to put the file (created if missing)",
- dest="remarkable_dir",
- default="/",
- )
- parser.add_argument(
- "--rmapi", help="path to rmapi executable", default="rmapi"
- )
- parser.add_argument(
- "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
- )
- parser.add_argument(
- "--pdftk", help="path to pdftk executable", default="pdftk"
- )
- parser.add_argument("--gs", help="path to gs executable", default="gs")
- parser.add_argument(
- "input", help="URL to a paper or the path of a local PDF file"
- )
- return parser.parse_args()
-
-
-def main():
- args = parse_args()
-
- providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
-
- provider = next((p for p in providers if p.validate(args.input)), None)
- if provider is None:
- exception("Input not valid, no provider can handle this source.")
-
- prov = provider(
- verbose=args.verbose,
- upload=not args.no_upload,
- debug=args.debug,
- center=args.center,
- blank=args.blank,
- remarkable_dir=args.remarkable_dir,
- rmapi_path=args.rmapi,
- pdfcrop_path=args.pdfcrop,
- pdftk_path=args.pdftk,
- gs_path=args.gs,
- )
-
- prov.run(args.input, filename=args.filename)
-
-
-if __name__ == "__main__":
- main()
diff --git a/make_release.py b/make_release.py
new file mode 100644
index 0000000..932209a
--- /dev/null
+++ b/make_release.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Do-nothing script for making a release
+
+This idea comes from here:
+https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/
+
+Author: Gertjan van den Burg
+Date: 2019-07-23
+
+"""
+
+import colorama
+import os
+
+
+def colored(msg, color=None, style=None):
+ colors = {
+ "red": colorama.Fore.RED,
+ "green": colorama.Fore.GREEN,
+ "cyan": colorama.Fore.CYAN,
+ "yellow": colorama.Fore.YELLOW,
+ "magenta": colorama.Fore.MAGENTA,
+ None: "",
+ }
+ styles = {
+ "bright": colorama.Style.BRIGHT,
+ "dim": colorama.Style.DIM,
+ None: "",
+ }
+ pre = colors[color] + styles[style]
+ post = colorama.Style.RESET_ALL
+ return f"{pre}{msg}{post}"
+
+
+def cprint(msg, color=None, style=None):
+ print(colored(msg, color=color, style=style))
+
+
+def wait_for_enter():
+ input(colored("\nPress Enter to continue", style="dim"))
+ print()
+
+
+def get_package_name():
+ with open("./setup.py", "r") as fp:
+ nameline = next(
+ (l.strip() for l in fp if l.startswith("NAME = ")), None
+ )
+ return nameline.split("=")[-1].strip().strip('"')
+
+
+class Step:
+ def pre(self, context):
+ pass
+
+ def post(self, context):
+ wait_for_enter()
+
+ def run(self, context):
+ try:
+ self.pre(context)
+ self.action(context)
+ self.post(context)
+ except KeyboardInterrupt:
+ cprint("\nInterrupted.", color="red")
+ raise SystemExit(1)
+
+ def instruct(self, msg):
+ cprint(msg, color="green")
+
+ def print_run(self, msg):
+ cprint("Run:", color="cyan", style="bright")
+ self.print_cmd(msg)
+
+ def print_cmd(self, msg):
+ cprint("\t" + msg, color="cyan", style="bright")
+
+ def do_cmd(self, cmd):
+ cprint(f"Going to run: {cmd}", color="magenta", style="bright")
+ wait_for_enter()
+ os.system(cmd)
+
+
+class GitToMaster(Step):
+ def action(self, context):
+ self.instruct("Make sure you're on master and changes are merged in")
+ self.print_run("git checkout master")
+
+
+class UpdateChangelog(Step):
+ def action(self, context):
+ self.instruct(f"Update change log for version {context['version']}")
+ self.print_run("vi CHANGELOG.md")
+
+
+class RunTests(Step):
+ def action(self, context):
+ self.do_cmd("make test")
+
+
+class BumpVersionPackage(Step):
+ def action(self, context):
+ self.instruct(f"Update __version__.py with new version")
+ self.print_run(f"vi {context['pkgname']}/__version__.py")
+
+ def post(self, context):
+ wait_for_enter()
+ context["version"] = self._get_version(context)
+
+ def _get_version(self, context):
+ # Get the version from the version file
+ about = {}
+ with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp:
+ exec(fp.read(), about)
+ return about["__version__"]
+
+
+class MakeClean(Step):
+ def action(self, context):
+ self.do_cmd("make clean")
+
+
+class MakeDocs(Step):
+ def action(self, context):
+ self.do_cmd("make docs")
+
+
+class MakeDist(Step):
+ def action(self, context):
+ self.do_cmd("make dist")
+
+
+class PushToTestPyPI(Step):
+ def action(self, context):
+ self.do_cmd(
+ "twine upload --repository-url https://test.pypi.org/legacy/ dist/*"
+ )
+
+
+class InstallFromTestPyPI(Step):
+ def action(self, context):
+ self.print_run("cd /tmp/")
+ self.print_cmd("rm -rf ./venv")
+ self.print_cmd("virtualenv ./venv")
+ self.print_cmd("cd ./venv")
+ self.print_cmd("source bin/activate")
+ self.print_cmd(
+ "pip install --index-url https://test.pypi.org/simple/ "
+ + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}"
+ )
+
+
+class TestPackage(Step):
+ def action(self, context):
+ self.instruct(
+ f"Ensure that the following command gives version {context['version']}"
+ )
+ self.print_run(f"{context['pkgname']} -h")
+
+
+class DeactivateVenv(Step):
+ def action(self, context):
+ self.print_run("deactivate")
+ self.instruct("Go back to the project directory")
+
+
+class GitTagVersion(Step):
+ def action(self, context):
+ self.do_cmd(f"git tag v{context['version']}")
+
+
+class GitAdd(Step):
+ def action(self, context):
+ self.instruct("Add everything to git and commit")
+ self.print_run("git gui")
+
+
+class PushToPyPI(Step):
+ def action(self, context):
+ self.do_cmd("twine upload dist/*")
+
+
+class PushToGitHub(Step):
+ def action(self, context):
+ self.do_cmd("git push -u --tags origin master")
+
+
+class WaitForTravis(Step):
+ def action(self, context):
+ self.instruct(
+ "Wait for Travis to complete and verify that its successful"
+ )
+
+
+class WaitForAppVeyor(Step):
+ def action(self, context):
+ self.instruct(
+ "Wait for AppVeyor to complete and verify that its successful"
+ )
+
+
+class WaitForRTD(Step):
+ def action(self, context):
+ self.instruct(
+ "Wait for ReadTheDocs to complete and verify that its successful"
+ )
+
+
+def main():
+ colorama.init()
+ procedure = [
+ GitToMaster(),
+ GitAdd(),
+ PushToGitHub(),
+ BumpVersionPackage(),
+ UpdateChangelog(),
+ MakeClean(),
+ RunTests(),
+ MakeDist(),
+ PushToTestPyPI(),
+ InstallFromTestPyPI(),
+ TestPackage(),
+ DeactivateVenv(),
+ GitAdd(),
+ PushToPyPI(),
+ GitTagVersion(),
+ PushToGitHub(),
+ ]
+ context = {}
+ context["pkgname"] = get_package_name()
+ for step in procedure:
+ step.run(context)
+ cprint("\nDone!", color="yellow", style="bright")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py
new file mode 100644
index 0000000..113fc83
--- /dev/null
+++ b/paper2remarkable/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+from .__version__ import __version__
+
+GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable"
diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py
new file mode 100644
index 0000000..b97d538
--- /dev/null
+++ b/paper2remarkable/__main__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+"""
+Caller for the command line application
+"""
+
+import sys
+
+def main():
+ from .ui import main as realmain
+
+ sys.exit(realmain())
+
+if __name__ == '__main__':
+ main()
diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py
new file mode 100644
index 0000000..5bee2af
--- /dev/null
+++ b/paper2remarkable/__version__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+VERSION = (0, 4, 0)
+
+__version__ = '.'.join(map(str, VERSION))
diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py
new file mode 100644
index 0000000..b25b178
--- /dev/null
+++ b/paper2remarkable/crop.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+"""Code for cropping a PDF file
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import os
+import subprocess
+import pdfplumber
+
+RM_WIDTH = 1404
+RM_HEIGHT = 1872
+
+
+class Cropper(object):
+ def __init__(
+ self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
+ ):
+ if not input_file is None:
+ self.input_file = os.path.abspath(input_file)
+ self.reader = PyPDF2.PdfFileReader(self.input_file)
+ if not output_file is None:
+ self.output_file = os.path.abspath(output_file)
+ self.pdfcrop_path = pdfcrop_path
+
+ self.writer = PyPDF2.PdfFileWriter()
+
+ def crop(self, margins=1):
+ return self.process_file(self.crop_page, margins=margins)
+
+ def center(self, padding=15):
+ return self.process_file(self.center_page, padding=padding)
+
+ def process_file(self, page_func, *args, **kwargs):
+ for page_idx in range(self.reader.getNumPages()):
+ status = page_func(page_idx, *args, **kwargs)
+ if not status == 0:
+ return status
+ with open(self.output_file, "wb") as fp:
+ self.writer.write(fp)
+ return 0
+
+ def center_page(self, page_idx, padding):
+ return self.process_page(
+ page_idx, self.get_center_bbox, padding=padding
+ )
+
+ def crop_page(self, page_idx, margins):
+ return self.process_page(page_idx, self.get_bbox, margins=margins)
+
+ def export_page(self, page_idx):
+ """Helper function that exports a single page given by index """
+ page = self.reader.getPage(page_idx)
+ writer = PyPDF2.PdfFileWriter()
+ writer.addPage(page)
+ tmpfname = "./page.pdf"
+ with open(tmpfname, "wb") as fp:
+ writer.write(fp)
+ return tmpfname
+
+ def process_page(self, page_idx, bbox_func, *args, **kwargs):
+ """Process a single page and add it to the writer """
+ tmpfname = self.export_page(page_idx)
+ tmpfout = "./output.pdf"
+ bbox = bbox_func(tmpfname, *args, **kwargs)
+ status = subprocess.call(
+ [
+ self.pdfcrop_path,
+ "--bbox",
+ " ".join(map(str, bbox)),
+ tmpfname,
+ tmpfout,
+ ],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ return status
+ reader = PyPDF2.PdfFileReader(tmpfout)
+ page = reader.getPage(0)
+ self.writer.addPage(page)
+ os.unlink(tmpfname)
+ os.unlink(tmpfout)
+ return 0
+
+ def get_bbox(self, filename, margins=1, resolution=72):
+ """Get the bounding box, with optional margins
+
+ if margins is integer, used for all margins, else
+ margins = [left, top, right, bottom]
+
+ We get the bounding box by finding the smallest rectangle that is
+ completely surrounded by white pixels.
+ """
+ if isinstance(margins, int):
+ margins = [margins for _ in range(4)]
+ pdf = pdfplumber.open(filename)
+ im = pdf.pages[0].to_image(resolution=resolution)
+ pdf.close()
+
+ pixels = list(im.original.getdata())
+ W, H = im.original.size
+
+ # M is a list of H lists with each W integers that equal the sum of the
+ # pixel values
+ M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]
+
+ left, top, bottom, right = 0, 0, 0, 0
+ while top < H and sum(M[top]) == W * 255 * 3:
+ top += 1
+ while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
+ bottom += 1
+
+ # Transpose M
+ M = list(zip(*M))
+ while left < W and sum(M[left]) == H * 255 * 3:
+ left += 1
+ while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
+ right += 1
+
+ left -= margins[0]
+ top -= margins[1]
+ right -= margins[2]
+ bottom -= margins[3]
+
+ # This is the bounding box in PIL format: (0, 0) top left
+ x0, y0, x1, y1 = left, top, W - right, H - bottom
+
+ # Get the bbox in Ghostscript format: (0, 0) bottom left
+ a0, b0, a1, b1 = x0, H - y1, x1, H - y0
+ return [a0, b0, a1, b1]
+
+ def get_center_bbox(self, filename, padding=15):
+ """Compute a bounding box that will center the page file on the
+ reMarkable
+ """
+ bbox = self.get_bbox(filename, margins=0)
+
+ h = bbox[3] - bbox[1]
+ w = bbox[2] - bbox[0]
+
+ # we want some minimal padding all around, because it is visually more
+ # pleasing.
+ h_prime = h + 2 * padding
+ w_prime = w + 2 * padding
+
+ # if the document is wider than the remarkable, we add top-padding to
+ # center it, otherwise we add left-padding
+ x, y = 0, 0
+ if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
+ y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
+ else:
+ x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2
+
+ margins = [padding + x, padding + y, padding, padding]
+ return self.get_bbox(filename, margins=margins)
diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py
new file mode 100644
index 0000000..bae1cbf
--- /dev/null
+++ b/paper2remarkable/log.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+"""Just a simple logger
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+# NOTE: I know about the logging module, but this was easier because one of the
+# dependencies was using that and it became complicated. This one is obviously
+# not thread-safe and is very simple.
+
+import datetime
+import sys
+
+
+class Singleton(type):
+ # https://stackoverflow.com/q/6760685
+ _instances = {}
+
+ def __call__(cls, *args, **kwargs):
+ if cls not in cls._instances:
+ cls._instances[cls] = super(Singleton, cls).__call__(
+ *args, **kwargs
+ )
+ return cls._instances[cls]
+
+
+class Logger(metaclass=Singleton):
+ def __init__(self):
+ self.enabled = True
+
+ def enable(self):
+ self.enabled = True
+
+ def disable(self):
+ self.enabled = False
+
+ def _log(self, msg, mode):
+ if not self.enabled:
+ return
+ if not mode in ("info", "warn"):
+ raise ValueError("Unknown logging mode: %s" % mode)
+ file = sys.stdout if mode == "info" else sys.stderr
+ now = datetime.datetime.now()
+ nowstr = now.strftime("%Y-%m-%d %H:%M:%S")
+ print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file)
+ file.flush()
+
+ def info(self, msg):
+ self._log(msg, "info")
+
+ def warning(self, msg):
+ self._log(msg, "warn")
diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py
new file mode 100644
index 0000000..8636017
--- /dev/null
+++ b/paper2remarkable/pdf_ops.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+
+"""Operations on PDF files
+
+Author: G.J.J. van den Burg
+License: See LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+
+import PyPDF2
+import os
+import subprocess
+
+from .crop import Cropper
+from .log import Logger
+
+logger = Logger()
+
+def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
+ """Crop the pdf file using Cropper
+ """
+ logger.info("Cropping pdf file")
+ cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
+
+ cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path)
+ status = cropper.crop(margins=15)
+
+ if not status == 0:
+ logger.warning("Failed to crop the pdf file at: %s" % filepath)
+ return filepath
+ if not os.path.exists(cropped_file):
+ logger.warning(
+ "Can't find cropped file '%s' where expected." % cropped_file
+ )
+ return filepath
+ return cropped_file
+
+
+def center_pdf(filepath, pdfcrop_path="pdfcrop"):
+ """Center the pdf file on the reMarkable
+ """
+ logger.info("Centering pdf file")
+ centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
+
+ cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path)
+ status = cropper.center()
+
+ if not status == 0:
+ logger.warning("Failed to center the pdf file at: %s" % filepath)
+ return filepath
+ if not os.path.exists(centered_file):
+ logger.warning(
+ "Can't find centered file '%s' where expected." % centered_file
+ )
+ return filepath
+ return centered_file
+
+
+def blank_pdf(filepath):
+ """Add blank pages to PDF
+ """
+ logger.info("Adding blank pages")
+ input_pdf = PyPDF2.PdfFileReader(filepath)
+ output_pdf = PyPDF2.PdfFileWriter()
+ for page in input_pdf.pages:
+ output_pdf.addPage(page)
+ output_pdf.addBlankPage()
+
+ output_file = os.path.splitext(filepath)[0] + "-blank.pdf"
+ with open(output_file, "wb") as fp:
+ output_pdf.write(fp)
+ return output_file
+
+
+def shrink_pdf(filepath, gs_path="gs"):
+ """Shrink the PDF file size using Ghostscript
+ """
+ logger.info("Shrinking pdf file")
+ output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
+ status = subprocess.call(
+ [
+ gs_path,
+ "-sDEVICE=pdfwrite",
+ "-dCompatibilityLevel=1.4",
+ "-dPDFSETTINGS=/printer",
+ "-dNOPAUSE",
+ "-dBATCH",
+ "-dQUIET",
+ "-sOutputFile=%s" % output_file,
+ filepath,
+ ]
+ )
+ if not status == 0:
+ logger.warning("Failed to shrink the pdf file")
+ return filepath
+ return output_file
diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py
new file mode 100644
index 0000000..f6f93f9
--- /dev/null
+++ b/paper2remarkable/providers/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+from .arxiv import Arxiv
+from .pubmed import PubMed
+from .acm import ACM
+from .openreview import OpenReview
+from .springer import Springer
+from .local import LocalFile
+from .pdf_url import PdfUrl
+
+providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl]
diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py
new file mode 100644
index 0000000..bdc9558
--- /dev/null
+++ b/paper2remarkable/providers/_base.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+"""Base for the Provider class
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import abc
+import os
+import shutil
+import tempfile
+
+from ._info import Informer
+from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf
+from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable
+from ..log import Logger
+
+logger = Logger()
+
+
+class Provider(metaclass=abc.ABCMeta):
+ """ ABC for providers of pdf sources """
+
+ def __init__(
+ self,
+ verbose=False,
+ upload=True,
+ debug=False,
+ center=False,
+ blank=False,
+ remarkable_dir="/",
+ rmapi_path="rmapi",
+ pdfcrop_path="pdfcrop",
+ pdftk_path="pdftk",
+ gs_path="gs",
+ ):
+ self.upload = upload
+ self.debug = debug
+ self.remarkable_dir = remarkable_dir
+ self.rmapi_path = rmapi_path
+ self.pdfcrop_path = pdfcrop_path
+ self.pdftk_path = pdftk_path
+ self.gs_path = gs_path
+ self.informer = Informer()
+
+ # disable logging if requested
+ if not verbose:
+ logger.disable()
+
+ # Define the operations to run on the pdf. Providers can add others.
+ self.operations = [("crop", self.crop_pdf)]
+ if center:
+ self.operations.append(("center", self.center_pdf))
+
+ if blank:
+ self.operations.append(("blank", blank_pdf))
+ self.operations.append(("shrink", self.shrink_pdf))
+
+ logger.info("Starting %s" % type(self).__name__)
+
+ @staticmethod
+ @abc.abstractmethod
+ def validate(src):
+ """ Validate whether ``src`` is appropriate for this provider """
+
+ # Wrappers for pdf operations that have additional arguments
+ def crop_pdf(self, filepath):
+ return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+
+ def center_pdf(self, filepath):
+ return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path)
+
+ def shrink_pdf(self, filepath):
+ return shrink_pdf(filepath, gs_path=self.gs_path)
+
+ def retrieve_pdf(self, pdf_url, filename):
+ """ Download pdf from src and save to filename """
+ # This must exist so that the LocalFile provider can overwrite it
+ download_url(pdf_url, filename)
+
+ def run(self, src, filename=None):
+ abs_url, pdf_url = self.get_abs_pdf_urls(src)
+ clean_filename = filename or self.informer.get_filename(abs_url)
+ tmp_filename = "paper.pdf"
+
+ self.initial_dir = os.getcwd()
+ with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir:
+ os.chdir(working_dir)
+ self.retrieve_pdf(pdf_url, tmp_filename)
+ assert_file_is_pdf(tmp_filename)
+
+ intermediate_fname = tmp_filename
+ for opname, op in self.operations:
+ intermediate_fname = op(intermediate_fname)
+ shutil.move(intermediate_fname, clean_filename)
+
+ if self.debug:
+ print("Paused in debug mode in dir: %s" % working_dir)
+ print("Press enter to exit.")
+ return input()
+
+ if self.upload:
+ return upload_to_remarkable(
+ clean_filename,
+ remarkable_dir=self.remarkable_dir,
+ rmapi_path=self.rmapi_path,
+ )
+
+ target_path = os.path.join(self.initial_dir, clean_filename)
+ while os.path.exists(target_path):
+ base = os.path.splitext(target_path)[0]
+ target_path = base + "_.pdf"
+ shutil.move(clean_filename, target_path)
+ return target_path
diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py
new file mode 100644
index 0000000..0b28658
--- /dev/null
+++ b/paper2remarkable/providers/_info.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+"""Functionality for retrieving paper info
+"""
+
+import titlecase
+import unidecode
+import bs4
+
+from ..utils import clean_string, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
+
+
+class Informer:
+ """Base class for the informers.
+
+ The "informer" class is used to retrieve the title, authors, and year of
+ publication of the provided paper.
+
+ This base class provides the main functionality, but because various
+ outlets use different conventions to embed author, title, and publication
+ year information, we expect that individual providers will subclass this
+ class and overwrite some of the methods.
+ """
+
+ meta_author_key = "citation_author"
+ meta_title_key = "citation_title"
+ meta_date_key = "citation_date"
+
+ def __init__(self, title=None, authors=None, year=None):
+ self.title = title
+ self.authors = authors or []
+ self.year = year
+
+ def get_filename(self, abs_url):
+ """ Generate nice filename using the paper information
+
+ The provided url must be to a HTMl page where this information can be
+ found, not to the PDF file itself.
+ """
+ logger.info("Generating output filename")
+
+ # Retrieve the paper information
+ self.get_info(abs_url)
+
+ # we assume that the list of authors is surname only.
+ if len(self.authors) > 3:
+ authors = self.authors[0] + "_et_al"
+ else:
+ authors = "_".join(self.authors)
+ authors = clean_string(authors)
+
+ # Clean the title and make it titlecase
+ title = clean_string(self.title)
+ title = titlecase.titlecase(title)
+ title = title.replace(" ", "_")
+ title = clean_string(title)
+
+ year = str(self.year)
+
+ name = authors + "_-_" + title + "_" + year + ".pdf"
+ name = unidecode.unidecode(name)
+ logger.info("Created filename: %s" % name)
+ return name
+
+ def get_info(self, url):
+ logger.info("Getting paper info")
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ self.authors = self.authors or self.get_authors(soup)
+ self.title = self.title or self.get_title(soup)
+ self.year = self.year or self.get_year(soup)
+
+ ## Title
+
+ def get_title(self, soup):
+ target = soup.find_all("meta", {"name": self.meta_title_key})
+ return target[0]["content"]
+
+ ## Authors
+
+ def get_authors(self, soup):
+ authors = [
+ x["content"]
+ for x in soup.find_all("meta", {"name": self.meta_author_key})
+ ]
+ return self._format_authors(authors)
+
+ def _format_authors(self, soup_authors, sep=",", idx=0, op=None):
+ op = (lambda x: x) if op is None else op
+ # format the author list retrieved by bs4
+ return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)]
+
+ ## Year
+
+ def _format_year(self, soup_date):
+ return soup_date.split("/")[0]
+
+ def get_year(self, soup):
+ """ Retrieve the contents of the meta_date_key field and format it """
+ date = soup.find_all("meta", {"name": self.meta_date_key})[0][
+ "content"
+ ]
+ return self._format_year(date)
diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py
new file mode 100644
index 0000000..a0d79bd
--- /dev/null
+++ b/paper2remarkable/providers/acm.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for ACM
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import bs4
+import re
+
+from ._base import Provider
+from ._info import Informer
+from .. import GITHUB_URL
+from ..utils import exception, get_page_with_retry
+from ..log import Logger
+
+logger = Logger()
+
+
+class ACMInformer(Informer):
+ meta_author_key = "citation_authors"
+
+ def _format_authors(self, soup_authors):
+ op = lambda x: x[0].split(";")
+ return super()._format_authors(soup_authors, sep=",", idx=0, op=op)
+
+ def _format_year(self, soup_date):
+ if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()):
+ logger.warning(
+ "Couldn't extract year from ACM page, please raise an "
+ "issue on GitHub so it can be fixed: %s" % GITHUB_URL
+ )
+ return soup_date.strip().split("/")[-1]
+
+
+class ACM(Provider):
+
+ re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ACMInformer()
+
+ def get_acm_pdf_url(self, url):
+ page = get_page_with_retry(url)
+ soup = bs4.BeautifulSoup(page, "html.parser")
+ thea = None
+ for a in soup.find_all("a"):
+ if a.get("name") == "FullTextPDF":
+ thea = a
+ break
+ if thea is None:
+ return None
+ href = thea.get("href")
+ if href.startswith("http"):
+ return href
+ else:
+ return "https://dl.acm.org/" + href
+
+ def get_abs_pdf_urls(self, url):
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = self.get_acm_pdf_url(url)
+ if pdf_url is None:
+ exception(
+ "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?"
+ )
+ else:
+ exception(
+ "Couldn't figure out ACM urls, please provide a URL of the "
+ "format: http(s)://dl.acm.org/citation.cfm?id=..."
+ )
+ return abs_url, pdf_url
+
+ def validate(src):
+ m = re.fullmatch(ACM.re_abs, src)
+ return not m is None
diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py
new file mode 100644
index 0000000..e022658
--- /dev/null
+++ b/paper2remarkable/providers/arxiv.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for arxiv.org
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import os
+import re
+import subprocess
+
+from ._info import Informer
+from ._base import Provider
+from ..utils import exception
+from ..log import Logger
+
+logger = Logger()
+
+
+class ArxivInformer(Informer):
+ pass
+
+
+class Arxiv(Provider):
+
+ re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
+ re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = ArxivInformer()
+
+ # register the dearxiv operation
+ self.operations.insert(0, ("dearxiv", self.dearxiv))
+
+ def get_abs_pdf_urls(self, url):
+ """Get the pdf and abs url from any given arXiv url """
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.replace("abs", "pdf") + ".pdf"
+ elif re.match(self.re_pdf, url):
+ abs_url = url[:-4].replace("pdf", "abs")
+ pdf_url = url
+ else:
+ exception("Couldn't figure out arXiv urls.")
+ return abs_url, pdf_url
+
+ def validate(src):
+ """Check if the url is to an arXiv page. """
+ return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src)
+
+ def dearxiv(self, input_file):
+ """Remove the arXiv timestamp from a pdf"""
+ logger.info("Removing arXiv timestamp")
+ basename = os.path.splitext(input_file)[0]
+ uncompress_file = basename + "_uncompress.pdf"
+
+ status = subprocess.call(
+ [
+ self.pdftk_path,
+ input_file,
+ "output",
+ uncompress_file,
+ "uncompress",
+ ]
+ )
+ if not status == 0:
+ exception("pdftk failed to uncompress the pdf.")
+
+ with open(uncompress_file, "rb") as fid:
+ data = fid.read()
+ # Remove the text element
+ data = re.sub(
+ b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
+ b"()Tj",
+ data,
+ )
+ # Remove the URL element
+ data = re.sub(
+ b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n",
+ b"",
+ data,
+ )
+
+ removed_file = basename + "_removed.pdf"
+ with open(removed_file, "wb") as oid:
+ oid.write(data)
+
+ output_file = basename + "_dearxiv.pdf"
+ status = subprocess.call(
+ [self.pdftk_path, removed_file, "output", output_file, "compress"]
+ )
+ if not status == 0:
+ exception("pdftk failed to compress the pdf.")
+
+ return output_file
diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py
new file mode 100644
index 0000000..3f581b2
--- /dev/null
+++ b/paper2remarkable/providers/local.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for local files
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import os
+import shutil
+
+from ._base import Provider
+from ._info import Informer
+
+
+class LocalFileInformer(Informer):
+ def get_filename(self, abs_url):
+ return os.path.basename(abs_url)
+
+
+class LocalFile(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = LocalFileInformer()
+
+ def get_abs_pdf_urls(self, url):
+ # The 'url' is the path to the local file. We use this as abs_url and
+ # pdf_url.
+ return url, url
+
+ def validate(src):
+ return os.path.exists(src)
+
+ def retrieve_pdf(self, pdf_url, filename):
+ source = os.path.join(self.initial_dir, pdf_url)
+ shutil.copy(source, filename)
diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py
new file mode 100644
index 0000000..bfb139d
--- /dev/null
+++ b/paper2remarkable/providers/openreview.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for OpenReview
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..utils import exception
+
+
+class OpenReviewInformer(Informer):
+
+ meta_date_key = "citation_publication_date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class OpenReview(Provider):
+
+ re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
+ re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = OpenReviewInformer()
+
+ def get_abs_pdf_urls(self, url):
+ """ Get the pdf and abstract url from a OpenReview url """
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.replace("forum", "pdf")
+ elif re.match(self.re_pdf, url):
+ abs_url = url.replace("pdf", "forum")
+ pdf_url = url
+ else:
+ exception("Couldn't figure out OpenReview urls.")
+ return abs_url, pdf_url
+
+ def validate(src):
+ """ Check if the url is a valid OpenReview url. """
+ return re.match(OpenReview.re_abs, src) or re.match(
+ OpenReview.re_pdf, src
+ )
diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py
new file mode 100644
index 0000000..d80b1a9
--- /dev/null
+++ b/paper2remarkable/providers/pdf_url.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for generic PDF url
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import urllib
+
+from ._base import Provider
+from ._info import Informer
+from ..utils import exception
+
+
+class PdfUrlInformer(Informer):
+ def get_filename(self, abs_url):
+ # if this is called, filename must not be provided
+ exception(
+ "Filename must be provided with PDFUrlProvider (use --filename)"
+ )
+
+
+class PdfUrl(Provider):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = PdfUrlInformer()
+
+ def get_abs_pdf_urls(self, url):
+ return (None, url)
+
+ def validate(src):
+ try:
+ result = urllib.parse.urlparse(src)
+ return all([result.scheme, result.netloc, result.path])
+ except:
+ return False
diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py
new file mode 100644
index 0000000..ba4cca0
--- /dev/null
+++ b/paper2remarkable/providers/pubmed.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for PubMed
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+
+from ._base import Provider
+from ._info import Informer
+from ..utils import exception
+
+
+class PubMedInformer(Informer):
+
+ meta_author_key = "citation_authors"
+
+ def _format_authors(self, soup_authors):
+ op = lambda x: x[0].split(",")
+ return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op)
+
+ def _format_year(self, soup_date):
+ if re.match("\w+\ \d{4}", soup_date):
+ return soup_date.split(" ")[-1]
+ return soup_date.replace(" ", "_")
+
+
+class PubMed(Provider):
+
+ re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
+ re_pdf = (
+ "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
+ )
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = PubMedInformer()
+
+ def get_abs_pdf_urls(self, url):
+ """Get the pdf and html url from a given PMC url """
+ if re.match(self.re_pdf, url):
+ idx = url.index("pdf")
+ abs_url = url[: idx - 1]
+ pdf_url = url
+ elif re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually
+ else:
+ exception("Couldn't figure out PMC urls.")
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src)
diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py
new file mode 100644
index 0000000..ce4acdd
--- /dev/null
+++ b/paper2remarkable/providers/springer.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+"""Provider for Springer
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import re
+import urllib
+
+from ._base import Provider
+from ._info import Informer
+from ..utils import exception
+
+
+class SpringerInformer(Informer):
+
+ meta_date_key = "citation_online_date"
+
+ def _format_authors(self, soup_authors):
+ return super()._format_authors(soup_authors, sep=" ", idx=-1)
+
+
+class Springer(Provider):
+
+ re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
+ re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.informer = SpringerInformer()
+
+ def get_abs_pdf_urls(self, url):
+ """ Get the pdf and abstract urls from a Springer url """
+ if re.match(self.re_abs, url):
+ abs_url = url
+ pdf_url = url.replace("article", "content/pdf")
+ elif re.match(self.re_pdf, url):
+ abs_url = url.replace("content/pdf", "article")[: -len(".pdf")]
+ pdf_url = urllib.parse.unquote(url)
+ else:
+ exception("Couldn't figure out Springer urls.")
+ return abs_url, pdf_url
+
+ def validate(src):
+ return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src)
diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py
new file mode 100644
index 0000000..5323996
--- /dev/null
+++ b/paper2remarkable/ui.py
@@ -0,0 +1,107 @@
+# -*- coding: utf-8 -*-
+
+"""Command line interface
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import argparse
+
+from . import __version__
+
+from .providers import providers
+from .utils import exception
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="Paper2reMarkable version %s" % __version__
+ )
+ parser.add_argument(
+ "-b",
+ "--blank",
+ help="Add a blank page after every page of the PDF",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-c",
+ "--center",
+ help="Center the PDF on the page, instead of left align",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-d",
+ "--debug",
+ help="debug mode, doesn't upload to reMarkable",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-n",
+ "--no-upload",
+ help="don't upload to the reMarkable, save the output in current working dir",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-p",
+ "--remarkable-path",
+ help="directory on reMarkable to put the file (created if missing, default: /)",
+ dest="remarkable_dir",
+ default="/",
+ )
+ parser.add_argument(
+ "-v", "--verbose", help="be verbose", action="store_true"
+ )
+ parser.add_argument(
+ "--filename",
+ help="Filename to use for the file on reMarkable",
+ default=None,
+ )
+ parser.add_argument(
+ "--gs", help="path to gs executable (default: gs)", default="gs"
+ )
+ parser.add_argument(
+ "--pdfcrop",
+ help="path to pdfcrop executable (default: pdfcrop)",
+ default="pdfcrop",
+ )
+ parser.add_argument(
+ "--pdftk",
+ help="path to pdftk executable (default: pdftk)",
+ default="pdftk",
+ )
+ parser.add_argument(
+ "--rmapi",
+ help="path to rmapi executable (default: rmapi)",
+ default="rmapi",
+ )
+ parser.add_argument(
+ "input",
+ help="URL to a paper or the path of a local PDF file",
+ )
+ return parser.parse_args()
+
+
+def main():
+ args = parse_args()
+
+ provider = next((p for p in providers if p.validate(args.input)), None)
+ if provider is None:
+ exception("Input not valid, no provider can handle this source.")
+
+ prov = provider(
+ verbose=args.verbose,
+ upload=not args.no_upload,
+ debug=args.debug,
+ center=args.center,
+ blank=args.blank,
+ remarkable_dir=args.remarkable_dir,
+ rmapi_path=args.rmapi,
+ pdfcrop_path=args.pdfcrop,
+ pdftk_path=args.pdftk,
+ gs_path=args.gs,
+ )
+
+ prov.run(args.input, filename=args.filename)
diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py
new file mode 100644
index 0000000..a313ffe
--- /dev/null
+++ b/paper2remarkable/utils.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+"""Utility functions for a2r
+
+Author: G.J.J. van den Burg
+License: See LICENSE file
+Copyright: 2019, G.J.J. van den Burg
+
+"""
+
+import PyPDF2
+import requests
+import string
+import subprocess
+import sys
+import time
+import unidecode
+
+from . import GITHUB_URL
+from .log import Logger
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+ "Safari/537.36"
+}
+
+
+logger = Logger()
+
+
+def exception(msg):
+ print("ERROR: " + msg, file=sys.stderr)
+ print("Error occurred. Exiting.", file=sys.stderr)
+ print("", file=sys.stderr)
+ print(
+ "If you think this might be a bug, please raise an issue on GitHub: %s"
+ % GITHUB_URL
+ )
+ raise SystemExit(1)
+
+
+def clean_string(s):
+ """ Clean a string by replacing accented characters with equivalents and
+ keeping only the allowed characters (ascii letters, digits, underscore,
+ space, dash, and period)"""
+ normalized = unidecode.unidecode(s)
+ allowed = string.ascii_letters + string.digits + "_ .-"
+ cleaned = "".join(c if c in allowed else "_" for c in normalized)
+ while "__" in cleaned:
+ cleaned = cleaned.replace("__", "_")
+ return cleaned
+
+
+def assert_file_is_pdf(filename):
+ """Assert that a given file is a PDF file.
+
+ This is done by trying to open it using PyPDF2.
+ """
+ try:
+ fp = open(filename, "rb")
+ pdf = PyPDF2.PdfFileReader(fp, strict=False)
+ fp.close()
+ del pdf
+ return True
+ except PyPDF2.utils.PdfReadError:
+ exception("File %s isn't a valid pdf file." % filename)
+
+
+def download_url(url, filename):
+ """Download the content of an url and save it to a filename """
+ logger.info("Downloading file at url: %s" % url)
+ content = get_page_with_retry(url)
+ with open(filename, "wb") as fid:
+ fid.write(content)
+
+
+def get_page_with_retry(url, tries=5):
+ count = 0
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.get(url, headers=HEADERS)
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ logger.warning(
+ "(%i/%i) Error getting url %s. Retrying in 5 seconds."
+ % (count, tries, url)
+ )
+ time.sleep(5)
+ continue
+ logger.info("Downloading url: %s" % url)
+ return res.content
+
+
+def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"):
+ logger.info("Starting upload to reMarkable")
+
+ # Create the reMarkable dir if it doesn't exist
+ remarkable_dir = remarkable_dir.rstrip("/")
+ if remarkable_dir:
+ status = subprocess.call(
+ [rmapi_path, "mkdir", remarkable_dir + "/"],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ exception(
+ "Creating directory %s on reMarkable failed" % remarkable_dir
+ )
+
+ # Upload the file
+ status = subprocess.call(
+ [rmapi_path, "put", filepath, remarkable_dir + "/"],
+ stdout=subprocess.DEVNULL,
+ )
+ if not status == 0:
+ exception("Uploading file %s to reMarkable failed" % filepath)
+ logger.info("Upload successful.")
diff --git a/poetry.lock b/poetry.lock
deleted file mode 100644
index 272967c..0000000
--- a/poetry.lock
+++ /dev/null
@@ -1,183 +0,0 @@
-[[package]]
-category = "main"
-description = "Screen-scraping library"
-name = "beautifulsoup4"
-optional = false
-python-versions = "*"
-version = "4.7.1"
-
-[package.dependencies]
-soupsieve = ">=1.2"
-
-[[package]]
-category = "main"
-description = "Dummy package for Beautiful Soup"
-name = "bs4"
-optional = false
-python-versions = "*"
-version = "0.0.1"
-
-[package.dependencies]
-beautifulsoup4 = "*"
-
-[[package]]
-category = "main"
-description = "Python package for providing Mozilla's CA Bundle."
-name = "certifi"
-optional = false
-python-versions = "*"
-version = "2018.11.29"
-
-[[package]]
-category = "main"
-description = "Universal encoding detector for Python 2 and 3"
-name = "chardet"
-optional = false
-python-versions = "*"
-version = "3.0.4"
-
-[[package]]
-category = "main"
-description = "Internationalized Domain Names in Applications (IDNA)"
-name = "idna"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "2.8"
-
-[[package]]
-category = "main"
-description = "PDF parser and analyzer"
-name = "pdfminer.six"
-optional = false
-python-versions = "*"
-version = "20181108"
-
-[package.dependencies]
-pycryptodome = "*"
-six = "*"
-sortedcontainers = "*"
-
-[[package]]
-category = "main"
-description = "Plumb a PDF for detailed information about each char, rectangle, and line."
-name = "pdfplumber"
-optional = false
-python-versions = "*"
-version = "0.5.12"
-
-[package.dependencies]
-chardet = "*"
-"pdfminer.six" = "20181108"
-pillow = ">=3.0.0"
-pycryptodome = "*"
-unicodecsv = ">=0.14.1"
-wand = "*"
-
-[[package]]
-category = "main"
-description = "Python Imaging Library (Fork)"
-name = "pillow"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
-version = "6.0.0"
-
-[[package]]
-category = "main"
-description = "Cryptographic library for Python"
-name = "pycryptodome"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "3.8.2"
-
-[[package]]
-category = "main"
-description = "Python HTTP for Humans."
-name = "requests"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "2.21.0"
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-chardet = ">=3.0.2,<3.1.0"
-idna = ">=2.5,<2.9"
-urllib3 = ">=1.21.1,<1.25"
-
-[[package]]
-category = "main"
-description = "Python 2 and 3 compatibility utilities"
-name = "six"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*"
-version = "1.12.0"
-
-[[package]]
-category = "main"
-description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
-name = "sortedcontainers"
-optional = false
-python-versions = "*"
-version = "2.1.0"
-
-[[package]]
-category = "main"
-description = "A CSS4 selector implementation for Beautiful Soup."
-name = "soupsieve"
-optional = false
-python-versions = "*"
-version = "1.7.3"
-
-[[package]]
-category = "main"
-description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*."
-name = "unicodecsv"
-optional = false
-python-versions = "*"
-version = "0.14.1"
-
-[[package]]
-category = "main"
-description = "ASCII transliterations of Unicode text"
-name = "unidecode"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-version = "1.1.1"
-
-[[package]]
-category = "main"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-name = "urllib3"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
-version = "1.24.1"
-
-[[package]]
-category = "main"
-description = "Ctypes-based simple MagickWand API binding for Python"
-name = "wand"
-optional = false
-python-versions = "*"
-version = "0.5.4"
-
-[metadata]
-content-hash = "51a0dc0e8f6e6e23395cd5aca6a81e9b3aa121ec86f120f1304f2142eb2b65b0"
-python-versions = "^3.5"
-
-[metadata.hashes]
-beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"]
-bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"]
-certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"]
-chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"]
-idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
-"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"]
-pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"]
-pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"]
-pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"]
-requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"]
-six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"]
-sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"]
-soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"]
-unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"]
-unidecode = ["1d7a042116536098d05d599ef2b8616759f02985c85b4fef50c78a5aaf10822a", "2b6aab710c2a1647e928e36d69c21e76b453cd455f4e2621000e54b2a9b8cce8"]
-urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"]
-wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"]
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 7e9c629..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[tool.poetry]
-name = "arxiv2remarkable"
-version = "0.1.0"
-description = "Download an arXiv paper and send it to reMarkable"
-authors = ["Gertjan van den Burg <gertjanvandenburg@gmail.com>"]
-license = "MIT"
-
-[tool.poetry.dependencies]
-python = "^3.5"
-bs4 = "^0.0.1"
-requests = "^2.21"
-pdfplumber = "^0.5.12"
-unidecode = "^1.1"
-
-[tool.poetry.dev-dependencies]
-
-[build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..f54170a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import io
+import os
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+AUTHOR = "Gertjan van den Burg"
+DESCRIPTION = "Easily download an academic paper and send it to the reMarkable"
+EMAIL = "gertjanvandenburg@gmail.com"
+LICENSE = "MIT"
+LICENSE_TROVE = "License :: OSI Approved :: MIT License"
+NAME = "paper2remarkable"
+REQUIRES_PYTHON = ">=3.5.0"
+URL = "https://github.com/GjjvdBurg/paper2remarkable"
+VERSION = None
+
+# What packages are required for this module to be executed?
+REQUIRED = [
+ "beautifulsoup4>=4.8",
+ "requests>=2.21",
+ "pdfplumber>=0.5",
+ "unidecode>=1.1",
+ "titlecase>=0.12",
+ "PyPDF2>=1.26",
+]
+
+docs_require = []
+test_require = []
+dev_require = ["green"]
+
+# What packages are optional?
+EXTRAS = {
+ "docs": docs_require,
+ "tests": test_require,
+ "dev": docs_require + test_require + dev_require,
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+ with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
+ long_description = "\n" + f.read()
+except FileNotFoundError:
+ long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+ project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
+ with open(os.path.join(here, project_slug, "__version__.py")) as f:
+ exec(f.read(), about)
+else:
+ about["__version__"] = VERSION
+
+# Where the magic happens:
+setup(
+ name=NAME,
+ version=about["__version__"],
+ description=DESCRIPTION,
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ author=AUTHOR,
+ author_email=EMAIL,
+ python_requires=REQUIRES_PYTHON,
+ url=URL,
+ packages=find_packages(
+ exclude=["tests", "*.tests", "*.tests.*", "tests.*"]
+ ),
+ install_requires=REQUIRED,
+ extras_require=EXTRAS,
+ include_package_data=True,
+ license=LICENSE,
+ ext_modules=[],
+ entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]},
+ classifiers=[
+ # Trove classifiers
+ # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ LICENSE_TROVE,
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+ "Intended Audience :: Education",
+ "Intended Audience :: Science/Research",
+ "Topic :: Education",
+ "Topic :: Scientific/Engineering",
+ "Topic :: Utilities",
+ ],
+)
diff --git a/test.py b/tests/test_providers.py
index 83c74af..bb793b3 100644
--- a/test.py
+++ b/tests/test_providers.py
@@ -11,17 +11,17 @@ import hashlib
import shutil
import os
-from arxiv2remarkable import (
+from paper2remarkable.providers import (
ACM,
Arxiv,
LocalFile,
OpenReview,
PdfUrl,
- Pubmed,
+ PubMed,
Springer,
)
-VERBOSE = False
+VERBOSE = True
def md5sum(filename):
@@ -56,7 +56,7 @@ class Tests(unittest.TestCase):
self.assertEqual(exp_filename, os.path.basename(filename))
def test_pmc(self):
- prov = Pubmed(upload=False, verbose=VERBOSE)
+ prov = PubMed(upload=False, verbose=VERBOSE)
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/"
exp_filename = (
"Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf"