diff options
30 files changed, 1688 insertions, 1127 deletions
@@ -1 +1,6 @@ __pycache__/ +paper2remarkable.egg-info/ +dist/* +build/* +*.pyc +*/__pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ac4f357 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,6 @@ +# Changelog + +## Version 0.4.0 + +* Refactor code to make it a real Python package +* Rename to ``paper2remarkable`` @@ -19,14 +19,6 @@ RUN apt-get update \ pdftk \ texlive-extra-utils # contains pdfcrop -RUN pip install \ - bs4 \ - requests \ - PyPDF2 \ - titlecase \ - pdfplumber \ - unidecode +RUN pip install paper2remarkable -COPY arxiv2remarkable.py ./ - -ENTRYPOINT ["python", "arxiv2remarkable.py"] +ENTRYPOINT ["p2r"] diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..021523f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +include setup.py +include README.md +include LICENSE +recursive-include paper2remarkable *.py +recursive-include tests *.py +exclude Makefile +exclude .gitignore +exclude Dockerfile +exclude make_release.py +prune old diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2a656d4 --- /dev/null +++ b/Makefile @@ -0,0 +1,60 @@ +# Makefile for easier installation and cleanup. +# +# Uses self-documenting macros from here: +# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html + +PACKAGE=paper2remarkable +DOC_DIR='./docs/' +VENV_DIR=/tmp/p2r_venv/ + +.PHONY: help cover dist + +.DEFAULT_GOAL := help + +help: + @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ + awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\ + %s\n", $$1, $$2}' + +release: ## Make a release + python make_release.py + + +install: ## Install for the current user using the default python command + python setup.py build_ext --inplace + python setup.py install --user + + +test: venv ## Run unit tests + source $(VENV_DIR)/bin/activate && green -f -vv -a ./tests + + +clean: ## Clean build dist and egg directories left after install + rm -rf ./dist + rm -rf ./build + rm -rf ./$(PACKAGE).egg-info + rm -rf $(VENV_DIR) + rm -f MANIFEST + find . -type f -iname '*.pyc' -delete + find . -type d -name '__pycache__' -empty -delete + +dist: ## Make Python source distribution + python setup.py sdist + python setup.py bdist_wheel --universal + +docs: doc +doc: install ## Build documentation with Sphinx + m2r README.md && mv README.rst $(DOC_DIR) + m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR) + cd $(DOC_DIR) && \ + rm source/* && \ + sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ + touch source/AUTOGENERATED + $(MAKE) -C $(DOC_DIR) html + +venv: $(VENV_DIR)/bin/activate + +$(VENV_DIR)/bin/activate: + test -d $(VENV_DIR) || virtualenv $(VENV_DIR) + source $(VENV_DIR)/bin/activate && pip install -e .[dev] + touch $(VENV_DIR)/bin/activate @@ -1,11 +1,21 @@ -# arxiv2remarkable.py +# paper2remarkable -``arxiv2remarkable`` is a command line program to quickly transfer a paper to -your reMarkable. The script can be run as a plain Python script or via Docker +*Note: ``paper2remarkable`` is the new name for the ``arxiv2remarkable`` +script. The name was changed because it better captures what the program +does.* + +``paper2remarkable`` is a command line program for quickly and easily +transferring an academic paper to your reMarkable: + +``` +$ p2r https://arxiv.org/abs/1811.11242 +``` + +The script can be run through the ``p2r`` command line program or via Docker (see below). -This script makes it as easy as possible to get a PDF on your reMarkable from -any of the following sources: +paper2remarkable makes it as easy as possible to get a PDF on your reMarkable +from any of the following sources: - an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``) - a PubMed Central url (either to the HTML or the PDF) @@ -16,10 +26,10 @@ any of the following sources: - a url to a PDF file - a local file. -The script takes the source and: +When called, the paper2remarkable takes the source and: 1. Downloads the pdf if necessary -2. Removes the arXiv timestamp +2. Removes the arXiv timestamp (for arXiv sources) 3. Crops the pdf to remove unnecessary borders 4. Shrinks the pdf file to reduce the filesize 5. Generates a nice filename based on author/title/year of the paper @@ -37,41 +47,39 @@ Optionally, you can: Here's the full help of the script: ```text -usage: arxiv2remarkable.py [-h] [-b] [-v] [-n] [-d] [-c] [--filename FILENAME] - [-p REMARKABLE_DIR] [--rmapi RMAPI] - [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS] - input +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] + [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK] + [--rmapi RMAPI] + input + +Paper2reMarkable version 0.4.0 positional arguments: input URL to a paper or the path of a local PDF file optional arguments: -h, --help show this help message and exit - -b, --blank Add a blank page after every page of the PDF (default: - False) - -v, --verbose be verbose (default: False) - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir (default: False) - -d, --debug debug mode, doesn't upload to reMarkable (default: - False) + -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align - (default: False) - --filename FILENAME Filename to use for the file on reMarkable (default: - None) + -d, --debug debug mode, doesn't upload to reMarkable + -n, --no-upload don't upload to the reMarkable, save the output in + current working dir -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR directory on reMarkable to put the file (created if - missing) (default: /) - --rmapi RMAPI path to rmapi executable (default: rmapi) + missing, default: /) + -v, --verbose be verbose + --filename FILENAME Filename to use for the file on reMarkable + --gs GS path to gs executable (default: gs) --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) --pdftk PDFTK path to pdftk executable (default: pdftk) - --gs GS path to gs executable (default: gs) + --rmapi RMAPI path to rmapi executable (default: rmapi) ``` And here's an example with verbose mode enabled that shows everything the script does by default: -```bash -$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 +``` +$ p2r -v https://arxiv.org/abs/1811.11242 2019-05-30 00:38:27 - INFO - Starting ArxivProvider 2019-05-30 00:38:27 - INFO - Getting paper info from arXiv 2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242 @@ -86,7 +94,7 @@ $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 2019-05-30 00:38:42 - INFO - Upload successful. ``` -## Dependencies +## Installation The script requires the following external programs to be available: @@ -96,27 +104,15 @@ The script requires the following external programs to be available: - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -If these scripts are not available on the ``PATH`` variable, you can supply them -with the relevant options to the script. - -The script also needs the following Python packages: +If these scripts are not available on the ``PATH`` variable, you can supply +them with the relevant options to the script. Then, you can install +paper2remarkable from PyPI: -- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML -- [requests](https://pypi.org/project/requests/): getting HTML -- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF -- [titlecase](https://pypi.org/project/titlecase/): fancy titles -- [pdfplumber](https://github.com/jsvine/pdfplumber): used for better page - cropping -- [unidecode](https://pypi.org/project/Unidecode/): clean accented characters - from the filename - -If you use [Poetry](https://poetry.eustace.io/) you can install these -dependencies using ``poetry install`` in the project directory. Alternatively, -you can use ``pip`` with the following command: - -```bash -pip install --user bs4 requests PyPDF2 titlecase pdfplumber unidecode ``` +pip install paper2remarkable +``` + +This installs the ``p2r`` command line program. ## Docker @@ -127,7 +123,7 @@ First clone this repository with `git clone` and `cd` inside of it, then build the container: ```bash -docker build -t arxiv2remarkable . +docker build -t paper2remarkable . ``` ### Authorization @@ -137,7 +133,7 @@ we'll use `rmapi` to create it. ```bash touch ${HOME}/.rmapi -docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi arxiv2remarkable version +docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi paper2remarkable version ``` which should end with output like @@ -149,15 +145,15 @@ rmapi version: 0.0.5 ### Usage -Use the container by replacing `python arxiv2remarkable.py` with `docker run ---rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable`, e.g. +Use the container by replacing `p2r` with `docker run --rm -v +"${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable`, e.g. ``` # print help and exit -docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable --help +docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable --help # equivalent to above usage via `python` -docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable -v https://arxiv.org/abs/1811.11242 +docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable -v https://arxiv.org/abs/1811.11242 ``` # Notes diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py deleted file mode 100755 index 5694e1b..0000000 --- a/arxiv2remarkable.py +++ /dev/null @@ -1,859 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__version__ = "0.3.5" -__author__ = "G.J.J. van den Burg" - -""" -Download a paper from various sources and send it to the reMarkable. - -Author: G.J.J. van den Burg -Date: 2019-02-02 -License: MIT - -""" - -import PyPDF2 -import abc -import argparse -import bs4 -import datetime -import os -import pdfplumber -import re -import requests -import shutil -import string -import subprocess -import sys -import tempfile -import time -import titlecase -import unidecode -import urllib.parse - -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" - -HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " - "Safari/537.36" -} - -RM_WIDTH = 1404 -RM_HEIGHT = 1872 - - -class Provider(metaclass=abc.ABCMeta): - """ ABC for providers of pdf sources """ - - meta_author_key = "citation_author" - meta_title_key = "citation_title" - meta_date_key = "citation_date" - - def __init__( - self, - verbose=False, - upload=True, - debug=False, - center=False, - blank=False, - remarkable_dir="/", - rmapi_path="rmapi", - pdfcrop_path="pdfcrop", - pdftk_path="pdftk", - gs_path="gs", - ): - self.verbose = verbose - self.upload = upload - self.debug = debug - self.center = center - self.blank = blank - self.remarkable_dir = remarkable_dir - self.rmapi_path = rmapi_path - self.pdfcrop_path = pdfcrop_path - self.pdftk_path = pdftk_path - self.gs_path = gs_path - - self.log("Starting %s" % type(self).__name__) - - def log(self, msg, mode="info"): - if not self.verbose: - return - if not mode in ["info", "warning"]: - raise ValueError("unknown logging mode.") - now = datetime.datetime.now() - print( - now.strftime("%Y-%m-%d %H:%M:%S") - + " - " - + mode.upper() - + " - " - + msg - ) - - def warn(self, msg): - self.log(msg, mode="warning") - - @staticmethod - @abc.abstractmethod - def validate(src): - """ Validate whether ``src`` is appropriate for this provider """ - - def retrieve_pdf(self, src, filename): - """ Download pdf from src and save to filename """ - _, pdf_url = self.get_abs_pdf_urls(src) - self.download_url(pdf_url, filename) - - def _format_authors(self, soup_authors, sep=",", idx=0, op=None): - op = (lambda x: x) if op is None else op - # format the author list retrieved by bs4 - return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] - - def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] - return self._format_authors(authors) - - def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] - - def _format_date(self, soup_date): - return soup_date - - def get_date(self, soup): - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] - return self._format_date(date) - - def get_paper_info( - self, - src, - author_key="citation_author", - title_key="citation_title", - date_key="citation_date", - ): - """ Retrieve the title/author (surnames)/year information """ - abs_url, _ = self.get_abs_pdf_urls(src) - self.log("Getting paper info") - page = self.get_page_with_retry(abs_url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = self.get_authors(soup) - title = self.get_title(soup) - date = self.get_date(soup) - return dict(title=title, date=date, authors=authors) - - def string_clean(self, s): - """ Clean a string to replace accented characters with equivalents and - keep only the allowed characters """ - normalized = unidecode.unidecode(s) - allowed = string.ascii_letters + string.digits + "_ ." - cleaned = "".join(c if c in allowed else "_" for c in normalized) - return cleaned - - def create_filename(self, info, filename=None): - """ Generate filename using the info dict or filename if provided """ - if not filename is None: - return filename - # we assume that the list of authors is surname only. - self.log("Generating output filename") - - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = self.string_clean(author_part) - - title_part = self.string_clean(info["title"]) - title_part = titlecase.titlecase(title_part).replace(" ", "_") - - year_part = info["date"].split("/")[0] - - name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - name = unidecode.unidecode(name) - self.log("Created filename: %s" % name) - return name - - def blank_pdf(self, filepath): - if not self.blank: - return filepath - - self.log("Adding blank pages") - input_pdf = PyPDF2.PdfFileReader(filepath) - output_pdf = PyPDF2.PdfFileWriter() - for page in input_pdf.pages: - output_pdf.addPage(page) - output_pdf.addBlankPage() - - output_file = os.path.splitext(filepath)[0] + "-blank.pdf" - with open(output_file, "wb") as fp: - output_pdf.write(fp) - return output_file - - def crop_pdf(self, filepath): - self.log("Cropping pdf file") - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper( - filepath, cropped_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.crop(margins=15) - - if not status == 0: - self.warn("Failed to crop the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(cropped_file): - self.warn( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - def center_pdf(self, filepath): - if not self.center: - return filepath - - self.log("Centering pdf file") - centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper( - filepath, centered_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.center() - if not status == 0: - self.warn("Failed to center the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(centered_file): - self.warn( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - - def shrink_pdf(self, filepath): - self.log("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - self.gs_path, - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - self.warn("Failed to shrink the pdf file") - return filepath - return output_file - - def check_file_is_pdf(self, filename): - try: - fp = open(filename, "rb") - pdf = PyPDF2.PdfFileReader(fp, strict=False) - fp.close() - del pdf - return True - except PyPDF2.utils.PdfReadError: - exception("Downloaded file isn't a valid pdf file.") - - def download_url(self, url, filename): - """Download the content of an url and save it to a filename """ - self.log("Downloading file at url: %s" % url) - content = self.get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - def get_page_with_retry(self, url, tries=5): - count = 0 - while count < tries: - count += 1 - error = False - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - error = True - if error or not res.ok: - self.warn("Error getting url %s. Retrying in 5 seconds" % url) - time.sleep(5) - continue - self.log("Downloading url: %s" % url) - return res.content - - def upload_to_rm(self, filepath): - remarkable_dir = self.remarkable_dir.rstrip("/") - self.log("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [self.rmapi_path, "mkdir", remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception( - "Creating directory %s on reMarkable failed" - % remarkable_dir - ) - status = subprocess.call( - [self.rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - self.log("Upload successful.") - - def dearxiv(self, input_file): - """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) - - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) - - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - - return output_file - - def run(self, src, filename=None): - info = self.get_paper_info(src) - clean_filename = self.create_filename(info, filename) - tmp_filename = "paper.pdf" - - self.initial_dir = os.getcwd() - with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: - os.chdir(working_dir) - self.retrieve_pdf(src, tmp_filename) - self.check_file_is_pdf(tmp_filename) - - ops = [ - self.dearxiv, - self.crop_pdf, - self.center_pdf, - self.blank_pdf, - self.shrink_pdf, - ] - intermediate_fname = tmp_filename - for op in ops: - intermediate_fname = op(intermediate_fname) - shutil.move(intermediate_fname, clean_filename) - - if self.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if self.upload: - return self.upload_to_rm(clean_filename) - - target_path = os.path.join(self.initial_dir, clean_filename) - while os.path.exists(target_path): - base = os.path.splitext(target_path)[0] - target_path = base + "_.pdf" - shutil.move(clean_filename, target_path) - return target_path - - -class Arxiv(Provider): - - re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" - re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """Get the pdf and abs url from any given arXiv url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match(self.re_pdf, url): - abs_url = url[:-4].replace("pdf", "abs") - pdf_url = url - else: - exception("Couldn't figure out arXiv urls.") - return abs_url, pdf_url - - def validate(src): - """Check if the url is to an arXiv page. """ - return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) - - -class Pubmed(Provider): - - meta_author_key = "citation_authors" - - re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" - re_pdf = ( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" - ) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """Get the pdf and html url from a given PMC url """ - if re.match(self.re_pdf, url): - idx = url.index("pdf") - abs_url = url[: idx - 1] - pdf_url = url - elif re.match(self.re_abs, url): - abs_url = url - pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually - else: - exception("Couldn't figure out PMC urls.") - return abs_url, pdf_url - - def validate(src): - return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(",") - return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) - - def _format_date(self, soup_date): - if re.match("\w+\ \d{4}", soup_date): - return soup_date.split(" ")[-1] - return soup_date.replace(" ", "_") - - -class ACM(Provider): - - meta_author_key = "citation_authors" - - re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_acm_pdf_url(self, url): - page = self.get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: - return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href - - def get_abs_pdf_urls(self, url): - if re.match(self.re_abs, url): - abs_url = url - pdf_url = self.get_acm_pdf_url(url) - if pdf_url is None: - exception( - "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" - ) - else: - exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." - ) - return abs_url, pdf_url - - def validate(src): - m = re.fullmatch(ACM.re_abs, src) - return not m is None - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(";") - return super()._format_authors(soup_authors, sep=",", idx=0, op=op) - - def _format_date(self, soup_date): - if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - self.warn( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so it can be fixed: %s" % GITHUB_URL - ) - return soup_date.strip().split("/")[-1] - - -class OpenReview(Provider): - - meta_date_key = "citation_publication_date" - - re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" - re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """ Get the pdf and abstract url from a OpenReview url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("forum", "pdf") - elif re.match(self.re_pdf, url): - abs_url = url.replace("pdf", "forum") - pdf_url = url - else: - exception("Couldn't figure out OpenReview urls.") - return abs_url, pdf_url - - def validate(src): - """ Check if the url is a valid OpenReview url. """ - return re.match(OpenReview.re_abs, src) or re.match( - OpenReview.re_pdf, src - ) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) - - -class Springer(Provider): - - meta_date_key = "citation_online_date" - - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("article", "content/pdf") - elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] - pdf_url = urllib.parse.unquote(url) - else: - exception("Couldn't figure out Springer urls.") - return abs_url, pdf_url - - def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) - - -class LocalFile(Provider): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def validate(src): - return os.path.exists(src) - - def retrieve_pdf(self, src, filename): - source = os.path.join(self.initial_dir, src) - shutil.copy(source, filename) - - def get_paper_info(self, src): - return {"filename": src} - - def create_filename(self, info, filename=None): - if not filename is None: - return filename - return os.path.basename(info["filename"]) - - -class PdfUrl(Provider): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: - return False - - def retrieve_pdf(self, url, filename): - self.download_url(url, filename) - - def get_paper_info(self, src): - return None - - def create_filename(self, info, filename=None): - if filename is None: - exception( - "Filename must be provided with PDFUrlProvider (use --filename)" - ) - return filename - - -class Cropper(object): - def __init__( - self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" - ): - if not input_file is None: - self.input_file = os.path.abspath(input_file) - self.reader = PyPDF2.PdfFileReader(self.input_file) - if not output_file is None: - self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path - - self.writer = PyPDF2.PdfFileWriter() - - def crop(self, margins=1): - return self.process_file(self.crop_page, margins=margins) - - def center(self, padding=15): - return self.process_file(self.center_page, padding=padding) - - def process_file(self, page_func, *args, **kwargs): - for page_idx in range(self.reader.getNumPages()): - status = page_func(page_idx, *args, **kwargs) - if not status == 0: - return status - with open(self.output_file, "wb") as fp: - self.writer.write(fp) - return 0 - - def center_page(self, page_idx, padding): - return self.process_page( - page_idx, self.get_center_bbox, padding=padding - ) - - def crop_page(self, page_idx, margins): - return self.process_page(page_idx, self.get_bbox, margins=margins) - - def export_page(self, page_idx): - """Helper function that exports a single page given by index """ - page = self.reader.getPage(page_idx) - writer = PyPDF2.PdfFileWriter() - writer.addPage(page) - tmpfname = "./page.pdf" - with open(tmpfname, "wb") as fp: - writer.write(fp) - return tmpfname - - def process_page(self, page_idx, bbox_func, *args, **kwargs): - """Process a single page and add it to the writer """ - tmpfname = self.export_page(page_idx) - tmpfout = "./output.pdf" - bbox = bbox_func(tmpfname, *args, **kwargs) - status = subprocess.call( - [ - self.pdfcrop_path, - "--bbox", - " ".join(map(str, bbox)), - tmpfname, - tmpfout, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - return status - reader = PyPDF2.PdfFileReader(tmpfout) - page = reader.getPage(0) - self.writer.addPage(page) - os.unlink(tmpfname) - os.unlink(tmpfout) - return 0 - - def get_bbox(self, filename, margins=1, resolution=72): - """Get the bounding box, with optional margins - - if margins is integer, used for all margins, else - margins = [left, top, right, bottom] - - We get the bounding box by finding the smallest rectangle that is - completely surrounded by white pixels. - """ - if isinstance(margins, int): - margins = [margins for _ in range(4)] - pdf = pdfplumber.open(filename) - im = pdf.pages[0].to_image(resolution=resolution) - pdf.close() - - pixels = list(im.original.getdata()) - W, H = im.original.size - - # M is a list of H lists with each W integers that equal the sum of the - # pixel values - M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] - - left, top, bottom, right = 0, 0, 0, 0 - while top < H and sum(M[top]) == W * 255 * 3: - top += 1 - while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: - bottom += 1 - - # Transpose M - M = list(zip(*M)) - while left < W and sum(M[left]) == H * 255 * 3: - left += 1 - while right < W and sum(M[W - 1 - right]) == H * 255 * 3: - right += 1 - - left -= margins[0] - top -= margins[1] - right -= margins[2] - bottom -= margins[3] - - # This is the bounding box in PIL format: (0, 0) top left - x0, y0, x1, y1 = left, top, W - right, H - bottom - - # Get the bbox in Ghostscript format: (0, 0) bottom left - a0, b0, a1, b1 = x0, H - y1, x1, H - y0 - return [a0, b0, a1, b1] - - def get_center_bbox(self, filename, padding=15): - """Compute a bounding box that will center the page file on the - reMarkable - """ - bbox = self.get_bbox(filename, margins=0) - - h = bbox[3] - bbox[1] - w = bbox[2] - bbox[0] - - # we want some minimal padding all around, because it is visually more - # pleasing. - h_prime = h + 2 * padding - w_prime = w + 2 * padding - - # if the document is wider than the remarkable, we add top-padding to - # center it, otherwise we add left-padding - x, y = 0, 0 - if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: - y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 - else: - x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 - - margins = [padding + x, padding + y, padding, padding] - return self.get_bbox(filename, margins=margins) - - -def exception(msg): - print("ERROR: " + msg, file=sys.stderr) - print("Error occurred. Exiting.", file=sys.stderr) - print("", file=sys.stderr) - print( - "If you think this might be a bug, please raise an issue on GitHub: %s" - % GITHUB_URL - ) - raise SystemExit(1) - - -def parse_args(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "-b", - "--blank", - help="Add a blank page after every page of the PDF", - action="store_true", - ) - parser.add_argument( - "-v", "--verbose", help="be verbose", action="store_true" - ) - parser.add_argument( - "-n", - "--no-upload", - help="don't upload to the reMarkable, save the output in current working dir", - action="store_true", - ) - parser.add_argument( - "-d", - "--debug", - help="debug mode, doesn't upload to reMarkable", - action="store_true", - ) - parser.add_argument( - "-c", - "--center", - help="Center the PDF on the page, instead of left align", - action="store_true", - ) - parser.add_argument( - "--filename", - help="Filename to use for the file on reMarkable", - default=None, - ) - parser.add_argument( - "-p", - "--remarkable-path", - help="directory on reMarkable to put the file (created if missing)", - dest="remarkable_dir", - default="/", - ) - parser.add_argument( - "--rmapi", help="path to rmapi executable", default="rmapi" - ) - parser.add_argument( - "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" - ) - parser.add_argument( - "--pdftk", help="path to pdftk executable", default="pdftk" - ) - parser.add_argument("--gs", help="path to gs executable", default="gs") - parser.add_argument( - "input", help="URL to a paper or the path of a local PDF file" - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] - - provider = next((p for p in providers if p.validate(args.input)), None) - if provider is None: - exception("Input not valid, no provider can handle this source.") - - prov = provider( - verbose=args.verbose, - upload=not args.no_upload, - debug=args.debug, - center=args.center, - blank=args.blank, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - pdfcrop_path=args.pdfcrop, - pdftk_path=args.pdftk, - gs_path=args.gs, - ) - - prov.run(args.input, filename=args.filename) - - -if __name__ == "__main__": - main() diff --git a/make_release.py b/make_release.py new file mode 100644 index 0000000..932209a --- /dev/null +++ b/make_release.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Do-nothing script for making a release + +This idea comes from here: +https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/ + +Author: Gertjan van den Burg +Date: 2019-07-23 + +""" + +import colorama +import os + + +def colored(msg, color=None, style=None): + colors = { + "red": colorama.Fore.RED, + "green": colorama.Fore.GREEN, + "cyan": colorama.Fore.CYAN, + "yellow": colorama.Fore.YELLOW, + "magenta": colorama.Fore.MAGENTA, + None: "", + } + styles = { + "bright": colorama.Style.BRIGHT, + "dim": colorama.Style.DIM, + None: "", + } + pre = colors[color] + styles[style] + post = colorama.Style.RESET_ALL + return f"{pre}{msg}{post}" + + +def cprint(msg, color=None, style=None): + print(colored(msg, color=color, style=style)) + + +def wait_for_enter(): + input(colored("\nPress Enter to continue", style="dim")) + print() + + +def get_package_name(): + with open("./setup.py", "r") as fp: + nameline = next( + (l.strip() for l in fp if l.startswith("NAME = ")), None + ) + return nameline.split("=")[-1].strip().strip('"') + + +class Step: + def pre(self, context): + pass + + def post(self, context): + wait_for_enter() + + def run(self, context): + try: + self.pre(context) + self.action(context) + self.post(context) + except KeyboardInterrupt: + cprint("\nInterrupted.", color="red") + raise SystemExit(1) + + def instruct(self, msg): + cprint(msg, color="green") + + def print_run(self, msg): + cprint("Run:", color="cyan", style="bright") + self.print_cmd(msg) + + def print_cmd(self, msg): + cprint("\t" + msg, color="cyan", style="bright") + + def do_cmd(self, cmd): + cprint(f"Going to run: {cmd}", color="magenta", style="bright") + wait_for_enter() + os.system(cmd) + + +class GitToMaster(Step): + def action(self, context): + self.instruct("Make sure you're on master and changes are merged in") + self.print_run("git checkout master") + + +class UpdateChangelog(Step): + def action(self, context): + self.instruct(f"Update change log for version {context['version']}") + self.print_run("vi CHANGELOG.md") + + +class RunTests(Step): + def action(self, context): + self.do_cmd("make test") + + +class BumpVersionPackage(Step): + def action(self, context): + self.instruct(f"Update __version__.py with new version") + self.print_run(f"vi {context['pkgname']}/__version__.py") + + def post(self, context): + wait_for_enter() + context["version"] = self._get_version(context) + + def _get_version(self, context): + # Get the version from the version file + about = {} + with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp: + exec(fp.read(), about) + return about["__version__"] + + +class MakeClean(Step): + def action(self, context): + self.do_cmd("make clean") + + +class MakeDocs(Step): + def action(self, context): + self.do_cmd("make docs") + + +class MakeDist(Step): + def action(self, context): + self.do_cmd("make dist") + + +class PushToTestPyPI(Step): + def action(self, context): + self.do_cmd( + "twine upload --repository-url https://test.pypi.org/legacy/ dist/*" + ) + + +class InstallFromTestPyPI(Step): + def action(self, context): + self.print_run("cd /tmp/") + self.print_cmd("rm -rf ./venv") + self.print_cmd("virtualenv ./venv") + self.print_cmd("cd ./venv") + self.print_cmd("source bin/activate") + self.print_cmd( + "pip install --index-url https://test.pypi.org/simple/ " + + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}" + ) + + +class TestPackage(Step): + def action(self, context): + self.instruct( + f"Ensure that the following command gives version {context['version']}" + ) + self.print_run(f"{context['pkgname']} -h") + + +class DeactivateVenv(Step): + def action(self, context): + self.print_run("deactivate") + self.instruct("Go back to the project directory") + + +class GitTagVersion(Step): + def action(self, context): + self.do_cmd(f"git tag v{context['version']}") + + +class GitAdd(Step): + def action(self, context): + self.instruct("Add everything to git and commit") + self.print_run("git gui") + + +class PushToPyPI(Step): + def action(self, context): + self.do_cmd("twine upload dist/*") + + +class PushToGitHub(Step): + def action(self, context): + self.do_cmd("git push -u --tags origin master") + + +class WaitForTravis(Step): + def action(self, context): + self.instruct( + "Wait for Travis to complete and verify that its successful" + ) + + +class WaitForAppVeyor(Step): + def action(self, context): + self.instruct( + "Wait for AppVeyor to complete and verify that its successful" + ) + + +class WaitForRTD(Step): + def action(self, context): + self.instruct( + "Wait for ReadTheDocs to complete and verify that its successful" + ) + + +def main(): + colorama.init() + procedure = [ + GitToMaster(), + GitAdd(), + PushToGitHub(), + BumpVersionPackage(), + UpdateChangelog(), + MakeClean(), + RunTests(), + MakeDist(), + PushToTestPyPI(), + InstallFromTestPyPI(), + TestPackage(), + DeactivateVenv(), + GitAdd(), + PushToPyPI(), + GitTagVersion(), + PushToGitHub(), + ] + context = {} + context["pkgname"] = get_package_name() + for step in procedure: + step.run(context) + cprint("\nDone!", color="yellow", style="bright") + + +if __name__ == "__main__": + main() diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py new file mode 100644 index 0000000..113fc83 --- /dev/null +++ b/paper2remarkable/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +from .__version__ import __version__ + +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py new file mode 100644 index 0000000..b97d538 --- /dev/null +++ b/paper2remarkable/__main__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +""" +Caller for the command line application +""" + +import sys + +def main(): + from .ui import main as realmain + + sys.exit(realmain()) + +if __name__ == '__main__': + main() diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py new file mode 100644 index 0000000..5bee2af --- /dev/null +++ b/paper2remarkable/__version__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +VERSION = (0, 4, 0) + +__version__ = '.'.join(map(str, VERSION)) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py new file mode 100644 index 0000000..b25b178 --- /dev/null +++ b/paper2remarkable/crop.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +"""Code for cropping a PDF file + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import os +import subprocess +import pdfplumber + +RM_WIDTH = 1404 +RM_HEIGHT = 1872 + + +class Cropper(object): + def __init__( + self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + ): + if not input_file is None: + self.input_file = os.path.abspath(input_file) + self.reader = PyPDF2.PdfFileReader(self.input_file) + if not output_file is None: + self.output_file = os.path.abspath(output_file) + self.pdfcrop_path = pdfcrop_path + + self.writer = PyPDF2.PdfFileWriter() + + def crop(self, margins=1): + return self.process_file(self.crop_page, margins=margins) + + def center(self, padding=15): + return self.process_file(self.center_page, padding=padding) + + def process_file(self, page_func, *args, **kwargs): + for page_idx in range(self.reader.getNumPages()): + status = page_func(page_idx, *args, **kwargs) + if not status == 0: + return status + with open(self.output_file, "wb") as fp: + self.writer.write(fp) + return 0 + + def center_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_center_bbox, padding=padding + ) + + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + + def export_page(self, page_idx): + """Helper function that exports a single page given by index """ + page = self.reader.getPage(page_idx) + writer = PyPDF2.PdfFileWriter() + writer.addPage(page) + tmpfname = "./page.pdf" + with open(tmpfname, "wb") as fp: + writer.write(fp) + return tmpfname + + def process_page(self, page_idx, bbox_func, *args, **kwargs): + """Process a single page and add it to the writer """ + tmpfname = self.export_page(page_idx) + tmpfout = "./output.pdf" + bbox = bbox_func(tmpfname, *args, **kwargs) + status = subprocess.call( + [ + self.pdfcrop_path, + "--bbox", + " ".join(map(str, bbox)), + tmpfname, + tmpfout, + ], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + return status + reader = PyPDF2.PdfFileReader(tmpfout) + page = reader.getPage(0) + self.writer.addPage(page) + os.unlink(tmpfname) + os.unlink(tmpfout) + return 0 + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + pdf = pdfplumber.open(filename) + im = pdf.pages[0].to_image(resolution=resolution) + pdf.close() + + pixels = list(im.original.getdata()) + W, H = im.original.size + + # M is a list of H lists with each W integers that equal the sum of the + # pixel values + M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] + + left, top, bottom, right = 0, 0, 0, 0 + while top < H and sum(M[top]) == W * 255 * 3: + top += 1 + while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: + bottom += 1 + + # Transpose M + M = list(zip(*M)) + while left < W and sum(M[left]) == H * 255 * 3: + left += 1 + while right < W and sum(M[W - 1 - right]) == H * 255 * 3: + right += 1 + + left -= margins[0] + top -= margins[1] + right -= margins[2] + bottom -= margins[3] + + # This is the bounding box in PIL format: (0, 0) top left + x0, y0, x1, y1 = left, top, W - right, H - bottom + + # Get the bbox in Ghostscript format: (0, 0) bottom left + a0, b0, a1, b1 = x0, H - y1, x1, H - y0 + return [a0, b0, a1, b1] + + def get_center_bbox(self, filename, padding=15): + """Compute a bounding box that will center the page file on the + reMarkable + """ + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # we want some minimal padding all around, because it is visually more + # pleasing. + h_prime = h + 2 * padding + w_prime = w + 2 * padding + + # if the document is wider than the remarkable, we add top-padding to + # center it, otherwise we add left-padding + x, y = 0, 0 + if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: + y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 + else: + x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 + + margins = [padding + x, padding + y, padding, padding] + return self.get_bbox(filename, margins=margins) diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py new file mode 100644 index 0000000..bae1cbf --- /dev/null +++ b/paper2remarkable/log.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +"""Just a simple logger + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2019, G.J.J. van den Burg + +""" + +# NOTE: I know about the logging module, but this was easier because one of the +# dependencies was using that and it became complicated. This one is obviously +# not thread-safe and is very simple. + +import datetime +import sys + + +class Singleton(type): + # https://stackoverflow.com/q/6760685 + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__( + *args, **kwargs + ) + return cls._instances[cls] + + +class Logger(metaclass=Singleton): + def __init__(self): + self.enabled = True + + def enable(self): + self.enabled = True + + def disable(self): + self.enabled = False + + def _log(self, msg, mode): + if not self.enabled: + return + if not mode in ("info", "warn"): + raise ValueError("Unknown logging mode: %s" % mode) + file = sys.stdout if mode == "info" else sys.stderr + now = datetime.datetime.now() + nowstr = now.strftime("%Y-%m-%d %H:%M:%S") + print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file) + file.flush() + + def info(self, msg): + self._log(msg, "info") + + def warning(self, msg): + self._log(msg, "warn") diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py new file mode 100644 index 0000000..8636017 --- /dev/null +++ b/paper2remarkable/pdf_ops.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +"""Operations on PDF files + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + + +import PyPDF2 +import os +import subprocess + +from .crop import Cropper +from .log import Logger + +logger = Logger() + +def crop_pdf(filepath, pdfcrop_path="pdfcrop"): + """Crop the pdf file using Cropper + """ + logger.info("Cropping pdf file") + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + + cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) + status = cropper.crop(margins=15) + + if not status == 0: + logger.warning("Failed to crop the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(cropped_file): + logger.warning( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + +def center_pdf(filepath, pdfcrop_path="pdfcrop"): + """Center the pdf file on the reMarkable + """ + logger.info("Centering pdf file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + + cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) + status = cropper.center() + + if not status == 0: + logger.warning("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + logger.warning( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + + +def blank_pdf(filepath): + """Add blank pages to PDF + """ + logger.info("Adding blank pages") + input_pdf = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + for page in input_pdf.pages: + output_pdf.addPage(page) + output_pdf.addBlankPage() + + output_file = os.path.splitext(filepath)[0] + "-blank.pdf" + with open(output_file, "wb") as fp: + output_pdf.write(fp) + return output_file + + +def shrink_pdf(filepath, gs_path="gs"): + """Shrink the PDF file size using Ghostscript + """ + logger.info("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + logger.warning("Failed to shrink the pdf file") + return filepath + return output_file diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py new file mode 100644 index 0000000..f6f93f9 --- /dev/null +++ b/paper2remarkable/providers/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +from .arxiv import Arxiv +from .pubmed import PubMed +from .acm import ACM +from .openreview import OpenReview +from .springer import Springer +from .local import LocalFile +from .pdf_url import PdfUrl + +providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py new file mode 100644 index 0000000..bdc9558 --- /dev/null +++ b/paper2remarkable/providers/_base.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +"""Base for the Provider class + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import abc +import os +import shutil +import tempfile + +from ._info import Informer +from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf +from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable +from ..log import Logger + +logger = Logger() + + +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ + + def __init__( + self, + verbose=False, + upload=True, + debug=False, + center=False, + blank=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.upload = upload + self.debug = debug + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + self.informer = Informer() + + # disable logging if requested + if not verbose: + logger.disable() + + # Define the operations to run on the pdf. Providers can add others. + self.operations = [("crop", self.crop_pdf)] + if center: + self.operations.append(("center", self.center_pdf)) + + if blank: + self.operations.append(("blank", blank_pdf)) + self.operations.append(("shrink", self.shrink_pdf)) + + logger.info("Starting %s" % type(self).__name__) + + @staticmethod + @abc.abstractmethod + def validate(src): + """ Validate whether ``src`` is appropriate for this provider """ + + # Wrappers for pdf operations that have additional arguments + def crop_pdf(self, filepath): + return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + + def center_pdf(self, filepath): + return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + + def shrink_pdf(self, filepath): + return shrink_pdf(filepath, gs_path=self.gs_path) + + def retrieve_pdf(self, pdf_url, filename): + """ Download pdf from src and save to filename """ + # This must exist so that the LocalFile provider can overwrite it + download_url(pdf_url, filename) + + def run(self, src, filename=None): + abs_url, pdf_url = self.get_abs_pdf_urls(src) + clean_filename = filename or self.informer.get_filename(abs_url) + tmp_filename = "paper.pdf" + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) + + intermediate_fname = tmp_filename + for opname, op in self.operations: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return upload_to_remarkable( + clean_filename, + remarkable_dir=self.remarkable_dir, + rmapi_path=self.rmapi_path, + ) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py new file mode 100644 index 0000000..0b28658 --- /dev/null +++ b/paper2remarkable/providers/_info.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +"""Functionality for retrieving paper info +""" + +import titlecase +import unidecode +import bs4 + +from ..utils import clean_string, get_page_with_retry +from ..log import Logger + +logger = Logger() + + +class Informer: + """Base class for the informers. + + The "informer" class is used to retrieve the title, authors, and year of + publication of the provided paper. + + This base class provides the main functionality, but because various + outlets use different conventions to embed author, title, and publication + year information, we expect that individual providers will subclass this + class and overwrite some of the methods. + """ + + meta_author_key = "citation_author" + meta_title_key = "citation_title" + meta_date_key = "citation_date" + + def __init__(self, title=None, authors=None, year=None): + self.title = title + self.authors = authors or [] + self.year = year + + def get_filename(self, abs_url): + """ Generate nice filename using the paper information + + The provided url must be to a HTMl page where this information can be + found, not to the PDF file itself. + """ + logger.info("Generating output filename") + + # Retrieve the paper information + self.get_info(abs_url) + + # we assume that the list of authors is surname only. + if len(self.authors) > 3: + authors = self.authors[0] + "_et_al" + else: + authors = "_".join(self.authors) + authors = clean_string(authors) + + # Clean the title and make it titlecase + title = clean_string(self.title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + title = clean_string(title) + + year = str(self.year) + + name = authors + "_-_" + title + "_" + year + ".pdf" + name = unidecode.unidecode(name) + logger.info("Created filename: %s" % name) + return name + + def get_info(self, url): + logger.info("Getting paper info") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + self.authors = self.authors or self.get_authors(soup) + self.title = self.title or self.get_title(soup) + self.year = self.year or self.get_year(soup) + + ## Title + + def get_title(self, soup): + target = soup.find_all("meta", {"name": self.meta_title_key}) + return target[0]["content"] + + ## Authors + + def get_authors(self, soup): + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": self.meta_author_key}) + ] + return self._format_authors(authors) + + def _format_authors(self, soup_authors, sep=",", idx=0, op=None): + op = (lambda x: x) if op is None else op + # format the author list retrieved by bs4 + return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] + + ## Year + + def _format_year(self, soup_date): + return soup_date.split("/")[0] + + def get_year(self, soup): + """ Retrieve the contents of the meta_date_key field and format it """ + date = soup.find_all("meta", {"name": self.meta_date_key})[0][ + "content" + ] + return self._format_year(date) diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py new file mode 100644 index 0000000..a0d79bd --- /dev/null +++ b/paper2remarkable/providers/acm.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +"""Provider for ACM + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import bs4 +import re + +from ._base import Provider +from ._info import Informer +from .. import GITHUB_URL +from ..utils import exception, get_page_with_retry +from ..log import Logger + +logger = Logger() + + +class ACMInformer(Informer): + meta_author_key = "citation_authors" + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(";") + return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + + def _format_year(self, soup_date): + if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): + logger.warning( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so it can be fixed: %s" % GITHUB_URL + ) + return soup_date.strip().split("/")[-1] + + +class ACM(Provider): + + re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = ACMInformer() + + def get_acm_pdf_url(self, url): + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) + else: + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return abs_url, pdf_url + + def validate(src): + m = re.fullmatch(ACM.re_abs, src) + return not m is None diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py new file mode 100644 index 0000000..e022658 --- /dev/null +++ b/paper2remarkable/providers/arxiv.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +"""Provider for arxiv.org + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import os +import re +import subprocess + +from ._info import Informer +from ._base import Provider +from ..utils import exception +from ..log import Logger + +logger = Logger() + + +class ArxivInformer(Informer): + pass + + +class Arxiv(Provider): + + re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" + re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = ArxivInformer() + + # register the dearxiv operation + self.operations.insert(0, ("dearxiv", self.dearxiv)) + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match(self.re_pdf, url): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url + + def validate(src): + """Check if the url is to an arXiv page. """ + return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) + + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + logger.info("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py new file mode 100644 index 0000000..3f581b2 --- /dev/null +++ b/paper2remarkable/providers/local.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +"""Provider for local files + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import os +import shutil + +from ._base import Provider +from ._info import Informer + + +class LocalFileInformer(Informer): + def get_filename(self, abs_url): + return os.path.basename(abs_url) + + +class LocalFile(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = LocalFileInformer() + + def get_abs_pdf_urls(self, url): + # The 'url' is the path to the local file. We use this as abs_url and + # pdf_url. + return url, url + + def validate(src): + return os.path.exists(src) + + def retrieve_pdf(self, pdf_url, filename): + source = os.path.join(self.initial_dir, pdf_url) + shutil.copy(source, filename) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py new file mode 100644 index 0000000..bfb139d --- /dev/null +++ b/paper2remarkable/providers/openreview.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Provider for OpenReview + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class OpenReviewInformer(Informer): + + meta_date_key = "citation_publication_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class OpenReview(Provider): + + re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" + re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = OpenReviewInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("forum", "pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("pdf", "forum") + pdf_url = url + else: + exception("Couldn't figure out OpenReview urls.") + return abs_url, pdf_url + + def validate(src): + """ Check if the url is a valid OpenReview url. """ + return re.match(OpenReview.re_abs, src) or re.match( + OpenReview.re_pdf, src + ) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py new file mode 100644 index 0000000..d80b1a9 --- /dev/null +++ b/paper2remarkable/providers/pdf_url.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Provider for generic PDF url + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import urllib + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class PdfUrlInformer(Informer): + def get_filename(self, abs_url): + # if this is called, filename must not be provided + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + + +class PdfUrl(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = PdfUrlInformer() + + def get_abs_pdf_urls(self, url): + return (None, url) + + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py new file mode 100644 index 0000000..ba4cca0 --- /dev/null +++ b/paper2remarkable/providers/pubmed.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +"""Provider for PubMed + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class PubMedInformer(Informer): + + meta_author_key = "citation_authors" + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + def _format_year(self, soup_date): + if re.match("\w+\ \d{4}", soup_date): + return soup_date.split(" ")[-1] + return soup_date.replace(" ", "_") + + +class PubMed(Provider): + + re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" + re_pdf = ( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = PubMedInformer() + + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match(self.re_pdf, url): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match(self.re_abs, url): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually + else: + exception("Couldn't figure out PMC urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py new file mode 100644 index 0000000..ce4acdd --- /dev/null +++ b/paper2remarkable/providers/springer.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +"""Provider for Springer + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re +import urllib + +from ._base import Provider +from ._info import Informer +from ..utils import exception + + +class SpringerInformer(Informer): + + meta_date_key = "citation_online_date" + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class Springer(Provider): + + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.informer = SpringerInformer() + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a Springer url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + pdf_url = urllib.parse.unquote(url) + else: + exception("Couldn't figure out Springer urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py new file mode 100644 index 0000000..5323996 --- /dev/null +++ b/paper2remarkable/ui.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +"""Command line interface + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import argparse + +from . import __version__ + +from .providers import providers +from .utils import exception + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paper2reMarkable version %s" % __version__ + ) + parser.add_argument( + "-b", + "--blank", + help="Add a blank page after every page of the PDF", + action="store_true", + ) + parser.add_argument( + "-c", + "--center", + help="Center the PDF on the page, instead of left align", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + help="debug mode, doesn't upload to reMarkable", + action="store_true", + ) + parser.add_argument( + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", + action="store_true", + ) + parser.add_argument( + "-p", + "--remarkable-path", + help="directory on reMarkable to put the file (created if missing, default: /)", + dest="remarkable_dir", + default="/", + ) + parser.add_argument( + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "--filename", + help="Filename to use for the file on reMarkable", + default=None, + ) + parser.add_argument( + "--gs", help="path to gs executable (default: gs)", default="gs" + ) + parser.add_argument( + "--pdfcrop", + help="path to pdfcrop executable (default: pdfcrop)", + default="pdfcrop", + ) + parser.add_argument( + "--pdftk", + help="path to pdftk executable (default: pdftk)", + default="pdftk", + ) + parser.add_argument( + "--rmapi", + help="path to rmapi executable (default: rmapi)", + default="rmapi", + ) + parser.add_argument( + "input", + help="URL to a paper or the path of a local PDF file", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + prov = provider( + verbose=args.verbose, + upload=not args.no_upload, + debug=args.debug, + center=args.center, + blank=args.blank, + remarkable_dir=args.remarkable_dir, + rmapi_path=args.rmapi, + pdfcrop_path=args.pdfcrop, + pdftk_path=args.pdftk, + gs_path=args.gs, + ) + + prov.run(args.input, filename=args.filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py new file mode 100644 index 0000000..a313ffe --- /dev/null +++ b/paper2remarkable/utils.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +"""Utility functions for a2r + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import requests +import string +import subprocess +import sys +import time +import unidecode + +from . import GITHUB_URL +from .log import Logger + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + + +logger = Logger() + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + print("", file=sys.stderr) + print( + "If you think this might be a bug, please raise an issue on GitHub: %s" + % GITHUB_URL + ) + raise SystemExit(1) + + +def clean_string(s): + """ Clean a string by replacing accented characters with equivalents and + keeping only the allowed characters (ascii letters, digits, underscore, + space, dash, and period)""" + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ .-" + cleaned = "".join(c if c in allowed else "_" for c in normalized) + while "__" in cleaned: + cleaned = cleaned.replace("__", "_") + return cleaned + + +def assert_file_is_pdf(filename): + """Assert that a given file is a PDF file. + + This is done by trying to open it using PyPDF2. + """ + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("File %s isn't a valid pdf file." % filename) + + +def download_url(url, filename): + """Download the content of an url and save it to a filename """ + logger.info("Downloading file at url: %s" % url) + content = get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + +def get_page_with_retry(url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logger.warning( + "(%i/%i) Error getting url %s. Retrying in 5 seconds." + % (count, tries, url) + ) + time.sleep(5) + continue + logger.info("Downloading url: %s" % url) + return res.content + + +def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): + logger.info("Starting upload to reMarkable") + + # Create the reMarkable dir if it doesn't exist + remarkable_dir = remarkable_dir.rstrip("/") + if remarkable_dir: + status = subprocess.call( + [rmapi_path, "mkdir", remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" % remarkable_dir + ) + + # Upload the file + status = subprocess.call( + [rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + logger.info("Upload successful.") diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index 272967c..0000000 --- a/poetry.lock +++ /dev/null @@ -1,183 +0,0 @@ -[[package]] -category = "main" -description = "Screen-scraping library" -name = "beautifulsoup4" -optional = false -python-versions = "*" -version = "4.7.1" - -[package.dependencies] -soupsieve = ">=1.2" - -[[package]] -category = "main" -description = "Dummy package for Beautiful Soup" -name = "bs4" -optional = false -python-versions = "*" -version = "0.0.1" - -[package.dependencies] -beautifulsoup4 = "*" - -[[package]] -category = "main" -description = "Python package for providing Mozilla's CA Bundle." -name = "certifi" -optional = false -python-versions = "*" -version = "2018.11.29" - -[[package]] -category = "main" -description = "Universal encoding detector for Python 2 and 3" -name = "chardet" -optional = false -python-versions = "*" -version = "3.0.4" - -[[package]] -category = "main" -description = "Internationalized Domain Names in Applications (IDNA)" -name = "idna" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.8" - -[[package]] -category = "main" -description = "PDF parser and analyzer" -name = "pdfminer.six" -optional = false -python-versions = "*" -version = "20181108" - -[package.dependencies] -pycryptodome = "*" -six = "*" -sortedcontainers = "*" - -[[package]] -category = "main" -description = "Plumb a PDF for detailed information about each char, rectangle, and line." -name = "pdfplumber" -optional = false -python-versions = "*" -version = "0.5.12" - -[package.dependencies] -chardet = "*" -"pdfminer.six" = "20181108" -pillow = ">=3.0.0" -pycryptodome = "*" -unicodecsv = ">=0.14.1" -wand = "*" - -[[package]] -category = "main" -description = "Python Imaging Library (Fork)" -name = "pillow" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "6.0.0" - -[[package]] -category = "main" -description = "Cryptographic library for Python" -name = "pycryptodome" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "3.8.2" - -[[package]] -category = "main" -description = "Python HTTP for Humans." -name = "requests" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.21.0" - -[package.dependencies] -certifi = ">=2017.4.17" -chardet = ">=3.0.2,<3.1.0" -idna = ">=2.5,<2.9" -urllib3 = ">=1.21.1,<1.25" - -[[package]] -category = "main" -description = "Python 2 and 3 compatibility utilities" -name = "six" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*" -version = "1.12.0" - -[[package]] -category = "main" -description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -name = "sortedcontainers" -optional = false -python-versions = "*" -version = "2.1.0" - -[[package]] -category = "main" -description = "A CSS4 selector implementation for Beautiful Soup." -name = "soupsieve" -optional = false -python-versions = "*" -version = "1.7.3" - -[[package]] -category = "main" -description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*." -name = "unicodecsv" -optional = false -python-versions = "*" -version = "0.14.1" - -[[package]] -category = "main" -description = "ASCII transliterations of Unicode text" -name = "unidecode" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.1.1" - -[[package]] -category = "main" -description = "HTTP library with thread-safe connection pooling, file post, and more." -name = "urllib3" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -version = "1.24.1" - -[[package]] -category = "main" -description = "Ctypes-based simple MagickWand API binding for Python" -name = "wand" -optional = false -python-versions = "*" -version = "0.5.4" - -[metadata] -content-hash = "51a0dc0e8f6e6e23395cd5aca6a81e9b3aa121ec86f120f1304f2142eb2b65b0" -python-versions = "^3.5" - -[metadata.hashes] -beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"] -bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] -certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] -chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] -idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] -"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"] -pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"] -pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"] -pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"] -requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] -six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"] -sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"] -soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] -unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"] -unidecode = ["1d7a042116536098d05d599ef2b8616759f02985c85b4fef50c78a5aaf10822a", "2b6aab710c2a1647e928e36d69c21e76b453cd455f4e2621000e54b2a9b8cce8"] -urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] -wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"] diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 7e9c629..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,19 +0,0 @@ -[tool.poetry] -name = "arxiv2remarkable" -version = "0.1.0" -description = "Download an arXiv paper and send it to reMarkable" -authors = ["Gertjan van den Burg <gertjanvandenburg@gmail.com>"] -license = "MIT" - -[tool.poetry.dependencies] -python = "^3.5" -bs4 = "^0.0.1" -requests = "^2.21" -pdfplumber = "^0.5.12" -unidecode = "^1.1" - -[tool.poetry.dev-dependencies] - -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f54170a --- /dev/null +++ b/setup.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import os + +from setuptools import find_packages, setup + +# Package meta-data. +AUTHOR = "Gertjan van den Burg" +DESCRIPTION = "Easily download an academic paper and send it to the reMarkable" +EMAIL = "gertjanvandenburg@gmail.com" +LICENSE = "MIT" +LICENSE_TROVE = "License :: OSI Approved :: MIT License" +NAME = "paper2remarkable" +REQUIRES_PYTHON = ">=3.5.0" +URL = "https://github.com/GjjvdBurg/paper2remarkable" +VERSION = None + +# What packages are required for this module to be executed? +REQUIRED = [ + "beautifulsoup4>=4.8", + "requests>=2.21", + "pdfplumber>=0.5", + "unidecode>=1.1", + "titlecase>=0.12", + "PyPDF2>=1.26", +] + +docs_require = [] +test_require = [] +dev_require = ["green"] + +# What packages are optional? +EXTRAS = { + "docs": docs_require, + "tests": test_require, + "dev": docs_require + test_require + dev_require, +} + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' is present in your MANIFEST.in file! +try: + with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: + long_description = "\n" + f.read() +except FileNotFoundError: + long_description = DESCRIPTION + +# Load the package's __version__.py module as a dictionary. +about = {} +if not VERSION: + project_slug = NAME.lower().replace("-", "_").replace(" ", "_") + with open(os.path.join(here, project_slug, "__version__.py")) as f: + exec(f.read(), about) +else: + about["__version__"] = VERSION + +# Where the magic happens: +setup( + name=NAME, + version=about["__version__"], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages( + exclude=["tests", "*.tests", "*.tests.*", "tests.*"] + ), + install_requires=REQUIRED, + extras_require=EXTRAS, + include_package_data=True, + license=LICENSE, + ext_modules=[], + entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]}, + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + LICENSE_TROVE, + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Utilities", + ], +) diff --git a/test.py b/tests/test_providers.py index 83c74af..bb793b3 100644 --- a/test.py +++ b/tests/test_providers.py @@ -11,17 +11,17 @@ import hashlib import shutil import os -from arxiv2remarkable import ( +from paper2remarkable.providers import ( ACM, Arxiv, LocalFile, OpenReview, PdfUrl, - Pubmed, + PubMed, Springer, ) -VERBOSE = False +VERBOSE = True def md5sum(filename): @@ -56,7 +56,7 @@ class Tests(unittest.TestCase): self.assertEqual(exp_filename, os.path.basename(filename)) def test_pmc(self): - prov = Pubmed(upload=False, verbose=VERBOSE) + prov = PubMed(upload=False, verbose=VERBOSE) url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" exp_filename = ( "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" |
