diff options
| -rw-r--r-- | Makefile | 62 | ||||
| -rw-r--r-- | paper2remarkable/__init__.py | 0 | ||||
| -rw-r--r-- | paper2remarkable/__main__.py | 15 | ||||
| -rw-r--r-- | paper2remarkable/__version__.py | 5 | ||||
| -rw-r--r-- | paper2remarkable/crop.py | 160 | ||||
| -rw-r--r-- | paper2remarkable/providers/__init__.py | 11 | ||||
| -rw-r--r-- | paper2remarkable/providers/_base.py | 380 | ||||
| -rw-r--r-- | paper2remarkable/providers/acm.py | 74 | ||||
| -rw-r--r-- | paper2remarkable/providers/arxiv.py | 39 | ||||
| -rw-r--r-- | paper2remarkable/providers/local.py | 34 | ||||
| -rw-r--r-- | paper2remarkable/providers/openreview.py | 46 | ||||
| -rw-r--r-- | paper2remarkable/providers/pdf_url.py | 39 | ||||
| -rw-r--r-- | paper2remarkable/providers/pubmed.py | 51 | ||||
| -rw-r--r-- | paper2remarkable/providers/springer.py | 44 | ||||
| -rw-r--r-- | paper2remarkable/ui.py | 96 | ||||
| -rw-r--r-- | paper2remarkable/utils.py | 25 | ||||
| -rw-r--r-- | setup.py | 98 |
17 files changed, 1179 insertions, 0 deletions
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ed2d040 --- /dev/null +++ b/Makefile @@ -0,0 +1,62 @@ +# Makefile for easier installation and cleanup. +# +# Uses self-documenting macros from here: +# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html + +PACKAGE=paper2remarkable +DOC_DIR='./docs/' +VENV_DIR='/tmp/p2r_venv/' + +.PHONY: help cover dist + +.DEFAULT_GOAL := help + +help: + @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ + awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\ + %s\n", $$1, $$2}' + +release: ## Make a release + python make_release.py + + +install: ## Install for the current user using the default python command + python setup.py build_ext --inplace + python setup.py install --user + + +test: venv ## Run unit tests + source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit + + +clean: ## Clean build dist and egg directories left after install + rm -rf ./dist + rm -rf ./build + rm -rf ./$(PACKAGE).egg-info + rm -rf $(VENV_DIR) + rm -f MANIFEST + find . -type f -iname '*.pyc' -delete + find . -type d -name '__pycache__' -empty -delete + +dist: ## Make Python source distribution + python setup.py sdist + python setup.py bdist_wheel --universal + +docs: doc +doc: install ## Build documentation with Sphinx + m2r README.md && mv README.rst $(DOC_DIR) + m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR) + cd $(DOC_DIR) && \ + rm source/* && \ + sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ + touch source/AUTOGENERATED + $(MAKE) -C $(DOC_DIR) html + + + +venv: $(VENV_DIR)/bin/activate + +$(VENV_DIR)/bin/activate: + test -d $(VENV_DIR) || virtualenv $(VENV_DIR) + source $(VENV_DIR)/bin/activate && pip install -q -e .[dev] + touch $(VENV_DIR)/bin/activate diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/paper2remarkable/__init__.py diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py new file mode 100644 index 0000000..b97d538 --- /dev/null +++ b/paper2remarkable/__main__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +""" +Caller for the command line application +""" + +import sys + +def main(): + from .ui import main as realmain + + sys.exit(realmain()) + +if __name__ == '__main__': + main() diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py new file mode 100644 index 0000000..5bee2af --- /dev/null +++ b/paper2remarkable/__version__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +VERSION = (0, 4, 0) + +__version__ = '.'.join(map(str, VERSION)) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py new file mode 100644 index 0000000..b25b178 --- /dev/null +++ b/paper2remarkable/crop.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +"""Code for cropping a PDF file + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import os +import subprocess +import pdfplumber + +RM_WIDTH = 1404 +RM_HEIGHT = 1872 + + +class Cropper(object): + def __init__( + self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + ): + if not input_file is None: + self.input_file = os.path.abspath(input_file) + self.reader = PyPDF2.PdfFileReader(self.input_file) + if not output_file is None: + self.output_file = os.path.abspath(output_file) + self.pdfcrop_path = pdfcrop_path + + self.writer = PyPDF2.PdfFileWriter() + + def crop(self, margins=1): + return self.process_file(self.crop_page, margins=margins) + + def center(self, padding=15): + return self.process_file(self.center_page, padding=padding) + + def process_file(self, page_func, *args, **kwargs): + for page_idx in range(self.reader.getNumPages()): + status = page_func(page_idx, *args, **kwargs) + if not status == 0: + return status + with open(self.output_file, "wb") as fp: + self.writer.write(fp) + return 0 + + def center_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_center_bbox, padding=padding + ) + + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + + def export_page(self, page_idx): + """Helper function that exports a single page given by index """ + page = self.reader.getPage(page_idx) + writer = PyPDF2.PdfFileWriter() + writer.addPage(page) + tmpfname = "./page.pdf" + with open(tmpfname, "wb") as fp: + writer.write(fp) + return tmpfname + + def process_page(self, page_idx, bbox_func, *args, **kwargs): + """Process a single page and add it to the writer """ + tmpfname = self.export_page(page_idx) + tmpfout = "./output.pdf" + bbox = bbox_func(tmpfname, *args, **kwargs) + status = subprocess.call( + [ + self.pdfcrop_path, + "--bbox", + " ".join(map(str, bbox)), + tmpfname, + tmpfout, + ], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + return status + reader = PyPDF2.PdfFileReader(tmpfout) + page = reader.getPage(0) + self.writer.addPage(page) + os.unlink(tmpfname) + os.unlink(tmpfout) + return 0 + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + pdf = pdfplumber.open(filename) + im = pdf.pages[0].to_image(resolution=resolution) + pdf.close() + + pixels = list(im.original.getdata()) + W, H = im.original.size + + # M is a list of H lists with each W integers that equal the sum of the + # pixel values + M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] + + left, top, bottom, right = 0, 0, 0, 0 + while top < H and sum(M[top]) == W * 255 * 3: + top += 1 + while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: + bottom += 1 + + # Transpose M + M = list(zip(*M)) + while left < W and sum(M[left]) == H * 255 * 3: + left += 1 + while right < W and sum(M[W - 1 - right]) == H * 255 * 3: + right += 1 + + left -= margins[0] + top -= margins[1] + right -= margins[2] + bottom -= margins[3] + + # This is the bounding box in PIL format: (0, 0) top left + x0, y0, x1, y1 = left, top, W - right, H - bottom + + # Get the bbox in Ghostscript format: (0, 0) bottom left + a0, b0, a1, b1 = x0, H - y1, x1, H - y0 + return [a0, b0, a1, b1] + + def get_center_bbox(self, filename, padding=15): + """Compute a bounding box that will center the page file on the + reMarkable + """ + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # we want some minimal padding all around, because it is visually more + # pleasing. + h_prime = h + 2 * padding + w_prime = w + 2 * padding + + # if the document is wider than the remarkable, we add top-padding to + # center it, otherwise we add left-padding + x, y = 0, 0 + if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: + y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 + else: + x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 + + margins = [padding + x, padding + y, padding, padding] + return self.get_bbox(filename, margins=margins) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py new file mode 100644 index 0000000..361c11e --- /dev/null +++ b/paper2remarkable/providers/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +from .arxiv import Arxiv +from .pubmed import Pubmed +from .acm import ACM +from .openreview import OpenReview +from .springer import Springer +from .local import LocalFile +from .pdf_url import PdfUrl + +providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py new file mode 100644 index 0000000..05fc0b7 --- /dev/null +++ b/paper2remarkable/providers/_base.py @@ -0,0 +1,380 @@ +# -*- coding: utf-8 -*- + +"""Base for the Provider class + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import abc +import bs4 +import datetime +import os +import re +import requests +import shutil +import string +import subprocess +import tempfile +import time +import titlecase +import unidecode + +from ..crop import Cropper +from ..utils import exception + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + + +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ + + meta_author_key = "citation_author" + meta_title_key = "citation_title" + meta_date_key = "citation_date" + + def __init__( + self, + verbose=False, + upload=True, + debug=False, + center=False, + blank=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.verbose = verbose + self.upload = upload + self.debug = debug + self.center = center + self.blank = blank + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + + self.log("Starting %s" % type(self).__name__) + + def log(self, msg, mode="info"): + if not self.verbose: + return + if not mode in ["info", "warning"]: + raise ValueError("unknown logging mode.") + now = datetime.datetime.now() + print( + now.strftime("%Y-%m-%d %H:%M:%S") + + " - " + + mode.upper() + + " - " + + msg + ) + + def warn(self, msg): + self.log(msg, mode="warning") + + @staticmethod + @abc.abstractmethod + def validate(src): + """ Validate whether ``src`` is appropriate for this provider """ + + def retrieve_pdf(self, src, filename): + """ Download pdf from src and save to filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def _format_authors(self, soup_authors, sep=",", idx=0, op=None): + op = (lambda x: x) if op is None else op + # format the author list retrieved by bs4 + return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] + + def get_authors(self, soup): + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": self.meta_author_key}) + ] + return self._format_authors(authors) + + def get_title(self, soup): + target = soup.find_all("meta", {"name": self.meta_title_key}) + return target[0]["content"] + + def _format_date(self, soup_date): + return soup_date + + def get_date(self, soup): + date = soup.find_all("meta", {"name": self.meta_date_key})[0][ + "content" + ] + return self._format_date(date) + + def get_paper_info( + self, + src, + author_key="citation_author", + title_key="citation_title", + date_key="citation_date", + ): + """ Retrieve the title/author (surnames)/year information """ + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = self.get_authors(soup) + title = self.get_title(soup) + date = self.get_date(soup) + return dict(title=title, date=date, authors=authors) + + def string_clean(self, s): + """ Clean a string to replace accented characters with equivalents and + keep only the allowed characters """ + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + + def create_filename(self, info, filename=None): + """ Generate filename using the info dict or filename if provided """ + if not filename is None: + return filename + # we assume that the list of authors is surname only. + self.log("Generating output filename") + + if len(info["authors"]) > 3: + author_part = info["authors"][0] + "_et_al" + else: + author_part = "_".join(info["authors"]) + author_part = self.string_clean(author_part) + + title_part = self.string_clean(info["title"]) + title_part = titlecase.titlecase(title_part).replace(" ", "_") + + year_part = info["date"].split("/")[0] + + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + name = unidecode.unidecode(name) + self.log("Created filename: %s" % name) + return name + + def blank_pdf(self, filepath): + if not self.blank: + return filepath + + self.log("Adding blank pages") + input_pdf = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + for page in input_pdf.pages: + output_pdf.addPage(page) + output_pdf.addBlankPage() + + output_file = os.path.splitext(filepath)[0] + "-blank.pdf" + with open(output_file, "wb") as fp: + output_pdf.write(fp) + return output_file + + def crop_pdf(self, filepath): + self.log("Cropping pdf file") + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + cropper = Cropper( + filepath, cropped_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.crop(margins=15) + + if not status == 0: + self.warn("Failed to crop the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(cropped_file): + self.warn( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + def center_pdf(self, filepath): + if not self.center: + return filepath + + self.log("Centering pdf file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + cropper = Cropper( + filepath, centered_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.center() + if not status == 0: + self.warn("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + self.warn( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + + def shrink_pdf(self, filepath): + self.log("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + self.warn("Failed to shrink the pdf file") + return filepath + return output_file + + def check_file_is_pdf(self, filename): + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") + + def download_url(self, url, filename): + """Download the content of an url and save it to a filename """ + self.log("Downloading file at url: %s" % url) + content = self.get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + def get_page_with_retry(self, url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + self.warn("Error getting url %s. Retrying in 5 seconds" % url) + time.sleep(5) + continue + self.log("Downloading url: %s" % url) + return res.content + + def upload_to_rm(self, filepath): + remarkable_dir = self.remarkable_dir.rstrip("/") + self.log("Starting upload to reMarkable") + if remarkable_dir: + status = subprocess.call( + [self.rmapi_path, "mkdir", remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) + status = subprocess.call( + [self.rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + self.log("Upload successful.") + + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + def run(self, src, filename=None): + info = self.get_paper_info(src) + clean_filename = self.create_filename(info, filename) + tmp_filename = "paper.pdf" + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [ + self.dearxiv, + self.crop_pdf, + self.center_pdf, + self.blank_pdf, + self.shrink_pdf, + ] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return self.upload_to_rm(clean_filename) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py new file mode 100644 index 0000000..be98e16 --- /dev/null +++ b/paper2remarkable/providers/acm.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +"""Provider for ACM + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import bs4 +import re + +from . import Provider +from ..utils import exception + +# TODO: put this somewhere central, now multiply defined +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" + +class ACM(Provider): + + meta_author_key = "citation_authors" + + re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_acm_pdf_url(self, url): + page = self.get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) + else: + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return abs_url, pdf_url + + def validate(src): + m = re.fullmatch(ACM.re_abs, src) + return not m is None + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(";") + return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + + def _format_date(self, soup_date): + if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so it can be fixed: %s" % GITHUB_URL + ) + return soup_date.strip().split("/")[-1] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py new file mode 100644 index 0000000..fc5c004 --- /dev/null +++ b/paper2remarkable/providers/arxiv.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Provider for arxiv.org + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ..utils import exception + + +class Arxiv(Provider): + + re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" + re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match(self.re_pdf, url): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url + + def validate(src): + """Check if the url is to an arXiv page. """ + return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py new file mode 100644 index 0000000..68ce030 --- /dev/null +++ b/paper2remarkable/providers/local.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +"""Provider for local files + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import os +import shutil + +from . import Provider + + +class LocalFile(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validate(src): + return os.path.exists(src) + + def retrieve_pdf(self, src, filename): + source = os.path.join(self.initial_dir, src) + shutil.copy(source, filename) + + def get_paper_info(self, src): + return {"filename": src} + + def create_filename(self, info, filename=None): + if not filename is None: + return filename + return os.path.basename(info["filename"]) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py new file mode 100644 index 0000000..b7e1d77 --- /dev/null +++ b/paper2remarkable/providers/openreview.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +"""Provider for OpenReview + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from . import Provider +from ..utils import exception + + +class OpenReview(Provider): + + meta_date_key = "citation_publication_date" + + re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" + re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("forum", "pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("pdf", "forum") + pdf_url = url + else: + exception("Couldn't figure out OpenReview urls.") + return abs_url, pdf_url + + def validate(src): + """ Check if the url is a valid OpenReview url. """ + return re.match(OpenReview.re_abs, src) or re.match( + OpenReview.re_pdf, src + ) + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py new file mode 100644 index 0000000..56427d3 --- /dev/null +++ b/paper2remarkable/providers/pdf_url.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Provider for generic PDF url + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import urllib + +from . import Provider +from ..utils import exception + + +class PdfUrl(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False + + def retrieve_pdf(self, url, filename): + self.download_url(url, filename) + + def get_paper_info(self, src): + return None + + def create_filename(self, info, filename=None): + if filename is None: + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + return filename diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py new file mode 100644 index 0000000..29bdb31 --- /dev/null +++ b/paper2remarkable/providers/pubmed.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Provider for PubMed + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from . import Provider +from ..utils import exception + +class Pubmed(Provider): + + meta_author_key = "citation_authors" + + re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" + re_pdf = ( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match(self.re_pdf, url): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match(self.re_abs, url): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually + else: + exception("Couldn't figure out PMC urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + def _format_date(self, soup_date): + if re.match("\w+\ \d{4}", soup_date): + return soup_date.split(" ")[-1] + return soup_date.replace(" ", "_") diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py new file mode 100644 index 0000000..ce16007 --- /dev/null +++ b/paper2remarkable/providers/springer.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +"""Provider for Springer + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re +import urllib + +from . import Provider +from ..utils import exception + + +class Springer(Provider): + + meta_date_key = "citation_online_date" + + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a Springer url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + pdf_url = urllib.parse.unquote(url) + else: + exception("Couldn't figure out Springer urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py new file mode 100644 index 0000000..71fc655 --- /dev/null +++ b/paper2remarkable/ui.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +"""Command line interface + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import argparse + +from .providers import providers +from .utils import exception + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-b", + "--blank", + help="Add a blank page after every page of the PDF", + action="store_true", + ) + parser.add_argument( + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + help="debug mode, doesn't upload to reMarkable", + action="store_true", + ) + parser.add_argument( + "-c", + "--center", + help="Center the PDF on the page, instead of left align", + action="store_true", + ) + parser.add_argument( + "--filename", + help="Filename to use for the file on reMarkable", + default=None, + ) + parser.add_argument( + "-p", + "--remarkable-path", + help="directory on reMarkable to put the file (created if missing)", + dest="remarkable_dir", + default="/", + ) + parser.add_argument( + "--rmapi", help="path to rmapi executable", default="rmapi" + ) + parser.add_argument( + "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" + ) + parser.add_argument( + "--pdftk", help="path to pdftk executable", default="pdftk" + ) + parser.add_argument("--gs", help="path to gs executable", default="gs") + parser.add_argument( + "input", help="URL to a paper or the path of a local PDF file" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + prov = provider( + verbose=args.verbose, + upload=not args.no_upload, + debug=args.debug, + center=args.center, + blank=args.blank, + remarkable_dir=args.remarkable_dir, + rmapi_path=args.rmapi, + pdfcrop_path=args.pdfcrop, + pdftk_path=args.pdftk, + gs_path=args.gs, + ) + + prov.run(args.input, filename=args.filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py new file mode 100644 index 0000000..af19d22 --- /dev/null +++ b/paper2remarkable/utils.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +"""Utility functions for a2r + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + + +import sys + +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + print("", file=sys.stderr) + print( + "If you think this might be a bug, please raise an issue on GitHub: %s" + % GITHUB_URL + ) + raise SystemExit(1) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e5a697e --- /dev/null +++ b/setup.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import os + +from setuptools import find_packages, setup + +# Package meta-data. +AUTHOR = "Gertjan van den Burg" +DESCRIPTION = "Easily download an academic paper and send it to the reMarkable" +EMAIL = "gertjanvandenburg@gmail.com" +LICENSE = "MIT" +LICENSE_TROVE = "License :: OSI Approved :: MIT License" +NAME = "paper2remarkable" +REQUIRES_PYTHON = ">=3.5.0" +URL = "https://github.com/GjjvdBurg/paper2remarkable" +VERSION = None + +# What packages are required for this module to be executed? +REQUIRED = [ + "bs4>=4.8.0", + "requests>=2.21", + "pdfplumber>=0.5.12", + "unidecode>=1.1" +] + +docs_require = [] +test_require = [] +dev_require = [] + +# What packages are optional? +EXTRAS = { + "docs": docs_require, + "tests": test_require, + "dev": docs_require + test_require + dev_require, +} + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' is present in your MANIFEST.in file! +try: + with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: + long_description = "\n" + f.read() +except FileNotFoundError: + long_description = DESCRIPTION + +# Load the package's __version__.py module as a dictionary. +about = {} +if not VERSION: + project_slug = NAME.lower().replace("-", "_").replace(" ", "_") + with open(os.path.join(here, project_slug, "__version__.py")) as f: + exec(f.read(), about) +else: + about["__version__"] = VERSION + +# Where the magic happens: +setup( + name=NAME, + version=about["__version__"], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages( + exclude=["tests", "*.tests", "*.tests.*", "tests.*"] + ), + install_requires=REQUIRED, + extras_require=EXTRAS, + include_package_data=True, + license=LICENSE, + ext_modules=[], + entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]}, + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + LICENSE_TROVE, + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Utilities", + ], +) |
