From 0105cd484921ca854a1489abcaa35d0167c85ceb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:01:27 +0100 Subject: Move tests to separate directory --- test.py | 106 ---------------------------------------------------------- tests/test.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 106 deletions(-) delete mode 100644 test.py create mode 100644 tests/test.py diff --git a/test.py b/test.py deleted file mode 100644 index 83c74af..0000000 --- a/test.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__author__ = "G.J.J. van den Burg" - -"""Tests""" - -import unittest -import tempfile -import hashlib -import shutil -import os - -from arxiv2remarkable import ( - ACM, - Arxiv, - LocalFile, - OpenReview, - PdfUrl, - Pubmed, - Springer, -) - -VERBOSE = False - - -def md5sum(filename): - blocksize = 65536 - hasher = hashlib.md5() - with open(filename, "rb") as fid: - buf = fid.read(blocksize) - while len(buf) > 0: - hasher.update(buf) - buf = fid.read(blocksize) - return hasher.hexdigest() - - -class Tests(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.original_dir = os.getcwd() - - def setUp(self): - self.test_dir = tempfile.mkdtemp() - os.chdir(self.test_dir) - - def tearDown(self): - os.chdir(self.original_dir) - shutil.rmtree(self.test_dir) - - def test_arxiv(self): - prov = Arxiv(upload=False, verbose=VERBOSE) - url = "https://arxiv.org/abs/1811.11242v1" - exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_pmc(self): - prov = Pubmed(upload=False, verbose=VERBOSE) - url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" - exp_filename = ( - "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" - ) - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_acm(self): - prov = ACM(upload=False, verbose=VERBOSE) - url = "https://dl.acm.org/citation.cfm?id=3025626" - exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_openreview(self): - prov = OpenReview(upload=False, verbose=VERBOSE) - url = "https://openreview.net/forum?id=S1x4ghC9tQ" - exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_springer(self): - prov = Springer(upload=False, verbose=VERBOSE) - url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" - exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_local(self): - local_filename = "test.pdf" - with open(local_filename, "w") as fp: - fp.write( - "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" - ) - prov = LocalFile(upload=False, verbose=VERBOSE) - filename = prov.run(local_filename) - self.assertEqual("test_.pdf", os.path.basename(filename)) - - def test_pdfurl(self): - prov = PdfUrl(upload=False, verbose=VERBOSE) - url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..83c74af --- /dev/null +++ b/tests/test.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__author__ = "G.J.J. van den Burg" + +"""Tests""" + +import unittest +import tempfile +import hashlib +import shutil +import os + +from arxiv2remarkable import ( + ACM, + Arxiv, + LocalFile, + OpenReview, + PdfUrl, + Pubmed, + Springer, +) + +VERBOSE = False + + +def md5sum(filename): + blocksize = 65536 + hasher = hashlib.md5() + with open(filename, "rb") as fid: + buf = fid.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = fid.read(blocksize) + return hasher.hexdigest() + + +class Tests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_arxiv(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/abs/1811.11242v1" + exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_pmc(self): + prov = Pubmed(upload=False, verbose=VERBOSE) + url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" + exp_filename = ( + "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_acm(self): + prov = ACM(upload=False, verbose=VERBOSE) + url = "https://dl.acm.org/citation.cfm?id=3025626" + exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_openreview(self): + prov = OpenReview(upload=False, verbose=VERBOSE) + url = "https://openreview.net/forum?id=S1x4ghC9tQ" + exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_springer(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" + exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_local(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + prov = LocalFile(upload=False, verbose=VERBOSE) + filename = prov.run(local_filename) + self.assertEqual("test_.pdf", os.path.basename(filename)) + + def test_pdfurl(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url, filename="test.pdf") + self.assertEqual("test.pdf", os.path.basename(filename)) + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From 83df50f47426cefb71c2f4fde161c8fad934dba3 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:02:36 +0100 Subject: [wip] First commit of new code structure Not fully functional yet probably --- Makefile | 62 +++++ paper2remarkable/__init__.py | 0 paper2remarkable/__main__.py | 15 ++ paper2remarkable/__version__.py | 5 + paper2remarkable/crop.py | 160 +++++++++++++ paper2remarkable/providers/__init__.py | 11 + paper2remarkable/providers/_base.py | 380 +++++++++++++++++++++++++++++++ paper2remarkable/providers/acm.py | 74 ++++++ paper2remarkable/providers/arxiv.py | 39 ++++ paper2remarkable/providers/local.py | 34 +++ paper2remarkable/providers/openreview.py | 46 ++++ paper2remarkable/providers/pdf_url.py | 39 ++++ paper2remarkable/providers/pubmed.py | 51 +++++ paper2remarkable/providers/springer.py | 44 ++++ paper2remarkable/ui.py | 96 ++++++++ paper2remarkable/utils.py | 25 ++ setup.py | 98 ++++++++ 17 files changed, 1179 insertions(+) create mode 100644 Makefile create mode 100644 paper2remarkable/__init__.py create mode 100644 paper2remarkable/__main__.py create mode 100644 paper2remarkable/__version__.py create mode 100644 paper2remarkable/crop.py create mode 100644 paper2remarkable/providers/__init__.py create mode 100644 paper2remarkable/providers/_base.py create mode 100644 paper2remarkable/providers/acm.py create mode 100644 paper2remarkable/providers/arxiv.py create mode 100644 paper2remarkable/providers/local.py create mode 100644 paper2remarkable/providers/openreview.py create mode 100644 paper2remarkable/providers/pdf_url.py create mode 100644 paper2remarkable/providers/pubmed.py create mode 100644 paper2remarkable/providers/springer.py create mode 100644 paper2remarkable/ui.py create mode 100644 paper2remarkable/utils.py create mode 100644 setup.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ed2d040 --- /dev/null +++ b/Makefile @@ -0,0 +1,62 @@ +# Makefile for easier installation and cleanup. +# +# Uses self-documenting macros from here: +# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html + +PACKAGE=paper2remarkable +DOC_DIR='./docs/' +VENV_DIR='/tmp/p2r_venv/' + +.PHONY: help cover dist + +.DEFAULT_GOAL := help + +help: + @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ + awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\ + %s\n", $$1, $$2}' + +release: ## Make a release + python make_release.py + + +install: ## Install for the current user using the default python command + python setup.py build_ext --inplace + python setup.py install --user + + +test: venv ## Run unit tests + source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit + + +clean: ## Clean build dist and egg directories left after install + rm -rf ./dist + rm -rf ./build + rm -rf ./$(PACKAGE).egg-info + rm -rf $(VENV_DIR) + rm -f MANIFEST + find . -type f -iname '*.pyc' -delete + find . -type d -name '__pycache__' -empty -delete + +dist: ## Make Python source distribution + python setup.py sdist + python setup.py bdist_wheel --universal + +docs: doc +doc: install ## Build documentation with Sphinx + m2r README.md && mv README.rst $(DOC_DIR) + m2r CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR) + cd $(DOC_DIR) && \ + rm source/* && \ + sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ + touch source/AUTOGENERATED + $(MAKE) -C $(DOC_DIR) html + + + +venv: $(VENV_DIR)/bin/activate + +$(VENV_DIR)/bin/activate: + test -d $(VENV_DIR) || virtualenv $(VENV_DIR) + source $(VENV_DIR)/bin/activate && pip install -q -e .[dev] + touch $(VENV_DIR)/bin/activate diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/paper2remarkable/__main__.py b/paper2remarkable/__main__.py new file mode 100644 index 0000000..b97d538 --- /dev/null +++ b/paper2remarkable/__main__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +""" +Caller for the command line application +""" + +import sys + +def main(): + from .ui import main as realmain + + sys.exit(realmain()) + +if __name__ == '__main__': + main() diff --git a/paper2remarkable/__version__.py b/paper2remarkable/__version__.py new file mode 100644 index 0000000..5bee2af --- /dev/null +++ b/paper2remarkable/__version__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +VERSION = (0, 4, 0) + +__version__ = '.'.join(map(str, VERSION)) diff --git a/paper2remarkable/crop.py b/paper2remarkable/crop.py new file mode 100644 index 0000000..b25b178 --- /dev/null +++ b/paper2remarkable/crop.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +"""Code for cropping a PDF file + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import os +import subprocess +import pdfplumber + +RM_WIDTH = 1404 +RM_HEIGHT = 1872 + + +class Cropper(object): + def __init__( + self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" + ): + if not input_file is None: + self.input_file = os.path.abspath(input_file) + self.reader = PyPDF2.PdfFileReader(self.input_file) + if not output_file is None: + self.output_file = os.path.abspath(output_file) + self.pdfcrop_path = pdfcrop_path + + self.writer = PyPDF2.PdfFileWriter() + + def crop(self, margins=1): + return self.process_file(self.crop_page, margins=margins) + + def center(self, padding=15): + return self.process_file(self.center_page, padding=padding) + + def process_file(self, page_func, *args, **kwargs): + for page_idx in range(self.reader.getNumPages()): + status = page_func(page_idx, *args, **kwargs) + if not status == 0: + return status + with open(self.output_file, "wb") as fp: + self.writer.write(fp) + return 0 + + def center_page(self, page_idx, padding): + return self.process_page( + page_idx, self.get_center_bbox, padding=padding + ) + + def crop_page(self, page_idx, margins): + return self.process_page(page_idx, self.get_bbox, margins=margins) + + def export_page(self, page_idx): + """Helper function that exports a single page given by index """ + page = self.reader.getPage(page_idx) + writer = PyPDF2.PdfFileWriter() + writer.addPage(page) + tmpfname = "./page.pdf" + with open(tmpfname, "wb") as fp: + writer.write(fp) + return tmpfname + + def process_page(self, page_idx, bbox_func, *args, **kwargs): + """Process a single page and add it to the writer """ + tmpfname = self.export_page(page_idx) + tmpfout = "./output.pdf" + bbox = bbox_func(tmpfname, *args, **kwargs) + status = subprocess.call( + [ + self.pdfcrop_path, + "--bbox", + " ".join(map(str, bbox)), + tmpfname, + tmpfout, + ], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + return status + reader = PyPDF2.PdfFileReader(tmpfout) + page = reader.getPage(0) + self.writer.addPage(page) + os.unlink(tmpfname) + os.unlink(tmpfout) + return 0 + + def get_bbox(self, filename, margins=1, resolution=72): + """Get the bounding box, with optional margins + + if margins is integer, used for all margins, else + margins = [left, top, right, bottom] + + We get the bounding box by finding the smallest rectangle that is + completely surrounded by white pixels. + """ + if isinstance(margins, int): + margins = [margins for _ in range(4)] + pdf = pdfplumber.open(filename) + im = pdf.pages[0].to_image(resolution=resolution) + pdf.close() + + pixels = list(im.original.getdata()) + W, H = im.original.size + + # M is a list of H lists with each W integers that equal the sum of the + # pixel values + M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] + + left, top, bottom, right = 0, 0, 0, 0 + while top < H and sum(M[top]) == W * 255 * 3: + top += 1 + while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: + bottom += 1 + + # Transpose M + M = list(zip(*M)) + while left < W and sum(M[left]) == H * 255 * 3: + left += 1 + while right < W and sum(M[W - 1 - right]) == H * 255 * 3: + right += 1 + + left -= margins[0] + top -= margins[1] + right -= margins[2] + bottom -= margins[3] + + # This is the bounding box in PIL format: (0, 0) top left + x0, y0, x1, y1 = left, top, W - right, H - bottom + + # Get the bbox in Ghostscript format: (0, 0) bottom left + a0, b0, a1, b1 = x0, H - y1, x1, H - y0 + return [a0, b0, a1, b1] + + def get_center_bbox(self, filename, padding=15): + """Compute a bounding box that will center the page file on the + reMarkable + """ + bbox = self.get_bbox(filename, margins=0) + + h = bbox[3] - bbox[1] + w = bbox[2] - bbox[0] + + # we want some minimal padding all around, because it is visually more + # pleasing. + h_prime = h + 2 * padding + w_prime = w + 2 * padding + + # if the document is wider than the remarkable, we add top-padding to + # center it, otherwise we add left-padding + x, y = 0, 0 + if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: + y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 + else: + x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 + + margins = [padding + x, padding + y, padding, padding] + return self.get_bbox(filename, margins=margins) diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py new file mode 100644 index 0000000..361c11e --- /dev/null +++ b/paper2remarkable/providers/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +from .arxiv import Arxiv +from .pubmed import Pubmed +from .acm import ACM +from .openreview import OpenReview +from .springer import Springer +from .local import LocalFile +from .pdf_url import PdfUrl + +providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py new file mode 100644 index 0000000..05fc0b7 --- /dev/null +++ b/paper2remarkable/providers/_base.py @@ -0,0 +1,380 @@ +# -*- coding: utf-8 -*- + +"""Base for the Provider class + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import PyPDF2 +import abc +import bs4 +import datetime +import os +import re +import requests +import shutil +import string +import subprocess +import tempfile +import time +import titlecase +import unidecode + +from ..crop import Cropper +from ..utils import exception + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + + +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ + + meta_author_key = "citation_author" + meta_title_key = "citation_title" + meta_date_key = "citation_date" + + def __init__( + self, + verbose=False, + upload=True, + debug=False, + center=False, + blank=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.verbose = verbose + self.upload = upload + self.debug = debug + self.center = center + self.blank = blank + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + + self.log("Starting %s" % type(self).__name__) + + def log(self, msg, mode="info"): + if not self.verbose: + return + if not mode in ["info", "warning"]: + raise ValueError("unknown logging mode.") + now = datetime.datetime.now() + print( + now.strftime("%Y-%m-%d %H:%M:%S") + + " - " + + mode.upper() + + " - " + + msg + ) + + def warn(self, msg): + self.log(msg, mode="warning") + + @staticmethod + @abc.abstractmethod + def validate(src): + """ Validate whether ``src`` is appropriate for this provider """ + + def retrieve_pdf(self, src, filename): + """ Download pdf from src and save to filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def _format_authors(self, soup_authors, sep=",", idx=0, op=None): + op = (lambda x: x) if op is None else op + # format the author list retrieved by bs4 + return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] + + def get_authors(self, soup): + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": self.meta_author_key}) + ] + return self._format_authors(authors) + + def get_title(self, soup): + target = soup.find_all("meta", {"name": self.meta_title_key}) + return target[0]["content"] + + def _format_date(self, soup_date): + return soup_date + + def get_date(self, soup): + date = soup.find_all("meta", {"name": self.meta_date_key})[0][ + "content" + ] + return self._format_date(date) + + def get_paper_info( + self, + src, + author_key="citation_author", + title_key="citation_title", + date_key="citation_date", + ): + """ Retrieve the title/author (surnames)/year information """ + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = self.get_authors(soup) + title = self.get_title(soup) + date = self.get_date(soup) + return dict(title=title, date=date, authors=authors) + + def string_clean(self, s): + """ Clean a string to replace accented characters with equivalents and + keep only the allowed characters """ + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + + def create_filename(self, info, filename=None): + """ Generate filename using the info dict or filename if provided """ + if not filename is None: + return filename + # we assume that the list of authors is surname only. + self.log("Generating output filename") + + if len(info["authors"]) > 3: + author_part = info["authors"][0] + "_et_al" + else: + author_part = "_".join(info["authors"]) + author_part = self.string_clean(author_part) + + title_part = self.string_clean(info["title"]) + title_part = titlecase.titlecase(title_part).replace(" ", "_") + + year_part = info["date"].split("/")[0] + + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + name = unidecode.unidecode(name) + self.log("Created filename: %s" % name) + return name + + def blank_pdf(self, filepath): + if not self.blank: + return filepath + + self.log("Adding blank pages") + input_pdf = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + for page in input_pdf.pages: + output_pdf.addPage(page) + output_pdf.addBlankPage() + + output_file = os.path.splitext(filepath)[0] + "-blank.pdf" + with open(output_file, "wb") as fp: + output_pdf.write(fp) + return output_file + + def crop_pdf(self, filepath): + self.log("Cropping pdf file") + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + cropper = Cropper( + filepath, cropped_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.crop(margins=15) + + if not status == 0: + self.warn("Failed to crop the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(cropped_file): + self.warn( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + def center_pdf(self, filepath): + if not self.center: + return filepath + + self.log("Centering pdf file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + cropper = Cropper( + filepath, centered_file, pdfcrop_path=self.pdfcrop_path + ) + status = cropper.center() + if not status == 0: + self.warn("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + self.warn( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + + def shrink_pdf(self, filepath): + self.log("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + self.warn("Failed to shrink the pdf file") + return filepath + return output_file + + def check_file_is_pdf(self, filename): + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") + + def download_url(self, url, filename): + """Download the content of an url and save it to a filename """ + self.log("Downloading file at url: %s" % url) + content = self.get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + def get_page_with_retry(self, url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + self.warn("Error getting url %s. Retrying in 5 seconds" % url) + time.sleep(5) + continue + self.log("Downloading url: %s" % url) + return res.content + + def upload_to_rm(self, filepath): + remarkable_dir = self.remarkable_dir.rstrip("/") + self.log("Starting upload to reMarkable") + if remarkable_dir: + status = subprocess.call( + [self.rmapi_path, "mkdir", remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) + status = subprocess.call( + [self.rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + self.log("Upload successful.") + + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + def run(self, src, filename=None): + info = self.get_paper_info(src) + clean_filename = self.create_filename(info, filename) + tmp_filename = "paper.pdf" + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [ + self.dearxiv, + self.crop_pdf, + self.center_pdf, + self.blank_pdf, + self.shrink_pdf, + ] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return self.upload_to_rm(clean_filename) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py new file mode 100644 index 0000000..be98e16 --- /dev/null +++ b/paper2remarkable/providers/acm.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +"""Provider for ACM + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import bs4 +import re + +from . import Provider +from ..utils import exception + +# TODO: put this somewhere central, now multiply defined +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" + +class ACM(Provider): + + meta_author_key = "citation_authors" + + re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_acm_pdf_url(self, url): + page = self.get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match(self.re_abs, url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) + else: + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return abs_url, pdf_url + + def validate(src): + m = re.fullmatch(ACM.re_abs, src) + return not m is None + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(";") + return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + + def _format_date(self, soup_date): + if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so it can be fixed: %s" % GITHUB_URL + ) + return soup_date.strip().split("/")[-1] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py new file mode 100644 index 0000000..fc5c004 --- /dev/null +++ b/paper2remarkable/providers/arxiv.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Provider for arxiv.org + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from ._base import Provider +from ..utils import exception + + +class Arxiv(Provider): + + re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" + re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match(self.re_pdf, url): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url + + def validate(src): + """Check if the url is to an arXiv page. """ + return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py new file mode 100644 index 0000000..68ce030 --- /dev/null +++ b/paper2remarkable/providers/local.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +"""Provider for local files + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import os +import shutil + +from . import Provider + + +class LocalFile(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validate(src): + return os.path.exists(src) + + def retrieve_pdf(self, src, filename): + source = os.path.join(self.initial_dir, src) + shutil.copy(source, filename) + + def get_paper_info(self, src): + return {"filename": src} + + def create_filename(self, info, filename=None): + if not filename is None: + return filename + return os.path.basename(info["filename"]) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py new file mode 100644 index 0000000..b7e1d77 --- /dev/null +++ b/paper2remarkable/providers/openreview.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +"""Provider for OpenReview + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from . import Provider +from ..utils import exception + + +class OpenReview(Provider): + + meta_date_key = "citation_publication_date" + + re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" + re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract url from a OpenReview url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("forum", "pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("pdf", "forum") + pdf_url = url + else: + exception("Couldn't figure out OpenReview urls.") + return abs_url, pdf_url + + def validate(src): + """ Check if the url is a valid OpenReview url. """ + return re.match(OpenReview.re_abs, src) or re.match( + OpenReview.re_pdf, src + ) + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py new file mode 100644 index 0000000..56427d3 --- /dev/null +++ b/paper2remarkable/providers/pdf_url.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +"""Provider for generic PDF url + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import urllib + +from . import Provider +from ..utils import exception + + +class PdfUrl(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False + + def retrieve_pdf(self, url, filename): + self.download_url(url, filename) + + def get_paper_info(self, src): + return None + + def create_filename(self, info, filename=None): + if filename is None: + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + return filename diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py new file mode 100644 index 0000000..29bdb31 --- /dev/null +++ b/paper2remarkable/providers/pubmed.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Provider for PubMed + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re + +from . import Provider +from ..utils import exception + +class Pubmed(Provider): + + meta_author_key = "citation_authors" + + re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" + re_pdf = ( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match(self.re_pdf, url): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match(self.re_abs, url): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually + else: + exception("Couldn't figure out PMC urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) + + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + def _format_date(self, soup_date): + if re.match("\w+\ \d{4}", soup_date): + return soup_date.split(" ")[-1] + return soup_date.replace(" ", "_") diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py new file mode 100644 index 0000000..ce16007 --- /dev/null +++ b/paper2remarkable/providers/springer.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +"""Provider for Springer + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import re +import urllib + +from . import Provider +from ..utils import exception + + +class Springer(Provider): + + meta_date_key = "citation_online_date" + + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" + re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """ Get the pdf and abstract urls from a Springer url """ + if re.match(self.re_abs, url): + abs_url = url + pdf_url = url.replace("article", "content/pdf") + elif re.match(self.re_pdf, url): + abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] + pdf_url = urllib.parse.unquote(url) + else: + exception("Couldn't figure out Springer urls.") + return abs_url, pdf_url + + def validate(src): + return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) + + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py new file mode 100644 index 0000000..71fc655 --- /dev/null +++ b/paper2remarkable/ui.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +"""Command line interface + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + +import argparse + +from .providers import providers +from .utils import exception + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-b", + "--blank", + help="Add a blank page after every page of the PDF", + action="store_true", + ) + parser.add_argument( + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + help="debug mode, doesn't upload to reMarkable", + action="store_true", + ) + parser.add_argument( + "-c", + "--center", + help="Center the PDF on the page, instead of left align", + action="store_true", + ) + parser.add_argument( + "--filename", + help="Filename to use for the file on reMarkable", + default=None, + ) + parser.add_argument( + "-p", + "--remarkable-path", + help="directory on reMarkable to put the file (created if missing)", + dest="remarkable_dir", + default="/", + ) + parser.add_argument( + "--rmapi", help="path to rmapi executable", default="rmapi" + ) + parser.add_argument( + "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" + ) + parser.add_argument( + "--pdftk", help="path to pdftk executable", default="pdftk" + ) + parser.add_argument("--gs", help="path to gs executable", default="gs") + parser.add_argument( + "input", help="URL to a paper or the path of a local PDF file" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + prov = provider( + verbose=args.verbose, + upload=not args.no_upload, + debug=args.debug, + center=args.center, + blank=args.blank, + remarkable_dir=args.remarkable_dir, + rmapi_path=args.rmapi, + pdfcrop_path=args.pdfcrop, + pdftk_path=args.pdftk, + gs_path=args.gs, + ) + + prov.run(args.input, filename=args.filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py new file mode 100644 index 0000000..af19d22 --- /dev/null +++ b/paper2remarkable/utils.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +"""Utility functions for a2r + +Author: G.J.J. van den Burg +License: See LICENSE file +Copyright: 2019, G.J.J. van den Burg + +""" + + +import sys + +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + print("", file=sys.stderr) + print( + "If you think this might be a bug, please raise an issue on GitHub: %s" + % GITHUB_URL + ) + raise SystemExit(1) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e5a697e --- /dev/null +++ b/setup.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import os + +from setuptools import find_packages, setup + +# Package meta-data. +AUTHOR = "Gertjan van den Burg" +DESCRIPTION = "Easily download an academic paper and send it to the reMarkable" +EMAIL = "gertjanvandenburg@gmail.com" +LICENSE = "MIT" +LICENSE_TROVE = "License :: OSI Approved :: MIT License" +NAME = "paper2remarkable" +REQUIRES_PYTHON = ">=3.5.0" +URL = "https://github.com/GjjvdBurg/paper2remarkable" +VERSION = None + +# What packages are required for this module to be executed? +REQUIRED = [ + "bs4>=4.8.0", + "requests>=2.21", + "pdfplumber>=0.5.12", + "unidecode>=1.1" +] + +docs_require = [] +test_require = [] +dev_require = [] + +# What packages are optional? +EXTRAS = { + "docs": docs_require, + "tests": test_require, + "dev": docs_require + test_require + dev_require, +} + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' is present in your MANIFEST.in file! +try: + with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: + long_description = "\n" + f.read() +except FileNotFoundError: + long_description = DESCRIPTION + +# Load the package's __version__.py module as a dictionary. +about = {} +if not VERSION: + project_slug = NAME.lower().replace("-", "_").replace(" ", "_") + with open(os.path.join(here, project_slug, "__version__.py")) as f: + exec(f.read(), about) +else: + about["__version__"] = VERSION + +# Where the magic happens: +setup( + name=NAME, + version=about["__version__"], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages( + exclude=["tests", "*.tests", "*.tests.*", "tests.*"] + ), + install_requires=REQUIRED, + extras_require=EXTRAS, + include_package_data=True, + license=LICENSE, + ext_modules=[], + entry_points={"console_scripts": ["p2r = paper2remarkable.__main__:main"]}, + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + LICENSE_TROVE, + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "Topic :: Education", + "Topic :: Scientific/Engineering", + "Topic :: Utilities", + ], +) -- cgit v1.2.3 From 1bb2edea5723c8987de60f8783ba645df8e0cfd5 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:22:00 +0100 Subject: Define operations in the init function This gives cleaner code and allows operations to be defined and registered by specific providers, such as the dearxiv functionality. --- paper2remarkable/providers/_base.py | 72 +++++-------------------------------- paper2remarkable/providers/arxiv.py | 51 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 63 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 05fc0b7..77413a9 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -13,7 +13,6 @@ import abc import bs4 import datetime import os -import re import requests import shutil import string @@ -56,14 +55,20 @@ class Provider(metaclass=abc.ABCMeta): self.verbose = verbose self.upload = upload self.debug = debug - self.center = center - self.blank = blank self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdfcrop_path = pdfcrop_path self.pdftk_path = pdftk_path self.gs_path = gs_path + # Define the operations to run on the pdf. Providers can add others + self.operations = [("crop", self.crop_pdf)] + if center: + self.operations.append(("center", self.center_pdf)) + if blank: + self.operations.append(("blank", self.blank_pdf)) + self.operations.append(("shrink", self.shrink_pdf)) + self.log("Starting %s" % type(self).__name__) def log(self, msg, mode="info"): @@ -167,9 +172,6 @@ class Provider(metaclass=abc.ABCMeta): return name def blank_pdf(self, filepath): - if not self.blank: - return filepath - self.log("Adding blank pages") input_pdf = PyPDF2.PdfFileReader(filepath) output_pdf = PyPDF2.PdfFileWriter() @@ -201,9 +203,6 @@ class Provider(metaclass=abc.ABCMeta): return cropped_file def center_pdf(self, filepath): - if not self.center: - return filepath - self.log("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" cropper = Cropper( @@ -295,52 +294,6 @@ class Provider(metaclass=abc.ABCMeta): exception("Uploading file %s to reMarkable failed" % filepath) self.log("Upload successful.") - def dearxiv(self, input_file): - """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) - - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) - - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - - return output_file - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) @@ -352,15 +305,8 @@ class Provider(metaclass=abc.ABCMeta): self.retrieve_pdf(src, tmp_filename) self.check_file_is_pdf(tmp_filename) - ops = [ - self.dearxiv, - self.crop_pdf, - self.center_pdf, - self.blank_pdf, - self.shrink_pdf, - ] intermediate_fname = tmp_filename - for op in ops: + for op in self.operations: intermediate_fname = op(intermediate_fname) shutil.move(intermediate_fname, clean_filename) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index fc5c004..b1982f4 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -8,7 +8,9 @@ Copyright: 2019, G.J.J. van den Burg """ +import os import re +import subprocess from ._base import Provider from ..utils import exception @@ -22,6 +24,9 @@ class Arxiv(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # register the dearxiv operation + self.operations.insert(0, ("dearxiv", self.dearxiv)) + def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ if re.match(self.re_abs, url): @@ -37,3 +42,49 @@ class Arxiv(Provider): def validate(src): """Check if the url is to an arXiv page. """ return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) + + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file -- cgit v1.2.3 From 1ac27a769c1fabd3f2339f7f929c4d39cf20564e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:49:13 +0100 Subject: Move pdf operations to a separate module --- paper2remarkable/pdf_ops.py | 97 +++++++++++++++++++++++++++++++++++++ paper2remarkable/providers/_base.py | 82 +++++-------------------------- 2 files changed, 109 insertions(+), 70 deletions(-) create mode 100644 paper2remarkable/pdf_ops.py diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py new file mode 100644 index 0000000..d1eae40 --- /dev/null +++ b/paper2remarkable/pdf_ops.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +"""Operations on PDF files + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + + +import PyPDF2 +import logging +import os +import subprocess + +from .crop import Cropper + + +def crop_pdf(filepath, pdfcrop_path="pdfcrop"): + """Crop the pdf file using Cropper + """ + logging.info("Cropping pdf file") + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + + cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) + status = cropper.crop(margins=15) + + if not status == 0: + logging.warning("Failed to crop the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(cropped_file): + logging.warning( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file + + +def center_pdf(filepath, pdfcrop_path="pdfcrop"): + """Center the pdf file on the reMarkable + """ + logging.info("Centering pdf file") + centered_file = os.path.splitext(filepath)[0] + "-center.pdf" + + cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) + status = cropper.center() + + if not status == 0: + logging.warning("Failed to center the pdf file at: %s" % filepath) + return filepath + if not os.path.exists(centered_file): + logging.warning( + "Can't find centered file '%s' where expected." % centered_file + ) + return filepath + return centered_file + + +def blank_pdf(filepath): + """Add blank pages to PDF + """ + logging.info("Adding blank pages") + input_pdf = PyPDF2.PdfFileReader(filepath) + output_pdf = PyPDF2.PdfFileWriter() + for page in input_pdf.pages: + output_pdf.addPage(page) + output_pdf.addBlankPage() + + output_file = os.path.splitext(filepath)[0] + "-blank.pdf" + with open(output_file, "wb") as fp: + output_pdf.write(fp) + return output_file + + +def shrink_pdf(filepath, gs_path="gs"): + """Shrink the PDF file size using Ghostscript + """ + logging.info("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + logging.warning("Failed to shrink the pdf file") + return filepath + return output_file diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 77413a9..d427f9e 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -22,7 +22,7 @@ import time import titlecase import unidecode -from ..crop import Cropper +from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf from ..utils import exception HEADERS = { @@ -66,7 +66,7 @@ class Provider(metaclass=abc.ABCMeta): if center: self.operations.append(("center", self.center_pdf)) if blank: - self.operations.append(("blank", self.blank_pdf)) + self.operations.append(("blank", blank_pdf)) self.operations.append(("shrink", self.shrink_pdf)) self.log("Starting %s" % type(self).__name__) @@ -93,6 +93,16 @@ class Provider(metaclass=abc.ABCMeta): def validate(src): """ Validate whether ``src`` is appropriate for this provider """ + # Wrappers for pdf operations that have additional arguments + def crop_pdf(self, filepath): + return crop_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + + def center_pdf(self, filepath): + return center_pdf(filepath, pdfcrop_path=self.pdfcrop_path) + + def shrink_pdf(self, filepath): + return shrink_pdf(filepath, gs_path=self.gs_path) + def retrieve_pdf(self, src, filename): """ Download pdf from src and save to filename """ _, pdf_url = self.get_abs_pdf_urls(src) @@ -171,74 +181,6 @@ class Provider(metaclass=abc.ABCMeta): self.log("Created filename: %s" % name) return name - def blank_pdf(self, filepath): - self.log("Adding blank pages") - input_pdf = PyPDF2.PdfFileReader(filepath) - output_pdf = PyPDF2.PdfFileWriter() - for page in input_pdf.pages: - output_pdf.addPage(page) - output_pdf.addBlankPage() - - output_file = os.path.splitext(filepath)[0] + "-blank.pdf" - with open(output_file, "wb") as fp: - output_pdf.write(fp) - return output_file - - def crop_pdf(self, filepath): - self.log("Cropping pdf file") - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper( - filepath, cropped_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.crop(margins=15) - - if not status == 0: - self.warn("Failed to crop the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(cropped_file): - self.warn( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - def center_pdf(self, filepath): - self.log("Centering pdf file") - centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper( - filepath, centered_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.center() - if not status == 0: - self.warn("Failed to center the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(centered_file): - self.warn( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - - def shrink_pdf(self, filepath): - self.log("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - self.gs_path, - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - self.warn("Failed to shrink the pdf file") - return filepath - return output_file def check_file_is_pdf(self, filename): try: -- cgit v1.2.3 From febe13fc7006db65f3a90bbb8e30d646fd0b72af Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:49:34 +0100 Subject: Move pdf file check to utils --- paper2remarkable/providers/_base.py | 11 ----------- paper2remarkable/utils.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index d427f9e..8e9223e 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -181,17 +181,6 @@ class Provider(metaclass=abc.ABCMeta): self.log("Created filename: %s" % name) return name - - def check_file_is_pdf(self, filename): - try: - fp = open(filename, "rb") - pdf = PyPDF2.PdfFileReader(fp, strict=False) - fp.close() - del pdf - return True - except PyPDF2.utils.PdfReadError: - exception("Downloaded file isn't a valid pdf file.") - def download_url(self, url, filename): """Download the content of an url and save it to a filename """ self.log("Downloading file at url: %s" % url) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index af19d22..5188afb 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -9,6 +9,7 @@ Copyright: 2019, G.J.J. van den Burg """ +import PyPDF2 import sys GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" @@ -23,3 +24,14 @@ def exception(msg): % GITHUB_URL ) raise SystemExit(1) + + +def check_file_is_pdf(filename): + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") -- cgit v1.2.3 From 283cc289655448f3d3685f57c8adfb84af2f6d69 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:50:01 +0100 Subject: Switch to logging module throughout --- paper2remarkable/providers/_base.py | 45 +++++++++++++------------------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 8e9223e..3692924 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -8,10 +8,9 @@ Copyright: 2019, G.J.J. van den Burg """ -import PyPDF2 import abc import bs4 -import datetime +import logging import os import requests import shutil @@ -52,7 +51,6 @@ class Provider(metaclass=abc.ABCMeta): pdftk_path="pdftk", gs_path="gs", ): - self.verbose = verbose self.upload = upload self.debug = debug self.remarkable_dir = remarkable_dir @@ -61,32 +59,19 @@ class Provider(metaclass=abc.ABCMeta): self.pdftk_path = pdftk_path self.gs_path = gs_path + if not self.verbose: + logging.disable() + # Define the operations to run on the pdf. Providers can add others self.operations = [("crop", self.crop_pdf)] if center: self.operations.append(("center", self.center_pdf)) + if blank: self.operations.append(("blank", blank_pdf)) self.operations.append(("shrink", self.shrink_pdf)) - self.log("Starting %s" % type(self).__name__) - - def log(self, msg, mode="info"): - if not self.verbose: - return - if not mode in ["info", "warning"]: - raise ValueError("unknown logging mode.") - now = datetime.datetime.now() - print( - now.strftime("%Y-%m-%d %H:%M:%S") - + " - " - + mode.upper() - + " - " - + msg - ) - - def warn(self, msg): - self.log(msg, mode="warning") + logging.info("Starting %s" % type(self).__name__) @staticmethod @abc.abstractmethod @@ -142,7 +127,7 @@ class Provider(metaclass=abc.ABCMeta): ): """ Retrieve the title/author (surnames)/year information """ abs_url, _ = self.get_abs_pdf_urls(src) - self.log("Getting paper info") + logging.info("Getting paper info") page = self.get_page_with_retry(abs_url) soup = bs4.BeautifulSoup(page, "html.parser") authors = self.get_authors(soup) @@ -163,7 +148,7 @@ class Provider(metaclass=abc.ABCMeta): if not filename is None: return filename # we assume that the list of authors is surname only. - self.log("Generating output filename") + logging.info("Generating output filename") if len(info["authors"]) > 3: author_part = info["authors"][0] + "_et_al" @@ -178,12 +163,12 @@ class Provider(metaclass=abc.ABCMeta): name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" name = unidecode.unidecode(name) - self.log("Created filename: %s" % name) + logging.info("Created filename: %s" % name) return name def download_url(self, url, filename): """Download the content of an url and save it to a filename """ - self.log("Downloading file at url: %s" % url) + logging.info("Downloading file at url: %s" % url) content = self.get_page_with_retry(url) with open(filename, "wb") as fid: fid.write(content) @@ -198,15 +183,17 @@ class Provider(metaclass=abc.ABCMeta): except requests.exceptions.ConnectionError: error = True if error or not res.ok: - self.warn("Error getting url %s. Retrying in 5 seconds" % url) + logging.warning( + "Error getting url %s. Retrying in 5 seconds" % url + ) time.sleep(5) continue - self.log("Downloading url: %s" % url) + logging.info("Downloading url: %s" % url) return res.content def upload_to_rm(self, filepath): remarkable_dir = self.remarkable_dir.rstrip("/") - self.log("Starting upload to reMarkable") + logging.info("Starting upload to reMarkable") if remarkable_dir: status = subprocess.call( [self.rmapi_path, "mkdir", remarkable_dir + "/"], @@ -223,7 +210,7 @@ class Provider(metaclass=abc.ABCMeta): ) if not status == 0: exception("Uploading file %s to reMarkable failed" % filepath) - self.log("Upload successful.") + logging.info("Upload successful.") def run(self, src, filename=None): info = self.get_paper_info(src) -- cgit v1.2.3 From 2b8289495ff5910d75013b903d82085bcd7742a1 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:56:53 +0100 Subject: Move upload functionality to utils --- paper2remarkable/providers/_base.py | 30 ++++++------------------------ paper2remarkable/utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 3692924..85415a9 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -15,14 +15,13 @@ import os import requests import shutil import string -import subprocess import tempfile import time import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import exception +from ..utils import upload_to_remarkable, check_file_is_pdf HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -191,27 +190,6 @@ class Provider(metaclass=abc.ABCMeta): logging.info("Downloading url: %s" % url) return res.content - def upload_to_rm(self, filepath): - remarkable_dir = self.remarkable_dir.rstrip("/") - logging.info("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [self.rmapi_path, "mkdir", remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception( - "Creating directory %s on reMarkable failed" - % remarkable_dir - ) - status = subprocess.call( - [self.rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - logging.info("Upload successful.") - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) @@ -234,7 +212,11 @@ class Provider(metaclass=abc.ABCMeta): return input() if self.upload: - return self.upload_to_rm(clean_filename) + return upload_to_remarkable( + clean_filename, + remarkable_dir=self.remarkable_dir, + rmapi_path=self.rmapi_path, + ) target_path = os.path.join(self.initial_dir, clean_filename) while os.path.exists(target_path): diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 5188afb..26b024e 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -10,6 +10,8 @@ Copyright: 2019, G.J.J. van den Burg import PyPDF2 +import logging +import subprocess import sys GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" @@ -35,3 +37,28 @@ def check_file_is_pdf(filename): return True except PyPDF2.utils.PdfReadError: exception("Downloaded file isn't a valid pdf file.") + + +def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): + logging.info("Starting upload to reMarkable") + + # Create the reMarkable dir if it doesn't exist + remarkable_dir = remarkable_dir.rstrip("/") + if remarkable_dir: + status = subprocess.call( + [rmapi_path, "mkdir", remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" % remarkable_dir + ) + + # Upload the file + status = subprocess.call( + [rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + logging.info("Upload successful.") -- cgit v1.2.3 From 61807b2ce2d1d4c70016a114c77a8fe5da9fbcdb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:57:19 +0100 Subject: Minor fixes to check_file_is_pdf --- paper2remarkable/providers/_base.py | 2 +- paper2remarkable/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 85415a9..f703874 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -199,7 +199,7 @@ class Provider(metaclass=abc.ABCMeta): with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: os.chdir(working_dir) self.retrieve_pdf(src, tmp_filename) - self.check_file_is_pdf(tmp_filename) + check_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename for op in self.operations: diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 26b024e..110453b 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -29,6 +29,10 @@ def exception(msg): def check_file_is_pdf(filename): + """Check that a given file is a PDF file. + + This is done by trying to open it using PyPDF2. + """ try: fp = open(filename, "rb") pdf = PyPDF2.PdfFileReader(fp, strict=False) @@ -36,7 +40,7 @@ def check_file_is_pdf(filename): del pdf return True except PyPDF2.utils.PdfReadError: - exception("Downloaded file isn't a valid pdf file.") + exception("File %s isn't a valid pdf file." % filename) def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): -- cgit v1.2.3 From 7551591bf876f005c47a5fe98618e0ec6e2412d2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 15:01:32 +0100 Subject: Move download functionality to utils --- paper2remarkable/providers/_base.py | 44 +++++++------------------------------ paper2remarkable/utils.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index f703874..4354776 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -12,22 +12,19 @@ import abc import bs4 import logging import os -import requests import shutil import string import tempfile -import time import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import upload_to_remarkable, check_file_is_pdf - -HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " - "Safari/537.36" -} +from ..utils import ( + upload_to_remarkable, + check_file_is_pdf, + download_url, + get_page_with_retry, +) class Provider(metaclass=abc.ABCMeta): @@ -90,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta): def retrieve_pdf(self, src, filename): """ Download pdf from src and save to filename """ _, pdf_url = self.get_abs_pdf_urls(src) - self.download_url(pdf_url, filename) + download_url(pdf_url, filename) def _format_authors(self, soup_authors, sep=",", idx=0, op=None): op = (lambda x: x) if op is None else op @@ -127,7 +124,7 @@ class Provider(metaclass=abc.ABCMeta): """ Retrieve the title/author (surnames)/year information """ abs_url, _ = self.get_abs_pdf_urls(src) logging.info("Getting paper info") - page = self.get_page_with_retry(abs_url) + page = get_page_with_retry(abs_url) soup = bs4.BeautifulSoup(page, "html.parser") authors = self.get_authors(soup) title = self.get_title(soup) @@ -165,31 +162,6 @@ class Provider(metaclass=abc.ABCMeta): logging.info("Created filename: %s" % name) return name - def download_url(self, url, filename): - """Download the content of an url and save it to a filename """ - logging.info("Downloading file at url: %s" % url) - content = self.get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - def get_page_with_retry(self, url, tries=5): - count = 0 - while count < tries: - count += 1 - error = False - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - error = True - if error or not res.ok: - logging.warning( - "Error getting url %s. Retrying in 5 seconds" % url - ) - time.sleep(5) - continue - logging.info("Downloading url: %s" % url) - return res.content - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 110453b..e2a714b 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -13,9 +13,17 @@ import PyPDF2 import logging import subprocess import sys +import requests +import time GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" +HEADERS = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " + "Safari/537.36" +} + def exception(msg): print("ERROR: " + msg, file=sys.stderr) @@ -43,6 +51,34 @@ def check_file_is_pdf(filename): exception("File %s isn't a valid pdf file." % filename) +def download_url(url, filename): + """Download the content of an url and save it to a filename """ + logging.info("Downloading file at url: %s" % url) + content = get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + +def get_page_with_retry(url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + logging.warning( + "(%i/%i) Error getting url %s. Retrying in 5 seconds." % + (count, tries, url) + ) + time.sleep(5) + continue + logging.info("Downloading url: %s" % url) + return res.content + + def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): logging.info("Starting upload to reMarkable") -- cgit v1.2.3 From eadbd95da32057e01c1b4d5f2cb554e4c0c0b292 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 15:10:47 +0100 Subject: Move string cleaning to utils --- paper2remarkable/providers/_base.py | 15 ++++----------- paper2remarkable/utils.py | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 4354776..db13434 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -13,17 +13,17 @@ import bs4 import logging import os import shutil -import string import tempfile import titlecase import unidecode from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf from ..utils import ( - upload_to_remarkable, check_file_is_pdf, + clean_string, download_url, get_page_with_retry, + upload_to_remarkable, ) @@ -131,13 +131,6 @@ class Provider(metaclass=abc.ABCMeta): date = self.get_date(soup) return dict(title=title, date=date, authors=authors) - def string_clean(self, s): - """ Clean a string to replace accented characters with equivalents and - keep only the allowed characters """ - normalized = unidecode.unidecode(s) - allowed = string.ascii_letters + string.digits + "_ ." - cleaned = "".join(c if c in allowed else "_" for c in normalized) - return cleaned def create_filename(self, info, filename=None): """ Generate filename using the info dict or filename if provided """ @@ -150,9 +143,9 @@ class Provider(metaclass=abc.ABCMeta): author_part = info["authors"][0] + "_et_al" else: author_part = "_".join(info["authors"]) - author_part = self.string_clean(author_part) + author_part = clean_string(author_part) - title_part = self.string_clean(info["title"]) + title_part = clean_string(info["title"]) title_part = titlecase.titlecase(title_part).replace(" ", "_") year_part = info["date"].split("/")[0] diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index e2a714b..15cac95 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -8,13 +8,14 @@ Copyright: 2019, G.J.J. van den Burg """ - import PyPDF2 import logging +import requests +import string import subprocess import sys -import requests import time +import unidecode GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" @@ -36,6 +37,16 @@ def exception(msg): raise SystemExit(1) +def clean_string(s): + """ Clean a string by replacing accented characters with equivalents and + keeping only the allowed characters (ascii letters, digits, underscore, + space, and period)""" + normalized = unidecode.unidecode(s) + allowed = string.ascii_letters + string.digits + "_ ." + cleaned = "".join(c if c in allowed else "_" for c in normalized) + return cleaned + + def check_file_is_pdf(filename): """Check that a given file is a PDF file. @@ -70,8 +81,8 @@ def get_page_with_retry(url, tries=5): error = True if error or not res.ok: logging.warning( - "(%i/%i) Error getting url %s. Retrying in 5 seconds." % - (count, tries, url) + "(%i/%i) Error getting url %s. Retrying in 5 seconds." + % (count, tries, url) ) time.sleep(5) continue -- cgit v1.2.3 From 5a8b1f64445f55201999e3355589b83c01f05ba4 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 15:11:16 +0100 Subject: Simplify filename code --- paper2remarkable/providers/_base.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index db13434..8b454b0 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -131,11 +131,8 @@ class Provider(metaclass=abc.ABCMeta): date = self.get_date(soup) return dict(title=title, date=date, authors=authors) - - def create_filename(self, info, filename=None): + def create_filename(self, info): """ Generate filename using the info dict or filename if provided """ - if not filename is None: - return filename # we assume that the list of authors is surname only. logging.info("Generating output filename") @@ -157,7 +154,7 @@ class Provider(metaclass=abc.ABCMeta): def run(self, src, filename=None): info = self.get_paper_info(src) - clean_filename = self.create_filename(info, filename) + clean_filename = filename or self.create_filename(info) tmp_filename = "paper.pdf" self.initial_dir = os.getcwd() -- cgit v1.2.3 From 221a27aaf0b5e7746a790610fe568ed33dcfbd7a Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:56:18 +0100 Subject: Rename unit test file --- tests/test.py | 106 ------------------------------------------------ tests/test_providers.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 106 deletions(-) delete mode 100644 tests/test.py create mode 100644 tests/test_providers.py diff --git a/tests/test.py b/tests/test.py deleted file mode 100644 index 83c74af..0000000 --- a/tests/test.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__author__ = "G.J.J. van den Burg" - -"""Tests""" - -import unittest -import tempfile -import hashlib -import shutil -import os - -from arxiv2remarkable import ( - ACM, - Arxiv, - LocalFile, - OpenReview, - PdfUrl, - Pubmed, - Springer, -) - -VERBOSE = False - - -def md5sum(filename): - blocksize = 65536 - hasher = hashlib.md5() - with open(filename, "rb") as fid: - buf = fid.read(blocksize) - while len(buf) > 0: - hasher.update(buf) - buf = fid.read(blocksize) - return hasher.hexdigest() - - -class Tests(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.original_dir = os.getcwd() - - def setUp(self): - self.test_dir = tempfile.mkdtemp() - os.chdir(self.test_dir) - - def tearDown(self): - os.chdir(self.original_dir) - shutil.rmtree(self.test_dir) - - def test_arxiv(self): - prov = Arxiv(upload=False, verbose=VERBOSE) - url = "https://arxiv.org/abs/1811.11242v1" - exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_pmc(self): - prov = Pubmed(upload=False, verbose=VERBOSE) - url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" - exp_filename = ( - "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" - ) - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_acm(self): - prov = ACM(upload=False, verbose=VERBOSE) - url = "https://dl.acm.org/citation.cfm?id=3025626" - exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_openreview(self): - prov = OpenReview(upload=False, verbose=VERBOSE) - url = "https://openreview.net/forum?id=S1x4ghC9tQ" - exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_springer(self): - prov = Springer(upload=False, verbose=VERBOSE) - url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" - exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" - filename = prov.run(url) - self.assertEqual(exp_filename, os.path.basename(filename)) - - def test_local(self): - local_filename = "test.pdf" - with open(local_filename, "w") as fp: - fp.write( - "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" - ) - prov = LocalFile(upload=False, verbose=VERBOSE) - filename = prov.run(local_filename) - self.assertEqual("test_.pdf", os.path.basename(filename)) - - def test_pdfurl(self): - prov = PdfUrl(upload=False, verbose=VERBOSE) - url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" - filename = prov.run(url, filename="test.pdf") - self.assertEqual("test.pdf", os.path.basename(filename)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_providers.py b/tests/test_providers.py new file mode 100644 index 0000000..bb793b3 --- /dev/null +++ b/tests/test_providers.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +__author__ = "G.J.J. van den Burg" + +"""Tests""" + +import unittest +import tempfile +import hashlib +import shutil +import os + +from paper2remarkable.providers import ( + ACM, + Arxiv, + LocalFile, + OpenReview, + PdfUrl, + PubMed, + Springer, +) + +VERBOSE = True + + +def md5sum(filename): + blocksize = 65536 + hasher = hashlib.md5() + with open(filename, "rb") as fid: + buf = fid.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = fid.read(blocksize) + return hasher.hexdigest() + + +class Tests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.original_dir = os.getcwd() + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + os.chdir(self.test_dir) + + def tearDown(self): + os.chdir(self.original_dir) + shutil.rmtree(self.test_dir) + + def test_arxiv(self): + prov = Arxiv(upload=False, verbose=VERBOSE) + url = "https://arxiv.org/abs/1811.11242v1" + exp_filename = "Burg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_pmc(self): + prov = PubMed(upload=False, verbose=VERBOSE) + url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3474301/" + exp_filename = ( + "Hoogenboom_Manske_-_How_to_Write_a_Scientific_Article_2012.pdf" + ) + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_acm(self): + prov = ACM(upload=False, verbose=VERBOSE) + url = "https://dl.acm.org/citation.cfm?id=3025626" + exp_filename = "Kery_Horvath_Myers_-_Variolite_Supporting_Exploratory_Programming_by_Data_Scientists_2017.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_openreview(self): + prov = OpenReview(upload=False, verbose=VERBOSE) + url = "https://openreview.net/forum?id=S1x4ghC9tQ" + exp_filename = "Gregor_et_al_-_Temporal_Difference_Variational_Auto-Encoder_2018.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_springer(self): + prov = Springer(upload=False, verbose=VERBOSE) + url = "https://link.springer.com/article/10.1007/s10618-019-00631-5" + exp_filename = "Mauw_Ramirez-Cruz_Trujillo-Rasua_-_Robust_Active_Attacks_on_Social_Graphs_2019.pdf" + filename = prov.run(url) + self.assertEqual(exp_filename, os.path.basename(filename)) + + def test_local(self): + local_filename = "test.pdf" + with open(local_filename, "w") as fp: + fp.write( + "%PDF-1.1\n%¥±ë\n\n1 0 obj\n << /Type /Catalog\n /Pages 2 0 R\n >>\nendobj\n\n2 0 obj\n << /Type /Pages\n /Kids [3 0 R]\n /Count 1\n /MediaBox [0 0 300 144]\n >>\nendobj\n\n3 0 obj\n << /Type /Page\n /Parent 2 0 R\n /Resources\n << /Font\n << /F1\n << /Type /Font\n /Subtype /Type1\n /BaseFont /Times-Roman\n >>\n >>\n >>\n /Contents 4 0 R\n >>\nendobj\n\n4 0 obj\n << /Length 55 >>\nstream\n BT\n /F1 18 Tf\n 0 0 Td\n (Hello World) Tj\n ET\nendstream\nendobj\n\nxref\n0 5\n0000000000 65535 f \n0000000018 00000 n \n0000000077 00000 n \n0000000178 00000 n \n0000000457 00000 n \ntrailer\n << /Root 1 0 R\n /Size 5\n >>\nstartxref\n565\n%%EOF" + ) + prov = LocalFile(upload=False, verbose=VERBOSE) + filename = prov.run(local_filename) + self.assertEqual("test_.pdf", os.path.basename(filename)) + + def test_pdfurl(self): + prov = PdfUrl(upload=False, verbose=VERBOSE) + url = "http://www.jmlr.org/papers/volume17/14-526/14-526.pdf" + filename = prov.run(url, filename="test.pdf") + self.assertEqual("test.pdf", os.path.basename(filename)) + + +if __name__ == "__main__": + unittest.main() -- cgit v1.2.3 From 058589548a6b91350e240468f5ddaa47e7a10abf Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:57:44 +0100 Subject: Move paper info functionality to Informer class --- paper2remarkable/__init__.py | 3 + paper2remarkable/providers/__init__.py | 4 +- paper2remarkable/providers/_base.py | 100 +++++------------------------- paper2remarkable/providers/_info.py | 103 +++++++++++++++++++++++++++++++ paper2remarkable/providers/acm.py | 41 ++++++------ paper2remarkable/providers/arxiv.py | 9 ++- paper2remarkable/providers/local.py | 26 ++++---- paper2remarkable/providers/openreview.py | 15 +++-- paper2remarkable/providers/pdf_url.py | 27 ++++---- paper2remarkable/providers/pubmed.py | 30 +++++---- paper2remarkable/providers/springer.py | 15 +++-- paper2remarkable/utils.py | 6 +- 12 files changed, 221 insertions(+), 158 deletions(-) create mode 100644 paper2remarkable/providers/_info.py diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py index e69de29..71c1105 100644 --- a/paper2remarkable/__init__.py +++ b/paper2remarkable/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" diff --git a/paper2remarkable/providers/__init__.py b/paper2remarkable/providers/__init__.py index 361c11e..f6f93f9 100644 --- a/paper2remarkable/providers/__init__.py +++ b/paper2remarkable/providers/__init__.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- from .arxiv import Arxiv -from .pubmed import Pubmed +from .pubmed import PubMed from .acm import ACM from .openreview import OpenReview from .springer import Springer from .local import LocalFile from .pdf_url import PdfUrl -providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] +providers = [Arxiv, PubMed, ACM, OpenReview, Springer, LocalFile, PdfUrl] diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 8b454b0..ca6ab70 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -9,31 +9,19 @@ Copyright: 2019, G.J.J. van den Burg """ import abc -import bs4 import logging import os import shutil import tempfile -import titlecase -import unidecode +from ._info import Informer from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf -from ..utils import ( - check_file_is_pdf, - clean_string, - download_url, - get_page_with_retry, - upload_to_remarkable, -) +from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable class Provider(metaclass=abc.ABCMeta): """ ABC for providers of pdf sources """ - meta_author_key = "citation_author" - meta_title_key = "citation_title" - meta_date_key = "citation_date" - def __init__( self, verbose=False, @@ -54,11 +42,14 @@ class Provider(metaclass=abc.ABCMeta): self.pdfcrop_path = pdfcrop_path self.pdftk_path = pdftk_path self.gs_path = gs_path + self.informer = Informer() - if not self.verbose: + # disable logging if requested + logging.basicConfig(level=logging.INFO) + if not verbose: logging.disable() - # Define the operations to run on the pdf. Providers can add others + # Define the operations to run on the pdf. Providers can add others. self.operations = [("crop", self.crop_pdf)] if center: self.operations.append(("center", self.center_pdf)) @@ -84,87 +75,24 @@ class Provider(metaclass=abc.ABCMeta): def shrink_pdf(self, filepath): return shrink_pdf(filepath, gs_path=self.gs_path) - def retrieve_pdf(self, src, filename): + def retrieve_pdf(self, pdf_url, filename): """ Download pdf from src and save to filename """ - _, pdf_url = self.get_abs_pdf_urls(src) + # This must exist so that the LocalFile provider can overwrite it download_url(pdf_url, filename) - def _format_authors(self, soup_authors, sep=",", idx=0, op=None): - op = (lambda x: x) if op is None else op - # format the author list retrieved by bs4 - return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] - - def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] - return self._format_authors(authors) - - def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] - - def _format_date(self, soup_date): - return soup_date - - def get_date(self, soup): - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] - return self._format_date(date) - - def get_paper_info( - self, - src, - author_key="citation_author", - title_key="citation_title", - date_key="citation_date", - ): - """ Retrieve the title/author (surnames)/year information """ - abs_url, _ = self.get_abs_pdf_urls(src) - logging.info("Getting paper info") - page = get_page_with_retry(abs_url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = self.get_authors(soup) - title = self.get_title(soup) - date = self.get_date(soup) - return dict(title=title, date=date, authors=authors) - - def create_filename(self, info): - """ Generate filename using the info dict or filename if provided """ - # we assume that the list of authors is surname only. - logging.info("Generating output filename") - - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = clean_string(author_part) - - title_part = clean_string(info["title"]) - title_part = titlecase.titlecase(title_part).replace(" ", "_") - - year_part = info["date"].split("/")[0] - - name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - name = unidecode.unidecode(name) - logging.info("Created filename: %s" % name) - return name - def run(self, src, filename=None): - info = self.get_paper_info(src) - clean_filename = filename or self.create_filename(info) + abs_url, pdf_url = self.get_abs_pdf_urls(src) + clean_filename = filename or self.informer.get_filename(abs_url) tmp_filename = "paper.pdf" self.initial_dir = os.getcwd() with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: os.chdir(working_dir) - self.retrieve_pdf(src, tmp_filename) - check_file_is_pdf(tmp_filename) + self.retrieve_pdf(pdf_url, tmp_filename) + assert_file_is_pdf(tmp_filename) intermediate_fname = tmp_filename - for op in self.operations: + for opname, op in self.operations: intermediate_fname = op(intermediate_fname) shutil.move(intermediate_fname, clean_filename) diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py new file mode 100644 index 0000000..04efcb1 --- /dev/null +++ b/paper2remarkable/providers/_info.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +"""Functionality for retrieving paper info +""" + +import logging +import titlecase +import unidecode +import bs4 + +from ..utils import clean_string, get_page_with_retry + + +class Informer: + """Base class for the informers. + + The "informer" class is used to retrieve the title, authors, and year of + publication of the provided paper. + + This base class provides the main functionality, but because various + outlets use different conventions to embed author, title, and publication + year information, we expect that individual providers will subclass this + class and overwrite some of the methods. + """ + + meta_author_key = "citation_author" + meta_title_key = "citation_title" + meta_date_key = "citation_date" + + def __init__(self, title=None, authors=None, year=None): + self.title = title + self.authors = authors or [] + self.year = year + + def get_filename(self, abs_url): + """ Generate nice filename using the paper information + + The provided url must be to a HTMl page where this information can be + found, not to the PDF file itself. + """ + logging.info("Generating output filename") + + # Retrieve the paper information + self.get_info(abs_url) + + # we assume that the list of authors is surname only. + if len(self.authors) > 3: + authors = self.authors[0] + "_et_al" + else: + authors = "_".join(self.authors) + authors = clean_string(authors) + + # Clean the title and make it titlecase + title = clean_string(self.title) + title = titlecase.titlecase(title) + title = title.replace(" ", "_") + + year = str(self.year) + + name = authors + "_-_" + title + "_" + year + ".pdf" + name = unidecode.unidecode(name) + logging.info("Created filename: %s" % name) + return name + + def get_info(self, url): + logging.info("Getting paper info") + page = get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + self.authors = self.authors or self.get_authors(soup) + self.title = self.title or self.get_title(soup) + self.year = self.year or self.get_year(soup) + + ## Title + + def get_title(self, soup): + target = soup.find_all("meta", {"name": self.meta_title_key}) + return target[0]["content"] + + ## Authors + + def get_authors(self, soup): + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": self.meta_author_key}) + ] + return self._format_authors(authors) + + def _format_authors(self, soup_authors, sep=",", idx=0, op=None): + op = (lambda x: x) if op is None else op + # format the author list retrieved by bs4 + return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] + + ## Year + + def _format_year(self, soup_date): + return soup_date.split("/")[0] + + def get_year(self, soup): + """ Retrieve the contents of the meta_date_key field and format it """ + date = soup.find_all("meta", {"name": self.meta_date_key})[0][ + "content" + ] + return self._format_year(date) diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py index be98e16..e14efa7 100644 --- a/paper2remarkable/providers/acm.py +++ b/paper2remarkable/providers/acm.py @@ -11,23 +11,38 @@ Copyright: 2019, G.J.J. van den Burg import bs4 import re -from . import Provider -from ..utils import exception +from ._base import Provider +from ._info import Informer +from .. import GITHUB_URL +from ..utils import exception, get_page_with_retry -# TODO: put this somewhere central, now multiply defined -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" - -class ACM(Provider): +class ACMInformer(Informer): meta_author_key = "citation_authors" + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(";") + return super()._format_authors(soup_authors, sep=",", idx=0, op=op) + + def _format_year(self, soup_date): + if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so it can be fixed: %s" % GITHUB_URL + ) + return soup_date.strip().split("/")[-1] + + +class ACM(Provider): + re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = ACMInformer() def get_acm_pdf_url(self, url): - page = self.get_page_with_retry(url) + page = get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") thea = None for a in soup.find_all("a"): @@ -60,15 +75,3 @@ class ACM(Provider): def validate(src): m = re.fullmatch(ACM.re_abs, src) return not m is None - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(";") - return super()._format_authors(soup_authors, sep=",", idx=0, op=op) - - def _format_date(self, soup_date): - if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - self.warn( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so it can be fixed: %s" % GITHUB_URL - ) - return soup_date.strip().split("/")[-1] diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index b1982f4..d950e47 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -11,11 +11,17 @@ Copyright: 2019, G.J.J. van den Burg import os import re import subprocess +import logging +from ._info import Informer from ._base import Provider from ..utils import exception +class ArxivInformer(Informer): + pass + + class Arxiv(Provider): re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" @@ -23,6 +29,7 @@ class Arxiv(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = ArxivInformer() # register the dearxiv operation self.operations.insert(0, ("dearxiv", self.dearxiv)) @@ -45,7 +52,7 @@ class Arxiv(Provider): def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") + logging.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] uncompress_file = basename + "_uncompress.pdf" diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py index 68ce030..b1201d3 100644 --- a/paper2remarkable/providers/local.py +++ b/paper2remarkable/providers/local.py @@ -11,24 +11,28 @@ Copyright: 2019, G.J.J. van den Burg import os import shutil -from . import Provider +from ._base import Provider +from ._info import Informer + + +class LocalFileInformer(Informer): + def get_filenames(self, abs_url): + return os.path.basename(abs_url) class LocalFile(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = LocalFileInformer() + + def get_abs_pdf_url(self, url): + # The 'url' is the path to the local file. We use this as abs_url and + # pdf_url. + return url, url def validate(src): return os.path.exists(src) - def retrieve_pdf(self, src, filename): - source = os.path.join(self.initial_dir, src) + def retrieve_pdf(self, pdf_url, filename): + source = os.path.join(self.initial_dir, pdf_url) shutil.copy(source, filename) - - def get_paper_info(self, src): - return {"filename": src} - - def create_filename(self, info, filename=None): - if not filename is None: - return filename - return os.path.basename(info["filename"]) diff --git a/paper2remarkable/providers/openreview.py b/paper2remarkable/providers/openreview.py index b7e1d77..bfb139d 100644 --- a/paper2remarkable/providers/openreview.py +++ b/paper2remarkable/providers/openreview.py @@ -10,19 +10,27 @@ Copyright: 2019, G.J.J. van den Burg import re -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class OpenReview(Provider): +class OpenReviewInformer(Informer): meta_date_key = "citation_publication_date" + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class OpenReview(Provider): + re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = OpenReviewInformer() def get_abs_pdf_urls(self, url): """ Get the pdf and abstract url from a OpenReview url """ @@ -41,6 +49,3 @@ class OpenReview(Provider): return re.match(OpenReview.re_abs, src) or re.match( OpenReview.re_pdf, src ) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index 56427d3..f28c742 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -10,13 +10,25 @@ Copyright: 2019, G.J.J. van den Burg import urllib -from . import Provider +from ._base import Provider +from ._info import Informer + from ..utils import exception +class PdfUrlInformer(Informer): + + def get_filename(self, abs_url): + # if this is called, filename must not be provided + exception( + "Filename must be provided with PDFUrlProvider (use --filename)" + ) + + class PdfUrl(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = PdfUrlInformer() def validate(src): try: @@ -24,16 +36,3 @@ class PdfUrl(Provider): return all([result.scheme, result.netloc, result.path]) except: return False - - def retrieve_pdf(self, url, filename): - self.download_url(url, filename) - - def get_paper_info(self, src): - return None - - def create_filename(self, info, filename=None): - if filename is None: - exception( - "Filename must be provided with PDFUrlProvider (use --filename)" - ) - return filename diff --git a/paper2remarkable/providers/pubmed.py b/paper2remarkable/providers/pubmed.py index 29bdb31..ba4cca0 100644 --- a/paper2remarkable/providers/pubmed.py +++ b/paper2remarkable/providers/pubmed.py @@ -10,13 +10,27 @@ Copyright: 2019, G.J.J. van den Burg import re -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class Pubmed(Provider): + +class PubMedInformer(Informer): meta_author_key = "citation_authors" + def _format_authors(self, soup_authors): + op = lambda x: x[0].split(",") + return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) + + def _format_year(self, soup_date): + if re.match("\w+\ \d{4}", soup_date): + return soup_date.split(" ")[-1] + return soup_date.replace(" ", "_") + + +class PubMed(Provider): + re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" re_pdf = ( "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" @@ -24,6 +38,7 @@ class Pubmed(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = PubMedInformer() def get_abs_pdf_urls(self, url): """Get the pdf and html url from a given PMC url """ @@ -39,13 +54,4 @@ class Pubmed(Provider): return abs_url, pdf_url def validate(src): - return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(",") - return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) - - def _format_date(self, soup_date): - if re.match("\w+\ \d{4}", soup_date): - return soup_date.split(" ")[-1] - return soup_date.replace(" ", "_") + return re.match(PubMed.re_abs, src) or re.match(PubMed.re_pdf, src) diff --git a/paper2remarkable/providers/springer.py b/paper2remarkable/providers/springer.py index ce16007..ce4acdd 100644 --- a/paper2remarkable/providers/springer.py +++ b/paper2remarkable/providers/springer.py @@ -11,19 +11,27 @@ Copyright: 2019, G.J.J. van den Burg import re import urllib -from . import Provider +from ._base import Provider +from ._info import Informer from ..utils import exception -class Springer(Provider): +class SpringerInformer(Informer): meta_date_key = "citation_online_date" + def _format_authors(self, soup_authors): + return super()._format_authors(soup_authors, sep=" ", idx=-1) + + +class Springer(Provider): + re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.informer = SpringerInformer() def get_abs_pdf_urls(self, url): """ Get the pdf and abstract urls from a Springer url """ @@ -39,6 +47,3 @@ class Springer(Provider): def validate(src): return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 15cac95..2bed231 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -17,7 +17,7 @@ import sys import time import unidecode -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" +from . import GITHUB_URL HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -47,8 +47,8 @@ def clean_string(s): return cleaned -def check_file_is_pdf(filename): - """Check that a given file is a PDF file. +def assert_file_is_pdf(filename): + """Assert that a given file is a PDF file. This is done by trying to open it using PyPDF2. """ -- cgit v1.2.3 From 552fdeff2832bfe6dc71ebdfdaf92387f5cb98b0 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:57:50 +0100 Subject: fix dependencies --- setup.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index e5a697e..3d1fbc5 100644 --- a/setup.py +++ b/setup.py @@ -19,15 +19,19 @@ VERSION = None # What packages are required for this module to be executed? REQUIRED = [ - "bs4>=4.8.0", - "requests>=2.21", - "pdfplumber>=0.5.12", - "unidecode>=1.1" + "beautifulsoup4>=4.8", + "requests>=2.21", + "pdfplumber>=0.5", + "unidecode>=1.1", + "titlecase>=0.12", + "PyPDF2>=1.26" + ] docs_require = [] test_require = [] -dev_require = [] +dev_require = [ + 'green'] # What packages are optional? EXTRAS = { -- cgit v1.2.3 From db56f74e5430ac1f1a1b255db3dc3fe799bffbbb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:58:08 +0100 Subject: minor makefile fixes --- Makefile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index ed2d040..baccb92 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ PACKAGE=paper2remarkable DOC_DIR='./docs/' -VENV_DIR='/tmp/p2r_venv/' +VENV_DIR=/tmp/p2r_venv/ .PHONY: help cover dist @@ -26,7 +26,7 @@ install: ## Install for the current user using the default python command test: venv ## Run unit tests - source $(VENV_DIR)/bin/activate && green -v ./tests/test_unit + source $(VENV_DIR)/bin/activate && green -v ./tests clean: ## Clean build dist and egg directories left after install @@ -52,11 +52,9 @@ doc: install ## Build documentation with Sphinx touch source/AUTOGENERATED $(MAKE) -C $(DOC_DIR) html - - venv: $(VENV_DIR)/bin/activate $(VENV_DIR)/bin/activate: test -d $(VENV_DIR) || virtualenv $(VENV_DIR) - source $(VENV_DIR)/bin/activate && pip install -q -e .[dev] + source $(VENV_DIR)/bin/activate && pip install -e .[dev] touch $(VENV_DIR)/bin/activate -- cgit v1.2.3 From 754ae016ae27a337bf230d162abf6ea1b423bd7d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 22:58:31 +0100 Subject: Remove poetry stuff --- poetry.lock | 183 --------------------------------------------------------- pyproject.toml | 19 ------ 2 files changed, 202 deletions(-) delete mode 100644 poetry.lock delete mode 100644 pyproject.toml diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index 272967c..0000000 --- a/poetry.lock +++ /dev/null @@ -1,183 +0,0 @@ -[[package]] -category = "main" -description = "Screen-scraping library" -name = "beautifulsoup4" -optional = false -python-versions = "*" -version = "4.7.1" - -[package.dependencies] -soupsieve = ">=1.2" - -[[package]] -category = "main" -description = "Dummy package for Beautiful Soup" -name = "bs4" -optional = false -python-versions = "*" -version = "0.0.1" - -[package.dependencies] -beautifulsoup4 = "*" - -[[package]] -category = "main" -description = "Python package for providing Mozilla's CA Bundle." -name = "certifi" -optional = false -python-versions = "*" -version = "2018.11.29" - -[[package]] -category = "main" -description = "Universal encoding detector for Python 2 and 3" -name = "chardet" -optional = false -python-versions = "*" -version = "3.0.4" - -[[package]] -category = "main" -description = "Internationalized Domain Names in Applications (IDNA)" -name = "idna" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.8" - -[[package]] -category = "main" -description = "PDF parser and analyzer" -name = "pdfminer.six" -optional = false -python-versions = "*" -version = "20181108" - -[package.dependencies] -pycryptodome = "*" -six = "*" -sortedcontainers = "*" - -[[package]] -category = "main" -description = "Plumb a PDF for detailed information about each char, rectangle, and line." -name = "pdfplumber" -optional = false -python-versions = "*" -version = "0.5.12" - -[package.dependencies] -chardet = "*" -"pdfminer.six" = "20181108" -pillow = ">=3.0.0" -pycryptodome = "*" -unicodecsv = ">=0.14.1" -wand = "*" - -[[package]] -category = "main" -description = "Python Imaging Library (Fork)" -name = "pillow" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "6.0.0" - -[[package]] -category = "main" -description = "Cryptographic library for Python" -name = "pycryptodome" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "3.8.2" - -[[package]] -category = "main" -description = "Python HTTP for Humans." -name = "requests" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.21.0" - -[package.dependencies] -certifi = ">=2017.4.17" -chardet = ">=3.0.2,<3.1.0" -idna = ">=2.5,<2.9" -urllib3 = ">=1.21.1,<1.25" - -[[package]] -category = "main" -description = "Python 2 and 3 compatibility utilities" -name = "six" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*" -version = "1.12.0" - -[[package]] -category = "main" -description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -name = "sortedcontainers" -optional = false -python-versions = "*" -version = "2.1.0" - -[[package]] -category = "main" -description = "A CSS4 selector implementation for Beautiful Soup." -name = "soupsieve" -optional = false -python-versions = "*" -version = "1.7.3" - -[[package]] -category = "main" -description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*." -name = "unicodecsv" -optional = false -python-versions = "*" -version = "0.14.1" - -[[package]] -category = "main" -description = "ASCII transliterations of Unicode text" -name = "unidecode" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.1.1" - -[[package]] -category = "main" -description = "HTTP library with thread-safe connection pooling, file post, and more." -name = "urllib3" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -version = "1.24.1" - -[[package]] -category = "main" -description = "Ctypes-based simple MagickWand API binding for Python" -name = "wand" -optional = false -python-versions = "*" -version = "0.5.4" - -[metadata] -content-hash = "51a0dc0e8f6e6e23395cd5aca6a81e9b3aa121ec86f120f1304f2142eb2b65b0" -python-versions = "^3.5" - -[metadata.hashes] -beautifulsoup4 = ["034740f6cb549b4e932ae1ab975581e6103ac8f942200a0e9759065984391858", "945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"] -bs4 = ["36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"] -certifi = ["47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", "993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"] -chardet = ["84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"] -idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"] -"pdfminer.six" = ["f04d029d1d3e58c87da51bdefef2e9a1dbf2d7b63f727dd2a3e36054f5ae96ea"] -pdfplumber = ["f2d6861c21730630347d4f97b2e635860e9cb4074972c60b7c443701f7bfa88f"] -pillow = ["0683e80d81e840d401b687ebc00a02bbb23d0793c34d0852a5af64cfa1589540", "09c4e81c3277199898e8dc2d85d94febad87c41251ecbd447ba7d64d94765bd8", "0ee74a23022af9baf997e3016b4e090e4ff08688d37a6f49010338ab46cfe101", "10860baedfe5da7c43cd17835b091494dcc59dda5ad176a011713fe398ea6ac2", "15c056bfa284c30a7f265a41ac4cbbc93bdbfc0dfe0613b9cb8a8581b51a9e55", "1a4e06ba4f74494ea0c58c24de2bb752818e9d504474ec95b0aa94f6b0a7e479", "1c3c707c76be43c9e99cb7e3d5f1bee1c8e5be8b8a2a5eeee665efbf8ddde91a", "1fd0b290203e3b0882d9605d807b03c0f47e3440f97824586c173eca0aadd99d", "24114e4a6e1870c5a24b1da8f60d0ba77a0b4027907860188ea82bd3508c80eb", "258d886a49b6b058cd7abb0ab4b2b85ce78669a857398e83e8b8e28b317b5abb", "2734c55f7d054b0ad889c971136cbb0a5b35a921e27beaa44fdc2436af529c6e", "2ac36ec56727a95bd5a04dfca6abce1db8042c31ee73b65796a42f31fd52d009", "2bc1002b573d107c0b172a5da0f34b4900b2ddc6c3296b82d601e966d5ac1959", "33c79b6dd6bc7f65079ab9ca5bebffb5f5d1141c689c9c6a7855776d1b09b7e8", "367385fc797b2c31564c427430c7a8630db1a00bd040555dfc1d5c52e39fcd72", "3c1884ff078fb8bf5f63d7d86921838b82ed4a7d0c027add773c2f38b3168754", "44e5240e8f4f8861d748f2a58b3f04daadab5e22bfec896bf5434745f788f33f", "46aa988e15f3ea72dddd81afe3839437b755fffddb5e173886f11460be909dce", "492e1e4df823b57f9334f591c78a1e0e65a361e92594534e0568eeeeea56bbba", "50fb9e25d25cfcb50b2e6842c4e104e4f0b424be4624e1724532bf005c67589a", "5ceadd60dbd1e56ab7faffbfee1df5ecb83c3f0420e47f652cd5306d70eb0296", "74d90d499c9c736d52dd6d9b7221af5665b9c04f1767e35f5dd8694324bd4601", "7eeac51fc37e6b19631a4b8e38b8261a074efcf7cc27fc16a6bee4697af7aaa5", "809c0a2ce9032cbcd7b5313f71af4bdc5c8c771cb86eb7559afd954cab82ebb5", "85d1ef2cdafd5507c4221d201aaf62fc9276f8b0f71bd3933363e62a33abc734", "8c3889c7681af77ecfa4431cd42a2885d093ecb811e81fbe5e203abc07e0995b", "9218d81b9fca98d2c47d35d688a0cea0c42fd473159dfd5612dcb0483c63e40b", "9319215530e236822169cbe92426cdc18d16b88c943fdf365a6309a89876e335", "96ec275c83bf839972d6a7dd7d685fdfb6a3233c3c382ecff839d04e7d53955d", "9aa4f3827992288edd37c9df345783a69ef58bd20cc02e64b36e44bcd157bbf1", "9d80f44137a70b6f84c750d11019a3419f409c944526a95219bea0ac31f4dd91", "b7ebd36128a2fe93991293f997e44be9286503c7530ace6a55b938b20be288d8", "c30857e1fbf7d4a4b79d7d376eefaf293ea4307b8293d00a62e6f517f51bfe9b", "c4c78e2c71c257c136cdd43869fd3d5e34fc2162dc22e4a5406b0ebe86958239", "c5472ea3945e8f9eb0659f37fc1f592fd06f4f725f0f03774a8999ad8c130334", "c6a842537f887be1fe115d8abb5daa9bc8cc124e455ff995830cc785624a97af", "cf0a2e040fdf5a6d95f4c286c6ef1df6b36c218b528c8a9158ec2452a804b9b8", "cfd28aad6fc61f7a5d4ee556a997dc6e5555d9381d1390c00ecaf984d57e4232", "d0fd1ec2e7c3e0aeaae999efe83f5d0f42c1160a1f8be5120d40857d20baa452", "dca5660e25932771460d4688ccbb515677caaf8595f3f3240ec16c117deff89a", "de7aedc85918c2f887886442e50f52c1b93545606317956d65f342bd81cb4fc3", "e6c0bbf8e277b74196e3140c35f9a1ae3eafd818f7f2d3a15819c49135d6c062"] -pycryptodome = ["0281dc6a65a4d0d9e439f54e0ad5faf27bfdc2ebe9ead36912bac74a0920fa2e", "02af9b284f5c9a55f06f5e4532c16c9b7bd958e293e93969934d864ef7bd87ee", "09da99372fb69762e4b9690291176a166cc351793e2e1c9405d29ca291503aa8", "0c2400ccfc049c3f24e65d4f02bb4208d86e408011019e455fab7f50d2b226c9", "2081dd6dce6b21bf3596427edaedd4f2561dce616893b162ed2c674f3a3ca70a", "28b86ec9fdb005a2a18e4862a3a7277046738825ee8dc89cda5657e75a396089", "2d790c0d4c0d5edcf5fbab4e2af7b03757e40c5ae8d217f0dfe9ddea37fe130f", "2f24906153dca16528cf5515b1afa9ef635423d5a654904e861765f88ca667b6", "30d283939896fa4bacbdb9fa86e6fd51e9a5b953a511e210b38481f697f289f5", "31f78b67f97830d137f74813c0502a181a03b43a32ed124049bb20428176c307", "33c1f3a380fd38ab4dd4372bef17e98002b360b52814bb1b077693b1bd06ec87", "34091e9a6650c44e25339f22fc821396f19f152f65be2546edd823a093fb5a04", "567fb73951ab6865a2eb1a0060b54be1e27302574f6c65879525bdf53fab49e1", "5bc40f8aa7ba8ca7f833ad2477b9d84e1bfd2630b22a46d9bbd221982f8c3ac0", "6b0a0ccf33c7a6100c569667c888335a4aaf0d22218cb97b4963a65d70f6c343", "71b93157f1ce93fc7cfff9359b76def2b4826a7ef7a7f95e070161368e7f584a", "7d939d511b7dac29b2d936706786771ecb8256e43fade5cdb0e8bc58f02b86cf", "7fbc5a93d52e4c51487f4648b00dc41700adb144d10fc567b05f852e76c243ad", "9cb94b8f9c915a5d2b273d612a25a8e5d67b49543f8eb6bcec0275ac46cda421", "a585ea1722f9731e75881d5ffcc51d11c794d244ac57e7c2a9cbb8d5ac729302", "a6458dd7a10ae51f6fce56bdfc79bf6d3b54556237045d09e77fbda9d6d37864", "a9fb92e948128bce0239b87c6efcf2cb1c5a703d0b41dd6835211e6fafd1c5df", "b0b6b4ca1c53e7d6ca9f2720919f63837f05e7a5f92912a2bc29bfd03ed3b54f", "b7d22c8d648aaa3a7ec785eda544402141eb78ac5ffbba4cbe2c3a1f52276870", "bc9560574a868cfa2ba781b7bb0b4685b08ea251697abfc49070ffc05e1cbee6", "c0c5a576f3f7b7de3f86889cb47eb51b59dc11db9cf1e2a0f51eb4d988010ea4", "e1c91c2fa942a71c98a7a1f462de6dbbe82f34b9267eb8131314d97bd13bf0d4", "ec936361ad78aa95382c313df95777795b8185aac5dd3ec5463363ea94b556fc"] -requests = ["502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"] -six = ["3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", "d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"] -sortedcontainers = ["974e9a32f56b17c1bac2aebd9dcf197f3eb9cd30553c5852a3187ad162e1a03a", "d9e96492dd51fae31e60837736b38fe42a187b5404c16606ff7ee7cd582d4c60"] -soupsieve = ["466910df7561796a60748826781ebe9a888f7a1668a636ae86783f44d10aae73", "87db12ae79194f0ff9808d2b1641c4f031ae39ffa3cab6b907ea7c1e5e5ed445"] -unicodecsv = ["018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"] -unidecode = ["1d7a042116536098d05d599ef2b8616759f02985c85b4fef50c78a5aaf10822a", "2b6aab710c2a1647e928e36d69c21e76b453cd455f4e2621000e54b2a9b8cce8"] -urllib3 = ["61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22"] -wand = ["c52d647a34205f9b3948baae739db461a7379a04818548fe8042b5ce751ea6ea", "e2e08e19a37c61e85eaa307fe319889af46fe4cac6c23e3ae668b96be3e497ff"] diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 7e9c629..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,19 +0,0 @@ -[tool.poetry] -name = "arxiv2remarkable" -version = "0.1.0" -description = "Download an arXiv paper and send it to reMarkable" -authors = ["Gertjan van den Burg "] -license = "MIT" - -[tool.poetry.dependencies] -python = "^3.5" -bs4 = "^0.0.1" -requests = "^2.21" -pdfplumber = "^0.5.12" -unidecode = "^1.1" - -[tool.poetry.dev-dependencies] - -[build-system] -requires = ["poetry>=0.12"] -build-backend = "poetry.masonry.api" -- cgit v1.2.3 From 317e79cc6aaa9572e4090dad653df8fd6eff9563 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 23:03:41 +0100 Subject: Remove old script as its no longer needed --- arxiv2remarkable.py | 859 ---------------------------------------------------- 1 file changed, 859 deletions(-) delete mode 100755 arxiv2remarkable.py diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py deleted file mode 100755 index 5694e1b..0000000 --- a/arxiv2remarkable.py +++ /dev/null @@ -1,859 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__version__ = "0.3.5" -__author__ = "G.J.J. van den Burg" - -""" -Download a paper from various sources and send it to the reMarkable. - -Author: G.J.J. van den Burg -Date: 2019-02-02 -License: MIT - -""" - -import PyPDF2 -import abc -import argparse -import bs4 -import datetime -import os -import pdfplumber -import re -import requests -import shutil -import string -import subprocess -import sys -import tempfile -import time -import titlecase -import unidecode -import urllib.parse - -GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" - -HEADERS = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 " - "Safari/537.36" -} - -RM_WIDTH = 1404 -RM_HEIGHT = 1872 - - -class Provider(metaclass=abc.ABCMeta): - """ ABC for providers of pdf sources """ - - meta_author_key = "citation_author" - meta_title_key = "citation_title" - meta_date_key = "citation_date" - - def __init__( - self, - verbose=False, - upload=True, - debug=False, - center=False, - blank=False, - remarkable_dir="/", - rmapi_path="rmapi", - pdfcrop_path="pdfcrop", - pdftk_path="pdftk", - gs_path="gs", - ): - self.verbose = verbose - self.upload = upload - self.debug = debug - self.center = center - self.blank = blank - self.remarkable_dir = remarkable_dir - self.rmapi_path = rmapi_path - self.pdfcrop_path = pdfcrop_path - self.pdftk_path = pdftk_path - self.gs_path = gs_path - - self.log("Starting %s" % type(self).__name__) - - def log(self, msg, mode="info"): - if not self.verbose: - return - if not mode in ["info", "warning"]: - raise ValueError("unknown logging mode.") - now = datetime.datetime.now() - print( - now.strftime("%Y-%m-%d %H:%M:%S") - + " - " - + mode.upper() - + " - " - + msg - ) - - def warn(self, msg): - self.log(msg, mode="warning") - - @staticmethod - @abc.abstractmethod - def validate(src): - """ Validate whether ``src`` is appropriate for this provider """ - - def retrieve_pdf(self, src, filename): - """ Download pdf from src and save to filename """ - _, pdf_url = self.get_abs_pdf_urls(src) - self.download_url(pdf_url, filename) - - def _format_authors(self, soup_authors, sep=",", idx=0, op=None): - op = (lambda x: x) if op is None else op - # format the author list retrieved by bs4 - return [x.strip().split(sep)[idx].strip() for x in op(soup_authors)] - - def get_authors(self, soup): - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": self.meta_author_key}) - ] - return self._format_authors(authors) - - def get_title(self, soup): - target = soup.find_all("meta", {"name": self.meta_title_key}) - return target[0]["content"] - - def _format_date(self, soup_date): - return soup_date - - def get_date(self, soup): - date = soup.find_all("meta", {"name": self.meta_date_key})[0][ - "content" - ] - return self._format_date(date) - - def get_paper_info( - self, - src, - author_key="citation_author", - title_key="citation_title", - date_key="citation_date", - ): - """ Retrieve the title/author (surnames)/year information """ - abs_url, _ = self.get_abs_pdf_urls(src) - self.log("Getting paper info") - page = self.get_page_with_retry(abs_url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = self.get_authors(soup) - title = self.get_title(soup) - date = self.get_date(soup) - return dict(title=title, date=date, authors=authors) - - def string_clean(self, s): - """ Clean a string to replace accented characters with equivalents and - keep only the allowed characters """ - normalized = unidecode.unidecode(s) - allowed = string.ascii_letters + string.digits + "_ ." - cleaned = "".join(c if c in allowed else "_" for c in normalized) - return cleaned - - def create_filename(self, info, filename=None): - """ Generate filename using the info dict or filename if provided """ - if not filename is None: - return filename - # we assume that the list of authors is surname only. - self.log("Generating output filename") - - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = self.string_clean(author_part) - - title_part = self.string_clean(info["title"]) - title_part = titlecase.titlecase(title_part).replace(" ", "_") - - year_part = info["date"].split("/")[0] - - name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" - name = unidecode.unidecode(name) - self.log("Created filename: %s" % name) - return name - - def blank_pdf(self, filepath): - if not self.blank: - return filepath - - self.log("Adding blank pages") - input_pdf = PyPDF2.PdfFileReader(filepath) - output_pdf = PyPDF2.PdfFileWriter() - for page in input_pdf.pages: - output_pdf.addPage(page) - output_pdf.addBlankPage() - - output_file = os.path.splitext(filepath)[0] + "-blank.pdf" - with open(output_file, "wb") as fp: - output_pdf.write(fp) - return output_file - - def crop_pdf(self, filepath): - self.log("Cropping pdf file") - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - cropper = Cropper( - filepath, cropped_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.crop(margins=15) - - if not status == 0: - self.warn("Failed to crop the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(cropped_file): - self.warn( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - def center_pdf(self, filepath): - if not self.center: - return filepath - - self.log("Centering pdf file") - centered_file = os.path.splitext(filepath)[0] + "-center.pdf" - cropper = Cropper( - filepath, centered_file, pdfcrop_path=self.pdfcrop_path - ) - status = cropper.center() - if not status == 0: - self.warn("Failed to center the pdf file at: %s" % filepath) - return filepath - if not os.path.exists(centered_file): - self.warn( - "Can't find centered file '%s' where expected." % centered_file - ) - return filepath - return centered_file - - def shrink_pdf(self, filepath): - self.log("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - self.gs_path, - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - self.warn("Failed to shrink the pdf file") - return filepath - return output_file - - def check_file_is_pdf(self, filename): - try: - fp = open(filename, "rb") - pdf = PyPDF2.PdfFileReader(fp, strict=False) - fp.close() - del pdf - return True - except PyPDF2.utils.PdfReadError: - exception("Downloaded file isn't a valid pdf file.") - - def download_url(self, url, filename): - """Download the content of an url and save it to a filename """ - self.log("Downloading file at url: %s" % url) - content = self.get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) - - def get_page_with_retry(self, url, tries=5): - count = 0 - while count < tries: - count += 1 - error = False - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - error = True - if error or not res.ok: - self.warn("Error getting url %s. Retrying in 5 seconds" % url) - time.sleep(5) - continue - self.log("Downloading url: %s" % url) - return res.content - - def upload_to_rm(self, filepath): - remarkable_dir = self.remarkable_dir.rstrip("/") - self.log("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [self.rmapi_path, "mkdir", remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception( - "Creating directory %s on reMarkable failed" - % remarkable_dir - ) - status = subprocess.call( - [self.rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - self.log("Upload successful.") - - def dearxiv(self, input_file): - """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) - - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) - - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - - return output_file - - def run(self, src, filename=None): - info = self.get_paper_info(src) - clean_filename = self.create_filename(info, filename) - tmp_filename = "paper.pdf" - - self.initial_dir = os.getcwd() - with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: - os.chdir(working_dir) - self.retrieve_pdf(src, tmp_filename) - self.check_file_is_pdf(tmp_filename) - - ops = [ - self.dearxiv, - self.crop_pdf, - self.center_pdf, - self.blank_pdf, - self.shrink_pdf, - ] - intermediate_fname = tmp_filename - for op in ops: - intermediate_fname = op(intermediate_fname) - shutil.move(intermediate_fname, clean_filename) - - if self.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if self.upload: - return self.upload_to_rm(clean_filename) - - target_path = os.path.join(self.initial_dir, clean_filename) - while os.path.exists(target_path): - base = os.path.splitext(target_path)[0] - target_path = base + "_.pdf" - shutil.move(clean_filename, target_path) - return target_path - - -class Arxiv(Provider): - - re_abs = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?" - re_pdf = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """Get the pdf and abs url from any given arXiv url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match(self.re_pdf, url): - abs_url = url[:-4].replace("pdf", "abs") - pdf_url = url - else: - exception("Couldn't figure out arXiv urls.") - return abs_url, pdf_url - - def validate(src): - """Check if the url is to an arXiv page. """ - return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) - - -class Pubmed(Provider): - - meta_author_key = "citation_authors" - - re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?" - re_pdf = ( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf" - ) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """Get the pdf and html url from a given PMC url """ - if re.match(self.re_pdf, url): - idx = url.index("pdf") - abs_url = url[: idx - 1] - pdf_url = url - elif re.match(self.re_abs, url): - abs_url = url - pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually - else: - exception("Couldn't figure out PMC urls.") - return abs_url, pdf_url - - def validate(src): - return re.match(Pubmed.re_abs, src) or re.match(Pubmed.re_pdf, src) - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(",") - return super()._format_authors(soup_authors, sep=" ", idx=-1, op=op) - - def _format_date(self, soup_date): - if re.match("\w+\ \d{4}", soup_date): - return soup_date.split(" ")[-1] - return soup_date.replace(" ", "_") - - -class ACM(Provider): - - meta_author_key = "citation_authors" - - re_abs = "https?://dl.acm.org/citation.cfm\?id=\d+" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_acm_pdf_url(self, url): - page = self.get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: - return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href - - def get_abs_pdf_urls(self, url): - if re.match(self.re_abs, url): - abs_url = url - pdf_url = self.get_acm_pdf_url(url) - if pdf_url is None: - exception( - "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" - ) - else: - exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." - ) - return abs_url, pdf_url - - def validate(src): - m = re.fullmatch(ACM.re_abs, src) - return not m is None - - def _format_authors(self, soup_authors): - op = lambda x: x[0].split(";") - return super()._format_authors(soup_authors, sep=",", idx=0, op=op) - - def _format_date(self, soup_date): - if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - self.warn( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so it can be fixed: %s" % GITHUB_URL - ) - return soup_date.strip().split("/")[-1] - - -class OpenReview(Provider): - - meta_date_key = "citation_publication_date" - - re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+" - re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """ Get the pdf and abstract url from a OpenReview url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("forum", "pdf") - elif re.match(self.re_pdf, url): - abs_url = url.replace("pdf", "forum") - pdf_url = url - else: - exception("Couldn't figure out OpenReview urls.") - return abs_url, pdf_url - - def validate(src): - """ Check if the url is a valid OpenReview url. """ - return re.match(OpenReview.re_abs, src) or re.match( - OpenReview.re_pdf, src - ) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) - - -class Springer(Provider): - - meta_date_key = "citation_online_date" - - re_abs = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+" - re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-]+\.pdf" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_abs_pdf_urls(self, url): - """ Get the pdf and abstract urls from a Springer url """ - if re.match(self.re_abs, url): - abs_url = url - pdf_url = url.replace("article", "content/pdf") - elif re.match(self.re_pdf, url): - abs_url = url.replace("content/pdf", "article")[: -len(".pdf")] - pdf_url = urllib.parse.unquote(url) - else: - exception("Couldn't figure out Springer urls.") - return abs_url, pdf_url - - def validate(src): - return re.match(Springer.re_abs, src) or re.match(Springer.re_pdf, src) - - def _format_authors(self, soup_authors): - return super()._format_authors(soup_authors, sep=" ", idx=-1) - - -class LocalFile(Provider): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def validate(src): - return os.path.exists(src) - - def retrieve_pdf(self, src, filename): - source = os.path.join(self.initial_dir, src) - shutil.copy(source, filename) - - def get_paper_info(self, src): - return {"filename": src} - - def create_filename(self, info, filename=None): - if not filename is None: - return filename - return os.path.basename(info["filename"]) - - -class PdfUrl(Provider): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def validate(src): - try: - result = urllib.parse.urlparse(src) - return all([result.scheme, result.netloc, result.path]) - except: - return False - - def retrieve_pdf(self, url, filename): - self.download_url(url, filename) - - def get_paper_info(self, src): - return None - - def create_filename(self, info, filename=None): - if filename is None: - exception( - "Filename must be provided with PDFUrlProvider (use --filename)" - ) - return filename - - -class Cropper(object): - def __init__( - self, input_file=None, output_file=None, pdfcrop_path="pdfcrop" - ): - if not input_file is None: - self.input_file = os.path.abspath(input_file) - self.reader = PyPDF2.PdfFileReader(self.input_file) - if not output_file is None: - self.output_file = os.path.abspath(output_file) - self.pdfcrop_path = pdfcrop_path - - self.writer = PyPDF2.PdfFileWriter() - - def crop(self, margins=1): - return self.process_file(self.crop_page, margins=margins) - - def center(self, padding=15): - return self.process_file(self.center_page, padding=padding) - - def process_file(self, page_func, *args, **kwargs): - for page_idx in range(self.reader.getNumPages()): - status = page_func(page_idx, *args, **kwargs) - if not status == 0: - return status - with open(self.output_file, "wb") as fp: - self.writer.write(fp) - return 0 - - def center_page(self, page_idx, padding): - return self.process_page( - page_idx, self.get_center_bbox, padding=padding - ) - - def crop_page(self, page_idx, margins): - return self.process_page(page_idx, self.get_bbox, margins=margins) - - def export_page(self, page_idx): - """Helper function that exports a single page given by index """ - page = self.reader.getPage(page_idx) - writer = PyPDF2.PdfFileWriter() - writer.addPage(page) - tmpfname = "./page.pdf" - with open(tmpfname, "wb") as fp: - writer.write(fp) - return tmpfname - - def process_page(self, page_idx, bbox_func, *args, **kwargs): - """Process a single page and add it to the writer """ - tmpfname = self.export_page(page_idx) - tmpfout = "./output.pdf" - bbox = bbox_func(tmpfname, *args, **kwargs) - status = subprocess.call( - [ - self.pdfcrop_path, - "--bbox", - " ".join(map(str, bbox)), - tmpfname, - tmpfout, - ], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - return status - reader = PyPDF2.PdfFileReader(tmpfout) - page = reader.getPage(0) - self.writer.addPage(page) - os.unlink(tmpfname) - os.unlink(tmpfout) - return 0 - - def get_bbox(self, filename, margins=1, resolution=72): - """Get the bounding box, with optional margins - - if margins is integer, used for all margins, else - margins = [left, top, right, bottom] - - We get the bounding box by finding the smallest rectangle that is - completely surrounded by white pixels. - """ - if isinstance(margins, int): - margins = [margins for _ in range(4)] - pdf = pdfplumber.open(filename) - im = pdf.pages[0].to_image(resolution=resolution) - pdf.close() - - pixels = list(im.original.getdata()) - W, H = im.original.size - - # M is a list of H lists with each W integers that equal the sum of the - # pixel values - M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)] - - left, top, bottom, right = 0, 0, 0, 0 - while top < H and sum(M[top]) == W * 255 * 3: - top += 1 - while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3: - bottom += 1 - - # Transpose M - M = list(zip(*M)) - while left < W and sum(M[left]) == H * 255 * 3: - left += 1 - while right < W and sum(M[W - 1 - right]) == H * 255 * 3: - right += 1 - - left -= margins[0] - top -= margins[1] - right -= margins[2] - bottom -= margins[3] - - # This is the bounding box in PIL format: (0, 0) top left - x0, y0, x1, y1 = left, top, W - right, H - bottom - - # Get the bbox in Ghostscript format: (0, 0) bottom left - a0, b0, a1, b1 = x0, H - y1, x1, H - y0 - return [a0, b0, a1, b1] - - def get_center_bbox(self, filename, padding=15): - """Compute a bounding box that will center the page file on the - reMarkable - """ - bbox = self.get_bbox(filename, margins=0) - - h = bbox[3] - bbox[1] - w = bbox[2] - bbox[0] - - # we want some minimal padding all around, because it is visually more - # pleasing. - h_prime = h + 2 * padding - w_prime = w + 2 * padding - - # if the document is wider than the remarkable, we add top-padding to - # center it, otherwise we add left-padding - x, y = 0, 0 - if h_prime / w_prime < RM_HEIGHT / RM_WIDTH: - y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2 - else: - x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2 - - margins = [padding + x, padding + y, padding, padding] - return self.get_bbox(filename, margins=margins) - - -def exception(msg): - print("ERROR: " + msg, file=sys.stderr) - print("Error occurred. Exiting.", file=sys.stderr) - print("", file=sys.stderr) - print( - "If you think this might be a bug, please raise an issue on GitHub: %s" - % GITHUB_URL - ) - raise SystemExit(1) - - -def parse_args(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "-b", - "--blank", - help="Add a blank page after every page of the PDF", - action="store_true", - ) - parser.add_argument( - "-v", "--verbose", help="be verbose", action="store_true" - ) - parser.add_argument( - "-n", - "--no-upload", - help="don't upload to the reMarkable, save the output in current working dir", - action="store_true", - ) - parser.add_argument( - "-d", - "--debug", - help="debug mode, doesn't upload to reMarkable", - action="store_true", - ) - parser.add_argument( - "-c", - "--center", - help="Center the PDF on the page, instead of left align", - action="store_true", - ) - parser.add_argument( - "--filename", - help="Filename to use for the file on reMarkable", - default=None, - ) - parser.add_argument( - "-p", - "--remarkable-path", - help="directory on reMarkable to put the file (created if missing)", - dest="remarkable_dir", - default="/", - ) - parser.add_argument( - "--rmapi", help="path to rmapi executable", default="rmapi" - ) - parser.add_argument( - "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" - ) - parser.add_argument( - "--pdftk", help="path to pdftk executable", default="pdftk" - ) - parser.add_argument("--gs", help="path to gs executable", default="gs") - parser.add_argument( - "input", help="URL to a paper or the path of a local PDF file" - ) - return parser.parse_args() - - -def main(): - args = parse_args() - - providers = [Arxiv, Pubmed, ACM, OpenReview, Springer, LocalFile, PdfUrl] - - provider = next((p for p in providers if p.validate(args.input)), None) - if provider is None: - exception("Input not valid, no provider can handle this source.") - - prov = provider( - verbose=args.verbose, - upload=not args.no_upload, - debug=args.debug, - center=args.center, - blank=args.blank, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - pdfcrop_path=args.pdfcrop, - pdftk_path=args.pdftk, - gs_path=args.gs, - ) - - prov.run(args.input, filename=args.filename) - - -if __name__ == "__main__": - main() -- cgit v1.2.3 From b0b3b177dd2ee5555fb5a6a68c529d5673df83bb Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 23:39:03 +0100 Subject: Switch to a simple logger singleton The logging module gave problems because one of the pdf packages is based on a package that extensively used the info level of the logging module, and this seemed like the easiest solution. --- paper2remarkable/log.py | 56 +++++++++++++++++++++++++++++++++++ paper2remarkable/pdf_ops.py | 21 ++++++------- paper2remarkable/providers/_base.py | 9 +++--- paper2remarkable/providers/_info.py | 10 ++++--- paper2remarkable/providers/acm.py | 5 +++- paper2remarkable/providers/arxiv.py | 6 ++-- paper2remarkable/providers/pdf_url.py | 1 - paper2remarkable/utils.py | 15 ++++++---- 8 files changed, 95 insertions(+), 28 deletions(-) create mode 100644 paper2remarkable/log.py diff --git a/paper2remarkable/log.py b/paper2remarkable/log.py new file mode 100644 index 0000000..bae1cbf --- /dev/null +++ b/paper2remarkable/log.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +"""Just a simple logger + +Author: G.J.J. van den Burg +License: See LICENSE file. +Copyright: 2019, G.J.J. van den Burg + +""" + +# NOTE: I know about the logging module, but this was easier because one of the +# dependencies was using that and it became complicated. This one is obviously +# not thread-safe and is very simple. + +import datetime +import sys + + +class Singleton(type): + # https://stackoverflow.com/q/6760685 + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__( + *args, **kwargs + ) + return cls._instances[cls] + + +class Logger(metaclass=Singleton): + def __init__(self): + self.enabled = True + + def enable(self): + self.enabled = True + + def disable(self): + self.enabled = False + + def _log(self, msg, mode): + if not self.enabled: + return + if not mode in ("info", "warn"): + raise ValueError("Unknown logging mode: %s" % mode) + file = sys.stdout if mode == "info" else sys.stderr + now = datetime.datetime.now() + nowstr = now.strftime("%Y-%m-%d %H:%M:%S") + print("%s - %s - %s" % (nowstr, mode.upper(), msg), file=file) + file.flush() + + def info(self, msg): + self._log(msg, "info") + + def warning(self, msg): + self._log(msg, "warn") diff --git a/paper2remarkable/pdf_ops.py b/paper2remarkable/pdf_ops.py index d1eae40..8636017 100644 --- a/paper2remarkable/pdf_ops.py +++ b/paper2remarkable/pdf_ops.py @@ -10,27 +10,28 @@ Copyright: 2019, The Alan Turing Institute import PyPDF2 -import logging import os import subprocess from .crop import Cropper +from .log import Logger +logger = Logger() def crop_pdf(filepath, pdfcrop_path="pdfcrop"): """Crop the pdf file using Cropper """ - logging.info("Cropping pdf file") + logger.info("Cropping pdf file") cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" cropper = Cropper(filepath, cropped_file, pdfcrop_path=pdfcrop_path) status = cropper.crop(margins=15) if not status == 0: - logging.warning("Failed to crop the pdf file at: %s" % filepath) + logger.warning("Failed to crop the pdf file at: %s" % filepath) return filepath if not os.path.exists(cropped_file): - logging.warning( + logger.warning( "Can't find cropped file '%s' where expected." % cropped_file ) return filepath @@ -40,17 +41,17 @@ def crop_pdf(filepath, pdfcrop_path="pdfcrop"): def center_pdf(filepath, pdfcrop_path="pdfcrop"): """Center the pdf file on the reMarkable """ - logging.info("Centering pdf file") + logger.info("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" cropper = Cropper(filepath, centered_file, pdfcrop_path=pdfcrop_path) status = cropper.center() if not status == 0: - logging.warning("Failed to center the pdf file at: %s" % filepath) + logger.warning("Failed to center the pdf file at: %s" % filepath) return filepath if not os.path.exists(centered_file): - logging.warning( + logger.warning( "Can't find centered file '%s' where expected." % centered_file ) return filepath @@ -60,7 +61,7 @@ def center_pdf(filepath, pdfcrop_path="pdfcrop"): def blank_pdf(filepath): """Add blank pages to PDF """ - logging.info("Adding blank pages") + logger.info("Adding blank pages") input_pdf = PyPDF2.PdfFileReader(filepath) output_pdf = PyPDF2.PdfFileWriter() for page in input_pdf.pages: @@ -76,7 +77,7 @@ def blank_pdf(filepath): def shrink_pdf(filepath, gs_path="gs"): """Shrink the PDF file size using Ghostscript """ - logging.info("Shrinking pdf file") + logger.info("Shrinking pdf file") output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" status = subprocess.call( [ @@ -92,6 +93,6 @@ def shrink_pdf(filepath, gs_path="gs"): ] ) if not status == 0: - logging.warning("Failed to shrink the pdf file") + logger.warning("Failed to shrink the pdf file") return filepath return output_file diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index ca6ab70..5432d48 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -9,7 +9,6 @@ Copyright: 2019, G.J.J. van den Burg """ import abc -import logging import os import shutil import tempfile @@ -17,6 +16,9 @@ import tempfile from ._info import Informer from ..pdf_ops import crop_pdf, center_pdf, blank_pdf, shrink_pdf from ..utils import assert_file_is_pdf, download_url, upload_to_remarkable +from ..log import Logger + +logger = Logger() class Provider(metaclass=abc.ABCMeta): @@ -45,9 +47,8 @@ class Provider(metaclass=abc.ABCMeta): self.informer = Informer() # disable logging if requested - logging.basicConfig(level=logging.INFO) if not verbose: - logging.disable() + logger.disable() # Define the operations to run on the pdf. Providers can add others. self.operations = [("crop", self.crop_pdf)] @@ -58,7 +59,7 @@ class Provider(metaclass=abc.ABCMeta): self.operations.append(("blank", blank_pdf)) self.operations.append(("shrink", self.shrink_pdf)) - logging.info("Starting %s" % type(self).__name__) + logger.info("Starting %s" % type(self).__name__) @staticmethod @abc.abstractmethod diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 04efcb1..9130e34 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -3,12 +3,14 @@ """Functionality for retrieving paper info """ -import logging import titlecase import unidecode import bs4 from ..utils import clean_string, get_page_with_retry +from ..log import Logger + +logger = Logger() class Informer: @@ -38,7 +40,7 @@ class Informer: The provided url must be to a HTMl page where this information can be found, not to the PDF file itself. """ - logging.info("Generating output filename") + logger.info("Generating output filename") # Retrieve the paper information self.get_info(abs_url) @@ -59,11 +61,11 @@ class Informer: name = authors + "_-_" + title + "_" + year + ".pdf" name = unidecode.unidecode(name) - logging.info("Created filename: %s" % name) + logger.info("Created filename: %s" % name) return name def get_info(self, url): - logging.info("Getting paper info") + logger.info("Getting paper info") page = get_page_with_retry(url) soup = bs4.BeautifulSoup(page, "html.parser") self.authors = self.authors or self.get_authors(soup) diff --git a/paper2remarkable/providers/acm.py b/paper2remarkable/providers/acm.py index e14efa7..a0d79bd 100644 --- a/paper2remarkable/providers/acm.py +++ b/paper2remarkable/providers/acm.py @@ -15,6 +15,9 @@ from ._base import Provider from ._info import Informer from .. import GITHUB_URL from ..utils import exception, get_page_with_retry +from ..log import Logger + +logger = Logger() class ACMInformer(Informer): @@ -26,7 +29,7 @@ class ACMInformer(Informer): def _format_year(self, soup_date): if not re.match("\d{2}/\d{2}/\d{4}", soup_date.strip()): - self.warn( + logger.warning( "Couldn't extract year from ACM page, please raise an " "issue on GitHub so it can be fixed: %s" % GITHUB_URL ) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index d950e47..e022658 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -11,11 +11,13 @@ Copyright: 2019, G.J.J. van den Burg import os import re import subprocess -import logging from ._info import Informer from ._base import Provider from ..utils import exception +from ..log import Logger + +logger = Logger() class ArxivInformer(Informer): @@ -52,7 +54,7 @@ class Arxiv(Provider): def dearxiv(self, input_file): """Remove the arXiv timestamp from a pdf""" - logging.info("Removing arXiv timestamp") + logger.info("Removing arXiv timestamp") basename = os.path.splitext(input_file)[0] uncompress_file = basename + "_uncompress.pdf" diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index f28c742..dfc8646 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -12,7 +12,6 @@ import urllib from ._base import Provider from ._info import Informer - from ..utils import exception class PdfUrlInformer(Informer): diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index 2bed231..d80c954 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -9,7 +9,6 @@ Copyright: 2019, G.J.J. van den Burg """ import PyPDF2 -import logging import requests import string import subprocess @@ -18,6 +17,7 @@ import time import unidecode from . import GITHUB_URL +from .log import Logger HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " @@ -26,6 +26,8 @@ HEADERS = { } +logger = Logger() + def exception(msg): print("ERROR: " + msg, file=sys.stderr) print("Error occurred. Exiting.", file=sys.stderr) @@ -37,6 +39,7 @@ def exception(msg): raise SystemExit(1) + def clean_string(s): """ Clean a string by replacing accented characters with equivalents and keeping only the allowed characters (ascii letters, digits, underscore, @@ -64,7 +67,7 @@ def assert_file_is_pdf(filename): def download_url(url, filename): """Download the content of an url and save it to a filename """ - logging.info("Downloading file at url: %s" % url) + logger.info("Downloading file at url: %s" % url) content = get_page_with_retry(url) with open(filename, "wb") as fid: fid.write(content) @@ -80,18 +83,18 @@ def get_page_with_retry(url, tries=5): except requests.exceptions.ConnectionError: error = True if error or not res.ok: - logging.warning( + logger.warning( "(%i/%i) Error getting url %s. Retrying in 5 seconds." % (count, tries, url) ) time.sleep(5) continue - logging.info("Downloading url: %s" % url) + logger.info("Downloading url: %s" % url) return res.content def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): - logging.info("Starting upload to reMarkable") + logger.info("Starting upload to reMarkable") # Create the reMarkable dir if it doesn't exist remarkable_dir = remarkable_dir.rstrip("/") @@ -112,4 +115,4 @@ def upload_to_remarkable(filepath, remarkable_dir="/", rmapi_path="rmapi"): ) if not status == 0: exception("Uploading file %s to reMarkable failed" % filepath) - logging.info("Upload successful.") + logger.info("Upload successful.") -- cgit v1.2.3 From 89f3fb37ab5aad7284ca1da29aa610ae196b6fcf Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 00:03:17 +0100 Subject: Improve string cleaning --- paper2remarkable/providers/_info.py | 1 + paper2remarkable/utils.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paper2remarkable/providers/_info.py b/paper2remarkable/providers/_info.py index 9130e34..0b28658 100644 --- a/paper2remarkable/providers/_info.py +++ b/paper2remarkable/providers/_info.py @@ -56,6 +56,7 @@ class Informer: title = clean_string(self.title) title = titlecase.titlecase(title) title = title.replace(" ", "_") + title = clean_string(title) year = str(self.year) diff --git a/paper2remarkable/utils.py b/paper2remarkable/utils.py index d80c954..a313ffe 100644 --- a/paper2remarkable/utils.py +++ b/paper2remarkable/utils.py @@ -28,6 +28,7 @@ HEADERS = { logger = Logger() + def exception(msg): print("ERROR: " + msg, file=sys.stderr) print("Error occurred. Exiting.", file=sys.stderr) @@ -39,14 +40,15 @@ def exception(msg): raise SystemExit(1) - def clean_string(s): """ Clean a string by replacing accented characters with equivalents and keeping only the allowed characters (ascii letters, digits, underscore, - space, and period)""" + space, dash, and period)""" normalized = unidecode.unidecode(s) - allowed = string.ascii_letters + string.digits + "_ ." + allowed = string.ascii_letters + string.digits + "_ .-" cleaned = "".join(c if c in allowed else "_" for c in normalized) + while "__" in cleaned: + cleaned = cleaned.replace("__", "_") return cleaned -- cgit v1.2.3 From f8ccc47b17a19655860fa16149420eb422d71c26 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 00:03:39 +0100 Subject: bugfixes --- paper2remarkable/providers/local.py | 2 +- paper2remarkable/providers/pdf_url.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py index b1201d3..68b88ea 100644 --- a/paper2remarkable/providers/local.py +++ b/paper2remarkable/providers/local.py @@ -25,7 +25,7 @@ class LocalFile(Provider): super().__init__(*args, **kwargs) self.informer = LocalFileInformer() - def get_abs_pdf_url(self, url): + def get_abs_pdf_urls(self, url): # The 'url' is the path to the local file. We use this as abs_url and # pdf_url. return url, url diff --git a/paper2remarkable/providers/pdf_url.py b/paper2remarkable/providers/pdf_url.py index dfc8646..d80b1a9 100644 --- a/paper2remarkable/providers/pdf_url.py +++ b/paper2remarkable/providers/pdf_url.py @@ -14,14 +14,13 @@ from ._base import Provider from ._info import Informer from ..utils import exception -class PdfUrlInformer(Informer): +class PdfUrlInformer(Informer): def get_filename(self, abs_url): # if this is called, filename must not be provided exception( - "Filename must be provided with PDFUrlProvider (use --filename)" - ) - + "Filename must be provided with PDFUrlProvider (use --filename)" + ) class PdfUrl(Provider): @@ -29,6 +28,9 @@ class PdfUrl(Provider): super().__init__(*args, **kwargs) self.informer = PdfUrlInformer() + def get_abs_pdf_urls(self, url): + return (None, url) + def validate(src): try: result = urllib.parse.urlparse(src) -- cgit v1.2.3 From 395ab716bb5c8ed74a4f0b447ec8243f64515ea8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 00:03:48 +0100 Subject: makefile changes --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index baccb92..2a656d4 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ install: ## Install for the current user using the default python command test: venv ## Run unit tests - source $(VENV_DIR)/bin/activate && green -v ./tests + source $(VENV_DIR)/bin/activate && green -f -vv -a ./tests clean: ## Clean build dist and egg directories left after install -- cgit v1.2.3 From a405d661552b2e574725fcfb9e75f54d3f3d86ca Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 09:42:43 +0100 Subject: fix typo --- paper2remarkable/providers/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/local.py b/paper2remarkable/providers/local.py index 68b88ea..3f581b2 100644 --- a/paper2remarkable/providers/local.py +++ b/paper2remarkable/providers/local.py @@ -16,7 +16,7 @@ from ._info import Informer class LocalFileInformer(Informer): - def get_filenames(self, abs_url): + def get_filename(self, abs_url): return os.path.basename(abs_url) -- cgit v1.2.3 From a02c4c27d81df8aa012f923d2b150db37e064c80 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 09:42:55 +0100 Subject: update tempdir prefix --- paper2remarkable/providers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 5432d48..bdc9558 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -87,7 +87,7 @@ class Provider(metaclass=abc.ABCMeta): tmp_filename = "paper.pdf" self.initial_dir = os.getcwd() - with tempfile.TemporaryDirectory(prefix="a2r_") as working_dir: + with tempfile.TemporaryDirectory(prefix="p2r_") as working_dir: os.chdir(working_dir) self.retrieve_pdf(pdf_url, tmp_filename) assert_file_is_pdf(tmp_filename) -- cgit v1.2.3 From a37dd132ba815e8c10cf3e2f4e8a928dae96ae2d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 09:43:50 +0100 Subject: Update gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index c18dd8d..558dbc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ __pycache__/ +paper2remarkable.egg-info/ +dist/* +build/* +*.pyc +*/__pycache__/ -- cgit v1.2.3 From 7e544bb68e05cb3c1705c1a50076bdee33d759b2 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 10:05:10 +0100 Subject: Add description string to help --- paper2remarkable/__init__.py | 2 ++ paper2remarkable/ui.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/paper2remarkable/__init__.py b/paper2remarkable/__init__.py index 71c1105..113fc83 100644 --- a/paper2remarkable/__init__.py +++ b/paper2remarkable/__init__.py @@ -1,3 +1,5 @@ # -*- coding: utf-8 -*- +from .__version__ import __version__ + GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 71fc655..1466ef4 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -10,12 +10,15 @@ Copyright: 2019, G.J.J. van den Burg import argparse +from . import __version__ + from .providers import providers from .utils import exception def parse_args(): parser = argparse.ArgumentParser( + description='Paper2reMarkable version %s' % __version__, formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( -- cgit v1.2.3 From 24707255dad3f065fe484a34c8dc2de5e371e419 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 10:05:22 +0100 Subject: Reorder command line arguments --- paper2remarkable/ui.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index 1466ef4..d51ae0a 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -28,12 +28,9 @@ def parse_args(): action="store_true", ) parser.add_argument( - "-v", "--verbose", help="be verbose", action="store_true" - ) - parser.add_argument( - "-n", - "--no-upload", - help="don't upload to the reMarkable, save the output in current working dir", + "-c", + "--center", + help="Center the PDF on the page, instead of left align", action="store_true", ) parser.add_argument( @@ -43,16 +40,11 @@ def parse_args(): action="store_true", ) parser.add_argument( - "-c", - "--center", - help="Center the PDF on the page, instead of left align", + "-n", + "--no-upload", + help="don't upload to the reMarkable, save the output in current working dir", action="store_true", ) - parser.add_argument( - "--filename", - help="Filename to use for the file on reMarkable", - default=None, - ) parser.add_argument( "-p", "--remarkable-path", @@ -61,15 +53,23 @@ def parse_args(): default="/", ) parser.add_argument( - "--rmapi", help="path to rmapi executable", default="rmapi" + "-v", "--verbose", help="be verbose", action="store_true" + ) + parser.add_argument( + "--filename", + help="Filename to use for the file on reMarkable", + default=None, ) + parser.add_argument("--gs", help="path to gs executable", default="gs") parser.add_argument( "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" ) parser.add_argument( "--pdftk", help="path to pdftk executable", default="pdftk" ) - parser.add_argument("--gs", help="path to gs executable", default="gs") + parser.add_argument( + "--rmapi", help="path to rmapi executable", default="rmapi" + ) parser.add_argument( "input", help="URL to a paper or the path of a local PDF file" ) -- cgit v1.2.3 From 6e544fc055c9fc68857c7321a4f117042fe92565 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 10:15:32 +0100 Subject: Clarify some help text --- paper2remarkable/ui.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/paper2remarkable/ui.py b/paper2remarkable/ui.py index d51ae0a..5323996 100644 --- a/paper2remarkable/ui.py +++ b/paper2remarkable/ui.py @@ -18,8 +18,7 @@ from .utils import exception def parse_args(): parser = argparse.ArgumentParser( - description='Paper2reMarkable version %s' % __version__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter + description="Paper2reMarkable version %s" % __version__ ) parser.add_argument( "-b", @@ -48,7 +47,7 @@ def parse_args(): parser.add_argument( "-p", "--remarkable-path", - help="directory on reMarkable to put the file (created if missing)", + help="directory on reMarkable to put the file (created if missing, default: /)", dest="remarkable_dir", default="/", ) @@ -60,18 +59,27 @@ def parse_args(): help="Filename to use for the file on reMarkable", default=None, ) - parser.add_argument("--gs", help="path to gs executable", default="gs") parser.add_argument( - "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop" + "--gs", help="path to gs executable (default: gs)", default="gs" ) parser.add_argument( - "--pdftk", help="path to pdftk executable", default="pdftk" + "--pdfcrop", + help="path to pdfcrop executable (default: pdfcrop)", + default="pdfcrop", ) parser.add_argument( - "--rmapi", help="path to rmapi executable", default="rmapi" + "--pdftk", + help="path to pdftk executable (default: pdftk)", + default="pdftk", ) parser.add_argument( - "input", help="URL to a paper or the path of a local PDF file" + "--rmapi", + help="path to rmapi executable (default: rmapi)", + default="rmapi", + ) + parser.add_argument( + "input", + help="URL to a paper or the path of a local PDF file", ) return parser.parse_args() -- cgit v1.2.3 From 51bce213c917644ff9e512a3f81dd266477c19fe Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 11:09:09 +0100 Subject: Update readme --- README.md | 100 ++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 48 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index a01665c..8295e37 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,21 @@ -# arxiv2remarkable.py +# paper2remarkable -``arxiv2remarkable`` is a command line program to quickly transfer a paper to -your reMarkable. The script can be run as a plain Python script or via Docker +*Note: ``paper2remarkable`` is the new name for the ``arxiv2remarkable`` +script. The name was changed because it better captures what the program +does.* + +``paper2remarkable`` is a command line program for quickly and easily +transferring an academic paper to your reMarkable: + +``` +$ p2r https://arxiv.org/abs/1811.11242 +``` + +The script can be run through the ``p2r`` command line program or via Docker (see below). -This script makes it as easy as possible to get a PDF on your reMarkable from -any of the following sources: +paper2remarkable makes it as easy as possible to get a PDF on your reMarkable +from any of the following sources: - an arXiv url (either ``arxiv.org/abs/...`` or ``arxiv.org/pdf/...``) - a PubMed Central url (either to the HTML or the PDF) @@ -16,10 +26,10 @@ any of the following sources: - a url to a PDF file - a local file. -The script takes the source and: +When called, the paper2remarkable takes the source and: 1. Downloads the pdf if necessary -2. Removes the arXiv timestamp +2. Removes the arXiv timestamp (for arXiv sources) 3. Crops the pdf to remove unnecessary borders 4. Shrinks the pdf file to reduce the filesize 5. Generates a nice filename based on author/title/year of the paper @@ -37,41 +47,39 @@ Optionally, you can: Here's the full help of the script: ```text -usage: arxiv2remarkable.py [-h] [-b] [-v] [-n] [-d] [-c] [--filename FILENAME] - [-p REMARKABLE_DIR] [--rmapi RMAPI] - [--pdfcrop PDFCROP] [--pdftk PDFTK] [--gs GS] - input +usage: p2r [-h] [-b] [-c] [-d] [-n] [-p REMARKABLE_DIR] [-v] + [--filename FILENAME] [--gs GS] [--pdfcrop PDFCROP] [--pdftk PDFTK] + [--rmapi RMAPI] + input + +Paper2reMarkable version 0.4.0 positional arguments: input URL to a paper or the path of a local PDF file optional arguments: -h, --help show this help message and exit - -b, --blank Add a blank page after every page of the PDF (default: - False) - -v, --verbose be verbose (default: False) - -n, --no-upload don't upload to the reMarkable, save the output in - current working dir (default: False) - -d, --debug debug mode, doesn't upload to reMarkable (default: - False) + -b, --blank Add a blank page after every page of the PDF -c, --center Center the PDF on the page, instead of left align - (default: False) - --filename FILENAME Filename to use for the file on reMarkable (default: - None) + -d, --debug debug mode, doesn't upload to reMarkable + -n, --no-upload don't upload to the reMarkable, save the output in + current working dir -p REMARKABLE_DIR, --remarkable-path REMARKABLE_DIR directory on reMarkable to put the file (created if - missing) (default: /) - --rmapi RMAPI path to rmapi executable (default: rmapi) + missing, default: /) + -v, --verbose be verbose + --filename FILENAME Filename to use for the file on reMarkable + --gs GS path to gs executable (default: gs) --pdfcrop PDFCROP path to pdfcrop executable (default: pdfcrop) --pdftk PDFTK path to pdftk executable (default: pdftk) - --gs GS path to gs executable (default: gs) + --rmapi RMAPI path to rmapi executable (default: rmapi) ``` And here's an example with verbose mode enabled that shows everything the script does by default: -```bash -$ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 +``` +$ p2r -v https://arxiv.org/abs/1811.11242 2019-05-30 00:38:27 - INFO - Starting ArxivProvider 2019-05-30 00:38:27 - INFO - Getting paper info from arXiv 2019-05-30 00:38:27 - INFO - Downloading url: https://arxiv.org/abs/1811.11242 @@ -86,7 +94,7 @@ $ python arxiv2remarkable.py -v https://arxiv.org/abs/1811.11242 2019-05-30 00:38:42 - INFO - Upload successful. ``` -## Dependencies +## Installation The script requires the following external programs to be available: @@ -96,27 +104,15 @@ The script requires the following external programs to be available: - [GhostScript](https://www.ghostscript.com/) - [rMAPI](https://github.com/juruen/rmapi) -If these scripts are not available on the ``PATH`` variable, you can supply them -with the relevant options to the script. - -The script also needs the following Python packages: +If these scripts are not available on the ``PATH`` variable, you can supply +them with the relevant options to the script. Then, you can install +paper2remarkable from PyPI: -- [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/): parsing HTML -- [requests](https://pypi.org/project/requests/): getting HTML -- [PyPDF2](https://github.com/mstamy2/PyPDF2): verifying urls point to PDF -- [titlecase](https://pypi.org/project/titlecase/): fancy titles -- [pdfplumber](https://github.com/jsvine/pdfplumber): used for better page - cropping -- [unidecode](https://pypi.org/project/Unidecode/): clean accented characters - from the filename - -If you use [Poetry](https://poetry.eustace.io/) you can install these -dependencies using ``poetry install`` in the project directory. Alternatively, -you can use ``pip`` with the following command: - -```bash -pip install --user bs4 requests PyPDF2 titlecase pdfplumber unidecode ``` +pip install paper2remarkable +``` + +This installs the ``p2r`` command line program. ## Docker @@ -127,7 +123,7 @@ First clone this repository with `git clone` and `cd` inside of it, then build the container: ```bash -docker build -t arxiv2remarkable . +docker build -t paper2remarkable . ``` ### Authorization @@ -137,7 +133,7 @@ we'll use `rmapi` to create it. ```bash touch ${HOME}/.rmapi -docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi arxiv2remarkable version +docker run --rm --it -v "${HOME}/.rmapi:/root/.rmapi:rw" --entrypoint=rmapi paper2remarkable version ``` which should end with output like @@ -149,15 +145,15 @@ rmapi version: 0.0.5 ### Usage -Use the container by replacing `python arxiv2remarkable.py` with `docker run ---rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable`, e.g. +Use the container by replacing `p2r` with `docker run --rm -v +"${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable`, e.g. ``` # print help and exit -docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable --help +docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable --help # equivalent to above usage via `python` -docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" arxiv2remarkable -v https://arxiv.org/abs/1811.11242 +docker run --rm -v "${HOME}/.rmapi:/root/.rmapi:rw" paper2remarkable -v https://arxiv.org/abs/1811.11242 ``` # Notes -- cgit v1.2.3 From 61fb99f59aa5456627ac4cb8ec14503862780462 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 11:09:17 +0100 Subject: setup.py formatting --- setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 3d1fbc5..f54170a 100644 --- a/setup.py +++ b/setup.py @@ -24,14 +24,12 @@ REQUIRED = [ "pdfplumber>=0.5", "unidecode>=1.1", "titlecase>=0.12", - "PyPDF2>=1.26" - + "PyPDF2>=1.26", ] docs_require = [] test_require = [] -dev_require = [ - 'green'] +dev_require = ["green"] # What packages are optional? EXTRAS = { -- cgit v1.2.3 From a38318b2e0df603b4e46a39781cf73cf6fa9a148 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 11:09:54 +0100 Subject: add changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ac4f357 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,6 @@ +# Changelog + +## Version 0.4.0 + +* Refactor code to make it a real Python package +* Rename to ``paper2remarkable`` -- cgit v1.2.3 From f5c5308083e80b3a717aa904131833fed12e98a8 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 11:10:19 +0100 Subject: add packaging code --- MANIFEST.in | 10 +++ make_release.py | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+) create mode 100644 MANIFEST.in create mode 100644 make_release.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..021523f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +include setup.py +include README.md +include LICENSE +recursive-include paper2remarkable *.py +recursive-include tests *.py +exclude Makefile +exclude .gitignore +exclude Dockerfile +exclude make_release.py +prune old diff --git a/make_release.py b/make_release.py new file mode 100644 index 0000000..932209a --- /dev/null +++ b/make_release.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Do-nothing script for making a release + +This idea comes from here: +https://blog.danslimmon.com/2019/07/15/do-nothing-scripting-the-key-to-gradual-automation/ + +Author: Gertjan van den Burg +Date: 2019-07-23 + +""" + +import colorama +import os + + +def colored(msg, color=None, style=None): + colors = { + "red": colorama.Fore.RED, + "green": colorama.Fore.GREEN, + "cyan": colorama.Fore.CYAN, + "yellow": colorama.Fore.YELLOW, + "magenta": colorama.Fore.MAGENTA, + None: "", + } + styles = { + "bright": colorama.Style.BRIGHT, + "dim": colorama.Style.DIM, + None: "", + } + pre = colors[color] + styles[style] + post = colorama.Style.RESET_ALL + return f"{pre}{msg}{post}" + + +def cprint(msg, color=None, style=None): + print(colored(msg, color=color, style=style)) + + +def wait_for_enter(): + input(colored("\nPress Enter to continue", style="dim")) + print() + + +def get_package_name(): + with open("./setup.py", "r") as fp: + nameline = next( + (l.strip() for l in fp if l.startswith("NAME = ")), None + ) + return nameline.split("=")[-1].strip().strip('"') + + +class Step: + def pre(self, context): + pass + + def post(self, context): + wait_for_enter() + + def run(self, context): + try: + self.pre(context) + self.action(context) + self.post(context) + except KeyboardInterrupt: + cprint("\nInterrupted.", color="red") + raise SystemExit(1) + + def instruct(self, msg): + cprint(msg, color="green") + + def print_run(self, msg): + cprint("Run:", color="cyan", style="bright") + self.print_cmd(msg) + + def print_cmd(self, msg): + cprint("\t" + msg, color="cyan", style="bright") + + def do_cmd(self, cmd): + cprint(f"Going to run: {cmd}", color="magenta", style="bright") + wait_for_enter() + os.system(cmd) + + +class GitToMaster(Step): + def action(self, context): + self.instruct("Make sure you're on master and changes are merged in") + self.print_run("git checkout master") + + +class UpdateChangelog(Step): + def action(self, context): + self.instruct(f"Update change log for version {context['version']}") + self.print_run("vi CHANGELOG.md") + + +class RunTests(Step): + def action(self, context): + self.do_cmd("make test") + + +class BumpVersionPackage(Step): + def action(self, context): + self.instruct(f"Update __version__.py with new version") + self.print_run(f"vi {context['pkgname']}/__version__.py") + + def post(self, context): + wait_for_enter() + context["version"] = self._get_version(context) + + def _get_version(self, context): + # Get the version from the version file + about = {} + with open(f"{context['pkgname'].lower()}/__version__.py", "r") as fp: + exec(fp.read(), about) + return about["__version__"] + + +class MakeClean(Step): + def action(self, context): + self.do_cmd("make clean") + + +class MakeDocs(Step): + def action(self, context): + self.do_cmd("make docs") + + +class MakeDist(Step): + def action(self, context): + self.do_cmd("make dist") + + +class PushToTestPyPI(Step): + def action(self, context): + self.do_cmd( + "twine upload --repository-url https://test.pypi.org/legacy/ dist/*" + ) + + +class InstallFromTestPyPI(Step): + def action(self, context): + self.print_run("cd /tmp/") + self.print_cmd("rm -rf ./venv") + self.print_cmd("virtualenv ./venv") + self.print_cmd("cd ./venv") + self.print_cmd("source bin/activate") + self.print_cmd( + "pip install --index-url https://test.pypi.org/simple/ " + + f"--extra-index-url https://pypi.org/simple {context['pkgname']}=={context['version']}" + ) + + +class TestPackage(Step): + def action(self, context): + self.instruct( + f"Ensure that the following command gives version {context['version']}" + ) + self.print_run(f"{context['pkgname']} -h") + + +class DeactivateVenv(Step): + def action(self, context): + self.print_run("deactivate") + self.instruct("Go back to the project directory") + + +class GitTagVersion(Step): + def action(self, context): + self.do_cmd(f"git tag v{context['version']}") + + +class GitAdd(Step): + def action(self, context): + self.instruct("Add everything to git and commit") + self.print_run("git gui") + + +class PushToPyPI(Step): + def action(self, context): + self.do_cmd("twine upload dist/*") + + +class PushToGitHub(Step): + def action(self, context): + self.do_cmd("git push -u --tags origin master") + + +class WaitForTravis(Step): + def action(self, context): + self.instruct( + "Wait for Travis to complete and verify that its successful" + ) + + +class WaitForAppVeyor(Step): + def action(self, context): + self.instruct( + "Wait for AppVeyor to complete and verify that its successful" + ) + + +class WaitForRTD(Step): + def action(self, context): + self.instruct( + "Wait for ReadTheDocs to complete and verify that its successful" + ) + + +def main(): + colorama.init() + procedure = [ + GitToMaster(), + GitAdd(), + PushToGitHub(), + BumpVersionPackage(), + UpdateChangelog(), + MakeClean(), + RunTests(), + MakeDist(), + PushToTestPyPI(), + InstallFromTestPyPI(), + TestPackage(), + DeactivateVenv(), + GitAdd(), + PushToPyPI(), + GitTagVersion(), + PushToGitHub(), + ] + context = {} + context["pkgname"] = get_package_name() + for step in procedure: + step.run(context) + cprint("\nDone!", color="yellow", style="bright") + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 754c29c70fdea59b190cd2ff1f2b63e4a0efc9da Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 25 Oct 2019 11:36:23 +0100 Subject: update dockerfile --- Dockerfile | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6578db3..cb7cb19 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,14 +19,6 @@ RUN apt-get update \ pdftk \ texlive-extra-utils # contains pdfcrop -RUN pip install \ - bs4 \ - requests \ - PyPDF2 \ - titlecase \ - pdfplumber \ - unidecode +RUN pip install paper2remarkable -COPY arxiv2remarkable.py ./ - -ENTRYPOINT ["python", "arxiv2remarkable.py"] +ENTRYPOINT ["p2r"] -- cgit v1.2.3