From 1bb2edea5723c8987de60f8783ba645df8e0cfd5 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Thu, 24 Oct 2019 14:22:00 +0100 Subject: Define operations in the init function This gives cleaner code and allows operations to be defined and registered by specific providers, such as the dearxiv functionality. --- paper2remarkable/providers/_base.py | 72 +++++-------------------------------- paper2remarkable/providers/arxiv.py | 51 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 63 deletions(-) diff --git a/paper2remarkable/providers/_base.py b/paper2remarkable/providers/_base.py index 05fc0b7..77413a9 100644 --- a/paper2remarkable/providers/_base.py +++ b/paper2remarkable/providers/_base.py @@ -13,7 +13,6 @@ import abc import bs4 import datetime import os -import re import requests import shutil import string @@ -56,14 +55,20 @@ class Provider(metaclass=abc.ABCMeta): self.verbose = verbose self.upload = upload self.debug = debug - self.center = center - self.blank = blank self.remarkable_dir = remarkable_dir self.rmapi_path = rmapi_path self.pdfcrop_path = pdfcrop_path self.pdftk_path = pdftk_path self.gs_path = gs_path + # Define the operations to run on the pdf. Providers can add others + self.operations = [("crop", self.crop_pdf)] + if center: + self.operations.append(("center", self.center_pdf)) + if blank: + self.operations.append(("blank", self.blank_pdf)) + self.operations.append(("shrink", self.shrink_pdf)) + self.log("Starting %s" % type(self).__name__) def log(self, msg, mode="info"): @@ -167,9 +172,6 @@ class Provider(metaclass=abc.ABCMeta): return name def blank_pdf(self, filepath): - if not self.blank: - return filepath - self.log("Adding blank pages") input_pdf = PyPDF2.PdfFileReader(filepath) output_pdf = PyPDF2.PdfFileWriter() @@ -201,9 +203,6 @@ class Provider(metaclass=abc.ABCMeta): return cropped_file def center_pdf(self, filepath): - if not self.center: - return filepath - self.log("Centering pdf file") centered_file = os.path.splitext(filepath)[0] + "-center.pdf" cropper = Cropper( @@ -295,52 +294,6 @@ class Provider(metaclass=abc.ABCMeta): exception("Uploading file %s to reMarkable failed" % filepath) self.log("Upload successful.") - def dearxiv(self, input_file): - """Remove the arXiv timestamp from a pdf""" - self.log("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" - - status = subprocess.call( - [ - self.pdftk_path, - input_file, - "output", - uncompress_file, - "uncompress", - ] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) - - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) - - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [self.pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - - return output_file - def run(self, src, filename=None): info = self.get_paper_info(src) clean_filename = self.create_filename(info, filename) @@ -352,15 +305,8 @@ class Provider(metaclass=abc.ABCMeta): self.retrieve_pdf(src, tmp_filename) self.check_file_is_pdf(tmp_filename) - ops = [ - self.dearxiv, - self.crop_pdf, - self.center_pdf, - self.blank_pdf, - self.shrink_pdf, - ] intermediate_fname = tmp_filename - for op in ops: + for op in self.operations: intermediate_fname = op(intermediate_fname) shutil.move(intermediate_fname, clean_filename) diff --git a/paper2remarkable/providers/arxiv.py b/paper2remarkable/providers/arxiv.py index fc5c004..b1982f4 100644 --- a/paper2remarkable/providers/arxiv.py +++ b/paper2remarkable/providers/arxiv.py @@ -8,7 +8,9 @@ Copyright: 2019, G.J.J. van den Burg """ +import os import re +import subprocess from ._base import Provider from ..utils import exception @@ -22,6 +24,9 @@ class Arxiv(Provider): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # register the dearxiv operation + self.operations.insert(0, ("dearxiv", self.dearxiv)) + def get_abs_pdf_urls(self, url): """Get the pdf and abs url from any given arXiv url """ if re.match(self.re_abs, url): @@ -37,3 +42,49 @@ class Arxiv(Provider): def validate(src): """Check if the url is to an arXiv page. """ return re.match(Arxiv.re_abs, src) or re.match(Arxiv.re_pdf, src) + + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" + + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] + ) + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d+\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d+\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) + + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file -- cgit v1.2.3