diff options
Diffstat (limited to 'arxiv2remarkable.py')
| -rwxr-xr-x | arxiv2remarkable.py | 852 |
1 files changed, 466 insertions, 386 deletions
diff --git a/arxiv2remarkable.py b/arxiv2remarkable.py index 61a3667..08beaca 100755 --- a/arxiv2remarkable.py +++ b/arxiv2remarkable.py @@ -1,16 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" -Given an arXiv paper url this script: +__version__ = "0.2.0" +__author__ = "G.J.J. van den Burg" -1. Downloads the paper -2. Strips the timestamp -3. Crops the pdf to remove unnecessary borders -4. Shrinks the pdf to reduce the filesize -5. Renames it using the format: - '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf -6. Uploads it to the reMarkable using rMapi. +""" +Download a paper from various sources and send it to the reMarkable. Author: G.J.J. van den Burg Date: 2019-02-02 @@ -19,8 +14,10 @@ License: MIT """ import PyPDF2 +import abc import argparse import bs4 +import datetime import os import re import requests @@ -32,8 +29,6 @@ import time import titlecase import urllib.parse -from loguru import logger - GITHUB_URL = "https://github.com/GjjvdBurg/arxiv2remarkable" HEADERS = { @@ -43,334 +38,468 @@ HEADERS = { } -def exception(msg): - print("ERROR: " + msg, file=sys.stderr) - print("Error occurred. Exiting.", file=sys.stderr) - raise SystemExit(1) - - -def arxiv_url(url): - """Check if the url is to an arXiv page. - - >>> validate_url("https://arxiv.org/abs/1811.11242") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("http://arxiv.org/abs/1811.11242") - True - >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf") - True - >>> validate_url("https://arxiv.org/abs/1811.11242v1") - True - >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf") - True - >>> validate_url("https://gertjanvandenburg.com") - False - """ - m = re.match( - "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url - ) - return not m is None - +class Provider(metaclass=abc.ABCMeta): + """ ABC for providers of pdf sources """ -def pmc_url(url): - m = re.fullmatch( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", url - ) - return not m is None - - -def acm_url(url): - m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", url) - return not m is None - - -def valid_url(url): - try: - result = urllib.parse.urlparse(url) - return all([result.scheme, result.netloc, result.path]) - except: - return False + def __init__( + self, + verbose=False, + upload=True, + debug=False, + remarkable_dir="/", + rmapi_path="rmapi", + pdfcrop_path="pdfcrop", + pdftk_path="pdftk", + gs_path="gs", + ): + self.verbose = verbose + self.upload = upload + self.debug = debug + self.remarkable_dir = remarkable_dir + self.rmapi_path = rmapi_path + self.pdfcrop_path = pdfcrop_path + self.pdftk_path = pdftk_path + self.gs_path = gs_path + + self.log("Starting %s" % type(self).__name__) + + def log(self, msg, mode="info"): + if not self.verbose: + return + if not mode in ["info", "warning"]: + raise ValueError("unknown logging mode.") + now = datetime.datetime.now() + print( + now.strftime("%Y-%m-%d %H:%M:%S") + + " - " + + mode.upper() + + " - " + + msg + ) + def warn(self, msg): + self.log(msg, mode="warning") + + @staticmethod + @abc.abstractmethod + def validate(src): + """ Validate whether ``src`` is appropriate for this provider """ + + @abc.abstractmethod + def retrieve_pdf(self, src, filename): + """ Download pdf from src and save to filename """ + + @abc.abstractmethod + def get_paper_info(self, src): + """ Retrieve the title/author (surnames)/year information """ + + def create_filename(self, info, filename=None): + """ Generate filename using the info dict or filename if provided """ + if not filename is None: + return filename + # we assume that the list of authors is surname only. + self.log("Generating output filename") + if len(info["authors"]) > 3: + author_part = info["authors"][0] + "_et_al" + else: + author_part = "_".join(info["authors"]) + author_part = author_part.replace(" ", "_") + title = info["title"].replace(",", "").replace(":", "") + title_part = titlecase.titlecase(title).replace(" ", "_") + year_part = info["date"].split("/")[0] + name = author_part + "_-_" + title_part + "_" + year_part + ".pdf" + self.log("Created filename: %s" % name) + return name + + def crop_pdf(self, filepath): + self.log("Cropping pdf file") + status = subprocess.call( + [self.pdfcrop_path, "--margins", "15 40 15 15", filepath], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + self.warn("Failed to crop the pdf file at: %s" % filepath) + return filepath + cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" + if not os.path.exists(cropped_file): + self.warn( + "Can't find cropped file '%s' where expected." % cropped_file + ) + return filepath + return cropped_file -def check_file_is_pdf(filename): - try: - PyPDF2.PdfFileReader(open(filename, "rb")) - return True - except PyPDF2.utils.PdfReadError: - return False + def shrink_pdf(self, filepath): + self.log("Shrinking pdf file") + output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" + status = subprocess.call( + [ + self.gs_path, + "-sDEVICE=pdfwrite", + "-dCompatibilityLevel=1.4", + "-dPDFSETTINGS=/printer", + "-dNOPAUSE", + "-dBATCH", + "-dQUIET", + "-sOutputFile=%s" % output_file, + filepath, + ] + ) + if not status == 0: + self.warn("Failed to shrink the pdf file") + return filepath + return output_file + def check_file_is_pdf(self, filename): + try: + fp = open(filename, "rb") + pdf = PyPDF2.PdfFileReader(fp, strict=False) + fp.close() + del pdf + return True + except PyPDF2.utils.PdfReadError: + exception("Downloaded file isn't a valid pdf file.") + + def download_url(self, url, filename): + """Download the content of an url and save it to a filename """ + self.log("Downloading file at url: %s" % url) + content = self.get_page_with_retry(url) + with open(filename, "wb") as fid: + fid.write(content) + + def get_page_with_retry(self, url, tries=5): + count = 0 + while count < tries: + count += 1 + error = False + try: + res = requests.get(url, headers=HEADERS) + except requests.exceptions.ConnectionError: + error = True + if error or not res.ok: + time.sleep(5) + self.warn("Error getting url %s. Retrying in 5 seconds" % url) + continue + self.log("Downloading url: %s" % url) + return res.content -def get_arxiv_urls(url): - """Get the pdf and abs url from any given arXiv url """ - if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): - abs_url = url - pdf_url = url.replace("abs", "pdf") + ".pdf" - elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url): - abs_url = url[:-4].replace("pdf", "abs") - pdf_url = url - else: - exception("Couldn't figure out arXiv urls.") - return pdf_url, abs_url + def upload_to_rm(self, filepath): + remarkable_dir = self.remarkable_dir.rstrip("/") + self.log("Starting upload to reMarkable") + if remarkable_dir: + status = subprocess.call( + [self.rmapi_path, "mkdir", remarkable_dir], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception( + "Creating directory %s on reMarkable failed" + % remarkable_dir + ) + status = subprocess.call( + [self.rmapi_path, "put", filepath, remarkable_dir + "/"], + stdout=subprocess.DEVNULL, + ) + if not status == 0: + exception("Uploading file %s to reMarkable failed" % filepath) + self.log("Upload successful.") + def dearxiv(self, input_file): + """Remove the arXiv timestamp from a pdf""" + self.log("Removing arXiv timestamp") + basename = os.path.splitext(input_file)[0] + uncompress_file = basename + "_uncompress.pdf" -def get_pmc_urls(url): - """Get the pdf and html url from a given PMC url """ - if re.match( - "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", - url, - ): - idx = url.index("pdf") - abs_url = url[: idx - 1] - pdf_url = url - elif re.match("https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url): - abs_url = url - pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually - else: - exception("Couldn't figure out PMC urls.") - return pdf_url, abs_url - - -def get_acm_pdf_url(url): - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - thea = None - for a in soup.find_all("a"): - if a.get("name") == "FullTextPDF": - thea = a - break - if thea is None: - return None - href = thea.get("href") - if href.startswith("http"): - return href - else: - return "https://dl.acm.org/" + href - - -def get_acm_urls(url): - if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): - abs_url = url - pdf_url = get_acm_pdf_url(url) - if pdf_url is None: - exception("Couldn't extract PDF url from ACM citation page.") - else: - exception( - "Couldn't figure out ACM urls, please provide a URL of the " - "format: http(s)://dl.acm.org/citation.cfm?id=..." + status = subprocess.call( + [ + self.pdftk_path, + input_file, + "output", + uncompress_file, + "uncompress", + ] ) - return pdf_url, abs_url + if not status == 0: + exception("pdftk failed to uncompress the pdf.") + + with open(uncompress_file, "rb") as fid: + data = fid.read() + # Remove the text element + data = re.sub( + b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", + b"()Tj", + data, + ) + # Remove the URL element + data = re.sub( + b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", + b"", + data, + ) + removed_file = basename + "_removed.pdf" + with open(removed_file, "wb") as oid: + oid.write(data) -def get_page_with_retry(url): - """Get the content of an url, retrying up to five times on failure. """ + output_file = basename + "_dearxiv.pdf" + status = subprocess.call( + [self.pdftk_path, removed_file, "output", output_file, "compress"] + ) + if not status == 0: + exception("pdftk failed to compress the pdf.") + + return output_file + + def run(self, src, filename=None): + info = self.get_paper_info(src) + clean_filename = self.create_filename(info, filename) + tmp_filename = "paper.pdf" + + self.initial_dir = os.getcwd() + with tempfile.TemporaryDirectory() as working_dir: + os.chdir(working_dir) + self.retrieve_pdf(src, tmp_filename) + self.check_file_is_pdf(tmp_filename) + + ops = [self.dearxiv, self.crop_pdf, self.shrink_pdf] + intermediate_fname = tmp_filename + for op in ops: + intermediate_fname = op(intermediate_fname) + shutil.move(intermediate_fname, clean_filename) + + if self.debug: + print("Paused in debug mode in dir: %s" % working_dir) + print("Press enter to exit.") + return input() + + if self.upload: + return self.upload_to_rm(clean_filename) + + target_path = os.path.join(self.initial_dir, clean_filename) + while os.path.exists(target_path): + base = os.path.splitext(target_path)[0] + target_path = base + "_.pdf" + shutil.move(clean_filename, target_path) + return target_path + + +class ArxivProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and abs url from any given arXiv url """ + if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url): + abs_url = url + pdf_url = url.replace("abs", "pdf") + ".pdf" + elif re.match( + "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url + ): + abs_url = url[:-4].replace("pdf", "abs") + pdf_url = url + else: + exception("Couldn't figure out arXiv urls.") + return abs_url, pdf_url - def retry(url, count): - if count < 5: - logger.info( - "Caught error for url %s. Retrying in 5 seconds." % url - ) - time.sleep(5) + def validate(src): + """Check if the url is to an arXiv page. """ + m = re.match( + "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + """ Download the file and save as filename """ + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + abs_url, _ = self.get_abs_pdf_urls(src) + self.log("Getting paper info from arXiv") + page = self.get_page_with_retry(abs_url) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_author"}) + ] + authors = [x.split(",")[0].strip() for x in authors] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + return dict(title=title, date=date, authors=authors) + + +class PMCProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_abs_pdf_urls(self, url): + """Get the pdf and html url from a given PMC url """ + if re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf", + url, + ): + idx = url.index("pdf") + abs_url = url[: idx - 1] + pdf_url = url + elif re.match( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?", url + ): + abs_url = url + pdf_url = url.rstrip("/") + "/pdf" # it redirects, usually else: - exception("Failed to download url: %s" % url) + exception("Couldn't figure out PMC urls.") + return abs_url, pdf_url - count = 0 - while True: - count += 1 - try: - res = requests.get(url, headers=HEADERS) - except requests.exceptions.ConnectionError: - retry(url, count) - continue - if res.ok: - logger.info("Downloading url: %s" % url) - return res.content + def validate(src): + m = re.fullmatch( + "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+.*", src + ) + return not m is None + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + self.log("Getting paper info from PMC") + page = self.get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess at best. I'm open to + # more advanced approaches. + authors = [ + x.strip().split(" ")[-1].strip() for x in authors[0].split(",") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if re.match("\w+\ \d{4}", date): + date = date.split(" ")[-1] + else: + date = date.replace(" ", "_") + return dict(title=title, date=date, authors=authors) + + +class ACMProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_acm_pdf_url(self, url): + page = self.get_page_with_retry(url) + soup = bs4.BeautifulSoup(page, "html.parser") + thea = None + for a in soup.find_all("a"): + if a.get("name") == "FullTextPDF": + thea = a + break + if thea is None: + return None + href = thea.get("href") + if href.startswith("http"): + return href + else: + return "https://dl.acm.org/" + href + + def get_abs_pdf_urls(self, url): + if re.match("https?://dl.acm.org/citation.cfm\?id=\d+", url): + abs_url = url + pdf_url = self.get_acm_pdf_url(url) + if pdf_url is None: + exception( + "Couldn't extract PDF url from ACM citation page. Maybe it's behind a paywall?" + ) else: - retry(url, count) + exception( + "Couldn't figure out ACM urls, please provide a URL of the " + "format: http(s)://dl.acm.org/citation.cfm?id=..." + ) + return abs_url, pdf_url + + def retrieve_pdf(self, src, filename): + _, pdf_url = self.get_abs_pdf_urls(src) + self.download_url(pdf_url, filename) + + def validate(src): + m = re.fullmatch("https?://dl.acm.org/citation.cfm\?id=\d+", src) + return not m is None + + def get_paper_info(self, src): + """ Extract the paper's authors, title, and publication year """ + self.log("Getting paper info from ACM") + page = self.get_page_with_retry(src) + soup = bs4.BeautifulSoup(page, "html.parser") + authors = [ + x["content"] + for x in soup.find_all("meta", {"name": "citation_authors"}) + ] + # We only use last names, and this method is a guess. I'm open to more + # advanced approaches. + authors = [ + x.strip().split(",")[0].strip() for x in authors[0].split(";") + ] + title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] + date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] + if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): + self.warn( + "Couldn't extract year from ACM page, please raise an " + "issue on GitHub so I can fix it: %s" % GITHUB_URL + ) + date = date.strip().split("/")[-1] + return dict(title=title, date=date, authors=authors) -def download_url(url, filename): - """Download the content of an url and save it to a filename """ - logger.info("Downloading file at url: %s" % url) - content = get_page_with_retry(url) - with open(filename, "wb") as fid: - fid.write(content) +class LocalFileProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def validate(src): + return os.path.exists(src) -def dearxiv(input_file, pdftk_path="pdftk"): - """Remove the arXiv timestamp from a pdf""" - logger.info("Removing arXiv timestamp") - basename = os.path.splitext(input_file)[0] - uncompress_file = basename + "_uncompress.pdf" + def retrieve_pdf(self, src, filename): + source = os.path.join(self.initial_dir, src) + shutil.copy(source, filename) - status = subprocess.call( - [pdftk_path, input_file, "output", uncompress_file, "uncompress"] - ) - if not status == 0: - exception("pdftk failed to uncompress the pdf.") - - with open(uncompress_file, "rb") as fid: - data = fid.read() - # Remove the text element - data = re.sub( - b"\(arXiv:\d{4}\.\d{4,5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj", - b"()Tj", - data, - ) - # Remove the URL element - data = re.sub( - b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{4,5}v\d\)\\n\/S /URI\\n>>\\n", - b"", - data, - ) + def get_paper_info(self, src): + return {"filename": src} - removed_file = basename + "_removed.pdf" - with open(removed_file, "wb") as oid: - oid.write(data) + def create_filename(self, info, filename=None): + if not filename is None: + return filename + return os.path.basename(info["filename"]) - output_file = basename + "_dearxiv.pdf" - status = subprocess.call( - [pdftk_path, removed_file, "output", output_file, "compress"] - ) - if not status == 0: - exception("pdftk failed to compress the pdf.") - return output_file +class PdfUrlProvider(Provider): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def validate(src): + try: + result = urllib.parse.urlparse(src) + return all([result.scheme, result.netloc, result.path]) + except: + return False -def crop_pdf(filepath, pdfcrop_path="pdfcrop"): - logger.info("Cropping pdf file") - status = subprocess.call( - [pdfcrop_path, "--margins", "15 40 15 15", filepath], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - logger.warning("Failed to crop the pdf file at: %s" % filepath) - return filepath - cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf" - if not os.path.exists(cropped_file): - logger.warning( - "Can't find cropped file '%s' where expected." % cropped_file - ) - return filepath - return cropped_file - - -def shrink_pdf(filepath, gs_path="gs"): - logger.info("Shrinking pdf file") - output_file = os.path.splitext(filepath)[0] + "-shrink.pdf" - status = subprocess.call( - [ - "gs", - "-sDEVICE=pdfwrite", - "-dCompatibilityLevel=1.4", - "-dPDFSETTINGS=/printer", - "-dNOPAUSE", - "-dBATCH", - "-dQUIET", - "-sOutputFile=%s" % output_file, - filepath, - ] - ) - if not status == 0: - logger.warning("Failed to shrink the pdf file") - return filepath - return output_file - - -def get_paper_info_arxiv(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from arXiv") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_author"}) - ] - authors = [x.split(",")[0].strip() for x in authors] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_pmc(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from PMC") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess at best. I'm open to - # more advanced approaches. - authors = [x.strip().split(" ")[-1].strip() for x in authors[0].split(",")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if re.match("\w+\ \d{4}", date): - date = date.split(" ")[-1] - else: - date = date.replace(" ", "_") - return dict(title=title, date=date, authors=authors) - - -def get_paper_info_acm(url): - """ Extract the paper's authors, title, and publication year """ - logger.info("Getting paper info from ACM") - page = get_page_with_retry(url) - soup = bs4.BeautifulSoup(page, "html.parser") - authors = [ - x["content"] - for x in soup.find_all("meta", {"name": "citation_authors"}) - ] - # We only use last names, and this method is a guess. I'm open to more - # advanced approaches. - authors = [x.strip().split(",")[0].strip() for x in authors[0].split(";")] - title = soup.find_all("meta", {"name": "citation_title"})[0]["content"] - date = soup.find_all("meta", {"name": "citation_date"})[0]["content"] - if not re.match("\d{2}/\d{2}/\d{4}", date.strip()): - logger.warning( - "Couldn't extract year from ACM page, please raise an " - "issue on GitHub so I can fix it: %s", - GITHUB_URL, - ) - date = date.strip().split("/")[-1] - return dict(title=title, date=date, authors=authors) - - -def generate_filename(info): - """ Generate a nice filename for a paper given the info dict """ - # we assume that the list of authors is lastname only. - logger.info("Generating output filename") - if len(info["authors"]) > 3: - author_part = info["authors"][0] + "_et_al" - else: - author_part = "_".join(info["authors"]) - author_part = author_part.replace(" ", "_") - title = info["title"].replace(",", "").replace(":", "").replace(" ", "_") - title_part = titlecase.titlecase(title) - year_part = info["date"].split("/")[0] - return author_part + "_-_" + title_part + "_" + year_part + ".pdf" - - -def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"): - remarkable_dir = remarkable_dir.rstrip("/") - logger.info("Starting upload to reMarkable") - if remarkable_dir: - status = subprocess.call( - [rmapi_path, "mkdir", remarkable_dir], stdout=subprocess.DEVNULL - ) - if not status == 0: + def retrieve_pdf(self, url, filename): + self.download_url(url, filename) + + def get_paper_info(self, src): + return None + + def create_filename(self, info, filename=None): + if filename is None: exception( - "Creating directory %s on reMarkable failed" % remarkable_dir + "Filename must be provided with PDFUrlProvider (use --filename)" ) - status = subprocess.call( - [rmapi_path, "put", filepath, remarkable_dir + "/"], - stdout=subprocess.DEVNULL, - ) - if not status == 0: - exception("Uploading file %s to reMarkable failed" % filepath) - logger.info("Upload successful.") + return filename + + +def exception(msg): + print("ERROR: " + msg, file=sys.stderr) + print("Error occurred. Exiting.", file=sys.stderr) + raise SystemExit(1) def parse_args(): @@ -420,82 +549,33 @@ def parse_args(): return parser.parse_args() -@logger.catch def main(): args = parse_args() - if os.path.exists(args.input): - mode = "local_file" - elif arxiv_url(args.input): - mode = "arxiv_url" - elif pmc_url(args.input): - mode = "pmc_url" - elif acm_url(args.input): - mode = "acm_url" - elif valid_url(args.input): - if args.filename is None: - exception( - "Filename must be provided with pdf url (use --filename)" - ) - mode = "pdf_url" - else: - exception("Input not a valid url, arxiv url, or existing file.") - - if not args.verbose: - logger.remove(0) - - start_wd = os.getcwd() - - with tempfile.TemporaryDirectory() as working_dir: - if mode == "local_file": - shutil.copy(args.input, working_dir) - filename = os.path.basename(args.input) - clean_filename = args.filename if args.filename else filename - - os.chdir(working_dir) - if mode in ["arxiv_url", "pmc_url", "acm_url", "pdf_url"]: - filename = "paper.pdf" - if mode == "arxiv_url": - pdf_url, abs_url = get_arxiv_urls(args.input) - paper_info = get_paper_info_arxiv(abs_url) - elif mode == "pmc_url": - pdf_url, abs_url = get_pmc_urls(args.input) - paper_info = get_paper_info_pmc(abs_url) - elif mode == "acm_url": - pdf_url, abs_url = get_acm_urls(args.input) - paper_info = get_paper_info_acm(abs_url) - else: - pdf_url = args.input - download_url(pdf_url, filename) - if not check_file_is_pdf(filename): - exception("Downloaded file isn't a valid pdf file.") - if args.filename: - clean_filename = args.filename - else: - clean_filename = generate_filename(paper_info) - - dearxived = dearxiv(filename, pdftk_path=args.pdftk) - cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop) - shrinked = shrink_pdf(cropped) - shutil.move(shrinked, clean_filename) - - if args.debug: - print("Paused in debug mode in dir: %s" % working_dir) - print("Press enter to exit.") - return input() - - if args.no_upload: - if os.path.exists(os.path.join(start_wd, clean_filename)): - tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf" - shutil.move(clean_filename, os.path.join(start_wd, tmpfname)) - else: - shutil.move(clean_filename, start_wd) - else: - upload_to_rm( - clean_filename, - remarkable_dir=args.remarkable_dir, - rmapi_path=args.rmapi, - ) + providers = [ + ArxivProvider, + PMCProvider, + ACMProvider, + LocalFileProvider, + PdfUrlProvider, + ] + + provider = next((p for p in providers if p.validate(args.input)), None) + if provider is None: + exception("Input not valid, no provider can handle this source.") + + prov = provider( + args.verbose, + not args.no_upload, + args.debug, + args.remarkable_dir, + args.rmapi, + args.pdfcrop, + args.pdftk, + args.gs, + ) + + prov.run(args.input, filename=args.filename) if __name__ == "__main__": |
