#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Given an arXiv paper url this script:

1. Downloads the paper
2. Strips the timestamp
3. Crops the pdf to remove unnecessary borders
4. Shrinks the pdf to reduce the filesize
5. Renames it using the format:
    '_'.join(author_lastnames) + '_-_' + title + '_' + year.pdf
6. Uploads it to the reMarkable using rMapi.

Author: G.J.J. van den Burg
Date: 2019-02-02
License: MIT

"""

import argparse
import bs4
import os
import re
import requests
import shutil
import subprocess
import sys
import tempfile
import time

from loguru import logger

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
    "Safari/537.36"
}


def exception(msg):
    print("ERROR: " + msg, file=sys.stderr)
    print("Error occurred. Exiting.", file=sys.stderr)
    raise SystemExit(1)


def validate_url(url):
    """Check if the url is to an arXiv page.

    >>> validate_url("https://arxiv.org/abs/1811.11242")
    True
    >>> validate_url("https://arxiv.org/pdf/1811.11242.pdf")
    True
    >>> validate_url("http://arxiv.org/abs/1811.11242")
    True
    >>> validate_url("http://arxiv.org/pdf/1811.11242.pdf")
    True
    >>> validate_url("https://arxiv.org/abs/1811.11242v1")
    True
    >>> validate_url("https://arxiv.org/pdf/1811.11242v1.pdf")
    True
    >>> validate_url("https://gertjanvandenburg.com")
    False
    """
    m = re.match(
        "https?://arxiv.org/(abs|pdf)/\d{4}\.\d{4,5}(v\d+)?(\.pdf)?", url
    )
    return not m is None


def get_urls(url):
    """Get the pdf and abs url from any given url """
    if re.match("https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?", url):
        abs_url = url
        pdf_url = url.replace("abs", "pdf") + ".pdf"
    elif re.match("https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf", url):
        abs_url = url[:-4].replace("pdf", "abs")
        pdf_url = url
    else:
        exception("Couldn't figure out arXiv urls.")
    return pdf_url, abs_url


def get_page_with_retry(url):
    """Get the content of an url, retrying up to five times on failure. """
    count = 0
    while True:
        res = requests.get(url, headers=HEADERS)
        if res.ok:
            logger.info("Downloading url: %s" % url)
            return res.content
        else:
            if count < 5:
                logger.info(
                    "Caught error for url %s. Retrying in 5 seconds." % url
                )
                time.sleep(5)
            else:
                exception("Failed to download url: %s" % url)


def download_url(url, filename):
    """Download the content of an url and save it to a filename """
    logger.info("Downloading file at url: %s" % url)
    content = get_page_with_retry(url)
    with open(filename, "wb") as fid:
        fid.write(content)


def dearxiv(input_file, pdftk_path="pdftk"):
    """Remove the arXiv timestamp from a pdf"""
    logger.info("Removing arXiv timestamp")
    basename = os.path.splitext(input_file)[0]
    uncompress_file = basename + "_uncompress.pdf"

    status = subprocess.call(
        [pdftk_path, input_file, "output", uncompress_file, "uncompress"]
    )
    if not status == 0:
        exception("pdftk failed to uncompress the pdf.")

    with open(uncompress_file, "rb") as fid:
        data = fid.read()
        # Remove the text element
        data = re.sub(
            b"\(arXiv:\d{4}\.\d{5}v\d\s+\[\w+\.\w+\]\s+\d{1,2}\s\w{3}\s\d{4}\)Tj",
            b"()Tj",
            data,
        )
        # Remove the URL element
        data = re.sub(
            b"<<\\n\/URI \(http://arxiv\.org/abs/\d{4}\.\d{5}v\d\)\\n\/S /URI\\n>>\\n",
            b"",
            data,
        )

    removed_file = basename + "_removed.pdf"
    with open(removed_file, "wb") as oid:
        oid.write(data)

    output_file = basename + "_dearxiv.pdf"
    status = subprocess.call(
        [pdftk_path, removed_file, "output", output_file, "compress"]
    )
    if not status == 0:
        exception("pdftk failed to compress the pdf.")

    return output_file


def crop_pdf(filepath, pdfcrop_path="pdfcrop"):
    logger.info("Cropping pdf file")
    status = subprocess.call(
        [pdfcrop_path, "--margins", "15 40 15 15", filepath],
        stdout=subprocess.DEVNULL,
    )
    if not status == 0:
        logger.warning("Failed to crop the pdf file at: %s" % filepath)
        return filepath
    cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
    if not os.path.exists(cropped_file):
        logger.warning(
            "Can't find cropped file '%s' where expected." % cropped_file
        )
        return filepath
    return cropped_file


def shrink_pdf(filepath, gs_path="gs"):
    logger.info("Shrinking pdf file")
    output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
    status = subprocess.call(
        [
            "gs",
            "-sDEVICE=pdfwrite",
            "-dCompatibilityLevel=1.4",
            "-dPDFSETTINGS=/printer",
            "-dNOPAUSE",
            "-dBATCH",
            "-dQUIET",
            "-sOutputFile=%s" % output_file,
            filepath,
        ]
    )
    if not status == 0:
        logger.warning("Failed to shrink the pdf file")
        return filepath
    return output_file


def get_paper_info(url):
    logger.info("Getting paper info from arXiv")
    page = get_page_with_retry(url)
    soup = bs4.BeautifulSoup(page, "html.parser")
    authors = [
        x["content"]
        for x in soup.find_all("meta", {"name": "citation_author"})
    ]
    title = soup.find_all("meta", {"name": "citation_title"})[0]["content"]
    date = soup.find_all("meta", {"name": "citation_date"})[0]["content"]
    return dict(title=title, date=date, authors=authors)


def generate_filename(info):
    logger.info("Generating output filename")
    if len(info["authors"]) > 3:
        author_part = info["authors"][0].split(",")[0] + "_et_al"
    else:
        author_part = "_".join([x.split(",")[0] for x in info["authors"]])
    author_part = author_part.replace(" ", "_")
    title_part = info["title"].replace(",", "").replace(" ", "_")
    year_part = info["date"].split("/")[0]
    return author_part + "_-_" + title_part + "_" + year_part + ".pdf"


def upload_to_rm(filepath, remarkable_dir="/", rmapi_path="rmapi"):
    logger.info("Starting upload to reMarkable")
    status = subprocess.call(
        [rmapi_path, "put", filepath, remarkable_dir],
        stdout=subprocess.DEVNULL,
    )
    if not status == 0:
        exception("Uploading file %s to remarkable failed" % filepath)
    logger.info("Upload successful.")


def parse_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-v", "--verbose", help="be verbose", action="store_true"
    )
    parser.add_argument(
        "-n",
        "--no-upload",
        help="don't upload to the reMarkable, save the output in current working dir",
        action="store_true",
    )
    parser.add_argument(
        "-d",
        "--debug",
        help="debug mode, doesn't upload to reMarkable",
        action="store_true",
    )
    parser.add_argument(
        "--rmapi", help="path to rmapi executable", default="rmapi"
    )
    parser.add_argument(
        "--pdfcrop", help="path to pdfcrop executable", default="pdfcrop"
    )
    parser.add_argument(
        "--pdftk", help="path to pdftk executable", default="pdftk"
    )
    parser.add_argument("--gs", help="path to gs executable", default="gs")
    parser.add_argument(
        "input", help="url to an arxiv paper or existing pdf file"
    )
    return parser.parse_args()


@logger.catch
def main():
    args = parse_args()

    if not (os.path.exists(args.input) or validate_url(args.input)):
        exception("Input not a file or arXiv url.")

    if not args.verbose:
        logger.remove(0)

    start_wd = os.getcwd()

    with tempfile.TemporaryDirectory() as working_dir:
        if os.path.exists(args.input):
            shutil.copy(args.input, working_dir)
            filename = os.path.basename(args.input)
            clean_filename = filename

        os.chdir(working_dir)
        if validate_url(args.input):
            pdf_url, abs_url = get_urls(args.input)
            filename = "paper.pdf"
            download_url(pdf_url, filename)
            paper_info = get_paper_info(abs_url)
            clean_filename = generate_filename(paper_info)

        dearxived = dearxiv(filename, pdftk_path=args.pdftk)
        cropped = crop_pdf(dearxived, pdfcrop_path=args.pdfcrop)
        shrinked = shrink_pdf(cropped)
        shutil.move(shrinked, clean_filename)

        if args.debug:
            print("Paused in debug mode in dir: %s" % working_dir)
            print("Press enter to exit.")
            return input()

        if args.no_upload:
            if os.path.exists(os.path.join(start_wd, clean_filename)):
                tmpfname = os.path.splitext(filename)[0] + "_cropped.pdf"
                shutil.move(clean_filename, os.path.join(start_wd, tmpfname))
            else:
                shutil.move(clean_filename, start_wd)
        else:
            upload_to_rm(clean_filename, rmapi_path=args.rmapi)


if __name__ == "__main__":
    main()