#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Collect the iceland_tourism dataset

See the README file for more information.

Author: G.J.J. van den Burg
License: This file is part of TCPD, see the top-level LICENSE file.
Copyright: 2019, The Alan Turing Institute

"""

import argparse
import hashlib
import json
import os
import xlrd

from functools import wraps
from urllib.request import urlretrieve

XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx"

MD5_XLSX = "ec777afd95b01ca901aa00475fc284e5"
MD5_JSON = "8bbac4ca95319a865f2d58ff564f063d"

NAME_XLSX = "visitors-to-iceland-2002-2019-oct.xlsx"
NAME_JSON = "iceland_tourism.json"

MONTHS = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}


class ValidationError(Exception):
    def __init__(self, filename):
        self.message = (
            "Validating the file '%s' failed. \n"
            "Please raise an issue on the GitHub page for this project \n"
            "if the error persists." % filename
        )


def check_md5sum(filename, checksum):
    with open(filename, "rb") as fp:
        data = fp.read()
    h = hashlib.md5(data).hexdigest()
    return h == checksum


def validate(checksum):
    """Decorator that validates the target file."""

    def validate_decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            target = kwargs.get("target_path", None)
            if os.path.exists(target) and check_md5sum(target, checksum):
                return
            out = func(*args, **kwargs)
            if not os.path.exists(target):
                raise FileNotFoundError("Target file expected at: %s" % target)
            if not check_md5sum(target, checksum):
                raise ValidationError(target)
            return out

        return wrapper

    return validate_decorator


@validate(MD5_XLSX)
def download_xlsx(target_path=None):
    urlretrieve(XLSX_URL, target_path)


def format_ym(year, month):
    midx = MONTHS[month]
    return "%i-%02d" % (int(year), midx)


@validate(MD5_JSON)
def write_json(xlsx_path, target_path=None):
    wb = xlrd.open_workbook(xlsx_path)
    ws = wb.sheet_by_index(2)

    # hardcoding these row indices, not worth doing it nicely
    header = ws.row(2)
    column_idx = [
        i
        for i, c in enumerate(header)
        if c.ctype == xlrd.XL_CELL_NUMBER and 2003 <= c.value < 2020
    ]

    visitors = []

    r_offset = 3
    for c in column_idx:
        for r in range(r_offset, r_offset + 12):
            cell = ws.cell(r, c)
            if cell.ctype == xlrd.XL_CELL_EMPTY:
                continue
            year = header[c].value
            month = ws.cell(r, 0).value
            datestr = format_ym(year, month)
            # eliminate some observations that were not in the original dataset
            if datestr in ["2019-08", "2019-09", "2019-10"]:
                continue
            item = {"time": datestr, "value": int(cell.value)}
            visitors.append(item)

    name = "iceland_tourism"
    longname = "Iceland Tourism"

    data = {
        "name": name,
        "longname": longname,
        "n_obs": len(visitors),
        "n_dim": 1,
        "time": {
            "format": "%Y-%m",
            "index": list(range(len(visitors))),
            "raw": [v["time"] for v in visitors],
        },
        "series": [
            {
                "label": "Visitor Number",
                "type": "int",
                "raw": [v["value"] for v in visitors],
            }
        ],
    }

    with open(target_path, "w") as fp:
        json.dump(data, fp, indent="\t")


def collect(output_dir="."):
    xlsx_path = os.path.join(output_dir, NAME_XLSX)
    json_path = os.path.join(output_dir, NAME_JSON)

    download_xlsx(target_path=xlsx_path)
    write_json(xlsx_path, target_path=json_path)


def clean(output_dir="."):
    xlsx_path = os.path.join(output_dir, NAME_XLSX)
    json_path = os.path.join(output_dir, NAME_JSON)

    if os.path.exists(xlsx_path):
        os.unlink(xlsx_path)
    if os.path.exists(json_path):
        os.unlink(json_path)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-o", "--output-dir", help="output directory to use", default="."
    )
    parser.add_argument(
        "action",
        choices=["collect", "clean"],
        help="Action to perform",
        default="collect",
        nargs="?",
    )
    return parser.parse_args()


def main(output_dir="."):
    args = parse_args()
    if args.action == "collect":
        collect(output_dir=args.output_dir)
    elif args.action == "clean":
        clean(output_dir=args.output_dir)


if __name__ == "__main__":
    main()