diff options
Diffstat (limited to 'datasets/iceland_tourism/get_iceland_tourism.py')
| -rw-r--r-- | datasets/iceland_tourism/get_iceland_tourism.py | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py new file mode 100644 index 0000000..752f07d --- /dev/null +++ b/datasets/iceland_tourism/get_iceland_tourism.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Collect the iceland_tourism dataset + +See the README file for more information. + +Author: G.J.J. van den Burg +License: This file is part of TCPD, see the top-level LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import hashlib +import json +import os +import xlrd + +from functools import wraps +from urllib.request import urlretrieve + +XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx" + +MD5_XLSX = "ec777afd95b01ca901aa00475fc284e5" +MD5_JSON = "8bbac4ca95319a865f2d58ff564f063d" + +NAME_XLSX = "visitors-to-iceland-2002-2019-oct.xlsx" +NAME_JSON = "iceland_tourism.json" + +MONTHS = { + "January": 1, + "February": 2, + "March": 3, + "April": 4, + "May": 5, + "June": 6, + "July": 7, + "August": 8, + "September": 9, + "October": 10, + "November": 11, + "December": 12, +} + + +class ValidationError(Exception): + def __init__(self, filename): + self.message = ( + "Validating the file '%s' failed. \n" + "Please raise an issue on the GitHub page for this project \n" + "if the error persists." % filename + ) + + +def check_md5sum(filename, checksum): + with open(filename, "rb") as fp: + data = fp.read() + h = hashlib.md5(data).hexdigest() + return h == checksum + + +def validate(checksum): + """Decorator that validates the target file.""" + + def validate_decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + target = kwargs.get("target_path", None) + if os.path.exists(target) and check_md5sum(target, checksum): + return + out = func(*args, **kwargs) + if not os.path.exists(target): + raise FileNotFoundError("Target file expected at: %s" % target) + if not check_md5sum(target, checksum): + raise ValidationError(target) + return out + + return wrapper + + return validate_decorator + + +@validate(MD5_XLSX) +def download_xlsx(target_path=None): + urlretrieve(XLSX_URL, target_path) + + +def format_ym(year, month): + midx = MONTHS[month] + return "%i-%02d" % (int(year), midx) + + +@validate(MD5_JSON) +def write_json(xlsx_path, target_path=None): + wb = xlrd.open_workbook(xlsx_path) + ws = wb.sheet_by_index(2) + + # hardcoding these row indices, not worth doing it nicely + header = ws.row(2) + column_idx = [ + i + for i, c in enumerate(header) + if c.ctype == xlrd.XL_CELL_NUMBER and 2003 <= c.value < 2020 + ] + + visitors = [] + + r_offset = 3 + for c in column_idx: + for r in range(r_offset, r_offset + 12): + cell = ws.cell(r, c) + if cell.ctype == xlrd.XL_CELL_EMPTY: + continue + year = header[c].value + month = ws.cell(r, 0).value + datestr = format_ym(year, month) + # eliminate some observations that were not in the original dataset + if datestr in ["2019-08", "2019-09", "2019-10"]: + continue + item = {"time": datestr, "value": int(cell.value)} + visitors.append(item) + + name = "iceland_tourism" + longname = "Iceland Tourism" + + data = { + "name": name, + "longname": longname, + "n_obs": len(visitors), + "n_dim": 1, + "time": { + "format": "%Y-%m", + "index": list(range(len(visitors))), + "raw": [v["time"] for v in visitors], + }, + "series": [ + { + "label": "Visitor Number", + "type": "int", + "raw": [v["value"] for v in visitors], + } + ], + } + + with open(target_path, "w") as fp: + json.dump(data, fp, indent="\t") + + +def collect(output_dir="."): + xlsx_path = os.path.join(output_dir, NAME_XLSX) + json_path = os.path.join(output_dir, NAME_JSON) + + download_xlsx(target_path=xlsx_path) + write_json(xlsx_path, target_path=json_path) + + +def clean(output_dir="."): + xlsx_path = os.path.join(output_dir, NAME_XLSX) + json_path = os.path.join(output_dir, NAME_JSON) + + if os.path.exists(xlsx_path): + os.unlink(xlsx_path) + if os.path.exists(json_path): + os.unlink(json_path) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", "--output-dir", help="output directory to use", default="." + ) + parser.add_argument( + "action", + choices=["collect", "clean"], + help="Action to perform", + default="collect", + nargs="?", + ) + return parser.parse_args() + + +def main(output_dir="."): + args = parse_args() + if args.action == "collect": + collect(output_dir=args.output_dir) + elif args.action == "clean": + clean(output_dir=args.output_dir) + + +if __name__ == "__main__": + main() |
