aboutsummaryrefslogtreecommitdiff
path: root/datasets/iceland_tourism/get_iceland_tourism.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
commit7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree10aa6710599230c889ec44407a065ee303a79348 /datasets/iceland_tourism/get_iceland_tourism.py
downloadTCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz
TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip
Initial commit
Diffstat (limited to 'datasets/iceland_tourism/get_iceland_tourism.py')
-rw-r--r--datasets/iceland_tourism/get_iceland_tourism.py193
1 files changed, 193 insertions, 0 deletions
diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py
new file mode 100644
index 0000000..752f07d
--- /dev/null
+++ b/datasets/iceland_tourism/get_iceland_tourism.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Collect the iceland_tourism dataset
+
+See the README file for more information.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import xlrd
+
+from functools import wraps
+from urllib.request import urlretrieve
+
+XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx"
+
+MD5_XLSX = "ec777afd95b01ca901aa00475fc284e5"
+MD5_JSON = "8bbac4ca95319a865f2d58ff564f063d"
+
+NAME_XLSX = "visitors-to-iceland-2002-2019-oct.xlsx"
+NAME_JSON = "iceland_tourism.json"
+
+MONTHS = {
+ "January": 1,
+ "February": 2,
+ "March": 3,
+ "April": 4,
+ "May": 5,
+ "June": 6,
+ "July": 7,
+ "August": 8,
+ "September": 9,
+ "October": 10,
+ "November": 11,
+ "December": 12,
+}
+
+
+class ValidationError(Exception):
+ def __init__(self, filename):
+ self.message = (
+ "Validating the file '%s' failed. \n"
+ "Please raise an issue on the GitHub page for this project \n"
+ "if the error persists." % filename
+ )
+
+
+def check_md5sum(filename, checksum):
+ with open(filename, "rb") as fp:
+ data = fp.read()
+ h = hashlib.md5(data).hexdigest()
+ return h == checksum
+
+
+def validate(checksum):
+ """Decorator that validates the target file."""
+
+ def validate_decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ target = kwargs.get("target_path", None)
+ if os.path.exists(target) and check_md5sum(target, checksum):
+ return
+ out = func(*args, **kwargs)
+ if not os.path.exists(target):
+ raise FileNotFoundError("Target file expected at: %s" % target)
+ if not check_md5sum(target, checksum):
+ raise ValidationError(target)
+ return out
+
+ return wrapper
+
+ return validate_decorator
+
+
+@validate(MD5_XLSX)
+def download_xlsx(target_path=None):
+ urlretrieve(XLSX_URL, target_path)
+
+
+def format_ym(year, month):
+ midx = MONTHS[month]
+ return "%i-%02d" % (int(year), midx)
+
+
+@validate(MD5_JSON)
+def write_json(xlsx_path, target_path=None):
+ wb = xlrd.open_workbook(xlsx_path)
+ ws = wb.sheet_by_index(2)
+
+ # hardcoding these row indices, not worth doing it nicely
+ header = ws.row(2)
+ column_idx = [
+ i
+ for i, c in enumerate(header)
+ if c.ctype == xlrd.XL_CELL_NUMBER and 2003 <= c.value < 2020
+ ]
+
+ visitors = []
+
+ r_offset = 3
+ for c in column_idx:
+ for r in range(r_offset, r_offset + 12):
+ cell = ws.cell(r, c)
+ if cell.ctype == xlrd.XL_CELL_EMPTY:
+ continue
+ year = header[c].value
+ month = ws.cell(r, 0).value
+ datestr = format_ym(year, month)
+ # eliminate some observations that were not in the original dataset
+ if datestr in ["2019-08", "2019-09", "2019-10"]:
+ continue
+ item = {"time": datestr, "value": int(cell.value)}
+ visitors.append(item)
+
+ name = "iceland_tourism"
+ longname = "Iceland Tourism"
+
+ data = {
+ "name": name,
+ "longname": longname,
+ "n_obs": len(visitors),
+ "n_dim": 1,
+ "time": {
+ "format": "%Y-%m",
+ "index": list(range(len(visitors))),
+ "raw": [v["time"] for v in visitors],
+ },
+ "series": [
+ {
+ "label": "Visitor Number",
+ "type": "int",
+ "raw": [v["value"] for v in visitors],
+ }
+ ],
+ }
+
+ with open(target_path, "w") as fp:
+ json.dump(data, fp, indent="\t")
+
+
+def collect(output_dir="."):
+ xlsx_path = os.path.join(output_dir, NAME_XLSX)
+ json_path = os.path.join(output_dir, NAME_JSON)
+
+ download_xlsx(target_path=xlsx_path)
+ write_json(xlsx_path, target_path=json_path)
+
+
+def clean(output_dir="."):
+ xlsx_path = os.path.join(output_dir, NAME_XLSX)
+ json_path = os.path.join(output_dir, NAME_JSON)
+
+ if os.path.exists(xlsx_path):
+ os.unlink(xlsx_path)
+ if os.path.exists(json_path):
+ os.unlink(json_path)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-o", "--output-dir", help="output directory to use", default="."
+ )
+ parser.add_argument(
+ "action",
+ choices=["collect", "clean"],
+ help="Action to perform",
+ default="collect",
+ nargs="?",
+ )
+ return parser.parse_args()
+
+
+def main(output_dir="."):
+ args = parse_args()
+ if args.action == "collect":
+ collect(output_dir=args.output_dir)
+ elif args.action == "clean":
+ clean(output_dir=args.output_dir)
+
+
+if __name__ == "__main__":
+ main()