4 files changed, 192 insertions, 0 deletions
diff --git a/datasets/occupancy/.gitignore b/datasets/occupancy/.gitignore
new file mode 100644
index 0000000..958035f
--- /dev/null
+++ b/datasets/occupancy/.gitignore
@@ -0,0 +1,3 @@
+datatraining.txt
+occupancy.json
+old/
diff --git a/datasets/occupancy/README.md b/datasets/occupancy/README.md
new file mode 100644
index 0000000..0e961a5
--- /dev/null
+++ b/datasets/occupancy/README.md
@@ -0,0 +1,29 @@
+# Room occupancy data
+
+Dataset on detecting room occupancy based on several variables. For our 
+dataset we use the Temperature, Humidity, Light, and CO2 variables from the 
+training dataset.
+
+This dataset is obtained from the [UCI 
+repository](https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+)
+on 2019-06-10. As it is unclear whether the data can be redistributed as part 
+of this repository, we download it locally instead.
+
+The data is sampled at every 16 observations to reduce the length of the 
+series.
+
+When using this particular time series, please cite:
+
+```bib
+@article{candanedo2016accurate,
+	title={Accurate occupancy detection of an office room from light, temperature, humidity and $\text{CO}_2$ measurements using statistical learning models},
+	author={Candanedo, L. M. and Feldheim, V.},
+	journal={Energy and Buildings},
+	volume={112},
+	pages={28--39},
+	year={2016},
+	publisher={Elsevier}
+}
+```
+
+![Plot of occupancy dataset](./occupancy.png)
diff --git a/datasets/occupancy/get_occupancy.py b/datasets/occupancy/get_occupancy.py
new file mode 100644
index 0000000..0b590fa
--- /dev/null
+++ b/datasets/occupancy/get_occupancy.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Collect the occupancy dataset.
+
+See the README file for more information.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import clevercsv
+import hashlib
+import json
+import os
+
+from functools import wraps
+from urllib.request import urlretrieve
+
+SAMPLE = 16
+
+TXT_URL = "https://web.archive.org/web/20191128145102if_/https://raw.githubusercontent.com/LuisM78/Occupancy-detection-data/master/datatraining.txt"
+
+MD5_TXT = "e656cd731300cb444bd10fcd28071e37"
+MD5_JSON = "bc6cd9adaf496fe30bf0e417d2c3b0c6"
+
+NAME_TXT = "datatraining.txt"
+NAME_JSON = "occupancy.json"
+
+
+class ValidationError(Exception):
+    def __init__(self, filename):
+        message = (
+            "Validating the file '%s' failed. \n"
+            "Please raise an issue on the GitHub page for this project "
+            "if the error persists." % filename
+        )
+        super().__init__(message)
+
+
+def check_md5sum(filename, checksum):
+    with open(filename, "rb") as fp:
+        data = fp.read()
+    h = hashlib.md5(data).hexdigest()
+    return h == checksum
+
+
+def validate(checksum):
+    """Decorator that validates the target file."""
+
+    def validate_decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            target = kwargs.get("target_path", None)
+            if os.path.exists(target) and check_md5sum(target, checksum):
+                return
+            out = func(*args, **kwargs)
+            if not os.path.exists(target):
+                raise FileNotFoundError("Target file expected at: %s" % target)
+            if not check_md5sum(target, checksum):
+                raise ValidationError(target)
+            return out
+
+        return wrapper
+
+    return validate_decorator
+
+
+@validate(MD5_TXT)
+def download_txt(target_path=None):
+    urlretrieve(TXT_URL, target_path)
+
+
+@validate(MD5_JSON)
+def write_json(txt_path, target_path=None):
+    with open(txt_path, "r", newline="", encoding="ascii") as fp:
+        reader = clevercsv.reader(
+            fp, delimiter=",", quotechar='"', escapechar=""
+        )
+        rows = list(reader)
+
+    header = rows.pop(0)
+    header.insert(0, "id")
+    as_dicts = [dict(zip(header, r)) for r in rows]
+
+    var_include = ["Temperature", "Humidity", "Light", "CO2"]
+
+    time = [x["date"] for x in as_dicts]
+    time = [time[i] for i in range(0, len(time), SAMPLE)]
+
+    data = {
+        "name": "occupancy",
+        "longname": "Occupancy",
+        "n_obs": len(time),
+        "n_dim": len(var_include),
+        "time": {
+            "type": "string",
+            "format": "%Y-%m-%d %H:%M:%S",
+            "index": list(range(len(time))),
+            "raw": time,
+        },
+        "series": [],
+    }
+    for idx, var in enumerate(var_include, start=1):
+        lbl = "V%i" % idx
+        obs = [float(x[var]) for x in as_dicts]
+        obs = [obs[i] for i in range(0, len(obs), SAMPLE)]
+        data["series"].append({"label": lbl, "type": "float", "raw": obs})
+
+    with open(target_path, "w") as fp:
+        json.dump(data, fp, indent="\t")
+
+
+def collect(output_dir="."):
+    txt_path = os.path.join(output_dir, NAME_TXT)
+    json_path = os.path.join(output_dir, NAME_JSON)
+
+    download_txt(target_path=txt_path)
+    write_json(txt_path, target_path=json_path)
+
+
+def clean(output_dir="."):
+    txt_path = os.path.join(output_dir, NAME_TXT)
+    json_path = os.path.join(output_dir, NAME_JSON)
+
+    if os.path.exists(txt_path):
+        os.unlink(txt_path)
+    if os.path.exists(json_path):
+        os.unlink(json_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o", "--output-dir", help="output directory to use", default="."
+    )
+    parser.add_argument(
+        "action",
+        choices=["collect", "clean"],
+        help="Action to perform",
+        default="collect",
+        nargs="?",
+    )
+    return parser.parse_args()
+
+
+def main(output_dir="."):
+    args = parse_args()
+    if args.action == "collect":
+        collect(output_dir=args.output_dir)
+    elif args.action == "clean":
+        clean(output_dir=args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/occupancy/occupancy.png b/datasets/occupancy/occupancy.png
new file mode 100644
index 0000000..03fbc13
--- /dev/null
+++ b/datasets/occupancy/occupancy.png