app/utils/datasets.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

# -*- coding: utf-8 -*-

"""
Dataset handling

The dataset model is defined in the adjacent 'dataset_schema.json' file, which 
is a JSONSchema schema definition. It can be easily edited at 
www.jsonschema.net or yapi.demo.qunar.com/editor/

Missing values must be denoted by 'NaN' (this is understood by the JSON 
decoder).

Author: Gertjan van den Burg

"""

import hashlib
import json
import jsonschema
import logging
import math
import os

from flask import current_app

LOGGER = logging.getLogger(__file__)


def load_schema():
    pth = os.path.abspath(__file__)
    basedir = os.path.dirname(pth)
    schema_file = os.path.join(basedir, "dataset_schema.json")
    if not os.path.exists(schema_file):
        raise FileNotFoundError(schema_file)
    with open(schema_file, "rb") as fp:
        schema = json.load(fp)
    return schema


def validate_dataset(filename):
    if not os.path.exists(filename):
        return "File not found."

    with open(filename, "rb") as fp:
        try:
            data = json.load(fp)
        except json.JSONDecodeError as err:
            return "JSON decoding error: %s" % err.msg

    try:
        schema = load_schema()
    except FileNotFoundError:
        return "Schema file not found."

    try:
        jsonschema.validate(instance=data, schema=schema)
    except jsonschema.ValidationError as err:
        return "JSONSchema validation error: %s" % err.message

    if len(data["series"]) != data["n_dim"]:
        return "Number of dimensions and number of series don't match"

    if "time" in data.keys():
        if len(data["time"]["raw"]) != data["n_obs"]:
            return "Number of time points doesn't match number of observations"
        if None in data["time"]["raw"]:
            return "Null is not supported in time axis. Use 'NaN' instead."

    has_missing = False
    for var in data["series"]:
        if len(var["raw"]) != data["n_obs"]:
            return "Number of observations doesn't match for %s" % var["label"]
        if None in var["raw"]:
            return "Null is not supported in series. Use 'NaN' instead."
        has_missing = has_missing or any(map(math.isnan, var["raw"]))

    # this doesn't happen in any dataset yet, so let's not implement it until 
    # we need it.
    if data["n_dim"] > 1 and has_missing:
        return "Missing values are not yet supported for multidimensional data"

    return None


def get_name_from_dataset(filename):
    with open(filename, "rb") as fid:
        data = json.load(fid)
    return data["name"]


def dataset_is_demo(filename):
    with open(filename, "rb") as fid:
        data = json.load(fid)
    return "demo" in data.keys()


def get_demo_true_cps(name):
    dataset_dir = os.path.join(
        current_app.instance_path, current_app.config["DATASET_DIR"]
    )
    target_filename = os.path.join(dataset_dir, name + ".json")
    if not os.path.exists(target_filename):
        LOGGER.error("Dataset with name '%s' can't be found!" % name)
        return None
    with open(target_filename, "rb") as fid:
        data = json.load(fid)
    if not "demo" in data:
        LOGGER.error("Asked for 'demo' key in non-demo dataset '%s'" % name)
        return None
    if not "true_CPs" in data["demo"]:
        LOGGER.error(
            "Expected field'true_cps' field missing for dataset '%s'" % name
        )
    return data["demo"]["true_CPs"]


def md5sum(filename):
    """ Compute the MD5 hash for a given filename """
    blocksize = 65536
    hasher = hashlib.md5()
    with open(filename, "rb") as fid:
        buf = fid.read(blocksize)
        while len(buf) > 0:
            hasher.update(buf)
            buf = fid.read(blocksize)
    return hasher.hexdigest()


def load_data_for_chart(name, known_md5):
    dataset_dir = os.path.join(
        current_app.instance_path, current_app.config["DATASET_DIR"]
    )
    target_filename = os.path.join(dataset_dir, name + ".json")
    if not os.path.exists(target_filename):
        LOGGER.error("Dataset with name '%s' can't be found!" % name)
        return None
    if not md5sum(target_filename) == known_md5:
        LOGGER.error(
            """
        MD5 checksum failed for dataset with name: %s.
        Found: %s.
        Expected: %s.
        """
            % (name, md5sum(target_filename), known_md5)
        )
        return None
    with open(target_filename, "rb") as fid:
        data = json.load(fid)

    chart_data = {
        "time": data["time"] if "time" in data else None,
        "values": data["series"],
    }
    return {"chart_data": chart_data}