aboutsummaryrefslogtreecommitdiff
path: root/app/utils/datasets.py
blob: 078535f0c852b1a7d8dc9ceea3ef116033300ec6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-

"""
Dataset handling

The dataset model is a JSON object as follows:

    {
        "name": "name of the dataset",
        "n_obs": number of observations,
        "n_dim": number of dimensions,
        "series": {
            "V1": {
                "type": "float",
                "raw": [list of observations]
                },
            "V2": {
                "type": "int",
                "raw": [list of observations]
            },
            "V3": {
                "type": "category",
                "levels": ["A", "B", "C"],
                "raw": [list of observations]
                }
            }
    }

Missing values must be denoted by 'NaN' (this is understood by the JSON 
decoder).

Author: Gertjan van den Burg

"""

import re
import json
import hashlib


def validate_dataset(filename):
    """ Validate a dataset uploaded to the webapp
    Return None on success and a string error on failure
    """

    with open(filename, "rb") as fid:
        try:
            data = json.load(fid)
        except json.JSONDecodeError as err:
            return "JSON decoding error: %s" % err.msg

    required_keys = ["name", "n_obs", "n_dim", "series"]
    for key in required_keys:
        if not key in data:
            return "Required key missing: %s" % key

    if not re.fullmatch("\w+", data["name"]):
        return "Name can only contain characters in the set [a-zA-Z0-9_]"

    if len(data["series"]) != data["n_dim"]:
        return "Number of dimensions and number of series don't match"

    required_keys = ["type", "raw"]
    for idx, var in enumerate(data["series"]):
        if not var == "V%i" % (idx + 1):
            return "Unexpected variable name, expected 'V<int>', got %s" % var
        vardict = data["series"][var]
        for key in required_keys:
            if not key in vardict:
                return "Key '%s' missing for variable '%s'" % (key, var)
        if vardict["type"] == "category":
            if not "levels" in vardict:
                return (
                    "Variable '%s' has categorical type but 'levels' is missing"
                    % (var)
                )
        if not len(vardict["raw"]) == data["n_obs"]:
            return (
                "Length of data for variable '%s' not equal to n_obs = %i"
                % (var, data["n_obs"])
            )

    return None


def get_name_from_dataset(filename):
    with open(filename, "rb") as fid:
        data = json.load(fid)
    return data["name"]


def md5sum(filename):
    """ Compute the MD5 hash for a given filename """
    blocksize = 65536
    hasher = hashlib.md5()
    with open(filename, "rb") as fid:
        buf = fid.read(blocksize)
        while len(buf) > 0:
            hasher.update(buf)
            buf = fid.read(blocksize)
    return hasher.hexdigest()