From 3a2404010f8c0fdb3a9e9940202f59b84cb2791f Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 10 Jun 2019 14:34:48 +0100 Subject: Validate data according to a schema It became clear that a formal schema would make it easier to validate data. This is now added and the code is updated to work with this schema. --- app/utils/dataset_schema.json | 130 ++++++++++++++++++++++++++++++++++++++++++ app/utils/datasets.py | 93 ++++++++++++------------------ 2 files changed, 168 insertions(+), 55 deletions(-) create mode 100644 app/utils/dataset_schema.json (limited to 'app/utils') diff --git a/app/utils/dataset_schema.json b/app/utils/dataset_schema.json new file mode 100644 index 0000000..2aec504 --- /dev/null +++ b/app/utils/dataset_schema.json @@ -0,0 +1,130 @@ +{ + "definitions": {}, + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://example.com/root.json", + "type": "object", + "title": "Dataset Schema", + "default": null, + "required": [ + "name", + "n_obs", + "n_dim", + "series" + ], + "properties": { + "name": { + "$id": "#/properties/name", + "type": "string", + "title": "The Name Schema", + "default": "", + "pattern": "^(.*)$" + }, + "n_obs": { + "$id": "#/properties/n_obs", + "type": "integer", + "title": "The N_obs Schema", + "default": 0 + }, + "n_dim": { + "$id": "#/properties/n_dim", + "type": "integer", + "title": "The N_dim Schema", + "default": 0 + }, + "demo": { + "$id": "#/properties/demo", + "type": "object", + "title": "The Demo Schema", + "properties": { + "true_CPs": { + "$id": "#/properties/demo/properties/true_CPs", + "type": "array", + "items": { + "$id": "#/properties/demo/properties/true_CPs/items", + "type": "integer", + "title": "The Items Schema", + "default": null + } + } + } + }, + "time": { + "$id": "#/properties/time", + "type": "object", + "title": "The Time Schema", + "default": null, + "required": [ + "type", + "format", + "raw" + ], + "properties": { + "type": { + "$id": "#/properties/time/properties/type", + "type": "string", + "title": "The Type Schema", + "default": "", + "pattern": "^(.*)$" + }, + "format": { + "$id": "#/properties/time/properties/format", + "type": "string", + "title": "The Format Schema", + "default": "", + "pattern": "^(.*)$" + }, + "raw": { + "$id": "#/properties/time/properties/raw", + "type": "array", + "title": "The Raw Schema", + "items": { + "$id": "#/properties/time/properties/raw/items", + "title": "The Items Schema", + "default": "" + } + } + } + }, + "series": { + "$id": "#/properties/series", + "type": "array", + "title": "The Series Schema", + "items": { + "$id": "#/properties/series/items", + "type": "object", + "title": "The Variable Schema", + "default": null, + "properties": { + "label": { + "$id": "#/properties/series/items/properties/label", + "type": "string", + "title": "The Label Schema", + "default": "", + "pattern": "^(.*)$" + }, + "type": { + "$id": "#/properties/series/items/properties/type", + "type": "string", + "title": "The Type Schema", + "default": "", + "pattern": "^(.*)$" + }, + "raw": { + "$id": "#/properties/series/items/properties/raw", + "type": "array", + "title": "The Raw Schema", + "items": { + "$id": "#/properties/series/items/properties/raw/items", + "title": "The Items Schema", + "default": 0 + } + } + }, + "required": [ + "type", + "raw" + ] + } + } + } +} diff --git a/app/utils/datasets.py b/app/utils/datasets.py index 1fef85f..16647f0 100644 --- a/app/utils/datasets.py +++ b/app/utils/datasets.py @@ -3,28 +3,9 @@ """ Dataset handling -The dataset model is a JSON object as follows: - - { - "name": "name of the dataset", - "n_obs": number of observations, - "n_dim": number of dimensions, - "series": { - "V1": { - "type": "float", - "raw": [list of observations] - }, - "V2": { - "type": "int", - "raw": [list of observations] - }, - "V3": { - "type": "category", - "levels": ["A", "B", "C"], - "raw": [list of observations] - } - } - } +The dataset model is defined in the adjacent 'dataset_schema.json' file, which +is a JSONSchema schema definition. It can be easily edited at +www.jsonschema.net or yapi.demo.qunar.com/editor/ Missing values must be denoted by 'NaN' (this is understood by the JSON decoder). @@ -35,56 +16,56 @@ Author: Gertjan van den Burg import hashlib import json +import jsonschema import logging import os -import re from flask import current_app LOGGER = logging.getLogger(__file__) +def load_schema(): + pth = os.path.abspath(__file__) + basedir = os.path.dirname(pth) + schema_file = os.path.join(basedir, "dataset_schema.json") + if not os.path.exists(schema_file): + raise FileNotFoundError(schema_file) + with open(schema_file, "rb") as fp: + schema = json.load(fp) + return schema + + def validate_dataset(filename): - """ Validate a dataset uploaded to the webapp - Return None on success and a string error on failure - """ + if not os.path.exists(filename): + return "File not found." - with open(filename, "rb") as fid: + with open(filename, "rb") as fp: try: - data = json.load(fid) + data = json.load(fp) except json.JSONDecodeError as err: return "JSON decoding error: %s" % err.msg - required_keys = ["name", "n_obs", "n_dim", "series"] - for key in required_keys: - if not key in data: - return "Required key missing: %s" % key + try: + schema = load_schema() + except FileNotFoundError: + return "Schema file not found." - if not re.fullmatch("\w+", data["name"]): - return "Name can only contain characters in the set [a-zA-Z0-9_]" + try: + jsonschema.validate(instance=data, schema=schema) + except jsonschema.ValidationError as err: + return "JSONSchema validation error: %s" % err.msg if len(data["series"]) != data["n_dim"]: return "Number of dimensions and number of series don't match" - required_keys = ["type", "raw"] - for idx, var in enumerate(data["series"]): - if not var == "V%i" % (idx + 1): - return "Unexpected variable name, expected 'V', got %s" % var - vardict = data["series"][var] - for key in required_keys: - if not key in vardict: - return "Key '%s' missing for variable '%s'" % (key, var) - if vardict["type"] == "category": - if not "levels" in vardict: - return ( - "Variable '%s' has categorical type but 'levels' is missing" - % (var) - ) - if not len(vardict["raw"]) == data["n_obs"]: - return ( - "Length of data for variable '%s' not equal to n_obs = %i" - % (var, data["n_obs"]) - ) + if "time" in data.keys(): + if len(data["time"]["raw"]) != data["n_obs"]: + return "Number of time points doesn't match number of observations" + + for var in data["series"]: + if len(var["raw"]) != data["n_obs"]: + return "Number of observations doesn't match for %s" % var["label"] return None @@ -98,7 +79,7 @@ def get_name_from_dataset(filename): def dataset_is_demo(filename): with open(filename, "rb") as fid: data = json.load(fid) - return "demo" in data + return "demo" in data.keys() def get_demo_true_cps(name): @@ -153,5 +134,7 @@ def load_data_for_chart(name, known_md5): return None with open(target_filename, "rb") as fid: data = json.load(fid) - chart_data = [{"value": x} for x in data["series"]["V1"]["raw"]] + + chart_data = {"time": data["time"] if "time" in data else None, "values": + data["series"]} return {"chart_data": chart_data} -- cgit v1.2.3