aboutsummaryrefslogtreecommitdiff
path: root/app/utils
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-06-10 14:34:48 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-06-10 14:34:48 +0100
commit3a2404010f8c0fdb3a9e9940202f59b84cb2791f (patch)
tree3de6bfe5db4bf623e38d335899a715d3e8c65c76 /app/utils
parentUse tojson instead of safe (diff)
downloadAnnotateChange-3a2404010f8c0fdb3a9e9940202f59b84cb2791f.tar.gz
AnnotateChange-3a2404010f8c0fdb3a9e9940202f59b84cb2791f.zip
Validate data according to a schema
It became clear that a formal schema would make it easier to validate data. This is now added and the code is updated to work with this schema.
Diffstat (limited to 'app/utils')
-rw-r--r--app/utils/dataset_schema.json130
-rw-r--r--app/utils/datasets.py93
2 files changed, 168 insertions, 55 deletions
diff --git a/app/utils/dataset_schema.json b/app/utils/dataset_schema.json
new file mode 100644
index 0000000..2aec504
--- /dev/null
+++ b/app/utils/dataset_schema.json
@@ -0,0 +1,130 @@
+{
+ "definitions": {},
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "$id": "http://example.com/root.json",
+ "type": "object",
+ "title": "Dataset Schema",
+ "default": null,
+ "required": [
+ "name",
+ "n_obs",
+ "n_dim",
+ "series"
+ ],
+ "properties": {
+ "name": {
+ "$id": "#/properties/name",
+ "type": "string",
+ "title": "The Name Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "n_obs": {
+ "$id": "#/properties/n_obs",
+ "type": "integer",
+ "title": "The N_obs Schema",
+ "default": 0
+ },
+ "n_dim": {
+ "$id": "#/properties/n_dim",
+ "type": "integer",
+ "title": "The N_dim Schema",
+ "default": 0
+ },
+ "demo": {
+ "$id": "#/properties/demo",
+ "type": "object",
+ "title": "The Demo Schema",
+ "properties": {
+ "true_CPs": {
+ "$id": "#/properties/demo/properties/true_CPs",
+ "type": "array",
+ "items": {
+ "$id": "#/properties/demo/properties/true_CPs/items",
+ "type": "integer",
+ "title": "The Items Schema",
+ "default": null
+ }
+ }
+ }
+ },
+ "time": {
+ "$id": "#/properties/time",
+ "type": "object",
+ "title": "The Time Schema",
+ "default": null,
+ "required": [
+ "type",
+ "format",
+ "raw"
+ ],
+ "properties": {
+ "type": {
+ "$id": "#/properties/time/properties/type",
+ "type": "string",
+ "title": "The Type Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "format": {
+ "$id": "#/properties/time/properties/format",
+ "type": "string",
+ "title": "The Format Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "raw": {
+ "$id": "#/properties/time/properties/raw",
+ "type": "array",
+ "title": "The Raw Schema",
+ "items": {
+ "$id": "#/properties/time/properties/raw/items",
+ "title": "The Items Schema",
+ "default": ""
+ }
+ }
+ }
+ },
+ "series": {
+ "$id": "#/properties/series",
+ "type": "array",
+ "title": "The Series Schema",
+ "items": {
+ "$id": "#/properties/series/items",
+ "type": "object",
+ "title": "The Variable Schema",
+ "default": null,
+ "properties": {
+ "label": {
+ "$id": "#/properties/series/items/properties/label",
+ "type": "string",
+ "title": "The Label Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "type": {
+ "$id": "#/properties/series/items/properties/type",
+ "type": "string",
+ "title": "The Type Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "raw": {
+ "$id": "#/properties/series/items/properties/raw",
+ "type": "array",
+ "title": "The Raw Schema",
+ "items": {
+ "$id": "#/properties/series/items/properties/raw/items",
+ "title": "The Items Schema",
+ "default": 0
+ }
+ }
+ },
+ "required": [
+ "type",
+ "raw"
+ ]
+ }
+ }
+ }
+}
diff --git a/app/utils/datasets.py b/app/utils/datasets.py
index 1fef85f..16647f0 100644
--- a/app/utils/datasets.py
+++ b/app/utils/datasets.py
@@ -3,28 +3,9 @@
"""
Dataset handling
-The dataset model is a JSON object as follows:
-
- {
- "name": "name of the dataset",
- "n_obs": number of observations,
- "n_dim": number of dimensions,
- "series": {
- "V1": {
- "type": "float",
- "raw": [list of observations]
- },
- "V2": {
- "type": "int",
- "raw": [list of observations]
- },
- "V3": {
- "type": "category",
- "levels": ["A", "B", "C"],
- "raw": [list of observations]
- }
- }
- }
+The dataset model is defined in the adjacent 'dataset_schema.json' file, which
+is a JSONSchema schema definition. It can be easily edited at
+www.jsonschema.net or yapi.demo.qunar.com/editor/
Missing values must be denoted by 'NaN' (this is understood by the JSON
decoder).
@@ -35,56 +16,56 @@ Author: Gertjan van den Burg
import hashlib
import json
+import jsonschema
import logging
import os
-import re
from flask import current_app
LOGGER = logging.getLogger(__file__)
+def load_schema():
+ pth = os.path.abspath(__file__)
+ basedir = os.path.dirname(pth)
+ schema_file = os.path.join(basedir, "dataset_schema.json")
+ if not os.path.exists(schema_file):
+ raise FileNotFoundError(schema_file)
+ with open(schema_file, "rb") as fp:
+ schema = json.load(fp)
+ return schema
+
+
def validate_dataset(filename):
- """ Validate a dataset uploaded to the webapp
- Return None on success and a string error on failure
- """
+ if not os.path.exists(filename):
+ return "File not found."
- with open(filename, "rb") as fid:
+ with open(filename, "rb") as fp:
try:
- data = json.load(fid)
+ data = json.load(fp)
except json.JSONDecodeError as err:
return "JSON decoding error: %s" % err.msg
- required_keys = ["name", "n_obs", "n_dim", "series"]
- for key in required_keys:
- if not key in data:
- return "Required key missing: %s" % key
+ try:
+ schema = load_schema()
+ except FileNotFoundError:
+ return "Schema file not found."
- if not re.fullmatch("\w+", data["name"]):
- return "Name can only contain characters in the set [a-zA-Z0-9_]"
+ try:
+ jsonschema.validate(instance=data, schema=schema)
+ except jsonschema.ValidationError as err:
+ return "JSONSchema validation error: %s" % err.msg
if len(data["series"]) != data["n_dim"]:
return "Number of dimensions and number of series don't match"
- required_keys = ["type", "raw"]
- for idx, var in enumerate(data["series"]):
- if not var == "V%i" % (idx + 1):
- return "Unexpected variable name, expected 'V<int>', got %s" % var
- vardict = data["series"][var]
- for key in required_keys:
- if not key in vardict:
- return "Key '%s' missing for variable '%s'" % (key, var)
- if vardict["type"] == "category":
- if not "levels" in vardict:
- return (
- "Variable '%s' has categorical type but 'levels' is missing"
- % (var)
- )
- if not len(vardict["raw"]) == data["n_obs"]:
- return (
- "Length of data for variable '%s' not equal to n_obs = %i"
- % (var, data["n_obs"])
- )
+ if "time" in data.keys():
+ if len(data["time"]["raw"]) != data["n_obs"]:
+ return "Number of time points doesn't match number of observations"
+
+ for var in data["series"]:
+ if len(var["raw"]) != data["n_obs"]:
+ return "Number of observations doesn't match for %s" % var["label"]
return None
@@ -98,7 +79,7 @@ def get_name_from_dataset(filename):
def dataset_is_demo(filename):
with open(filename, "rb") as fid:
data = json.load(fid)
- return "demo" in data
+ return "demo" in data.keys()
def get_demo_true_cps(name):
@@ -153,5 +134,7 @@ def load_data_for_chart(name, known_md5):
return None
with open(target_filename, "rb") as fid:
data = json.load(fid)
- chart_data = [{"value": x} for x in data["series"]["V1"]["raw"]]
+
+ chart_data = {"time": data["time"] if "time" in data else None, "values":
+ data["series"]}
return {"chart_data": chart_data}