From 3a2404010f8c0fdb3a9e9940202f59b84cb2791f Mon Sep 17 00:00:00 2001
From: Gertjan van den Burg <gertjanvandenburg@gmail.com>
Date: Mon, 10 Jun 2019 14:34:48 +0100
Subject: Validate data according to a schema

It became clear that a formal schema would
make it easier to validate data. This is now
added and the code is updated to work with
this schema.
---
 app/utils/dataset_schema.json | 130 ++++++++++++++++++++++++++++++++++++++++++
 app/utils/datasets.py         |  93 ++++++++++++------------------
 2 files changed, 168 insertions(+), 55 deletions(-)
 create mode 100644 app/utils/dataset_schema.json

(limited to 'app/utils')

diff --git a/app/utils/dataset_schema.json b/app/utils/dataset_schema.json
new file mode 100644
index 0000000..2aec504
--- /dev/null
+++ b/app/utils/dataset_schema.json
@@ -0,0 +1,130 @@
+{
+	"definitions": {},
+	"$schema": "http://json-schema.org/draft-07/schema#",
+	"$id": "http://example.com/root.json",
+	"type": "object",
+	"title": "Dataset Schema",
+	"default": null,
+	"required": [
+		"name",
+		"n_obs",
+		"n_dim",
+		"series"
+	],
+	"properties": {
+		"name": {
+			"$id": "#/properties/name",
+			"type": "string",
+			"title": "The Name Schema",
+			"default": "",
+			"pattern": "^(.*)$"
+		},
+		"n_obs": {
+			"$id": "#/properties/n_obs",
+			"type": "integer",
+			"title": "The N_obs Schema",
+			"default": 0
+		},
+		"n_dim": {
+			"$id": "#/properties/n_dim",
+			"type": "integer",
+			"title": "The N_dim Schema",
+			"default": 0
+		},
+		"demo": {
+			"$id": "#/properties/demo",
+			"type": "object",
+			"title": "The Demo Schema",
+			"properties": {
+				"true_CPs": {
+					"$id": "#/properties/demo/properties/true_CPs",
+					"type": "array",
+					"items": {
+						"$id": "#/properties/demo/properties/true_CPs/items",
+						"type": "integer",
+						"title": "The Items Schema",
+						"default": null
+					}
+				}
+			}
+		},
+		"time": {
+			"$id": "#/properties/time",
+			"type": "object",
+			"title": "The Time Schema",
+			"default": null,
+			"required": [
+				"type",
+				"format",
+				"raw"
+			],
+			"properties": {
+				"type": {
+					"$id": "#/properties/time/properties/type",
+					"type": "string",
+					"title": "The Type Schema",
+					"default": "",
+					"pattern": "^(.*)$"
+				},
+				"format": {
+					"$id": "#/properties/time/properties/format",
+					"type": "string",
+					"title": "The Format Schema",
+					"default": "",
+					"pattern": "^(.*)$"
+				},
+				"raw": {
+					"$id": "#/properties/time/properties/raw",
+					"type": "array",
+					"title": "The Raw Schema",
+					"items": {
+						"$id": "#/properties/time/properties/raw/items",
+						"title": "The Items Schema",
+						"default": ""
+					}
+				}
+			}
+		},
+		"series": {
+			"$id": "#/properties/series",
+			"type": "array",
+			"title": "The Series Schema",
+			"items": {
+				"$id": "#/properties/series/items",
+				"type": "object",
+				"title": "The Variable Schema",
+				"default": null,
+				"properties": {
+					"label": {
+						"$id": "#/properties/series/items/properties/label",
+						"type": "string",
+						"title": "The Label Schema",
+						"default": "",
+						"pattern": "^(.*)$"
+					},
+					"type": {
+						"$id": "#/properties/series/items/properties/type",
+						"type": "string",
+						"title": "The Type Schema",
+						"default": "",
+						"pattern": "^(.*)$"
+					},
+					"raw": {
+						"$id": "#/properties/series/items/properties/raw",
+						"type": "array",
+						"title": "The Raw Schema",
+						"items": {
+							"$id": "#/properties/series/items/properties/raw/items",
+							"title": "The Items Schema",
+							"default": 0
+						}
+					}
+				},
+				"required": [
+					"type",
+					"raw"
+				]
+			}
+		}
+	}
+}
diff --git a/app/utils/datasets.py b/app/utils/datasets.py
index 1fef85f..16647f0 100644
--- a/app/utils/datasets.py
+++ b/app/utils/datasets.py
@@ -3,28 +3,9 @@
 """
 Dataset handling
 
-The dataset model is a JSON object as follows:
-
-    {
-        "name": "name of the dataset",
-        "n_obs": number of observations,
-        "n_dim": number of dimensions,
-        "series": {
-            "V1": {
-                "type": "float",
-                "raw": [list of observations]
-                },
-            "V2": {
-                "type": "int",
-                "raw": [list of observations]
-            },
-            "V3": {
-                "type": "category",
-                "levels": ["A", "B", "C"],
-                "raw": [list of observations]
-                }
-            }
-    }
+The dataset model is defined in the adjacent 'dataset_schema.json' file, which 
+is a JSONSchema schema definition. It can be easily edited at 
+www.jsonschema.net or yapi.demo.qunar.com/editor/
 
 Missing values must be denoted by 'NaN' (this is understood by the JSON 
 decoder).
@@ -35,56 +16,56 @@ Author: Gertjan van den Burg
 
 import hashlib
 import json
+import jsonschema
 import logging
 import os
-import re
 
 from flask import current_app
 
 LOGGER = logging.getLogger(__file__)
 
 
+def load_schema():
+    pth = os.path.abspath(__file__)
+    basedir = os.path.dirname(pth)
+    schema_file = os.path.join(basedir, "dataset_schema.json")
+    if not os.path.exists(schema_file):
+        raise FileNotFoundError(schema_file)
+    with open(schema_file, "rb") as fp:
+        schema = json.load(fp)
+    return schema
+
+
 def validate_dataset(filename):
-    """ Validate a dataset uploaded to the webapp
-    Return None on success and a string error on failure
-    """
+    if not os.path.exists(filename):
+        return "File not found."
 
-    with open(filename, "rb") as fid:
+    with open(filename, "rb") as fp:
         try:
-            data = json.load(fid)
+            data = json.load(fp)
         except json.JSONDecodeError as err:
             return "JSON decoding error: %s" % err.msg
 
-    required_keys = ["name", "n_obs", "n_dim", "series"]
-    for key in required_keys:
-        if not key in data:
-            return "Required key missing: %s" % key
+    try:
+        schema = load_schema()
+    except FileNotFoundError:
+        return "Schema file not found."
 
-    if not re.fullmatch("\w+", data["name"]):
-        return "Name can only contain characters in the set [a-zA-Z0-9_]"
+    try:
+        jsonschema.validate(instance=data, schema=schema)
+    except jsonschema.ValidationError as err:
+        return "JSONSchema validation error: %s" % err.msg
 
     if len(data["series"]) != data["n_dim"]:
         return "Number of dimensions and number of series don't match"
 
-    required_keys = ["type", "raw"]
-    for idx, var in enumerate(data["series"]):
-        if not var == "V%i" % (idx + 1):
-            return "Unexpected variable name, expected 'V<int>', got %s" % var
-        vardict = data["series"][var]
-        for key in required_keys:
-            if not key in vardict:
-                return "Key '%s' missing for variable '%s'" % (key, var)
-        if vardict["type"] == "category":
-            if not "levels" in vardict:
-                return (
-                    "Variable '%s' has categorical type but 'levels' is missing"
-                    % (var)
-                )
-        if not len(vardict["raw"]) == data["n_obs"]:
-            return (
-                "Length of data for variable '%s' not equal to n_obs = %i"
-                % (var, data["n_obs"])
-            )
+    if "time" in data.keys():
+        if len(data["time"]["raw"]) != data["n_obs"]:
+            return "Number of time points doesn't match number of observations"
+
+    for var in data["series"]:
+        if len(var["raw"]) != data["n_obs"]:
+            return "Number of observations doesn't match for %s" % var["label"]
 
     return None
 
@@ -98,7 +79,7 @@ def get_name_from_dataset(filename):
 def dataset_is_demo(filename):
     with open(filename, "rb") as fid:
         data = json.load(fid)
-    return "demo" in data
+    return "demo" in data.keys()
 
 
 def get_demo_true_cps(name):
@@ -153,5 +134,7 @@ def load_data_for_chart(name, known_md5):
         return None
     with open(target_filename, "rb") as fid:
         data = json.load(fid)
-    chart_data = [{"value": x} for x in data["series"]["V1"]["raw"]]
+
+    chart_data = {"time": data["time"] if "time" in data else None, "values": 
+            data["series"]}
     return {"chart_data": chart_data}
-- 
cgit v1.2.3