aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-06-10 14:34:48 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-06-10 14:34:48 +0100
commit3a2404010f8c0fdb3a9e9940202f59b84cb2791f (patch)
tree3de6bfe5db4bf623e38d335899a715d3e8c65c76
parentUse tojson instead of safe (diff)
downloadAnnotateChange-3a2404010f8c0fdb3a9e9940202f59b84cb2791f.tar.gz
AnnotateChange-3a2404010f8c0fdb3a9e9940202f59b84cb2791f.zip
Validate data according to a schema
It became clear that a formal schema would make it easier to validate data. This is now added and the code is updated to work with this schema.
-rw-r--r--app/static/js/makeChart.js12
-rw-r--r--app/utils/dataset_schema.json130
-rw-r--r--app/utils/datasets.py93
-rw-r--r--poetry.lock31
-rw-r--r--pyproject.toml1
5 files changed, 205 insertions, 62 deletions
diff --git a/app/static/js/makeChart.js b/app/static/js/makeChart.js
index 85d743b..1a0283d 100644
--- a/app/static/js/makeChart.js
+++ b/app/static/js/makeChart.js
@@ -3,10 +3,12 @@
function preprocessData(data) {
var n = 0;
- data.forEach(function(d) {
- d.X = n++;
- d.Y = d.value;
- });
+ cleanData = [];
+ for (i=0; i<data.values[0].raw.length; i++) {
+ d = data.values[0].raw[i];
+ cleanData.push({"X": n++, "Y": d});
+ }
+ return cleanData;
}
function scaleAndAxis(data, width, height) {
@@ -50,7 +52,7 @@ function noZoom() {
function baseChart(selector, data, clickFunction, annotations, annotationFunction) {
// preprocess the data
- preprocessData(data);
+ data = preprocessData(data);
var divWidth = 1000;
var divHeight = 480;
diff --git a/app/utils/dataset_schema.json b/app/utils/dataset_schema.json
new file mode 100644
index 0000000..2aec504
--- /dev/null
+++ b/app/utils/dataset_schema.json
@@ -0,0 +1,130 @@
+{
+ "definitions": {},
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "$id": "http://example.com/root.json",
+ "type": "object",
+ "title": "Dataset Schema",
+ "default": null,
+ "required": [
+ "name",
+ "n_obs",
+ "n_dim",
+ "series"
+ ],
+ "properties": {
+ "name": {
+ "$id": "#/properties/name",
+ "type": "string",
+ "title": "The Name Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "n_obs": {
+ "$id": "#/properties/n_obs",
+ "type": "integer",
+ "title": "The N_obs Schema",
+ "default": 0
+ },
+ "n_dim": {
+ "$id": "#/properties/n_dim",
+ "type": "integer",
+ "title": "The N_dim Schema",
+ "default": 0
+ },
+ "demo": {
+ "$id": "#/properties/demo",
+ "type": "object",
+ "title": "The Demo Schema",
+ "properties": {
+ "true_CPs": {
+ "$id": "#/properties/demo/properties/true_CPs",
+ "type": "array",
+ "items": {
+ "$id": "#/properties/demo/properties/true_CPs/items",
+ "type": "integer",
+ "title": "The Items Schema",
+ "default": null
+ }
+ }
+ }
+ },
+ "time": {
+ "$id": "#/properties/time",
+ "type": "object",
+ "title": "The Time Schema",
+ "default": null,
+ "required": [
+ "type",
+ "format",
+ "raw"
+ ],
+ "properties": {
+ "type": {
+ "$id": "#/properties/time/properties/type",
+ "type": "string",
+ "title": "The Type Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "format": {
+ "$id": "#/properties/time/properties/format",
+ "type": "string",
+ "title": "The Format Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "raw": {
+ "$id": "#/properties/time/properties/raw",
+ "type": "array",
+ "title": "The Raw Schema",
+ "items": {
+ "$id": "#/properties/time/properties/raw/items",
+ "title": "The Items Schema",
+ "default": ""
+ }
+ }
+ }
+ },
+ "series": {
+ "$id": "#/properties/series",
+ "type": "array",
+ "title": "The Series Schema",
+ "items": {
+ "$id": "#/properties/series/items",
+ "type": "object",
+ "title": "The Variable Schema",
+ "default": null,
+ "properties": {
+ "label": {
+ "$id": "#/properties/series/items/properties/label",
+ "type": "string",
+ "title": "The Label Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "type": {
+ "$id": "#/properties/series/items/properties/type",
+ "type": "string",
+ "title": "The Type Schema",
+ "default": "",
+ "pattern": "^(.*)$"
+ },
+ "raw": {
+ "$id": "#/properties/series/items/properties/raw",
+ "type": "array",
+ "title": "The Raw Schema",
+ "items": {
+ "$id": "#/properties/series/items/properties/raw/items",
+ "title": "The Items Schema",
+ "default": 0
+ }
+ }
+ },
+ "required": [
+ "type",
+ "raw"
+ ]
+ }
+ }
+ }
+}
diff --git a/app/utils/datasets.py b/app/utils/datasets.py
index 1fef85f..16647f0 100644
--- a/app/utils/datasets.py
+++ b/app/utils/datasets.py
@@ -3,28 +3,9 @@
"""
Dataset handling
-The dataset model is a JSON object as follows:
-
- {
- "name": "name of the dataset",
- "n_obs": number of observations,
- "n_dim": number of dimensions,
- "series": {
- "V1": {
- "type": "float",
- "raw": [list of observations]
- },
- "V2": {
- "type": "int",
- "raw": [list of observations]
- },
- "V3": {
- "type": "category",
- "levels": ["A", "B", "C"],
- "raw": [list of observations]
- }
- }
- }
+The dataset model is defined in the adjacent 'dataset_schema.json' file, which
+is a JSONSchema schema definition. It can be easily edited at
+www.jsonschema.net or yapi.demo.qunar.com/editor/
Missing values must be denoted by 'NaN' (this is understood by the JSON
decoder).
@@ -35,56 +16,56 @@ Author: Gertjan van den Burg
import hashlib
import json
+import jsonschema
import logging
import os
-import re
from flask import current_app
LOGGER = logging.getLogger(__file__)
+def load_schema():
+ pth = os.path.abspath(__file__)
+ basedir = os.path.dirname(pth)
+ schema_file = os.path.join(basedir, "dataset_schema.json")
+ if not os.path.exists(schema_file):
+ raise FileNotFoundError(schema_file)
+ with open(schema_file, "rb") as fp:
+ schema = json.load(fp)
+ return schema
+
+
def validate_dataset(filename):
- """ Validate a dataset uploaded to the webapp
- Return None on success and a string error on failure
- """
+ if not os.path.exists(filename):
+ return "File not found."
- with open(filename, "rb") as fid:
+ with open(filename, "rb") as fp:
try:
- data = json.load(fid)
+ data = json.load(fp)
except json.JSONDecodeError as err:
return "JSON decoding error: %s" % err.msg
- required_keys = ["name", "n_obs", "n_dim", "series"]
- for key in required_keys:
- if not key in data:
- return "Required key missing: %s" % key
+ try:
+ schema = load_schema()
+ except FileNotFoundError:
+ return "Schema file not found."
- if not re.fullmatch("\w+", data["name"]):
- return "Name can only contain characters in the set [a-zA-Z0-9_]"
+ try:
+ jsonschema.validate(instance=data, schema=schema)
+ except jsonschema.ValidationError as err:
+ return "JSONSchema validation error: %s" % err.msg
if len(data["series"]) != data["n_dim"]:
return "Number of dimensions and number of series don't match"
- required_keys = ["type", "raw"]
- for idx, var in enumerate(data["series"]):
- if not var == "V%i" % (idx + 1):
- return "Unexpected variable name, expected 'V<int>', got %s" % var
- vardict = data["series"][var]
- for key in required_keys:
- if not key in vardict:
- return "Key '%s' missing for variable '%s'" % (key, var)
- if vardict["type"] == "category":
- if not "levels" in vardict:
- return (
- "Variable '%s' has categorical type but 'levels' is missing"
- % (var)
- )
- if not len(vardict["raw"]) == data["n_obs"]:
- return (
- "Length of data for variable '%s' not equal to n_obs = %i"
- % (var, data["n_obs"])
- )
+ if "time" in data.keys():
+ if len(data["time"]["raw"]) != data["n_obs"]:
+ return "Number of time points doesn't match number of observations"
+
+ for var in data["series"]:
+ if len(var["raw"]) != data["n_obs"]:
+ return "Number of observations doesn't match for %s" % var["label"]
return None
@@ -98,7 +79,7 @@ def get_name_from_dataset(filename):
def dataset_is_demo(filename):
with open(filename, "rb") as fid:
data = json.load(fid)
- return "demo" in data
+ return "demo" in data.keys()
def get_demo_true_cps(name):
@@ -153,5 +134,7 @@ def load_data_for_chart(name, known_md5):
return None
with open(target_filename, "rb") as fid:
data = json.load(fid)
- chart_data = [{"value": x} for x in data["series"]["V1"]["raw"]]
+
+ chart_data = {"time": data["time"] if "time" in data else None, "values":
+ data["series"]}
return {"chart_data": chart_data}
diff --git a/poetry.lock b/poetry.lock
index 2ce758e..48b0d66 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -29,7 +29,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
version = "1.3.0"
[[package]]
-category = "dev"
+category = "main"
description = "Classes Without Boilerplate"
name = "attrs"
optional = false
@@ -237,6 +237,20 @@ MarkupSafe = ">=0.23"
[[package]]
category = "main"
+description = "An implementation of JSON Schema validation for Python"
+name = "jsonschema"
+optional = false
+python-versions = "*"
+version = "3.0.1"
+
+[package.dependencies]
+attrs = ">=17.4.0"
+pyrsistent = ">=0.14.0"
+setuptools = "*"
+six = ">=1.11.0"
+
+[[package]]
+category = "main"
description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
name = "mako"
optional = false
@@ -314,6 +328,17 @@ python-versions = "*"
version = "0.9.3"
[[package]]
+category = "main"
+description = "Persistent/Functional/Immutable data structures"
+name = "pyrsistent"
+optional = false
+python-versions = "*"
+version = "0.15.2"
+
+[package.dependencies]
+six = "*"
+
+[[package]]
category = "dev"
description = "pytest: simple powerful testing with Python"
name = "pytest"
@@ -391,7 +416,7 @@ python-versions = "*"
version = "2.2.1"
[metadata]
-content-hash = "0055b739d3afaaf6ffa9ca7720fc6869f5cb4d0b338beb28aa3d8c2b6f1e4bd9"
+content-hash = "4322d6cbe5b122bfc42ba5f35a3d465631c8ec8eb78dddadb17b01156e5805fa"
python-versions = "^3.7"
[metadata.hashes]
@@ -418,6 +443,7 @@ gunicorn = ["aa8e0b40b4157b36a5df5e599f45c9c76d6af43845ba3b3b0efe2c70473c2471",
idna = ["c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", "ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"]
itsdangerous = ["321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19", "b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"]
jinja2 = ["74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", "f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"]
+jsonschema = ["0c0a81564f181de3212efa2d17de1910f8732fa1b71c42266d983cd74304e20d", "a5f6559964a3851f59040d3b961de5e68e70971afb88ba519d27e6a039efff1a"]
mako = ["4e02fde57bd4abb5ec400181e4c314f56ac3e49ba4fb8b0d50bba18cb27d25ae"]
markdown = ["2e50876bcdd74517e7b71f3e7a76102050edec255b3983403f1a63e7c8a41e7a", "56a46ac655704b91e5b7e6326ce43d5ef72411376588afa1dd90e881b83c7e8c"]
markupsafe = ["00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", "09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", "24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", "62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", "6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", "7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905", "88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735", "8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d", "98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e", "9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d", "9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c", "ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21", "b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2", "b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5", "b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b", "ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", "e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"]
@@ -427,6 +453,7 @@ py = ["64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", "dc639
pycparser = ["a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3"]
pyjwt = ["5c6eca3c2940464d106b99ba83b00c6add741c9becaec087fb7ccdefea71350e", "8d59a976fb773f3e6a39c85636357c4f0e242707394cadadd9814f5cbaa20e96"]
pymysql = ["3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a", "d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"]
+pyrsistent = ["16692ee739d42cf5e39cef8d27649a8c1fdb7aa99887098f1460057c5eb75c3a"]
pytest = ["3f193df1cfe1d1609d4c583838bea3d532b18d6160fd3f55c9447fdca30848ec", "e246cf173c01169b9617fc07264b7b1316e78d7a650055235d6d897bc80d9660"]
python-dateutil = ["7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", "c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"]
python-editor = ["1bf6e860a8ad52a14c3ee1252d5dc25b2030618ed80c022598f00176adc8367d", "51fda6bcc5ddbbb7063b2af7509e43bd84bfc32a4ff71349ec7847713882327b", "5f98b069316ea1c2ed3f67e7f5df6c0d8f10b689964a4a811ff64f0106819ec8", "c3da2053dbab6b29c94e43c486ff67206eafbe7eb52dbec7390b5e2fb05aac77", "ea87e17f6ec459e780e4221f295411462e0d0810858e055fc514684350a2f522"]
diff --git a/pyproject.toml b/pyproject.toml
index 8544bc6..f26df38 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ gunicorn = "^19.9"
pymysql = "^0.9.3"
cryptography = "^2.6"
markdown = "^3.1"
+jsonschema = "^3.0"
[tool.poetry.dev-dependencies]
pytest = "^3.0"