aboutsummaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
commit7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree10aa6710599230c889ec44407a065ee303a79348 /utils
downloadTCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz
TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip
Initial commit
Diffstat (limited to 'utils')
-rw-r--r--utils/check_checksums.py85
-rw-r--r--utils/validate_dataset.py147
2 files changed, 232 insertions, 0 deletions
diff --git a/utils/check_checksums.py b/utils/check_checksums.py
new file mode 100644
index 0000000..01dcd99
--- /dev/null
+++ b/utils/check_checksums.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Validate the datasets by checksum
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import hashlib
+import os
+import json
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-c", "--checksum-file", help="Checksum file (json)", required=True
+ )
+ parser.add_argument(
+ "-d", "--dataset-dir", help="Dataset directory", required=True
+ )
+ parser.add_argument(
+ "-v", "--verbose", help="Enable verbose mode", action="store_true"
+ )
+ return parser.parse_args()
+
+
+def md5sum(filename):
+ with open(filename, "rb") as fp:
+ data = fp.read()
+ return hashlib.md5(data).hexdigest()
+
+
+def load_checksums(checksum_file):
+ with open(checksum_file, "r") as fp:
+ checksums = json.load(fp)
+ assert checksums["kind"] == "md5"
+ return checksums["checksums"]
+
+
+def find_datafiles(dataset_dir):
+ data_files = {}
+
+ datadirs = os.listdir(dataset_dir)
+ for ddir in datadirs:
+ pth = os.path.join(dataset_dir, ddir)
+ files = os.listdir(pth)
+ json_files = [f for f in files if f.endswith(".json")]
+ for jf in json_files:
+ jfpath = os.path.join(pth, jf)
+ if jf in data_files:
+ raise KeyError("Duplicate data file '%s'?" % jfpath)
+ data_files[jf] = jfpath
+
+ return data_files
+
+
+def main():
+ args = parse_args()
+
+ log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
+
+ checksums = load_checksums(args.checksum_file)
+ data_files = find_datafiles(args.dataset_dir)
+
+ for fname in checksums:
+ log("Checking %s" % fname)
+ if not fname in data_files:
+ raise FileNotFoundError("Missing data file: %s" % fname)
+ md5 = md5sum(data_files[fname])
+ if not md5 == checksums[fname]:
+ raise ValueError(
+ "Checksums don't match for file: %s" % (data_files[fname])
+ )
+
+ log("All ok.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/utils/validate_dataset.py b/utils/validate_dataset.py
new file mode 100644
index 0000000..5174936
--- /dev/null
+++ b/utils/validate_dataset.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Validate the dataset schema of a given file.
+
+Note that this script requires the ``jsonschema`` package.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD. See the LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import json
+import jsonschema
+import os
+import sys
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-s",
+ "--schema-file",
+ help="Schema file to use",
+ default="./schema.json",
+ )
+ parser.add_argument("-d", "--dataset-dir", help="Dataset directory")
+ parser.add_argument(
+ "datafile", help="JSON file with a TCPD time series", nargs="?"
+ )
+ parser.add_argument(
+ "-v", "--verbose", help="Enable verbose mode", action="store_true"
+ )
+ return parser.parse_args()
+
+
+def load_schema(schema_file):
+ if not os.path.exists(schema_file):
+ raise FileNotFoundError(schema_file)
+ with open(schema_file, "rb") as fp:
+ schema = json.load(fp)
+ return schema
+
+
+def find_datafiles(dataset_dir):
+ data_files = {}
+
+ datadirs = os.listdir(dataset_dir)
+ for ddir in datadirs:
+ pth = os.path.join(dataset_dir, ddir)
+ files = os.listdir(pth)
+ json_files = [f for f in files if f.endswith(".json")]
+ for jf in json_files:
+ jfpath = os.path.join(pth, jf)
+ if jf in data_files:
+ raise KeyError("Duplicate data file '%s'?" % jfpath)
+ data_files[jf] = jfpath
+
+ return data_files
+
+
+def validate_dataset(filename, schema_file=None):
+ """Validate a dataset file against the schema and other requirements
+ """
+ if not os.path.exists(filename):
+ return "File not found."
+
+ with open(filename, "rb") as fp:
+ try:
+ data = json.load(fp)
+ except json.JSONDecodeError as err:
+ return "JSON decoding error: %s" % err.msg
+
+ try:
+ schema = load_schema(schema_file)
+ except FileNotFoundError:
+ return "Schema file not found."
+
+ try:
+ jsonschema.validate(instance=data, schema=schema)
+ except jsonschema.ValidationError as err:
+ return "JSONSchema validation error: %s" % err.message
+
+ if len(data["series"]) != data["n_dim"]:
+ return "Number of dimensions and number of series don't match"
+
+ if "time" in data.keys():
+ if not "format" in data["time"] and "raw" in data["time"]:
+ return "'raw' must be accompanied by format"
+ if "format" in data["time"] and not "raw" in data["time"]:
+ return "Format must be accompanied by 'raw'"
+ if "index" in data["time"]:
+ if not data["time"]["index"][0] == 0:
+ return "Index should start at zero."
+ if not len(data["time"]["index"]) == data["n_obs"]:
+ return "Number of indices must match number of observations"
+ if "raw" in data["time"]:
+ if len(data["time"]["raw"]) != data["n_obs"]:
+ return "Number of time points doesn't match number of observations"
+ if None in data["time"]["raw"]:
+ return "Null is not supported in time axis. Use 'NaN' instead."
+
+ has_missing = False
+ for var in data["series"]:
+ if len(var["raw"]) != data["n_obs"]:
+ return "Number of observations doesn't match for %s" % var["label"]
+ if float("nan") in var["raw"]:
+ return "NaN is not supported in series. Use null instead."
+ has_missing = has_missing or any(map(lambda x: x is None, var["raw"]))
+
+ # this doesn't exist yet, so let's not implement it until we need it.
+ if data["n_dim"] > 1 and has_missing:
+ return "Missing values are not yet supported for multidimensional data"
+
+ return None
+
+
+def main():
+ args = parse_args()
+
+ log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
+
+ if args.dataset_dir:
+ datafiles = find_datafiles(args.dataset_dir)
+ for dset in datafiles:
+ log("Validating %s" % dset)
+ result = validate_dataset(
+ datafiles[dset], schema_file=args.schema_file
+ )
+ if not result is None:
+ print(
+ "Dataset: %s. Error: %s" % (dset, result), file=sys.stderr
+ )
+ raise SystemExit(1)
+ else:
+ result = validate_dataset(args.datafile, schema_file=args.schema_file)
+ if not result is None:
+ print("Error: %s" % result, file=sys.stderr)
+ raise SystemExit(1)
+ log("Validation passed.")
+
+
+if __name__ == "__main__":
+ main()