Initial commit

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-10 12:27:53 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-10 12:27:53 +0000
commit: 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree: 10aa6710599230c889ec44407a065ee303a79348 /utils
download: TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz
TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip
2 files changed, 232 insertions, 0 deletions
diff --git a/utils/check_checksums.py b/utils/check_checksums.py
new file mode 100644
index 0000000..01dcd99
--- /dev/null
+++ b/utils/check_checksums.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Validate the datasets by checksum
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import hashlib
+import os
+import json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--checksum-file", help="Checksum file (json)", required=True
+    )
+    parser.add_argument(
+        "-d", "--dataset-dir", help="Dataset directory", required=True
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="Enable verbose mode", action="store_true"
+    )
+    return parser.parse_args()
+
+
+def md5sum(filename):
+    with open(filename, "rb") as fp:
+        data = fp.read()
+    return hashlib.md5(data).hexdigest()
+
+
+def load_checksums(checksum_file):
+    with open(checksum_file, "r") as fp:
+        checksums = json.load(fp)
+    assert checksums["kind"] == "md5"
+    return checksums["checksums"]
+
+
+def find_datafiles(dataset_dir):
+    data_files = {}
+
+    datadirs = os.listdir(dataset_dir)
+    for ddir in datadirs:
+        pth = os.path.join(dataset_dir, ddir)
+        files = os.listdir(pth)
+        json_files = [f for f in files if f.endswith(".json")]
+        for jf in json_files:
+            jfpath = os.path.join(pth, jf)
+            if jf in data_files:
+                raise KeyError("Duplicate data file '%s'?" % jfpath)
+            data_files[jf] = jfpath
+
+    return data_files
+
+
+def main():
+    args = parse_args()
+
+    log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
+
+    checksums = load_checksums(args.checksum_file)
+    data_files = find_datafiles(args.dataset_dir)
+
+    for fname in checksums:
+        log("Checking %s" % fname)
+        if not fname in data_files:
+            raise FileNotFoundError("Missing data file: %s" % fname)
+        md5 = md5sum(data_files[fname])
+        if not md5 == checksums[fname]:
+            raise ValueError(
+                "Checksums don't match for file: %s" % (data_files[fname])
+            )
+
+    log("All ok.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/validate_dataset.py b/utils/validate_dataset.py
new file mode 100644
index 0000000..5174936
--- /dev/null
+++ b/utils/validate_dataset.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Validate the dataset schema of a given file.
+
+Note that this script requires the ``jsonschema`` package.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD. See the LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import json
+import jsonschema
+import os
+import sys
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--schema-file",
+        help="Schema file to use",
+        default="./schema.json",
+    )
+    parser.add_argument("-d", "--dataset-dir", help="Dataset directory")
+    parser.add_argument(
+        "datafile", help="JSON file with a TCPD time series", nargs="?"
+    )
+    parser.add_argument(
+        "-v", "--verbose", help="Enable verbose mode", action="store_true"
+    )
+    return parser.parse_args()
+
+
+def load_schema(schema_file):
+    if not os.path.exists(schema_file):
+        raise FileNotFoundError(schema_file)
+    with open(schema_file, "rb") as fp:
+        schema = json.load(fp)
+    return schema
+
+
+def find_datafiles(dataset_dir):
+    data_files = {}
+
+    datadirs = os.listdir(dataset_dir)
+    for ddir in datadirs:
+        pth = os.path.join(dataset_dir, ddir)
+        files = os.listdir(pth)
+        json_files = [f for f in files if f.endswith(".json")]
+        for jf in json_files:
+            jfpath = os.path.join(pth, jf)
+            if jf in data_files:
+                raise KeyError("Duplicate data file '%s'?" % jfpath)
+            data_files[jf] = jfpath
+
+    return data_files
+
+
+def validate_dataset(filename, schema_file=None):
+    """Validate a dataset file against the schema and other requirements
+    """
+    if not os.path.exists(filename):
+        return "File not found."
+
+    with open(filename, "rb") as fp:
+        try:
+            data = json.load(fp)
+        except json.JSONDecodeError as err:
+            return "JSON decoding error: %s" % err.msg
+
+    try:
+        schema = load_schema(schema_file)
+    except FileNotFoundError:
+        return "Schema file not found."
+
+    try:
+        jsonschema.validate(instance=data, schema=schema)
+    except jsonschema.ValidationError as err:
+        return "JSONSchema validation error: %s" % err.message
+
+    if len(data["series"]) != data["n_dim"]:
+        return "Number of dimensions and number of series don't match"
+
+    if "time" in data.keys():
+        if not "format" in data["time"] and "raw" in data["time"]:
+            return "'raw' must be accompanied by format"
+        if "format" in data["time"] and not "raw" in data["time"]:
+            return "Format must be accompanied by 'raw'"
+        if "index" in data["time"]:
+            if not data["time"]["index"][0] == 0:
+                return "Index should start at zero."
+            if not len(data["time"]["index"]) == data["n_obs"]:
+                return "Number of indices must match number of observations"
+        if "raw" in data["time"]:
+            if len(data["time"]["raw"]) != data["n_obs"]:
+                return "Number of time points doesn't match number of observations"
+            if None in data["time"]["raw"]:
+                return "Null is not supported in time axis. Use 'NaN' instead."
+
+    has_missing = False
+    for var in data["series"]:
+        if len(var["raw"]) != data["n_obs"]:
+            return "Number of observations doesn't match for %s" % var["label"]
+        if float("nan") in var["raw"]:
+            return "NaN is not supported in series. Use null instead."
+        has_missing = has_missing or any(map(lambda x: x is None, var["raw"]))
+
+    # this doesn't exist yet, so let's not implement it until we need it.
+    if data["n_dim"] > 1 and has_missing:
+        return "Missing values are not yet supported for multidimensional data"
+
+    return None
+
+
+def main():
+    args = parse_args()
+
+    log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
+
+    if args.dataset_dir:
+        datafiles = find_datafiles(args.dataset_dir)
+        for dset in datafiles:
+            log("Validating %s" % dset)
+            result = validate_dataset(
+                datafiles[dset], schema_file=args.schema_file
+            )
+            if not result is None:
+                print(
+                    "Dataset: %s. Error: %s" % (dset, result), file=sys.stderr
+                )
+                raise SystemExit(1)
+    else:
+        result = validate_dataset(args.datafile, schema_file=args.schema_file)
+        if not result is None:
+            print("Error: %s" % result, file=sys.stderr)
+            raise SystemExit(1)
+    log("Validation passed.")
+
+
+if __name__ == "__main__":
+    main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-10 12:27:53 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-10 12:27:53 +0000
commit	7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree	10aa6710599230c889ec44407a065ee303a79348 /utils
download	TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip