diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
| commit | 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch) | |
| tree | 10aa6710599230c889ec44407a065ee303a79348 /utils | |
| download | TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip | |
Initial commit
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/check_checksums.py | 85 | ||||
| -rw-r--r-- | utils/validate_dataset.py | 147 |
2 files changed, 232 insertions, 0 deletions
diff --git a/utils/check_checksums.py b/utils/check_checksums.py new file mode 100644 index 0000000..01dcd99 --- /dev/null +++ b/utils/check_checksums.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Validate the datasets by checksum + +Author: G.J.J. van den Burg +License: This file is part of TCPD, see the top-level LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import hashlib +import os +import json + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", "--checksum-file", help="Checksum file (json)", required=True + ) + parser.add_argument( + "-d", "--dataset-dir", help="Dataset directory", required=True + ) + parser.add_argument( + "-v", "--verbose", help="Enable verbose mode", action="store_true" + ) + return parser.parse_args() + + +def md5sum(filename): + with open(filename, "rb") as fp: + data = fp.read() + return hashlib.md5(data).hexdigest() + + +def load_checksums(checksum_file): + with open(checksum_file, "r") as fp: + checksums = json.load(fp) + assert checksums["kind"] == "md5" + return checksums["checksums"] + + +def find_datafiles(dataset_dir): + data_files = {} + + datadirs = os.listdir(dataset_dir) + for ddir in datadirs: + pth = os.path.join(dataset_dir, ddir) + files = os.listdir(pth) + json_files = [f for f in files if f.endswith(".json")] + for jf in json_files: + jfpath = os.path.join(pth, jf) + if jf in data_files: + raise KeyError("Duplicate data file '%s'?" % jfpath) + data_files[jf] = jfpath + + return data_files + + +def main(): + args = parse_args() + + log = lambda *a, **kw: print(*a, **kw) if args.verbose else None + + checksums = load_checksums(args.checksum_file) + data_files = find_datafiles(args.dataset_dir) + + for fname in checksums: + log("Checking %s" % fname) + if not fname in data_files: + raise FileNotFoundError("Missing data file: %s" % fname) + md5 = md5sum(data_files[fname]) + if not md5 == checksums[fname]: + raise ValueError( + "Checksums don't match for file: %s" % (data_files[fname]) + ) + + log("All ok.") + + +if __name__ == "__main__": + main() diff --git a/utils/validate_dataset.py b/utils/validate_dataset.py new file mode 100644 index 0000000..5174936 --- /dev/null +++ b/utils/validate_dataset.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Validate the dataset schema of a given file. + +Note that this script requires the ``jsonschema`` package. + +Author: G.J.J. van den Burg +License: This file is part of TCPD. See the LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import json +import jsonschema +import os +import sys + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--schema-file", + help="Schema file to use", + default="./schema.json", + ) + parser.add_argument("-d", "--dataset-dir", help="Dataset directory") + parser.add_argument( + "datafile", help="JSON file with a TCPD time series", nargs="?" + ) + parser.add_argument( + "-v", "--verbose", help="Enable verbose mode", action="store_true" + ) + return parser.parse_args() + + +def load_schema(schema_file): + if not os.path.exists(schema_file): + raise FileNotFoundError(schema_file) + with open(schema_file, "rb") as fp: + schema = json.load(fp) + return schema + + +def find_datafiles(dataset_dir): + data_files = {} + + datadirs = os.listdir(dataset_dir) + for ddir in datadirs: + pth = os.path.join(dataset_dir, ddir) + files = os.listdir(pth) + json_files = [f for f in files if f.endswith(".json")] + for jf in json_files: + jfpath = os.path.join(pth, jf) + if jf in data_files: + raise KeyError("Duplicate data file '%s'?" % jfpath) + data_files[jf] = jfpath + + return data_files + + +def validate_dataset(filename, schema_file=None): + """Validate a dataset file against the schema and other requirements + """ + if not os.path.exists(filename): + return "File not found." + + with open(filename, "rb") as fp: + try: + data = json.load(fp) + except json.JSONDecodeError as err: + return "JSON decoding error: %s" % err.msg + + try: + schema = load_schema(schema_file) + except FileNotFoundError: + return "Schema file not found." + + try: + jsonschema.validate(instance=data, schema=schema) + except jsonschema.ValidationError as err: + return "JSONSchema validation error: %s" % err.message + + if len(data["series"]) != data["n_dim"]: + return "Number of dimensions and number of series don't match" + + if "time" in data.keys(): + if not "format" in data["time"] and "raw" in data["time"]: + return "'raw' must be accompanied by format" + if "format" in data["time"] and not "raw" in data["time"]: + return "Format must be accompanied by 'raw'" + if "index" in data["time"]: + if not data["time"]["index"][0] == 0: + return "Index should start at zero." + if not len(data["time"]["index"]) == data["n_obs"]: + return "Number of indices must match number of observations" + if "raw" in data["time"]: + if len(data["time"]["raw"]) != data["n_obs"]: + return "Number of time points doesn't match number of observations" + if None in data["time"]["raw"]: + return "Null is not supported in time axis. Use 'NaN' instead." + + has_missing = False + for var in data["series"]: + if len(var["raw"]) != data["n_obs"]: + return "Number of observations doesn't match for %s" % var["label"] + if float("nan") in var["raw"]: + return "NaN is not supported in series. Use null instead." + has_missing = has_missing or any(map(lambda x: x is None, var["raw"])) + + # this doesn't exist yet, so let's not implement it until we need it. + if data["n_dim"] > 1 and has_missing: + return "Missing values are not yet supported for multidimensional data" + + return None + + +def main(): + args = parse_args() + + log = lambda *a, **kw: print(*a, **kw) if args.verbose else None + + if args.dataset_dir: + datafiles = find_datafiles(args.dataset_dir) + for dset in datafiles: + log("Validating %s" % dset) + result = validate_dataset( + datafiles[dset], schema_file=args.schema_file + ) + if not result is None: + print( + "Dataset: %s. Error: %s" % (dset, result), file=sys.stderr + ) + raise SystemExit(1) + else: + result = validate_dataset(args.datafile, schema_file=args.schema_file) + if not result is None: + print("Error: %s" % result, file=sys.stderr) + raise SystemExit(1) + log("Validation passed.") + + +if __name__ == "__main__": + main() |
