diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
| commit | 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch) | |
| tree | 10aa6710599230c889ec44407a065ee303a79348 /utils/check_checksums.py | |
| download | TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip | |
Initial commit
Diffstat (limited to 'utils/check_checksums.py')
| -rw-r--r-- | utils/check_checksums.py | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/utils/check_checksums.py b/utils/check_checksums.py new file mode 100644 index 0000000..01dcd99 --- /dev/null +++ b/utils/check_checksums.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Validate the datasets by checksum + +Author: G.J.J. van den Burg +License: This file is part of TCPD, see the top-level LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import hashlib +import os +import json + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-c", "--checksum-file", help="Checksum file (json)", required=True + ) + parser.add_argument( + "-d", "--dataset-dir", help="Dataset directory", required=True + ) + parser.add_argument( + "-v", "--verbose", help="Enable verbose mode", action="store_true" + ) + return parser.parse_args() + + +def md5sum(filename): + with open(filename, "rb") as fp: + data = fp.read() + return hashlib.md5(data).hexdigest() + + +def load_checksums(checksum_file): + with open(checksum_file, "r") as fp: + checksums = json.load(fp) + assert checksums["kind"] == "md5" + return checksums["checksums"] + + +def find_datafiles(dataset_dir): + data_files = {} + + datadirs = os.listdir(dataset_dir) + for ddir in datadirs: + pth = os.path.join(dataset_dir, ddir) + files = os.listdir(pth) + json_files = [f for f in files if f.endswith(".json")] + for jf in json_files: + jfpath = os.path.join(pth, jf) + if jf in data_files: + raise KeyError("Duplicate data file '%s'?" % jfpath) + data_files[jf] = jfpath + + return data_files + + +def main(): + args = parse_args() + + log = lambda *a, **kw: print(*a, **kw) if args.verbose else None + + checksums = load_checksums(args.checksum_file) + data_files = find_datafiles(args.dataset_dir) + + for fname in checksums: + log("Checking %s" % fname) + if not fname in data_files: + raise FileNotFoundError("Missing data file: %s" % fname) + md5 = md5sum(data_files[fname]) + if not md5 == checksums[fname]: + raise ValueError( + "Checksums don't match for file: %s" % (data_files[fname]) + ) + + log("All ok.") + + +if __name__ == "__main__": + main() |
