aboutsummaryrefslogtreecommitdiff
path: root/utils/check_checksums.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-10 12:27:53 +0000
commit7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree10aa6710599230c889ec44407a065ee303a79348 /utils/check_checksums.py
downloadTCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz
TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip
Initial commit
Diffstat (limited to 'utils/check_checksums.py')
-rw-r--r--utils/check_checksums.py85
1 files changed, 85 insertions, 0 deletions
diff --git a/utils/check_checksums.py b/utils/check_checksums.py
new file mode 100644
index 0000000..01dcd99
--- /dev/null
+++ b/utils/check_checksums.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Validate the datasets by checksum
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+import argparse
+import hashlib
+import os
+import json
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-c", "--checksum-file", help="Checksum file (json)", required=True
+ )
+ parser.add_argument(
+ "-d", "--dataset-dir", help="Dataset directory", required=True
+ )
+ parser.add_argument(
+ "-v", "--verbose", help="Enable verbose mode", action="store_true"
+ )
+ return parser.parse_args()
+
+
+def md5sum(filename):
+ with open(filename, "rb") as fp:
+ data = fp.read()
+ return hashlib.md5(data).hexdigest()
+
+
+def load_checksums(checksum_file):
+ with open(checksum_file, "r") as fp:
+ checksums = json.load(fp)
+ assert checksums["kind"] == "md5"
+ return checksums["checksums"]
+
+
+def find_datafiles(dataset_dir):
+ data_files = {}
+
+ datadirs = os.listdir(dataset_dir)
+ for ddir in datadirs:
+ pth = os.path.join(dataset_dir, ddir)
+ files = os.listdir(pth)
+ json_files = [f for f in files if f.endswith(".json")]
+ for jf in json_files:
+ jfpath = os.path.join(pth, jf)
+ if jf in data_files:
+ raise KeyError("Duplicate data file '%s'?" % jfpath)
+ data_files[jf] = jfpath
+
+ return data_files
+
+
+def main():
+ args = parse_args()
+
+ log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
+
+ checksums = load_checksums(args.checksum_file)
+ data_files = find_datafiles(args.dataset_dir)
+
+ for fname in checksums:
+ log("Checking %s" % fname)
+ if not fname in data_files:
+ raise FileNotFoundError("Missing data file: %s" % fname)
+ md5 = md5sum(data_files[fname])
+ if not md5 == checksums[fname]:
+ raise ValueError(
+ "Checksums don't match for file: %s" % (data_files[fname])
+ )
+
+ log("All ok.")
+
+
+if __name__ == "__main__":
+ main()