1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Validate the datasets by checksum
Author: G.J.J. van den Burg
License: This file is part of TCPD, see the top-level LICENSE file.
Copyright: 2019, The Alan Turing Institute
"""
import argparse
import hashlib
import os
import json
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"-c", "--checksum-file", help="Checksum file (json)", required=True
)
parser.add_argument(
"-d", "--dataset-dir", help="Dataset directory", required=True
)
parser.add_argument(
"-v", "--verbose", help="Enable verbose mode", action="store_true"
)
return parser.parse_args()
def md5sum(filename):
with open(filename, "rb") as fp:
data = fp.read()
return hashlib.md5(data).hexdigest()
def load_checksums(checksum_file):
with open(checksum_file, "r") as fp:
checksums = json.load(fp)
assert checksums["kind"] == "md5"
return checksums["checksums"]
def find_datafiles(dataset_dir):
data_files = {}
datadirs = os.listdir(dataset_dir)
for ddir in datadirs:
pth = os.path.join(dataset_dir, ddir)
files = os.listdir(pth)
json_files = [f for f in files if f.endswith(".json")]
for jf in json_files:
jfpath = os.path.join(pth, jf)
if jf in data_files:
raise KeyError("Duplicate data file '%s'?" % jfpath)
data_files[jf] = jfpath
return data_files
def main():
args = parse_args()
log = lambda *a, **kw: print(*a, **kw) if args.verbose else None
checksums = load_checksums(args.checksum_file)
data_files = find_datafiles(args.dataset_dir)
for fname in checksums:
log("Checking %s" % fname)
if not fname in data_files:
raise FileNotFoundError("Missing data file: %s" % fname)
md5 = md5sum(data_files[fname])
if not md5 == checksums[fname]:
raise ValueError(
"Checksums don't match for file: %s" % (data_files[fname])
)
log("All ok.")
if __name__ == "__main__":
main()
|