aboutsummaryrefslogtreecommitdiff
path: root/utils/check_checksums.py
blob: 01dcd99378188a96a294de18c53e321c0ce2b9f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Validate the datasets by checksum

Author: G.J.J. van den Burg
License: This file is part of TCPD, see the top-level LICENSE file.
Copyright: 2019, The Alan Turing Institute

"""

import argparse
import hashlib
import os
import json


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-c", "--checksum-file", help="Checksum file (json)", required=True
    )
    parser.add_argument(
        "-d", "--dataset-dir", help="Dataset directory", required=True
    )
    parser.add_argument(
        "-v", "--verbose", help="Enable verbose mode", action="store_true"
    )
    return parser.parse_args()


def md5sum(filename):
    with open(filename, "rb") as fp:
        data = fp.read()
    return hashlib.md5(data).hexdigest()


def load_checksums(checksum_file):
    with open(checksum_file, "r") as fp:
        checksums = json.load(fp)
    assert checksums["kind"] == "md5"
    return checksums["checksums"]


def find_datafiles(dataset_dir):
    data_files = {}

    datadirs = os.listdir(dataset_dir)
    for ddir in datadirs:
        pth = os.path.join(dataset_dir, ddir)
        files = os.listdir(pth)
        json_files = [f for f in files if f.endswith(".json")]
        for jf in json_files:
            jfpath = os.path.join(pth, jf)
            if jf in data_files:
                raise KeyError("Duplicate data file '%s'?" % jfpath)
            data_files[jf] = jfpath

    return data_files


def main():
    args = parse_args()

    log = lambda *a, **kw: print(*a, **kw) if args.verbose else None

    checksums = load_checksums(args.checksum_file)
    data_files = find_datafiles(args.dataset_dir)

    for fname in checksums:
        log("Checking %s" % fname)
        if not fname in data_files:
            raise FileNotFoundError("Missing data file: %s" % fname)
        md5 = md5sum(data_files[fname])
        if not md5 == checksums[fname]:
            raise ValueError(
                "Checksums don't match for file: %s" % (data_files[fname])
            )

    log("All ok.")


if __name__ == "__main__":
    main()