analysis/scripts/significance.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Code to compute significant differences

Author: Gertjan van den Burg
Copyright (c) 2020 - The Alan Turing Institute
License: See the LICENSE file.

"""

import argparse
import math
import scipy.stats as stats

from tabulate import tabulate

from rank_common import (
    load_data,
    preprocess_data,
    compute_ranks,
)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-o", "--output", help="Output basename")
    parser.add_argument(
        "-i", "--input", help="Input JSON file with results for each method"
    )
    parser.add_argument(
        "-m",
        "--mode",
        choices=["global", "reference"],
        help="Whether to do a global difference F test or a reference test to the best performing method",
    )
    parser.add_argument(
        "--type",
        help="Type of table to make",
        choices=["best", "default"],
        required=True,
    )
    return parser.parse_args()


def global_difference(avg_ranks, n_datasets):
    N = n_datasets
    k = len(avg_ranks)
    avg_sq_sum = sum([pow(float(avg_ranks[m]), 2.0) for m in avg_ranks])

    chi2 = (
        12.0 * N / (k * (k + 1)) * (avg_sq_sum - (k * pow(k + 1, 2.0) / 4.0))
    )
    chiprob = 1.0 - stats.chi2.cdf(chi2, k - 1)

    Fstat = (N - 1.0) * chi2 / (N * (k - 1) - chi2)
    Fprob = 1.0 - stats.f.cdf(Fstat, k - 1, (k - 1) * (N - 1))

    return Fstat, Fprob


def argmin(func, args):
    m, inc = float("inf"), None
    for a in args:
        v = func(a)
        if v < m:
            m, inc = v, a
    return inc


def reference_difference(avg_ranks, n_datasets, significance_level=0.05):
    N = n_datasets
    k = len(avg_ranks)

    methods = sorted(avg_ranks.keys())
    ranks = [avg_ranks[m] for m in methods]
    ref_method = argmin(lambda m: avg_ranks[m], methods)
    ref_idx = methods.index(ref_method)
    others = [m for m in methods if not m == ref_method]

    Z_scores = [0.0] * (k - 1)
    P_values = [0.0] * (k - 1)

    constant = math.sqrt(6 * N / (k * (k + 1)))
    for j, method in enumerate(others):
        i = methods.index(method)
        Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant
        P_values[j] = stats.norm.cdf(Z_scores[j])

    # sort the p-values in ascending order
    sorted_pvals = sorted((p, i) for i, p in enumerate(P_values))

    # Calculate significance differences following Holm's procedure
    significant_differences = [False] * (k - 1)
    thresholds = [0] * (k - 1)
    CD_threshold = None
    for i in range(k - 1):
        threshold = significance_level / float(k - (i + 1))
        pval, idx = sorted_pvals[i]
        significant_differences[idx] = pval < threshold
        thresholds[idx] = threshold
        if pval > threshold and CD_threshold is None:
            CD_threshold = threshold

    # Calculate the critical difference from the first threshold that failed to
    # reject. This works because if the p-value would be below the threshold we
    # would consider it significantly different and above the threshold we
    # would not.
    CD = -1 * stats.norm.ppf(CD_threshold) / constant

    txt = [
        "Number of datasets: %i" % N,
        "Number of methods: %i" % k,
        "Reference method: %s" % ref_method,
        "Significance level: %g" % significance_level,
        "",
        "Reference method rank: %.6f" % avg_ranks[ref_method],
        "Holm's procedure:",
    ]

    table = []
    for o, p, t, s in zip(
        others, P_values, thresholds, significant_differences
    ):
        table.append([o, avg_ranks[o], p, t, s])

    txt.append(
        tabulate(
            table,
            headers=["Method", "Rank", "p-Value", "Threshold", "Significant"],
        )
    )

    txt.append("")
    txt.append(
        "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold)
    )
    txt.append("")

    return ref_method, CD, txt


def main():
    args = parse_args()

    data = load_data(args.input)
    clean, methods = preprocess_data(data, args.type)

    n_datasets = len(clean)

    avg_ranks, all_ranks = compute_ranks(
        clean, keep_methods=methods, higher_better=True
    )

    if args.mode == "global":
        Fstat, Fprob = global_difference(avg_ranks, n_datasets)
        if args.output:
            with open(args.output, "w") as fp:
                fp.write("F = %.1f (p = %g)" % (Fstat, Fprob))
        else:
            print("Fstat = %.2f, Fprob = %.2g" % (Fstat, Fprob))
    elif args.mode == "reference":
        ref_method, CD, txt = reference_difference(avg_ranks, n_datasets)
        if args.output:
            outRef = args.output + "_ref.tex"
            with open(outRef + "w") as fp:
                fp.write(outRef + "%")
            outCD = args.output + "_CD.tex"
            with open(outCD + "w") as fp:
                fp.write(outCD + "%")
        else:
            print("Reference method = %s, CD = %.2f" % (ref_method, CD))
            print("")
            print("\n".join(txt))


if __name__ == "__main__":
    main()