aboutsummaryrefslogtreecommitdiff
path: root/analysis/scripts/significance.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
commit7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch)
tree9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts/significance.py
downloadTCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz
TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip
initial commit
Diffstat (limited to 'analysis/scripts/significance.py')
-rw-r--r--analysis/scripts/significance.py179
1 files changed, 179 insertions, 0 deletions
diff --git a/analysis/scripts/significance.py b/analysis/scripts/significance.py
new file mode 100644
index 00000000..6666306b
--- /dev/null
+++ b/analysis/scripts/significance.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Code to compute significant differences
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import math
+import scipy.stats as stats
+
+from tabulate import tabulate
+
+from rank_common import (
+ load_data,
+ preprocess_data,
+ compute_ranks,
+)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-o", "--output", help="Output basename")
+ parser.add_argument(
+ "-i", "--input", help="Input JSON file with results for each method"
+ )
+ parser.add_argument(
+ "-m",
+ "--mode",
+ choices=["global", "reference"],
+ help="Whether to do a global difference F test or a reference test to the best performing method",
+ )
+ parser.add_argument(
+ "--type",
+ help="Type of table to make",
+ choices=["best", "default"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def global_difference(avg_ranks, n_datasets):
+ N = n_datasets
+ k = len(avg_ranks)
+ avg_sq_sum = sum([pow(float(avg_ranks[m]), 2.0) for m in avg_ranks])
+
+ chi2 = (
+ 12.0 * N / (k * (k + 1)) * (avg_sq_sum - (k * pow(k + 1, 2.0) / 4.0))
+ )
+ chiprob = 1.0 - stats.chi2.cdf(chi2, k - 1)
+
+ Fstat = (N - 1.0) * chi2 / (N * (k - 1) - chi2)
+ Fprob = 1.0 - stats.f.cdf(Fstat, k - 1, (k - 1) * (N - 1))
+
+ return Fstat, Fprob
+
+
+def argmin(func, args):
+ m, inc = float("inf"), None
+ for a in args:
+ v = func(a)
+ if v < m:
+ m, inc = v, a
+ return inc
+
+
+def reference_difference(avg_ranks, n_datasets, significance_level=0.05):
+ N = n_datasets
+ k = len(avg_ranks)
+
+ methods = sorted(avg_ranks.keys())
+ ranks = [avg_ranks[m] for m in methods]
+ ref_method = argmin(lambda m: avg_ranks[m], methods)
+ ref_idx = methods.index(ref_method)
+ others = [m for m in methods if not m == ref_method]
+
+ Z_scores = [0.0] * (k - 1)
+ P_values = [0.0] * (k - 1)
+
+ constant = math.sqrt(6 * N / (k * (k + 1)))
+ for j, method in enumerate(others):
+ i = methods.index(method)
+ Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant
+ P_values[j] = stats.norm.cdf(Z_scores[j])
+
+ # sort the p-values in ascending order
+ sorted_pvals = sorted((p, i) for i, p in enumerate(P_values))
+
+ # Calculate significance differences following Holm's procedure
+ significant_differences = [False] * (k - 1)
+ thresholds = [0] * (k - 1)
+ CD_threshold = None
+ for i in range(k - 1):
+ threshold = significance_level / float(k - (i + 1))
+ pval, idx = sorted_pvals[i]
+ significant_differences[idx] = pval < threshold
+ thresholds[idx] = threshold
+ if pval > threshold and CD_threshold is None:
+ CD_threshold = threshold
+
+ # Calculate the critical difference from the first threshold that failed to
+ # reject. This works because if the p-value would be below the threshold we
+ # would consider it significantly different and above the threshold we
+ # would not.
+ CD = -1 * stats.norm.ppf(CD_threshold) / constant
+
+ txt = [
+ "Number of datasets: %i" % N,
+ "Number of methods: %i" % k,
+ "Reference method: %s" % ref_method,
+ "Significance level: %g" % significance_level,
+ "",
+ "Reference method rank: %.6f" % avg_ranks[ref_method],
+ "Holm's procedure:",
+ ]
+
+ table = []
+ for o, p, t, s in zip(
+ others, P_values, thresholds, significant_differences
+ ):
+ table.append([o, avg_ranks[o], p, t, s])
+
+ txt.append(
+ tabulate(
+ table,
+ headers=["Method", "Rank", "p-Value", "Threshold", "Significant"],
+ )
+ )
+
+ txt.append("")
+ txt.append(
+ "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold)
+ )
+ txt.append("")
+
+ return ref_method, CD, txt
+
+
+def main():
+ args = parse_args()
+
+ data = load_data(args.input)
+ clean, methods = preprocess_data(data, args.type)
+
+ n_datasets = len(clean)
+
+ avg_ranks, all_ranks = compute_ranks(
+ clean, keep_methods=methods, higher_better=True
+ )
+
+ if args.mode == "global":
+ Fstat, Fprob = global_difference(avg_ranks, n_datasets)
+ if args.output:
+ with open(args.output, "w") as fp:
+ fp.write("F = %.1f (p = %g)" % (Fstat, Fprob))
+ else:
+ print("Fstat = %.2f, Fprob = %.2g" % (Fstat, Fprob))
+ elif args.mode == "reference":
+ ref_method, CD, txt = reference_difference(avg_ranks, n_datasets)
+ if args.output:
+ outRef = args.output + "_ref.tex"
+ with open(outRef + "w") as fp:
+ fp.write(outRef + "%")
+ outCD = args.output + "_CD.tex"
+ with open(outCD + "w") as fp:
+ fp.write(outCD + "%")
+ else:
+ print("Reference method = %s, CD = %.2f" % (ref_method, CD))
+ print("")
+ print("\n".join(txt))
+
+
+if __name__ == "__main__":
+ main()