diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-12 14:33:57 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-12 14:33:57 +0000 |
| commit | 7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch) | |
| tree | 9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts | |
| download | TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip | |
initial commit
Diffstat (limited to 'analysis/scripts')
| -rw-r--r-- | analysis/scripts/aggregate_table_wide.py | 222 | ||||
| -rw-r--r-- | analysis/scripts/latex.py | 140 | ||||
| -rw-r--r-- | analysis/scripts/make_table.py | 441 | ||||
| -rw-r--r-- | analysis/scripts/metrics.py | 129 | ||||
| -rw-r--r-- | analysis/scripts/rank_common.py | 102 | ||||
| -rw-r--r-- | analysis/scripts/rank_plots.py | 151 | ||||
| -rw-r--r-- | analysis/scripts/significance.py | 179 | ||||
| -rw-r--r-- | analysis/scripts/summarize.py | 178 |
8 files changed, 1542 insertions, 0 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py new file mode 100644 index 00000000..712a6a4a --- /dev/null +++ b/analysis/scripts/aggregate_table_wide.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- + +""" +Script to generate the aggregate table (wide version) + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse +import json +import tabulate + +from enum import Enum + + +class Method(Enum): + amoc = "amoc" + binseg = "binseg" + bocpd = "bocpd" + bocpdms = "bocpdms" + cpnp = "cpnp" + ecp = "ecp" + kcpa = "kcpa" + pelt = "pelt" + prophet = "prophet" + rbocpdms = "rbocpdms" + rfpop = "rfpop" + segneigh = "segneigh" + wbs = "wbs" + + +# Methods that support multidimensional datasets +MULTIMETHODS = [ + Method.bocpd, + Method.bocpdms, + Method.ecp, + Method.kcpa, + Method.rbocpdms, +] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--bcu", + help="Path to json file with results for: best/cover/uni", + required=True, + ) + parser.add_argument( + "--bcm", + help="Path to json file with results for: best/cover/multi", + required=True, + ) + parser.add_argument( + "--bfu", + help="Path to json file with results for: best/f1/uni", + required=True, + ) + parser.add_argument( + "--bfm", + help="Path to json file with results for: best/f1/multi", + required=True, + ) + parser.add_argument( + "--dcu", + help="Path to json file with results for: default/cover/uni", + required=True, + ) + parser.add_argument( + "--dcm", + help="Path to json file with results for: default/cover/multi", + required=True, + ) + parser.add_argument( + "--dfu", + help="Path to json file with results for: default/f1/uni", + required=True, + ) + parser.add_argument( + "--dfm", + help="Path to json file with results for: default/f1/multi", + required=True, + ) + return parser.parse_args() + + +def load_json(filename): + with open(filename, "r") as fp: + return json.load(fp) + + +def make_table( + label, + uni_default_cover, + uni_default_f1, + uni_best_cover, + uni_best_f1, + multi_default_cover, + multi_default_f1, + multi_best_cover, + multi_best_f1, + methods, +): + """Create part of the aggregate table + """ + tex = [] + tex.append("%% This table requires booktabs!") + + tex.append("\\begin{tabular}{lrr|rrrr|rr}") + superheader = ( + " & ".join( + [ + "", + "\\multicolumn{4}{c}{Univariate}", + "\\multicolumn{4}{c}{Multivariate} \\\\", + ] + ) + + "\\cmidrule(lr){2-5}\\cmidrule(lr){6-9}" + ) + tex.append(superheader) + header = ( + " & ".join( + [ + "", + "\\multicolumn{2}{c}{Default}", + "\\multicolumn{2}{c}{Best}", + "\\multicolumn{2}{c}{Default}", + "\\multicolumn{2}{c}{Best} \\\\", + ] + ) + + "\\cmidrule(lr){2-3}" + + "\\cmidrule(lr){4-5}" + + "\\cmidrule(lr){6-7}" + + "\\cmidrule(lr){8-9}" + ) + tex.append(header) + subheader = ( + " & ".join( + [ + "", + "Cover", + "F1", + "Cover", + "F1", + "Cover", + "F1", + "Cover", + "F1" + "\\\\", + ] + ) + + "\\cmidrule(r){1-1}" + + "\\cmidrule(lr){2-5}" + + "\\cmidrule(l){6-9}" + ) + tex.append(subheader) + + table = [] + + L = max(map(len, methods)) + textsc = lambda m: "\\textsc{%s}%s" % (m, (L - len(m)) * " ") + table.append(list(map(textsc, methods))) + + all_exps = [ + [uni_default_cover, uni_default_f1, uni_best_cover, uni_best_f1], + [ + multi_default_cover, + multi_default_f1, + multi_best_cover, + multi_best_f1, + ], + ] + + for exps in all_exps: + for exp in exps: + row = [] + maxscore = max((exp[m] for m in methods if m in exp)) + for m in methods: + if not m in exp: + row.append(5 * " ") + continue + score = exp[m] + scorestr = tabulate._format( + score, tabulate._float_type, ".3f", "" + ) + if score == maxscore: + row.append("\\textbf{%s}" % scorestr) + else: + row.append(scorestr) + table.append(row) + + transposed = list(zip(*table)) + for row in transposed: + tex.append(" & ".join(row) + " \\\\") + tex.append("\\cmidrule(r){1-1}\\cmidrule(lr){2-5}\\cmidrule(l){6-9}") + tex.append("\\end{tabular}") + + return tex + + +def main(): + args = parse_args() + + bcu = load_json(args.bcu) + bcm = load_json(args.bcm) + bfu = load_json(args.bfu) + bfm = load_json(args.bfm) + dcu = load_json(args.dcu) + dcm = load_json(args.dcm) + dfu = load_json(args.dfu) + dfm = load_json(args.dfm) + + methods = sorted([m.name for m in Method]) + tex = make_table("Wide", dcu, dfu, bcu, bfu, dcm, dfm, bcm, bfm, methods) + + print("\n".join(tex)) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/latex.py b/analysis/scripts/latex.py new file mode 100644 index 00000000..107ce20a --- /dev/null +++ b/analysis/scripts/latex.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Code for compiling latex from Python. + +Based on: https://github.com/GjjvdBurg/labella.py + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import os +import shutil +import subprocess +import tabulate +import tempfile + + +def compile_latex(fname, tmpdirname, silent=True): + compiler = "latexmk" + compiler_args = [ + "--pdf", + "--outdir=" + tmpdirname, + "--interaction=nonstopmode", + fname, + ] + command = [compiler] + compiler_args + try: + output = subprocess.check_output(command, stderr=subprocess.STDOUT) + except (OSError, IOError) as e: + raise (e) + except subprocess.CalledProcessError as e: + print(e.output.decode()) + raise (e) + else: + if not silent: + print(output.decode()) + + +def build_latex_doc(tex, output_name=None, silent=True): + with tempfile.TemporaryDirectory() as tmpdirname: + basename = "labella_text" + fname = os.path.join(tmpdirname, basename + ".tex") + with open(fname, "w") as fid: + fid.write(tex) + + compile_latex(fname, tmpdirname, silent=silent) + + pdfname = os.path.join(tmpdirname, basename + ".pdf") + if output_name: + shutil.copy2(pdfname, output_name) + + +def build_latex_table( + table, + headers, + floatfmt="g", + missingval="", + bests="default", + table_spec=None, +): + """Construct the LaTeX code for a table + + This function creates the LaTeX code for a data table while taking number + formatting, headers, missing values, and "best value formatting" into + account. + + The numbers in the table are formatted following the provided float format + and the missing value indicator using the ``_format`` function from the + ``tabulate`` package. To indicate a missing value the data row should mark + this value as ``None``. + + The ``bests`` parameter is used to decide how to highlight the best values + in each row. It can be either ``'default'``, ``None``, a list of length 1 + where the element is either ``min`` or ``max``, or a list of length ``K`` + with similar elements where ``K`` is the length of the data table. If it is + ``'default'`` then ``max`` will be considered best for each row. If a list + of length 1 is supplied then the provided function will be used for each + row. If ``None``, no highlighting will be done. + + The ``table_spec`` parameter allows the user to specify the table + specification. This value is not checked. If it is None, the first column + will get 'l' spec and the remaining columns will get the 'r' spec. + + """ + if bests == "default": + bests = [max] + elif bests is None: + bests = [] + + if len(bests) > 1: + assert len(bests) == len(table) + assert all((x in [min, max] for x in bests)) + + if len(bests) == 0: + best_funcs = [None for x in range(len(table))] + elif len(bests) == 1: + best_funcs = [bests[0] for x in range(len(table))] + else: + best_funcs = bests[:] + + _typ = lambda v: tabulate._type(v) + _fmt = lambda v: tabulate._format(v, _typ(v), floatfmt, missingval) + + list_of_lists, headers = table, headers + cols = list(zip(*list_of_lists)) + coltypes = list(map(tabulate._column_type, cols)) + + cols = [ + [_fmt(v) for v in c] + for c, ct in zip(cols, coltypes) + ] + + n_cols = len(cols) + + data_rows = table + text_rows = list(zip(*cols)) + + text = [] + if table_spec is None: + text.append("\\begin{tabular}{l%s}" % ("r" * (n_cols - 1))) + else: + text.append("\\begin{tabular}{%s}" % table_spec) + text.append(" & ".join(headers) + "\\\\") + text.append("\\hline") + for data_row, text_row, best_func in zip(data_rows, text_rows, best_funcs): + text_row = list(text_row) + if not best_func is None: + best_val = best_func([x for x in data_row if isinstance(x, float)]) + best_idx = [i for i, v in enumerate(data_row) if v == best_val] + for idx in best_idx: + text_row[idx] = "\\textbf{" + text_row[idx] + "}" + text.append(" & ".join(text_row) + "\\\\") + text.append("\\hline") + text.append("\\end{tabular}") + + return "\n".join(text) diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py new file mode 100644 index 00000000..e4747258 --- /dev/null +++ b/analysis/scripts/make_table.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Script to generate tables from summary files + +Metrics, experiments, methods, and datasets are hard-coded as a means of +validation. + +For the "best" experiment, the RBOCPDMS method is excluded because it fails too +often. For the other experiments, datasets with incomplete results are removed. + +Author: G.J.J. van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse +import colorama +import json +import os +import sys +import termcolor + +from enum import Enum +from typing import Optional + +# from pydantic.dataclasses import dataclass +from dataclasses import dataclass + +from latex import build_latex_table + +colorama.init() + + +class Metric(Enum): + f1 = "f1" + cover = "cover" + + +class Experiment(Enum): + default = "default" + best = "best" + + +class Dataset(Enum): + apple = "apple" + bank = "bank" + bee_waggle_6 = "bee_waggle_6" + bitcoin = "bitcoin" + brent_spot = "brent_spot" + businv = "businv" + centralia = "centralia" + children_per_woman = "children_per_woman" + co2_canada = "co2_canada" + construction = "construction" + debt_ireland = "debt_ireland" + gdp_argentina = "gdp_argentina" + gdp_croatia = "gdp_croatia" + gdp_iran = "gdp_iran" + gdp_japan = "gdp_japan" + global_co2 = "global_co2" + homeruns = "homeruns" + iceland_tourism = "iceland_tourism" + jfk_passengers = "jfk_passengers" + lga_passengers = "lga_passengers" + nile = "nile" + occupancy = "occupancy" + ozone = "ozone" + quality_control_1 = "quality_control_1" + quality_control_2 = "quality_control_2" + quality_control_3 = "quality_control_3" + quality_control_4 = "quality_control_4" + quality_control_5 = "quality_control_5" + rail_lines = "rail_lines" + ratner_stock = "ratner_stock" + robocalls = "robocalls" + run_log = "run_log" + scanline_126007 = "scanline_126007" + scanline_42049 = "scanline_42049" + seatbelts = "seatbelts" + shanghai_license = "shanghai_license" + uk_coal_employ = "uk_coal_employ" + measles = "measles" + unemployment_nl = "unemployment_nl" + us_population = "us_population" + usd_isk = "usd_isk" + well_log = "well_log" + + +class Method(Enum): + amoc = "amoc" + binseg = "binseg" + bocpd = "bocpd" + bocpdms = "bocpdms" + cpnp = "cpnp" + ecp = "ecp" + kcpa = "kcpa" + pelt = "pelt" + prophet = "prophet" + rbocpdms = "rbocpdms" + rfpop = "rfpop" + segneigh = "segneigh" + wbs = "wbs" + + +# Methods that support multidimensional datasets +MULTIMETHODS = [ + Method.bocpd, + Method.bocpdms, + Method.ecp, + Method.kcpa, + Method.rbocpdms, +] + +# Multidimensional datasets +MULTIDATASETS = [ + Dataset.apple, + Dataset.bee_waggle_6, + Dataset.occupancy, + Dataset.run_log, +] + +# Datasets with missing values +MISSING_DATASETS = [Dataset.uk_coal_employ] + +# Methods that handle missing values +MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet] + + +@dataclass +class Result: + dataset: Dataset + experiment: Experiment + is_multidim: bool + method: Method + metric: Metric + score: Optional[float] + summary_file: str + placeholder: Optional[str] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-m", + "--metric", + help="Metric to use for the table", + choices=["f1", "cover"], + required=True, + ) + parser.add_argument( + "-e", + "--experiment", + help="Experiment to make table for", + choices=["best", "default"], + required=True, + ) + parser.add_argument( + "-d", + "--dim", + help="Dimensionality", + choices=["uni", "multi", "combined"], + required=True, + ) + parser.add_argument( + "-f", + "--format", + help="Output format", + choices=["json", "tex"], + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of table to make", + choices=["avg", "full"], + required=True, + ) + return parser.parse_args() + + +def warning(msg): + termcolor.cprint(msg, "yellow", file=sys.stderr) + + +def load_summary(filename): + with open(filename, "r") as fp: + data = json.load(fp) + return data + + +def extract_score(method_results, metric=None, experiment=None): + """Extract a single numeric score from a list of dictionaries + """ + + if not metric in [Metric.f1, Metric.cover]: + raise ValueError("Unknown metric: %s" % metric) + if not experiment in ["default", "best"]: + raise ValueError("Unknown experiment: %s" % experiment) + + # Collect all values for the chosen metric + scores = [] + for result in method_results: + if not result["status"] == "SUCCESS": + continue + scores.append(result["scores"][metric.name]) + + if len(scores) == 0: + return None + + # check that we have only one score for the 'default' experiment + if experiment == "default": + if len(scores) > 1: + raise ValueError("Default experiment with more than one score!") + return scores[0] + return max(scores) + + +def collect_results(summary_dir=None, metric=None, experiment=None): + """Collect the results for the experiment on the specified metric. + + Returns a list of Result objects. + """ + if not metric in [Metric.f1, Metric.cover]: + raise ValueError("Unknown metric: %s" % metric) + if not experiment in ["default", "best"]: + raise ValueError("Unknown experiment: %s" % experiment) + if not os.path.isdir(summary_dir): + raise FileNotFoundError(summary_dir) + + results = [] + for fname in sorted(os.listdir(summary_dir)): + path = os.path.join(summary_dir, fname) + summary_data = load_summary(path) + + dataset_name = summary_data["dataset"] + summary_results = summary_data["results"] + + is_multi = summary_data["dataset_ndim"] > 1 + + for method in summary_results: + # method names are prefixed with the experiment type, so we skip + # the ones we don't want + if not method.startswith(experiment + "_"): + continue + + # extract the metric score for this experiment from the summary + # results for the method + score = extract_score( + summary_results[method], metric=metric, experiment=experiment + ) + + # strip the experiment from the method name + method_name = method[len(experiment + "_") :] + + # determine the placeholder value if there is no score. + placeholder = set() + if score is None: + if (Dataset(dataset_name) in MISSING_DATASETS) and ( + not Method(method_name) in MISSING_METHODS + ): + # dataset has missing values and method can't handle it + placeholder.add("M") + else: + for result in summary_results[method]: + if result["status"] == "FAIL": + placeholder.add("F") + elif result["status"] == "TIMEOUT": + placeholder.add("T") + placeholder = "/".join(sorted(placeholder)) + + # create a Result object + res = Result( + dataset=Dataset(dataset_name), + experiment=Experiment(experiment), + is_multidim=is_multi, + method=Method(method_name), + metric=Metric(metric), + score=score, + summary_file=fname, + placeholder=placeholder or None, + ) + results.append(res) + return results + + +def average_results(results): + """Average the results + + NOTE: This function filters out some methods/datasets for which we have + insufficient results. + """ + experiment = list(set(r.experiment for r in results))[0] + # determine if we're dealing with multidimensional datasets + is_multi = all(r.is_multidim for r in results) + + expected_methods = MULTIMETHODS if is_multi else list(Method) + + # keep only expected methods + results = list(filter(lambda r: r.method in expected_methods, results)) + + # remove RBOCPDMS for 'best', because it fails too often + if experiment == Experiment.best: + warning( + "\nWarning: Removing RBOCPDMS (experiment = %s)\n" % experiment + ) + results = list(filter(lambda r: r.method != Method.rbocpdms, results)) + expected_methods.remove(Method.rbocpdms) + + # remove datasets for which we do not have complete results + to_remove = [] + for dataset in set(r.dataset for r in results): + dset_results = filter(lambda r: r.dataset == dataset, results) + if any(r.score is None for r in dset_results): + to_remove.append(dataset) + if to_remove: + warning("\nWarning: Filtering out datasets: %r\n" % to_remove) + results = list(filter(lambda r: not r.dataset in to_remove, results)) + + # check that we are now complete: for all datasets and all methods in the + # remaining results, we have a non-None score. + assert all(r.score is not None for r in results) + + # compute the average per method + methods = set(r.method for r in results) + avg = {} + for method in methods: + method_scores = [r.score for r in results if r.method == method] + avg_score = sum(method_scores) / len(method_scores) + avg[method.name] = avg_score + + return avg + + +def write_json(results, is_avg=None): + if not is_avg in [True, False]: + raise ValueError("is_avg should be either True or False") + + output = {} + if is_avg: + output = results + else: + datasets = set(r.dataset for r in results) + methods = set(r.method for r in results) + for d in datasets: + output[d.name] = {} + for m in methods: + r = next( + (r for r in results if r.dataset == d and r.method == m), + None, + ) + # intended to fail if r is None, because that shouldn't happen + output[d.name][m.name] = r.score + print(json.dumps(output, indent="\t", sort_keys=True)) + + +def write_latex(results, dim=None, is_avg=None): + if is_avg: + raise NotImplementedError( + "write_latex is not supported for is_avg = True" + ) + + methods = sorted(set(r.method.name for r in results)) + datasets = sorted(set(r.dataset.name for r in results)) + if dim == "combined": + uni_datasets = [ + d.name for d in list(Dataset) if not d in MULTIDATASETS + ] + multi_datasets = [d.name for d in MULTIDATASETS] + datasets = sorted(uni_datasets) + sorted(multi_datasets) + first_multi = sorted(multi_datasets)[0] + + textsc = lambda m: "\\textsc{%s}" % m + verb = lambda m: "\\verb+%s+" % m + + headers = ["Dataset"] + list(map(textsc, methods)) + + table = [] + for dataset in datasets: + row = [verb(dataset)] + d = Dataset(dataset) + + for method in methods: + m = Method(method) + r = next((r for r in results if r.method == m and r.dataset == d)) + row.append(r.placeholder if r.score is None else r.score) + + table.append(row) + spec = "l" + "c" * len(methods) + tex = build_latex_table(table, headers, floatfmt=".3f", table_spec=spec) + + if dim == "combined": + # add a horizontal line for these datasets + lines = tex.split("\n") + newlines = [] + for line in lines: + if line.startswith(verb(first_multi)): + newlines.append("\\hline") + newlines.append(line) + tex = "\n".join(newlines) + + print(tex) + + +def main(): + args = parse_args() + if args.type == "avg" and args.dim == "combined": + raise ValueError("Using 'avg' and 'combined' is not supported.") + + results = collect_results( + summary_dir=args.summary_dir, + metric=Metric(args.metric), + experiment=args.experiment, + ) + + if args.dim == "uni": + # filter out multi + results = list(filter(lambda r: not r.is_multidim, results)) + elif args.dim == "multi": + # filter out uni + results = list(filter(lambda r: r.is_multidim, results)) + + if args.type == "avg": + results = average_results(results) + + if args.format == "json": + write_json(results, is_avg=args.type == "avg") + else: + write_latex(results, args.dim, is_avg=args.type == "avg") + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py new file mode 100644 index 00000000..932fbb7c --- /dev/null +++ b/analysis/scripts/metrics.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +""" +Evaluation metrics + +Author: G.J.J. van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + + +def true_positives(T, X, margin=5): + """Compute true positives without double counting + """ + # make a copy so we don't affect the caller + X = set(list(X)) + TP = set() + for tau in T: + close = [(abs(tau - x), x) for x in X if abs(tau - x) <= margin] + close.sort() + if not close: + continue + dist, xstar = close[0] + TP.add(tau) + X.remove(xstar) + return TP + + +def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False): + """Compute the F-measure based on human annotations. + + annotations : dict from user_id to iterable of CP locations + predictions : iterable of predicted CP locations + alpha : value for the F-measure, alpha=0.5 gives the F1-measure + return_PR : whether to return precision and recall too + + Remember that all CP locations are 0-based! + + """ + # ensure 0 is in all the sets + Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)} + for Tk in Tks.values(): + Tk.add(0) + + X = set(predictions) + X.add(0) + + Tstar = [tau for tau in Tk for Tk in Tks.values()] + Tstar = set(Tstar) + + K = len(Tks) + + P = len(true_positives(Tstar, X, margin=margin)) / len(X) + + TPk = {k: true_positives(Tks[k], X, margin=margin) for k in Tks} + R = 1 / K * sum(len(TPk[k]) / len(Tks[k]) for k in Tks) + + F = P * R / (alpha * R + (1 - alpha) * P) + if return_PR: + return F, P, R + return F + + +def overlap(A, B): + """ Return the overlap (i.e. Jaccard index) of two sets """ + return len(A.intersection(B)) / len(A.union(B)) + + +def partition_from_cps(locations, n_obs): + """ Return a list of sets that give a partition of the set [0, T-1], as + defined by the change point locations. + + >>> partition_from_cps([], 5) + [{0, 1, 2, 3, 4}] + >>> partition_from_cps([3, 5], 8) + [{0, 1, 2}, {3, 4}, {5, 6, 7}] + >>> partition_from_cps([1,2,7], 8) + [{0}, {1}, {2, 3, 4, 5, 6}, {7}] + >>> partition_from_cps([0, 4], 6) + [{0, 1, 2, 3}, {4, 5}] + """ + T = n_obs + partition = [] + current = set() + + all_cps = iter(sorted(set(locations))) + cp = next(all_cps, None) + for i in range(T): + if i == cp: + if current: + partition.append(current) + current = set() + cp = next(all_cps, None) + current.add(i) + partition.append(current) + return partition + + +def cover_single(Sprime, S): + """Compute the covering of a segmentation S by a segmentation Sprime. + + This follows equation (8) in Arbaleaz, 2010. + """ + T = sum(map(len, Sprime)) + assert T == sum(map(len, S)) + C = 0 + for R in S: + C += len(R) * max(overlap(R, Rprime) for Rprime in Sprime) + C /= T + return C + + +def covering(annotations, predictions, n_obs): + """Compute the average segmentation covering against the human annotations. + + annotations : dict from user_id to iterable of CP locations + predictions : iterable of predicted Cp locations + n_obs : number of observations in the series + + """ + Ak = { + k + 1: partition_from_cps(annotations[uid], n_obs) + for k, uid in enumerate(annotations) + } + pX = partition_from_cps(predictions, n_obs) + + Cs = [cover_single(pX, Ak[k]) for k in Ak] + return sum(Cs) / len(Cs) diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py new file mode 100644 index 00000000..c47cac77 --- /dev/null +++ b/analysis/scripts/rank_common.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +"""Shared code to do with ranks + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import json +import numpy as np +import sys + +from scipy.stats import rankdata + + +def load_data(filename): + with open(filename, "r") as fp: + return json.load(fp) + + +def compute_ranks(results, keep_methods=None, higher_better=True): + """Compute the ranks + + Parameters + ---------- + results : dict + Mapping from dataset name to dict, where each dict in turn is a map + from method name to a score value. + + keep_methods: list + Methods to include in the ranks + + higher_better: bool + Whether a higher or a lower value is considered better + + Returns + ------- + avg_ranks : dict + Map from method name to average rank + + all_ranks: dict + Map from dataset name to dictionary, which is in turn a map from method + name to rank for that dataset and that method. + + """ + vec_ranks = [] + all_ranks = {} + + for dset in results: + methods = results[dset].keys() + methods = sorted(methods) + + methods = [m for m in methods if m in keep_methods] + assert methods == keep_methods + + if higher_better: + values = [-results[dset][m] for m in methods] + else: + values = [results[dset][m] for m in methods] + + if any(np.isnan(v) for v in values): + print( + "Skipping dataset %s because of nans" % dset, file=sys.stderr + ) + continue + + ranks = rankdata(values, method="average") + + vec_ranks.append(ranks) + rank_dict = {m: ranks[i] for i, m in enumerate(methods)} + + all_ranks[dset] = rank_dict + + avg_ranks = np.mean(vec_ranks, axis=0) + avg_ranks = {m: r for m, r in zip(methods, avg_ranks)} + return avg_ranks, all_ranks + + +def preprocess_data(data, _type): + methods = set([m for dset in data.keys() for m in data[dset].keys()]) + methods = sorted(methods) + if _type == "best": + print( + "\nWarning: Filtering out RBOCPDMS due to insufficient results.\n", + file=sys.stderr, + ) + methods = [m for m in methods if not m == "rbocpdms"] + + data_w_methods = {} + for dset in data: + data_w_methods[dset] = {} + for method in methods: + data_w_methods[dset][method] = data[dset][method] + + data_no_missing = {} + for dset in data_w_methods: + if any((x is None for x in data_w_methods[dset].values())): + continue + data_no_missing[dset] = data_w_methods[dset] + return data_no_missing, methods diff --git a/analysis/scripts/rank_plots.py b/analysis/scripts/rank_plots.py new file mode 100644 index 00000000..c4cad16f --- /dev/null +++ b/analysis/scripts/rank_plots.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Create rank plots from best table json files. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse + +from labella.timeline import TimelineTex +from labella.scale import LinearScale + +from rank_common import load_data, compute_ranks, preprocess_data +from significance import reference_difference + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--input", + help="Input JSON file with results for each method", + required=True, + ) + parser.add_argument( + "-o", "--output", help="Output tex file to write to", required=True + ) + parser.add_argument( + "-b", + "--better", + help="Whether higher or lower is better", + choices=["min", "max"], + default="max", + ) + parser.add_argument( + "--type", + help="Type of table to make", + choices=["best", "default"], + required=True, + ) + return parser.parse_args() + + +def method_name(m): + m = m.split("_")[-1] + return "\\textsc{%s}" % m + + +def make_rank_plot( + results, + output_file, + keep_methods=None, + higher_better=True, + return_ranks=False, +): + methods = keep_methods[:] + avg_ranks, all_ranks = compute_ranks( + results, keep_methods=keep_methods, higher_better=higher_better + ) + plot_data = [ + {"time": rank, "text": method_name(method)} + for method, rank in avg_ranks.items() + ] + + color = "#000" + + options = { + "scale": LinearScale(), + "direction": "up", + "domain": [1, len(methods)], + "layerGap": 20, + "borderColor": "#000", + "showBorder": False, + "labelBgColor": "#fff", + "linkColor": color, + "labelTextColor": color, + "dotColor": color, + "initialWidth": 600, + "initialHeight": 75, + "latex": {"linkThickness": "thin", "reproducible": True}, + "dotRadius": 2, + "margin": {"left": 0, "bottom": 0, "right": 0, "top": 0}, + } + + tl = TimelineTex(plot_data, options=options) + texlines = tl.export() + + n_datasets = len(all_ranks) + + ref_method, CD, _ = reference_difference(avg_ranks, n_datasets) + + # we're going to insert the critical difference line after the dots + # scope,·so we first have to figure out where that is. + lines = texlines.split("\n") + idx = None + find_scope = False + for i, line in enumerate(lines): + if line.strip() == "% dots": + find_scope = True + if find_scope and "\\end{scope}" in line: + idx = i + 1 + break + + before = lines[:idx] + after = lines[idx:] + + nodes, _ = tl.compute() + bestnode = next( + (n for n in nodes if n.data.text == method_name(ref_method)), None + ) + # idealPos is the position on the axis + posBest = bestnode.getRoot().idealPos + posCD = tl.options["scale"](bestnode.data.time + CD) + + CDlines = [ + "% Critical difference", + "\\def\\posBest{%.16f}" % posBest, + "\\def\\posCD{%.16f}" % posCD, + "\\begin{scope}", + "\\draw (\\posBest, 30) -- (\\posBest, 20);", + "\\draw (\\posBest, 25) --node[below] {CD} (\\posCD, 25);", + "\\draw (\\posCD, 30) -- (\\posCD, 20);", + "\\end{scope}", + ] + + all_lines = before + [""] + CDlines + after + + with open(output_file, "w") as fp: + fp.write("\n".join(all_lines)) + + +def main(): + args = parse_args() + + higher_better = args.better == "max" + + data = load_data(args.input) + clean, methods = preprocess_data(data, args.type) + + make_rank_plot( + clean, args.output, keep_methods=methods, higher_better=higher_better + ) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/significance.py b/analysis/scripts/significance.py new file mode 100644 index 00000000..6666306b --- /dev/null +++ b/analysis/scripts/significance.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Code to compute significant differences + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse +import math +import scipy.stats as stats + +from tabulate import tabulate + +from rank_common import ( + load_data, + preprocess_data, + compute_ranks, +) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-o", "--output", help="Output basename") + parser.add_argument( + "-i", "--input", help="Input JSON file with results for each method" + ) + parser.add_argument( + "-m", + "--mode", + choices=["global", "reference"], + help="Whether to do a global difference F test or a reference test to the best performing method", + ) + parser.add_argument( + "--type", + help="Type of table to make", + choices=["best", "default"], + required=True, + ) + return parser.parse_args() + + +def global_difference(avg_ranks, n_datasets): + N = n_datasets + k = len(avg_ranks) + avg_sq_sum = sum([pow(float(avg_ranks[m]), 2.0) for m in avg_ranks]) + + chi2 = ( + 12.0 * N / (k * (k + 1)) * (avg_sq_sum - (k * pow(k + 1, 2.0) / 4.0)) + ) + chiprob = 1.0 - stats.chi2.cdf(chi2, k - 1) + + Fstat = (N - 1.0) * chi2 / (N * (k - 1) - chi2) + Fprob = 1.0 - stats.f.cdf(Fstat, k - 1, (k - 1) * (N - 1)) + + return Fstat, Fprob + + +def argmin(func, args): + m, inc = float("inf"), None + for a in args: + v = func(a) + if v < m: + m, inc = v, a + return inc + + +def reference_difference(avg_ranks, n_datasets, significance_level=0.05): + N = n_datasets + k = len(avg_ranks) + + methods = sorted(avg_ranks.keys()) + ranks = [avg_ranks[m] for m in methods] + ref_method = argmin(lambda m: avg_ranks[m], methods) + ref_idx = methods.index(ref_method) + others = [m for m in methods if not m == ref_method] + + Z_scores = [0.0] * (k - 1) + P_values = [0.0] * (k - 1) + + constant = math.sqrt(6 * N / (k * (k + 1))) + for j, method in enumerate(others): + i = methods.index(method) + Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant + P_values[j] = stats.norm.cdf(Z_scores[j]) + + # sort the p-values in ascending order + sorted_pvals = sorted((p, i) for i, p in enumerate(P_values)) + + # Calculate significance differences following Holm's procedure + significant_differences = [False] * (k - 1) + thresholds = [0] * (k - 1) + CD_threshold = None + for i in range(k - 1): + threshold = significance_level / float(k - (i + 1)) + pval, idx = sorted_pvals[i] + significant_differences[idx] = pval < threshold + thresholds[idx] = threshold + if pval > threshold and CD_threshold is None: + CD_threshold = threshold + + # Calculate the critical difference from the first threshold that failed to + # reject. This works because if the p-value would be below the threshold we + # would consider it significantly different and above the threshold we + # would not. + CD = -1 * stats.norm.ppf(CD_threshold) / constant + + txt = [ + "Number of datasets: %i" % N, + "Number of methods: %i" % k, + "Reference method: %s" % ref_method, + "Significance level: %g" % significance_level, + "", + "Reference method rank: %.6f" % avg_ranks[ref_method], + "Holm's procedure:", + ] + + table = [] + for o, p, t, s in zip( + others, P_values, thresholds, significant_differences + ): + table.append([o, avg_ranks[o], p, t, s]) + + txt.append( + tabulate( + table, + headers=["Method", "Rank", "p-Value", "Threshold", "Significant"], + ) + ) + + txt.append("") + txt.append( + "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold) + ) + txt.append("") + + return ref_method, CD, txt + + +def main(): + args = parse_args() + + data = load_data(args.input) + clean, methods = preprocess_data(data, args.type) + + n_datasets = len(clean) + + avg_ranks, all_ranks = compute_ranks( + clean, keep_methods=methods, higher_better=True + ) + + if args.mode == "global": + Fstat, Fprob = global_difference(avg_ranks, n_datasets) + if args.output: + with open(args.output, "w") as fp: + fp.write("F = %.1f (p = %g)" % (Fstat, Fprob)) + else: + print("Fstat = %.2f, Fprob = %.2g" % (Fstat, Fprob)) + elif args.mode == "reference": + ref_method, CD, txt = reference_difference(avg_ranks, n_datasets) + if args.output: + outRef = args.output + "_ref.tex" + with open(outRef + "w") as fp: + fp.write(outRef + "%") + outCD = args.output + "_CD.tex" + with open(outCD + "w") as fp: + fp.write(outCD + "%") + else: + print("Reference method = %s, CD = %.2f" % (ref_method, CD)) + print("") + print("\n".join(txt)) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/summarize.py b/analysis/scripts/summarize.py new file mode 100644 index 00000000..426976c5 --- /dev/null +++ b/analysis/scripts/summarize.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Summarize the results into a single file per dataset. + +For each dataset we want:: + + { + "dataset": "<name>", + "dataset_nobs": N, + "dataset_ndim": N, + "annotations": { + "<user_id>": [...], + "<user_id>": [...], + }, + "results": { + "<method>": [ + { + "parameters": { + "<param>": value, + }, + "cplocations": [...], + "scores": { + "<score_1>": value, + }, + "status": <status> + }, + { + "parameters": { + "<param>": value, + }, + "cplocations": [...], + "scores": { + "<score_1>": value, + }, + "status": <status> + }, + ], + } + } + +Basic cleanup on the change point locations will also be performed: + + - deduplication + - removal of invalid indices. Recall that indices are 0-based. We remove + any indices smaller than 1 and larger than n_obs - 2. The reason that we + don't allow 0 or n_obs - 1 (both valid endpoints) is that several + algorithms declare these locations as change points by default and they + are meaningless. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse +import json +import os +import sys + +from metrics import f_measure, covering + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-a", + "--annotation-file", + help="Path to annotation file", + required=True, + ) + parser.add_argument( + "-d", "--dataset-file", help="Path to dataset file", required=True + ) + parser.add_argument( + "-r", "--result-dir", help="Directory of abed results", required=True + ) + parser.add_argument("-o", "--output-file", help="File to write to") + return parser.parse_args() + + +def load_json(filename): + with open(filename, "r") as fp: + try: + data = json.load(fp) + except json.decoder.JSONDecodeError: + print("Error parsing json file: %s" % filename, file=sys.stderr) + return {"error": "parsing error"} + return data + + +def load_annotations(filename, dataset): + with open(filename, "r") as fp: + data = json.load(fp) + return data[dataset] + + +def clean_cps(locations, dataset): + n_obs = dataset["n_obs"] + valid = set([x for x in locations if 1 <= x < n_obs - 1]) + return sorted(valid) + + +def main(): + args = parse_args() + + dataset = load_json(args.dataset_file) + annotations = load_annotations(args.annotation_file, dataset["name"]) + + out = { + "dataset": dataset["name"], + "dataset_nobs": dataset["n_obs"], + "dataset_ndim": dataset["n_dim"], + "annotations": annotations, + "results": {}, + } + + data_results = next( + (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None + ) + if data_results is None: + print( + "Couldn't find the result directory for dataset %s" + % dataset["name"], + file=sys.stderr, + ) + raise SystemExit(1) + + dataset_dir = os.path.join(args.result_dir, data_results) + + for method in os.listdir(dataset_dir): + method_dir = os.path.join(dataset_dir, method) + for result_file in os.listdir(method_dir): + # print("Processing result file: %s" % result_file) + fname = os.path.join(method_dir, result_file) + result = load_json(fname) + if not method in out["results"]: + out["results"][method] = [] + + if result["status"].lower() == "success": + locations = clean_cps(result["result"]["cplocations"], dataset) + + f1, precision, recall = f_measure( + annotations, locations, return_PR=True + ) + n_obs = dataset["n_obs"] + cover = covering(annotations, locations, n_obs) + scores = { + "f1": f1, + "precision": precision, + "recall": recall, + "cover": cover, + } + else: + locations = None + scores = None + + out["results"][method].append( + { + "parameters": result["parameters"], + "task_file": result_file, + "cplocations": locations, + "scores": scores, + "status": result['status'], + } + ) + + if args.output_file: + with open(args.output_file, "w") as fp: + json.dump(out, fp, indent="\t") + else: + print(json.dumps(out, indent="\t")) + + +if __name__ == "__main__": + main() |
