diff options
Diffstat (limited to 'analysis/scripts/make_table.py')
| -rw-r--r-- | analysis/scripts/make_table.py | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py new file mode 100644 index 00000000..e4747258 --- /dev/null +++ b/analysis/scripts/make_table.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Script to generate tables from summary files + +Metrics, experiments, methods, and datasets are hard-coded as a means of +validation. + +For the "best" experiment, the RBOCPDMS method is excluded because it fails too +often. For the other experiments, datasets with incomplete results are removed. + +Author: G.J.J. van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import argparse +import colorama +import json +import os +import sys +import termcolor + +from enum import Enum +from typing import Optional + +# from pydantic.dataclasses import dataclass +from dataclasses import dataclass + +from latex import build_latex_table + +colorama.init() + + +class Metric(Enum): + f1 = "f1" + cover = "cover" + + +class Experiment(Enum): + default = "default" + best = "best" + + +class Dataset(Enum): + apple = "apple" + bank = "bank" + bee_waggle_6 = "bee_waggle_6" + bitcoin = "bitcoin" + brent_spot = "brent_spot" + businv = "businv" + centralia = "centralia" + children_per_woman = "children_per_woman" + co2_canada = "co2_canada" + construction = "construction" + debt_ireland = "debt_ireland" + gdp_argentina = "gdp_argentina" + gdp_croatia = "gdp_croatia" + gdp_iran = "gdp_iran" + gdp_japan = "gdp_japan" + global_co2 = "global_co2" + homeruns = "homeruns" + iceland_tourism = "iceland_tourism" + jfk_passengers = "jfk_passengers" + lga_passengers = "lga_passengers" + nile = "nile" + occupancy = "occupancy" + ozone = "ozone" + quality_control_1 = "quality_control_1" + quality_control_2 = "quality_control_2" + quality_control_3 = "quality_control_3" + quality_control_4 = "quality_control_4" + quality_control_5 = "quality_control_5" + rail_lines = "rail_lines" + ratner_stock = "ratner_stock" + robocalls = "robocalls" + run_log = "run_log" + scanline_126007 = "scanline_126007" + scanline_42049 = "scanline_42049" + seatbelts = "seatbelts" + shanghai_license = "shanghai_license" + uk_coal_employ = "uk_coal_employ" + measles = "measles" + unemployment_nl = "unemployment_nl" + us_population = "us_population" + usd_isk = "usd_isk" + well_log = "well_log" + + +class Method(Enum): + amoc = "amoc" + binseg = "binseg" + bocpd = "bocpd" + bocpdms = "bocpdms" + cpnp = "cpnp" + ecp = "ecp" + kcpa = "kcpa" + pelt = "pelt" + prophet = "prophet" + rbocpdms = "rbocpdms" + rfpop = "rfpop" + segneigh = "segneigh" + wbs = "wbs" + + +# Methods that support multidimensional datasets +MULTIMETHODS = [ + Method.bocpd, + Method.bocpdms, + Method.ecp, + Method.kcpa, + Method.rbocpdms, +] + +# Multidimensional datasets +MULTIDATASETS = [ + Dataset.apple, + Dataset.bee_waggle_6, + Dataset.occupancy, + Dataset.run_log, +] + +# Datasets with missing values +MISSING_DATASETS = [Dataset.uk_coal_employ] + +# Methods that handle missing values +MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet] + + +@dataclass +class Result: + dataset: Dataset + experiment: Experiment + is_multidim: bool + method: Method + metric: Metric + score: Optional[float] + summary_file: str + placeholder: Optional[str] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-m", + "--metric", + help="Metric to use for the table", + choices=["f1", "cover"], + required=True, + ) + parser.add_argument( + "-e", + "--experiment", + help="Experiment to make table for", + choices=["best", "default"], + required=True, + ) + parser.add_argument( + "-d", + "--dim", + help="Dimensionality", + choices=["uni", "multi", "combined"], + required=True, + ) + parser.add_argument( + "-f", + "--format", + help="Output format", + choices=["json", "tex"], + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of table to make", + choices=["avg", "full"], + required=True, + ) + return parser.parse_args() + + +def warning(msg): + termcolor.cprint(msg, "yellow", file=sys.stderr) + + +def load_summary(filename): + with open(filename, "r") as fp: + data = json.load(fp) + return data + + +def extract_score(method_results, metric=None, experiment=None): + """Extract a single numeric score from a list of dictionaries + """ + + if not metric in [Metric.f1, Metric.cover]: + raise ValueError("Unknown metric: %s" % metric) + if not experiment in ["default", "best"]: + raise ValueError("Unknown experiment: %s" % experiment) + + # Collect all values for the chosen metric + scores = [] + for result in method_results: + if not result["status"] == "SUCCESS": + continue + scores.append(result["scores"][metric.name]) + + if len(scores) == 0: + return None + + # check that we have only one score for the 'default' experiment + if experiment == "default": + if len(scores) > 1: + raise ValueError("Default experiment with more than one score!") + return scores[0] + return max(scores) + + +def collect_results(summary_dir=None, metric=None, experiment=None): + """Collect the results for the experiment on the specified metric. + + Returns a list of Result objects. + """ + if not metric in [Metric.f1, Metric.cover]: + raise ValueError("Unknown metric: %s" % metric) + if not experiment in ["default", "best"]: + raise ValueError("Unknown experiment: %s" % experiment) + if not os.path.isdir(summary_dir): + raise FileNotFoundError(summary_dir) + + results = [] + for fname in sorted(os.listdir(summary_dir)): + path = os.path.join(summary_dir, fname) + summary_data = load_summary(path) + + dataset_name = summary_data["dataset"] + summary_results = summary_data["results"] + + is_multi = summary_data["dataset_ndim"] > 1 + + for method in summary_results: + # method names are prefixed with the experiment type, so we skip + # the ones we don't want + if not method.startswith(experiment + "_"): + continue + + # extract the metric score for this experiment from the summary + # results for the method + score = extract_score( + summary_results[method], metric=metric, experiment=experiment + ) + + # strip the experiment from the method name + method_name = method[len(experiment + "_") :] + + # determine the placeholder value if there is no score. + placeholder = set() + if score is None: + if (Dataset(dataset_name) in MISSING_DATASETS) and ( + not Method(method_name) in MISSING_METHODS + ): + # dataset has missing values and method can't handle it + placeholder.add("M") + else: + for result in summary_results[method]: + if result["status"] == "FAIL": + placeholder.add("F") + elif result["status"] == "TIMEOUT": + placeholder.add("T") + placeholder = "/".join(sorted(placeholder)) + + # create a Result object + res = Result( + dataset=Dataset(dataset_name), + experiment=Experiment(experiment), + is_multidim=is_multi, + method=Method(method_name), + metric=Metric(metric), + score=score, + summary_file=fname, + placeholder=placeholder or None, + ) + results.append(res) + return results + + +def average_results(results): + """Average the results + + NOTE: This function filters out some methods/datasets for which we have + insufficient results. + """ + experiment = list(set(r.experiment for r in results))[0] + # determine if we're dealing with multidimensional datasets + is_multi = all(r.is_multidim for r in results) + + expected_methods = MULTIMETHODS if is_multi else list(Method) + + # keep only expected methods + results = list(filter(lambda r: r.method in expected_methods, results)) + + # remove RBOCPDMS for 'best', because it fails too often + if experiment == Experiment.best: + warning( + "\nWarning: Removing RBOCPDMS (experiment = %s)\n" % experiment + ) + results = list(filter(lambda r: r.method != Method.rbocpdms, results)) + expected_methods.remove(Method.rbocpdms) + + # remove datasets for which we do not have complete results + to_remove = [] + for dataset in set(r.dataset for r in results): + dset_results = filter(lambda r: r.dataset == dataset, results) + if any(r.score is None for r in dset_results): + to_remove.append(dataset) + if to_remove: + warning("\nWarning: Filtering out datasets: %r\n" % to_remove) + results = list(filter(lambda r: not r.dataset in to_remove, results)) + + # check that we are now complete: for all datasets and all methods in the + # remaining results, we have a non-None score. + assert all(r.score is not None for r in results) + + # compute the average per method + methods = set(r.method for r in results) + avg = {} + for method in methods: + method_scores = [r.score for r in results if r.method == method] + avg_score = sum(method_scores) / len(method_scores) + avg[method.name] = avg_score + + return avg + + +def write_json(results, is_avg=None): + if not is_avg in [True, False]: + raise ValueError("is_avg should be either True or False") + + output = {} + if is_avg: + output = results + else: + datasets = set(r.dataset for r in results) + methods = set(r.method for r in results) + for d in datasets: + output[d.name] = {} + for m in methods: + r = next( + (r for r in results if r.dataset == d and r.method == m), + None, + ) + # intended to fail if r is None, because that shouldn't happen + output[d.name][m.name] = r.score + print(json.dumps(output, indent="\t", sort_keys=True)) + + +def write_latex(results, dim=None, is_avg=None): + if is_avg: + raise NotImplementedError( + "write_latex is not supported for is_avg = True" + ) + + methods = sorted(set(r.method.name for r in results)) + datasets = sorted(set(r.dataset.name for r in results)) + if dim == "combined": + uni_datasets = [ + d.name for d in list(Dataset) if not d in MULTIDATASETS + ] + multi_datasets = [d.name for d in MULTIDATASETS] + datasets = sorted(uni_datasets) + sorted(multi_datasets) + first_multi = sorted(multi_datasets)[0] + + textsc = lambda m: "\\textsc{%s}" % m + verb = lambda m: "\\verb+%s+" % m + + headers = ["Dataset"] + list(map(textsc, methods)) + + table = [] + for dataset in datasets: + row = [verb(dataset)] + d = Dataset(dataset) + + for method in methods: + m = Method(method) + r = next((r for r in results if r.method == m and r.dataset == d)) + row.append(r.placeholder if r.score is None else r.score) + + table.append(row) + spec = "l" + "c" * len(methods) + tex = build_latex_table(table, headers, floatfmt=".3f", table_spec=spec) + + if dim == "combined": + # add a horizontal line for these datasets + lines = tex.split("\n") + newlines = [] + for line in lines: + if line.startswith(verb(first_multi)): + newlines.append("\\hline") + newlines.append(line) + tex = "\n".join(newlines) + + print(tex) + + +def main(): + args = parse_args() + if args.type == "avg" and args.dim == "combined": + raise ValueError("Using 'avg' and 'combined' is not supported.") + + results = collect_results( + summary_dir=args.summary_dir, + metric=Metric(args.metric), + experiment=args.experiment, + ) + + if args.dim == "uni": + # filter out multi + results = list(filter(lambda r: not r.is_multidim, results)) + elif args.dim == "multi": + # filter out uni + results = list(filter(lambda r: r.is_multidim, results)) + + if args.type == "avg": + results = average_results(results) + + if args.format == "json": + write_json(results, is_avg=args.type == "avg") + else: + write_latex(results, args.dim, is_avg=args.type == "avg") + + +if __name__ == "__main__": + main() |
