aboutsummaryrefslogtreecommitdiff
path: root/analysis/scripts
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
commit7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch)
tree9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts
downloadTCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz
TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip
initial commit
Diffstat (limited to 'analysis/scripts')
-rw-r--r--analysis/scripts/aggregate_table_wide.py222
-rw-r--r--analysis/scripts/latex.py140
-rw-r--r--analysis/scripts/make_table.py441
-rw-r--r--analysis/scripts/metrics.py129
-rw-r--r--analysis/scripts/rank_common.py102
-rw-r--r--analysis/scripts/rank_plots.py151
-rw-r--r--analysis/scripts/significance.py179
-rw-r--r--analysis/scripts/summarize.py178
8 files changed, 1542 insertions, 0 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py
new file mode 100644
index 00000000..712a6a4a
--- /dev/null
+++ b/analysis/scripts/aggregate_table_wide.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""
+Script to generate the aggregate table (wide version)
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import json
+import tabulate
+
+from enum import Enum
+
+
+class Method(Enum):
+ amoc = "amoc"
+ binseg = "binseg"
+ bocpd = "bocpd"
+ bocpdms = "bocpdms"
+ cpnp = "cpnp"
+ ecp = "ecp"
+ kcpa = "kcpa"
+ pelt = "pelt"
+ prophet = "prophet"
+ rbocpdms = "rbocpdms"
+ rfpop = "rfpop"
+ segneigh = "segneigh"
+ wbs = "wbs"
+
+
+# Methods that support multidimensional datasets
+MULTIMETHODS = [
+ Method.bocpd,
+ Method.bocpdms,
+ Method.ecp,
+ Method.kcpa,
+ Method.rbocpdms,
+]
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--bcu",
+ help="Path to json file with results for: best/cover/uni",
+ required=True,
+ )
+ parser.add_argument(
+ "--bcm",
+ help="Path to json file with results for: best/cover/multi",
+ required=True,
+ )
+ parser.add_argument(
+ "--bfu",
+ help="Path to json file with results for: best/f1/uni",
+ required=True,
+ )
+ parser.add_argument(
+ "--bfm",
+ help="Path to json file with results for: best/f1/multi",
+ required=True,
+ )
+ parser.add_argument(
+ "--dcu",
+ help="Path to json file with results for: default/cover/uni",
+ required=True,
+ )
+ parser.add_argument(
+ "--dcm",
+ help="Path to json file with results for: default/cover/multi",
+ required=True,
+ )
+ parser.add_argument(
+ "--dfu",
+ help="Path to json file with results for: default/f1/uni",
+ required=True,
+ )
+ parser.add_argument(
+ "--dfm",
+ help="Path to json file with results for: default/f1/multi",
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def load_json(filename):
+ with open(filename, "r") as fp:
+ return json.load(fp)
+
+
+def make_table(
+ label,
+ uni_default_cover,
+ uni_default_f1,
+ uni_best_cover,
+ uni_best_f1,
+ multi_default_cover,
+ multi_default_f1,
+ multi_best_cover,
+ multi_best_f1,
+ methods,
+):
+ """Create part of the aggregate table
+ """
+ tex = []
+ tex.append("%% This table requires booktabs!")
+
+ tex.append("\\begin{tabular}{lrr|rrrr|rr}")
+ superheader = (
+ " & ".join(
+ [
+ "",
+ "\\multicolumn{4}{c}{Univariate}",
+ "\\multicolumn{4}{c}{Multivariate} \\\\",
+ ]
+ )
+ + "\\cmidrule(lr){2-5}\\cmidrule(lr){6-9}"
+ )
+ tex.append(superheader)
+ header = (
+ " & ".join(
+ [
+ "",
+ "\\multicolumn{2}{c}{Default}",
+ "\\multicolumn{2}{c}{Best}",
+ "\\multicolumn{2}{c}{Default}",
+ "\\multicolumn{2}{c}{Best} \\\\",
+ ]
+ )
+ + "\\cmidrule(lr){2-3}"
+ + "\\cmidrule(lr){4-5}"
+ + "\\cmidrule(lr){6-7}"
+ + "\\cmidrule(lr){8-9}"
+ )
+ tex.append(header)
+ subheader = (
+ " & ".join(
+ [
+ "",
+ "Cover",
+ "F1",
+ "Cover",
+ "F1",
+ "Cover",
+ "F1",
+ "Cover",
+ "F1" + "\\\\",
+ ]
+ )
+ + "\\cmidrule(r){1-1}"
+ + "\\cmidrule(lr){2-5}"
+ + "\\cmidrule(l){6-9}"
+ )
+ tex.append(subheader)
+
+ table = []
+
+ L = max(map(len, methods))
+ textsc = lambda m: "\\textsc{%s}%s" % (m, (L - len(m)) * " ")
+ table.append(list(map(textsc, methods)))
+
+ all_exps = [
+ [uni_default_cover, uni_default_f1, uni_best_cover, uni_best_f1],
+ [
+ multi_default_cover,
+ multi_default_f1,
+ multi_best_cover,
+ multi_best_f1,
+ ],
+ ]
+
+ for exps in all_exps:
+ for exp in exps:
+ row = []
+ maxscore = max((exp[m] for m in methods if m in exp))
+ for m in methods:
+ if not m in exp:
+ row.append(5 * " ")
+ continue
+ score = exp[m]
+ scorestr = tabulate._format(
+ score, tabulate._float_type, ".3f", ""
+ )
+ if score == maxscore:
+ row.append("\\textbf{%s}" % scorestr)
+ else:
+ row.append(scorestr)
+ table.append(row)
+
+ transposed = list(zip(*table))
+ for row in transposed:
+ tex.append(" & ".join(row) + " \\\\")
+ tex.append("\\cmidrule(r){1-1}\\cmidrule(lr){2-5}\\cmidrule(l){6-9}")
+ tex.append("\\end{tabular}")
+
+ return tex
+
+
+def main():
+ args = parse_args()
+
+ bcu = load_json(args.bcu)
+ bcm = load_json(args.bcm)
+ bfu = load_json(args.bfu)
+ bfm = load_json(args.bfm)
+ dcu = load_json(args.dcu)
+ dcm = load_json(args.dcm)
+ dfu = load_json(args.dfu)
+ dfm = load_json(args.dfm)
+
+ methods = sorted([m.name for m in Method])
+ tex = make_table("Wide", dcu, dfu, bcu, bfu, dcm, dfm, bcm, bfm, methods)
+
+ print("\n".join(tex))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/latex.py b/analysis/scripts/latex.py
new file mode 100644
index 00000000..107ce20a
--- /dev/null
+++ b/analysis/scripts/latex.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Code for compiling latex from Python.
+
+Based on: https://github.com/GjjvdBurg/labella.py
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import os
+import shutil
+import subprocess
+import tabulate
+import tempfile
+
+
+def compile_latex(fname, tmpdirname, silent=True):
+ compiler = "latexmk"
+ compiler_args = [
+ "--pdf",
+ "--outdir=" + tmpdirname,
+ "--interaction=nonstopmode",
+ fname,
+ ]
+ command = [compiler] + compiler_args
+ try:
+ output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+ except (OSError, IOError) as e:
+ raise (e)
+ except subprocess.CalledProcessError as e:
+ print(e.output.decode())
+ raise (e)
+ else:
+ if not silent:
+ print(output.decode())
+
+
+def build_latex_doc(tex, output_name=None, silent=True):
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ basename = "labella_text"
+ fname = os.path.join(tmpdirname, basename + ".tex")
+ with open(fname, "w") as fid:
+ fid.write(tex)
+
+ compile_latex(fname, tmpdirname, silent=silent)
+
+ pdfname = os.path.join(tmpdirname, basename + ".pdf")
+ if output_name:
+ shutil.copy2(pdfname, output_name)
+
+
+def build_latex_table(
+ table,
+ headers,
+ floatfmt="g",
+ missingval="",
+ bests="default",
+ table_spec=None,
+):
+ """Construct the LaTeX code for a table
+
+ This function creates the LaTeX code for a data table while taking number
+ formatting, headers, missing values, and "best value formatting" into
+ account.
+
+ The numbers in the table are formatted following the provided float format
+ and the missing value indicator using the ``_format`` function from the
+ ``tabulate`` package. To indicate a missing value the data row should mark
+ this value as ``None``.
+
+ The ``bests`` parameter is used to decide how to highlight the best values
+ in each row. It can be either ``'default'``, ``None``, a list of length 1
+ where the element is either ``min`` or ``max``, or a list of length ``K``
+ with similar elements where ``K`` is the length of the data table. If it is
+ ``'default'`` then ``max`` will be considered best for each row. If a list
+ of length 1 is supplied then the provided function will be used for each
+ row. If ``None``, no highlighting will be done.
+
+ The ``table_spec`` parameter allows the user to specify the table
+ specification. This value is not checked. If it is None, the first column
+ will get 'l' spec and the remaining columns will get the 'r' spec.
+
+ """
+ if bests == "default":
+ bests = [max]
+ elif bests is None:
+ bests = []
+
+ if len(bests) > 1:
+ assert len(bests) == len(table)
+ assert all((x in [min, max] for x in bests))
+
+ if len(bests) == 0:
+ best_funcs = [None for x in range(len(table))]
+ elif len(bests) == 1:
+ best_funcs = [bests[0] for x in range(len(table))]
+ else:
+ best_funcs = bests[:]
+
+ _typ = lambda v: tabulate._type(v)
+ _fmt = lambda v: tabulate._format(v, _typ(v), floatfmt, missingval)
+
+ list_of_lists, headers = table, headers
+ cols = list(zip(*list_of_lists))
+ coltypes = list(map(tabulate._column_type, cols))
+
+ cols = [
+ [_fmt(v) for v in c]
+ for c, ct in zip(cols, coltypes)
+ ]
+
+ n_cols = len(cols)
+
+ data_rows = table
+ text_rows = list(zip(*cols))
+
+ text = []
+ if table_spec is None:
+ text.append("\\begin{tabular}{l%s}" % ("r" * (n_cols - 1)))
+ else:
+ text.append("\\begin{tabular}{%s}" % table_spec)
+ text.append(" & ".join(headers) + "\\\\")
+ text.append("\\hline")
+ for data_row, text_row, best_func in zip(data_rows, text_rows, best_funcs):
+ text_row = list(text_row)
+ if not best_func is None:
+ best_val = best_func([x for x in data_row if isinstance(x, float)])
+ best_idx = [i for i, v in enumerate(data_row) if v == best_val]
+ for idx in best_idx:
+ text_row[idx] = "\\textbf{" + text_row[idx] + "}"
+ text.append(" & ".join(text_row) + "\\\\")
+ text.append("\\hline")
+ text.append("\\end{tabular}")
+
+ return "\n".join(text)
diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py
new file mode 100644
index 00000000..e4747258
--- /dev/null
+++ b/analysis/scripts/make_table.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Script to generate tables from summary files
+
+Metrics, experiments, methods, and datasets are hard-coded as a means of
+validation.
+
+For the "best" experiment, the RBOCPDMS method is excluded because it fails too
+often. For the other experiments, datasets with incomplete results are removed.
+
+Author: G.J.J. van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import colorama
+import json
+import os
+import sys
+import termcolor
+
+from enum import Enum
+from typing import Optional
+
+# from pydantic.dataclasses import dataclass
+from dataclasses import dataclass
+
+from latex import build_latex_table
+
+colorama.init()
+
+
+class Metric(Enum):
+ f1 = "f1"
+ cover = "cover"
+
+
+class Experiment(Enum):
+ default = "default"
+ best = "best"
+
+
+class Dataset(Enum):
+ apple = "apple"
+ bank = "bank"
+ bee_waggle_6 = "bee_waggle_6"
+ bitcoin = "bitcoin"
+ brent_spot = "brent_spot"
+ businv = "businv"
+ centralia = "centralia"
+ children_per_woman = "children_per_woman"
+ co2_canada = "co2_canada"
+ construction = "construction"
+ debt_ireland = "debt_ireland"
+ gdp_argentina = "gdp_argentina"
+ gdp_croatia = "gdp_croatia"
+ gdp_iran = "gdp_iran"
+ gdp_japan = "gdp_japan"
+ global_co2 = "global_co2"
+ homeruns = "homeruns"
+ iceland_tourism = "iceland_tourism"
+ jfk_passengers = "jfk_passengers"
+ lga_passengers = "lga_passengers"
+ nile = "nile"
+ occupancy = "occupancy"
+ ozone = "ozone"
+ quality_control_1 = "quality_control_1"
+ quality_control_2 = "quality_control_2"
+ quality_control_3 = "quality_control_3"
+ quality_control_4 = "quality_control_4"
+ quality_control_5 = "quality_control_5"
+ rail_lines = "rail_lines"
+ ratner_stock = "ratner_stock"
+ robocalls = "robocalls"
+ run_log = "run_log"
+ scanline_126007 = "scanline_126007"
+ scanline_42049 = "scanline_42049"
+ seatbelts = "seatbelts"
+ shanghai_license = "shanghai_license"
+ uk_coal_employ = "uk_coal_employ"
+ measles = "measles"
+ unemployment_nl = "unemployment_nl"
+ us_population = "us_population"
+ usd_isk = "usd_isk"
+ well_log = "well_log"
+
+
+class Method(Enum):
+ amoc = "amoc"
+ binseg = "binseg"
+ bocpd = "bocpd"
+ bocpdms = "bocpdms"
+ cpnp = "cpnp"
+ ecp = "ecp"
+ kcpa = "kcpa"
+ pelt = "pelt"
+ prophet = "prophet"
+ rbocpdms = "rbocpdms"
+ rfpop = "rfpop"
+ segneigh = "segneigh"
+ wbs = "wbs"
+
+
+# Methods that support multidimensional datasets
+MULTIMETHODS = [
+ Method.bocpd,
+ Method.bocpdms,
+ Method.ecp,
+ Method.kcpa,
+ Method.rbocpdms,
+]
+
+# Multidimensional datasets
+MULTIDATASETS = [
+ Dataset.apple,
+ Dataset.bee_waggle_6,
+ Dataset.occupancy,
+ Dataset.run_log,
+]
+
+# Datasets with missing values
+MISSING_DATASETS = [Dataset.uk_coal_employ]
+
+# Methods that handle missing values
+MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet]
+
+
+@dataclass
+class Result:
+ dataset: Dataset
+ experiment: Experiment
+ is_multidim: bool
+ method: Method
+ metric: Metric
+ score: Optional[float]
+ summary_file: str
+ placeholder: Optional[str]
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-s",
+ "--summary-dir",
+ help="Directory with summary files",
+ required=True,
+ )
+ parser.add_argument(
+ "-m",
+ "--metric",
+ help="Metric to use for the table",
+ choices=["f1", "cover"],
+ required=True,
+ )
+ parser.add_argument(
+ "-e",
+ "--experiment",
+ help="Experiment to make table for",
+ choices=["best", "default"],
+ required=True,
+ )
+ parser.add_argument(
+ "-d",
+ "--dim",
+ help="Dimensionality",
+ choices=["uni", "multi", "combined"],
+ required=True,
+ )
+ parser.add_argument(
+ "-f",
+ "--format",
+ help="Output format",
+ choices=["json", "tex"],
+ required=True,
+ )
+ parser.add_argument(
+ "-t",
+ "--type",
+ help="Type of table to make",
+ choices=["avg", "full"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def warning(msg):
+ termcolor.cprint(msg, "yellow", file=sys.stderr)
+
+
+def load_summary(filename):
+ with open(filename, "r") as fp:
+ data = json.load(fp)
+ return data
+
+
+def extract_score(method_results, metric=None, experiment=None):
+ """Extract a single numeric score from a list of dictionaries
+ """
+
+ if not metric in [Metric.f1, Metric.cover]:
+ raise ValueError("Unknown metric: %s" % metric)
+ if not experiment in ["default", "best"]:
+ raise ValueError("Unknown experiment: %s" % experiment)
+
+ # Collect all values for the chosen metric
+ scores = []
+ for result in method_results:
+ if not result["status"] == "SUCCESS":
+ continue
+ scores.append(result["scores"][metric.name])
+
+ if len(scores) == 0:
+ return None
+
+ # check that we have only one score for the 'default' experiment
+ if experiment == "default":
+ if len(scores) > 1:
+ raise ValueError("Default experiment with more than one score!")
+ return scores[0]
+ return max(scores)
+
+
+def collect_results(summary_dir=None, metric=None, experiment=None):
+ """Collect the results for the experiment on the specified metric.
+
+ Returns a list of Result objects.
+ """
+ if not metric in [Metric.f1, Metric.cover]:
+ raise ValueError("Unknown metric: %s" % metric)
+ if not experiment in ["default", "best"]:
+ raise ValueError("Unknown experiment: %s" % experiment)
+ if not os.path.isdir(summary_dir):
+ raise FileNotFoundError(summary_dir)
+
+ results = []
+ for fname in sorted(os.listdir(summary_dir)):
+ path = os.path.join(summary_dir, fname)
+ summary_data = load_summary(path)
+
+ dataset_name = summary_data["dataset"]
+ summary_results = summary_data["results"]
+
+ is_multi = summary_data["dataset_ndim"] > 1
+
+ for method in summary_results:
+ # method names are prefixed with the experiment type, so we skip
+ # the ones we don't want
+ if not method.startswith(experiment + "_"):
+ continue
+
+ # extract the metric score for this experiment from the summary
+ # results for the method
+ score = extract_score(
+ summary_results[method], metric=metric, experiment=experiment
+ )
+
+ # strip the experiment from the method name
+ method_name = method[len(experiment + "_") :]
+
+ # determine the placeholder value if there is no score.
+ placeholder = set()
+ if score is None:
+ if (Dataset(dataset_name) in MISSING_DATASETS) and (
+ not Method(method_name) in MISSING_METHODS
+ ):
+ # dataset has missing values and method can't handle it
+ placeholder.add("M")
+ else:
+ for result in summary_results[method]:
+ if result["status"] == "FAIL":
+ placeholder.add("F")
+ elif result["status"] == "TIMEOUT":
+ placeholder.add("T")
+ placeholder = "/".join(sorted(placeholder))
+
+ # create a Result object
+ res = Result(
+ dataset=Dataset(dataset_name),
+ experiment=Experiment(experiment),
+ is_multidim=is_multi,
+ method=Method(method_name),
+ metric=Metric(metric),
+ score=score,
+ summary_file=fname,
+ placeholder=placeholder or None,
+ )
+ results.append(res)
+ return results
+
+
+def average_results(results):
+ """Average the results
+
+ NOTE: This function filters out some methods/datasets for which we have
+ insufficient results.
+ """
+ experiment = list(set(r.experiment for r in results))[0]
+ # determine if we're dealing with multidimensional datasets
+ is_multi = all(r.is_multidim for r in results)
+
+ expected_methods = MULTIMETHODS if is_multi else list(Method)
+
+ # keep only expected methods
+ results = list(filter(lambda r: r.method in expected_methods, results))
+
+ # remove RBOCPDMS for 'best', because it fails too often
+ if experiment == Experiment.best:
+ warning(
+ "\nWarning: Removing RBOCPDMS (experiment = %s)\n" % experiment
+ )
+ results = list(filter(lambda r: r.method != Method.rbocpdms, results))
+ expected_methods.remove(Method.rbocpdms)
+
+ # remove datasets for which we do not have complete results
+ to_remove = []
+ for dataset in set(r.dataset for r in results):
+ dset_results = filter(lambda r: r.dataset == dataset, results)
+ if any(r.score is None for r in dset_results):
+ to_remove.append(dataset)
+ if to_remove:
+ warning("\nWarning: Filtering out datasets: %r\n" % to_remove)
+ results = list(filter(lambda r: not r.dataset in to_remove, results))
+
+ # check that we are now complete: for all datasets and all methods in the
+ # remaining results, we have a non-None score.
+ assert all(r.score is not None for r in results)
+
+ # compute the average per method
+ methods = set(r.method for r in results)
+ avg = {}
+ for method in methods:
+ method_scores = [r.score for r in results if r.method == method]
+ avg_score = sum(method_scores) / len(method_scores)
+ avg[method.name] = avg_score
+
+ return avg
+
+
+def write_json(results, is_avg=None):
+ if not is_avg in [True, False]:
+ raise ValueError("is_avg should be either True or False")
+
+ output = {}
+ if is_avg:
+ output = results
+ else:
+ datasets = set(r.dataset for r in results)
+ methods = set(r.method for r in results)
+ for d in datasets:
+ output[d.name] = {}
+ for m in methods:
+ r = next(
+ (r for r in results if r.dataset == d and r.method == m),
+ None,
+ )
+ # intended to fail if r is None, because that shouldn't happen
+ output[d.name][m.name] = r.score
+ print(json.dumps(output, indent="\t", sort_keys=True))
+
+
+def write_latex(results, dim=None, is_avg=None):
+ if is_avg:
+ raise NotImplementedError(
+ "write_latex is not supported for is_avg = True"
+ )
+
+ methods = sorted(set(r.method.name for r in results))
+ datasets = sorted(set(r.dataset.name for r in results))
+ if dim == "combined":
+ uni_datasets = [
+ d.name for d in list(Dataset) if not d in MULTIDATASETS
+ ]
+ multi_datasets = [d.name for d in MULTIDATASETS]
+ datasets = sorted(uni_datasets) + sorted(multi_datasets)
+ first_multi = sorted(multi_datasets)[0]
+
+ textsc = lambda m: "\\textsc{%s}" % m
+ verb = lambda m: "\\verb+%s+" % m
+
+ headers = ["Dataset"] + list(map(textsc, methods))
+
+ table = []
+ for dataset in datasets:
+ row = [verb(dataset)]
+ d = Dataset(dataset)
+
+ for method in methods:
+ m = Method(method)
+ r = next((r for r in results if r.method == m and r.dataset == d))
+ row.append(r.placeholder if r.score is None else r.score)
+
+ table.append(row)
+ spec = "l" + "c" * len(methods)
+ tex = build_latex_table(table, headers, floatfmt=".3f", table_spec=spec)
+
+ if dim == "combined":
+ # add a horizontal line for these datasets
+ lines = tex.split("\n")
+ newlines = []
+ for line in lines:
+ if line.startswith(verb(first_multi)):
+ newlines.append("\\hline")
+ newlines.append(line)
+ tex = "\n".join(newlines)
+
+ print(tex)
+
+
+def main():
+ args = parse_args()
+ if args.type == "avg" and args.dim == "combined":
+ raise ValueError("Using 'avg' and 'combined' is not supported.")
+
+ results = collect_results(
+ summary_dir=args.summary_dir,
+ metric=Metric(args.metric),
+ experiment=args.experiment,
+ )
+
+ if args.dim == "uni":
+ # filter out multi
+ results = list(filter(lambda r: not r.is_multidim, results))
+ elif args.dim == "multi":
+ # filter out uni
+ results = list(filter(lambda r: r.is_multidim, results))
+
+ if args.type == "avg":
+ results = average_results(results)
+
+ if args.format == "json":
+ write_json(results, is_avg=args.type == "avg")
+ else:
+ write_latex(results, args.dim, is_avg=args.type == "avg")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py
new file mode 100644
index 00000000..932fbb7c
--- /dev/null
+++ b/analysis/scripts/metrics.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+"""
+Evaluation metrics
+
+Author: G.J.J. van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+def true_positives(T, X, margin=5):
+ """Compute true positives without double counting
+ """
+ # make a copy so we don't affect the caller
+ X = set(list(X))
+ TP = set()
+ for tau in T:
+ close = [(abs(tau - x), x) for x in X if abs(tau - x) <= margin]
+ close.sort()
+ if not close:
+ continue
+ dist, xstar = close[0]
+ TP.add(tau)
+ X.remove(xstar)
+ return TP
+
+
+def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
+ """Compute the F-measure based on human annotations.
+
+ annotations : dict from user_id to iterable of CP locations
+ predictions : iterable of predicted CP locations
+ alpha : value for the F-measure, alpha=0.5 gives the F1-measure
+ return_PR : whether to return precision and recall too
+
+ Remember that all CP locations are 0-based!
+
+ """
+ # ensure 0 is in all the sets
+ Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)}
+ for Tk in Tks.values():
+ Tk.add(0)
+
+ X = set(predictions)
+ X.add(0)
+
+ Tstar = [tau for tau in Tk for Tk in Tks.values()]
+ Tstar = set(Tstar)
+
+ K = len(Tks)
+
+ P = len(true_positives(Tstar, X, margin=margin)) / len(X)
+
+ TPk = {k: true_positives(Tks[k], X, margin=margin) for k in Tks}
+ R = 1 / K * sum(len(TPk[k]) / len(Tks[k]) for k in Tks)
+
+ F = P * R / (alpha * R + (1 - alpha) * P)
+ if return_PR:
+ return F, P, R
+ return F
+
+
+def overlap(A, B):
+ """ Return the overlap (i.e. Jaccard index) of two sets """
+ return len(A.intersection(B)) / len(A.union(B))
+
+
+def partition_from_cps(locations, n_obs):
+ """ Return a list of sets that give a partition of the set [0, T-1], as
+ defined by the change point locations.
+
+ >>> partition_from_cps([], 5)
+ [{0, 1, 2, 3, 4}]
+ >>> partition_from_cps([3, 5], 8)
+ [{0, 1, 2}, {3, 4}, {5, 6, 7}]
+ >>> partition_from_cps([1,2,7], 8)
+ [{0}, {1}, {2, 3, 4, 5, 6}, {7}]
+ >>> partition_from_cps([0, 4], 6)
+ [{0, 1, 2, 3}, {4, 5}]
+ """
+ T = n_obs
+ partition = []
+ current = set()
+
+ all_cps = iter(sorted(set(locations)))
+ cp = next(all_cps, None)
+ for i in range(T):
+ if i == cp:
+ if current:
+ partition.append(current)
+ current = set()
+ cp = next(all_cps, None)
+ current.add(i)
+ partition.append(current)
+ return partition
+
+
+def cover_single(Sprime, S):
+ """Compute the covering of a segmentation S by a segmentation Sprime.
+
+ This follows equation (8) in Arbaleaz, 2010.
+ """
+ T = sum(map(len, Sprime))
+ assert T == sum(map(len, S))
+ C = 0
+ for R in S:
+ C += len(R) * max(overlap(R, Rprime) for Rprime in Sprime)
+ C /= T
+ return C
+
+
+def covering(annotations, predictions, n_obs):
+ """Compute the average segmentation covering against the human annotations.
+
+ annotations : dict from user_id to iterable of CP locations
+ predictions : iterable of predicted Cp locations
+ n_obs : number of observations in the series
+
+ """
+ Ak = {
+ k + 1: partition_from_cps(annotations[uid], n_obs)
+ for k, uid in enumerate(annotations)
+ }
+ pX = partition_from_cps(predictions, n_obs)
+
+ Cs = [cover_single(pX, Ak[k]) for k in Ak]
+ return sum(Cs) / len(Cs)
diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py
new file mode 100644
index 00000000..c47cac77
--- /dev/null
+++ b/analysis/scripts/rank_common.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+"""Shared code to do with ranks
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import json
+import numpy as np
+import sys
+
+from scipy.stats import rankdata
+
+
+def load_data(filename):
+ with open(filename, "r") as fp:
+ return json.load(fp)
+
+
+def compute_ranks(results, keep_methods=None, higher_better=True):
+ """Compute the ranks
+
+ Parameters
+ ----------
+ results : dict
+ Mapping from dataset name to dict, where each dict in turn is a map
+ from method name to a score value.
+
+ keep_methods: list
+ Methods to include in the ranks
+
+ higher_better: bool
+ Whether a higher or a lower value is considered better
+
+ Returns
+ -------
+ avg_ranks : dict
+ Map from method name to average rank
+
+ all_ranks: dict
+ Map from dataset name to dictionary, which is in turn a map from method
+ name to rank for that dataset and that method.
+
+ """
+ vec_ranks = []
+ all_ranks = {}
+
+ for dset in results:
+ methods = results[dset].keys()
+ methods = sorted(methods)
+
+ methods = [m for m in methods if m in keep_methods]
+ assert methods == keep_methods
+
+ if higher_better:
+ values = [-results[dset][m] for m in methods]
+ else:
+ values = [results[dset][m] for m in methods]
+
+ if any(np.isnan(v) for v in values):
+ print(
+ "Skipping dataset %s because of nans" % dset, file=sys.stderr
+ )
+ continue
+
+ ranks = rankdata(values, method="average")
+
+ vec_ranks.append(ranks)
+ rank_dict = {m: ranks[i] for i, m in enumerate(methods)}
+
+ all_ranks[dset] = rank_dict
+
+ avg_ranks = np.mean(vec_ranks, axis=0)
+ avg_ranks = {m: r for m, r in zip(methods, avg_ranks)}
+ return avg_ranks, all_ranks
+
+
+def preprocess_data(data, _type):
+ methods = set([m for dset in data.keys() for m in data[dset].keys()])
+ methods = sorted(methods)
+ if _type == "best":
+ print(
+ "\nWarning: Filtering out RBOCPDMS due to insufficient results.\n",
+ file=sys.stderr,
+ )
+ methods = [m for m in methods if not m == "rbocpdms"]
+
+ data_w_methods = {}
+ for dset in data:
+ data_w_methods[dset] = {}
+ for method in methods:
+ data_w_methods[dset][method] = data[dset][method]
+
+ data_no_missing = {}
+ for dset in data_w_methods:
+ if any((x is None for x in data_w_methods[dset].values())):
+ continue
+ data_no_missing[dset] = data_w_methods[dset]
+ return data_no_missing, methods
diff --git a/analysis/scripts/rank_plots.py b/analysis/scripts/rank_plots.py
new file mode 100644
index 00000000..c4cad16f
--- /dev/null
+++ b/analysis/scripts/rank_plots.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Create rank plots from best table json files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+
+from labella.timeline import TimelineTex
+from labella.scale import LinearScale
+
+from rank_common import load_data, compute_ranks, preprocess_data
+from significance import reference_difference
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-i",
+ "--input",
+ help="Input JSON file with results for each method",
+ required=True,
+ )
+ parser.add_argument(
+ "-o", "--output", help="Output tex file to write to", required=True
+ )
+ parser.add_argument(
+ "-b",
+ "--better",
+ help="Whether higher or lower is better",
+ choices=["min", "max"],
+ default="max",
+ )
+ parser.add_argument(
+ "--type",
+ help="Type of table to make",
+ choices=["best", "default"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def method_name(m):
+ m = m.split("_")[-1]
+ return "\\textsc{%s}" % m
+
+
+def make_rank_plot(
+ results,
+ output_file,
+ keep_methods=None,
+ higher_better=True,
+ return_ranks=False,
+):
+ methods = keep_methods[:]
+ avg_ranks, all_ranks = compute_ranks(
+ results, keep_methods=keep_methods, higher_better=higher_better
+ )
+ plot_data = [
+ {"time": rank, "text": method_name(method)}
+ for method, rank in avg_ranks.items()
+ ]
+
+ color = "#000"
+
+ options = {
+ "scale": LinearScale(),
+ "direction": "up",
+ "domain": [1, len(methods)],
+ "layerGap": 20,
+ "borderColor": "#000",
+ "showBorder": False,
+ "labelBgColor": "#fff",
+ "linkColor": color,
+ "labelTextColor": color,
+ "dotColor": color,
+ "initialWidth": 600,
+ "initialHeight": 75,
+ "latex": {"linkThickness": "thin", "reproducible": True},
+ "dotRadius": 2,
+ "margin": {"left": 0, "bottom": 0, "right": 0, "top": 0},
+ }
+
+ tl = TimelineTex(plot_data, options=options)
+ texlines = tl.export()
+
+ n_datasets = len(all_ranks)
+
+ ref_method, CD, _ = reference_difference(avg_ranks, n_datasets)
+
+ # we're going to insert the critical difference line after the dots
+ # scope,·so we first have to figure out where that is.
+ lines = texlines.split("\n")
+ idx = None
+ find_scope = False
+ for i, line in enumerate(lines):
+ if line.strip() == "% dots":
+ find_scope = True
+ if find_scope and "\\end{scope}" in line:
+ idx = i + 1
+ break
+
+ before = lines[:idx]
+ after = lines[idx:]
+
+ nodes, _ = tl.compute()
+ bestnode = next(
+ (n for n in nodes if n.data.text == method_name(ref_method)), None
+ )
+ # idealPos is the position on the axis
+ posBest = bestnode.getRoot().idealPos
+ posCD = tl.options["scale"](bestnode.data.time + CD)
+
+ CDlines = [
+ "% Critical difference",
+ "\\def\\posBest{%.16f}" % posBest,
+ "\\def\\posCD{%.16f}" % posCD,
+ "\\begin{scope}",
+ "\\draw (\\posBest, 30) -- (\\posBest, 20);",
+ "\\draw (\\posBest, 25) --node[below] {CD} (\\posCD, 25);",
+ "\\draw (\\posCD, 30) -- (\\posCD, 20);",
+ "\\end{scope}",
+ ]
+
+ all_lines = before + [""] + CDlines + after
+
+ with open(output_file, "w") as fp:
+ fp.write("\n".join(all_lines))
+
+
+def main():
+ args = parse_args()
+
+ higher_better = args.better == "max"
+
+ data = load_data(args.input)
+ clean, methods = preprocess_data(data, args.type)
+
+ make_rank_plot(
+ clean, args.output, keep_methods=methods, higher_better=higher_better
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/significance.py b/analysis/scripts/significance.py
new file mode 100644
index 00000000..6666306b
--- /dev/null
+++ b/analysis/scripts/significance.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Code to compute significant differences
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import math
+import scipy.stats as stats
+
+from tabulate import tabulate
+
+from rank_common import (
+ load_data,
+ preprocess_data,
+ compute_ranks,
+)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-o", "--output", help="Output basename")
+ parser.add_argument(
+ "-i", "--input", help="Input JSON file with results for each method"
+ )
+ parser.add_argument(
+ "-m",
+ "--mode",
+ choices=["global", "reference"],
+ help="Whether to do a global difference F test or a reference test to the best performing method",
+ )
+ parser.add_argument(
+ "--type",
+ help="Type of table to make",
+ choices=["best", "default"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def global_difference(avg_ranks, n_datasets):
+ N = n_datasets
+ k = len(avg_ranks)
+ avg_sq_sum = sum([pow(float(avg_ranks[m]), 2.0) for m in avg_ranks])
+
+ chi2 = (
+ 12.0 * N / (k * (k + 1)) * (avg_sq_sum - (k * pow(k + 1, 2.0) / 4.0))
+ )
+ chiprob = 1.0 - stats.chi2.cdf(chi2, k - 1)
+
+ Fstat = (N - 1.0) * chi2 / (N * (k - 1) - chi2)
+ Fprob = 1.0 - stats.f.cdf(Fstat, k - 1, (k - 1) * (N - 1))
+
+ return Fstat, Fprob
+
+
+def argmin(func, args):
+ m, inc = float("inf"), None
+ for a in args:
+ v = func(a)
+ if v < m:
+ m, inc = v, a
+ return inc
+
+
+def reference_difference(avg_ranks, n_datasets, significance_level=0.05):
+ N = n_datasets
+ k = len(avg_ranks)
+
+ methods = sorted(avg_ranks.keys())
+ ranks = [avg_ranks[m] for m in methods]
+ ref_method = argmin(lambda m: avg_ranks[m], methods)
+ ref_idx = methods.index(ref_method)
+ others = [m for m in methods if not m == ref_method]
+
+ Z_scores = [0.0] * (k - 1)
+ P_values = [0.0] * (k - 1)
+
+ constant = math.sqrt(6 * N / (k * (k + 1)))
+ for j, method in enumerate(others):
+ i = methods.index(method)
+ Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant
+ P_values[j] = stats.norm.cdf(Z_scores[j])
+
+ # sort the p-values in ascending order
+ sorted_pvals = sorted((p, i) for i, p in enumerate(P_values))
+
+ # Calculate significance differences following Holm's procedure
+ significant_differences = [False] * (k - 1)
+ thresholds = [0] * (k - 1)
+ CD_threshold = None
+ for i in range(k - 1):
+ threshold = significance_level / float(k - (i + 1))
+ pval, idx = sorted_pvals[i]
+ significant_differences[idx] = pval < threshold
+ thresholds[idx] = threshold
+ if pval > threshold and CD_threshold is None:
+ CD_threshold = threshold
+
+ # Calculate the critical difference from the first threshold that failed to
+ # reject. This works because if the p-value would be below the threshold we
+ # would consider it significantly different and above the threshold we
+ # would not.
+ CD = -1 * stats.norm.ppf(CD_threshold) / constant
+
+ txt = [
+ "Number of datasets: %i" % N,
+ "Number of methods: %i" % k,
+ "Reference method: %s" % ref_method,
+ "Significance level: %g" % significance_level,
+ "",
+ "Reference method rank: %.6f" % avg_ranks[ref_method],
+ "Holm's procedure:",
+ ]
+
+ table = []
+ for o, p, t, s in zip(
+ others, P_values, thresholds, significant_differences
+ ):
+ table.append([o, avg_ranks[o], p, t, s])
+
+ txt.append(
+ tabulate(
+ table,
+ headers=["Method", "Rank", "p-Value", "Threshold", "Significant"],
+ )
+ )
+
+ txt.append("")
+ txt.append(
+ "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold)
+ )
+ txt.append("")
+
+ return ref_method, CD, txt
+
+
+def main():
+ args = parse_args()
+
+ data = load_data(args.input)
+ clean, methods = preprocess_data(data, args.type)
+
+ n_datasets = len(clean)
+
+ avg_ranks, all_ranks = compute_ranks(
+ clean, keep_methods=methods, higher_better=True
+ )
+
+ if args.mode == "global":
+ Fstat, Fprob = global_difference(avg_ranks, n_datasets)
+ if args.output:
+ with open(args.output, "w") as fp:
+ fp.write("F = %.1f (p = %g)" % (Fstat, Fprob))
+ else:
+ print("Fstat = %.2f, Fprob = %.2g" % (Fstat, Fprob))
+ elif args.mode == "reference":
+ ref_method, CD, txt = reference_difference(avg_ranks, n_datasets)
+ if args.output:
+ outRef = args.output + "_ref.tex"
+ with open(outRef + "w") as fp:
+ fp.write(outRef + "%")
+ outCD = args.output + "_CD.tex"
+ with open(outCD + "w") as fp:
+ fp.write(outCD + "%")
+ else:
+ print("Reference method = %s, CD = %.2f" % (ref_method, CD))
+ print("")
+ print("\n".join(txt))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/summarize.py b/analysis/scripts/summarize.py
new file mode 100644
index 00000000..426976c5
--- /dev/null
+++ b/analysis/scripts/summarize.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Summarize the results into a single file per dataset.
+
+For each dataset we want::
+
+ {
+ "dataset": "<name>",
+ "dataset_nobs": N,
+ "dataset_ndim": N,
+ "annotations": {
+ "<user_id>": [...],
+ "<user_id>": [...],
+ },
+ "results": {
+ "<method>": [
+ {
+ "parameters": {
+ "<param>": value,
+ },
+ "cplocations": [...],
+ "scores": {
+ "<score_1>": value,
+ },
+ "status": <status>
+ },
+ {
+ "parameters": {
+ "<param>": value,
+ },
+ "cplocations": [...],
+ "scores": {
+ "<score_1>": value,
+ },
+ "status": <status>
+ },
+ ],
+ }
+ }
+
+Basic cleanup on the change point locations will also be performed:
+
+ - deduplication
+ - removal of invalid indices. Recall that indices are 0-based. We remove
+ any indices smaller than 1 and larger than n_obs - 2. The reason that we
+ don't allow 0 or n_obs - 1 (both valid endpoints) is that several
+ algorithms declare these locations as change points by default and they
+ are meaningless.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from metrics import f_measure, covering
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-a",
+ "--annotation-file",
+ help="Path to annotation file",
+ required=True,
+ )
+ parser.add_argument(
+ "-d", "--dataset-file", help="Path to dataset file", required=True
+ )
+ parser.add_argument(
+ "-r", "--result-dir", help="Directory of abed results", required=True
+ )
+ parser.add_argument("-o", "--output-file", help="File to write to")
+ return parser.parse_args()
+
+
+def load_json(filename):
+ with open(filename, "r") as fp:
+ try:
+ data = json.load(fp)
+ except json.decoder.JSONDecodeError:
+ print("Error parsing json file: %s" % filename, file=sys.stderr)
+ return {"error": "parsing error"}
+ return data
+
+
+def load_annotations(filename, dataset):
+ with open(filename, "r") as fp:
+ data = json.load(fp)
+ return data[dataset]
+
+
+def clean_cps(locations, dataset):
+ n_obs = dataset["n_obs"]
+ valid = set([x for x in locations if 1 <= x < n_obs - 1])
+ return sorted(valid)
+
+
+def main():
+ args = parse_args()
+
+ dataset = load_json(args.dataset_file)
+ annotations = load_annotations(args.annotation_file, dataset["name"])
+
+ out = {
+ "dataset": dataset["name"],
+ "dataset_nobs": dataset["n_obs"],
+ "dataset_ndim": dataset["n_dim"],
+ "annotations": annotations,
+ "results": {},
+ }
+
+ data_results = next(
+ (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None
+ )
+ if data_results is None:
+ print(
+ "Couldn't find the result directory for dataset %s"
+ % dataset["name"],
+ file=sys.stderr,
+ )
+ raise SystemExit(1)
+
+ dataset_dir = os.path.join(args.result_dir, data_results)
+
+ for method in os.listdir(dataset_dir):
+ method_dir = os.path.join(dataset_dir, method)
+ for result_file in os.listdir(method_dir):
+ # print("Processing result file: %s" % result_file)
+ fname = os.path.join(method_dir, result_file)
+ result = load_json(fname)
+ if not method in out["results"]:
+ out["results"][method] = []
+
+ if result["status"].lower() == "success":
+ locations = clean_cps(result["result"]["cplocations"], dataset)
+
+ f1, precision, recall = f_measure(
+ annotations, locations, return_PR=True
+ )
+ n_obs = dataset["n_obs"]
+ cover = covering(annotations, locations, n_obs)
+ scores = {
+ "f1": f1,
+ "precision": precision,
+ "recall": recall,
+ "cover": cover,
+ }
+ else:
+ locations = None
+ scores = None
+
+ out["results"][method].append(
+ {
+ "parameters": result["parameters"],
+ "task_file": result_file,
+ "cplocations": locations,
+ "scores": scores,
+ "status": result['status'],
+ }
+ )
+
+ if args.output_file:
+ with open(args.output_file, "w") as fp:
+ json.dump(out, fp, indent="\t")
+ else:
+ print(json.dumps(out, indent="\t"))
+
+
+if __name__ == "__main__":
+ main()