initial commit

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-12 14:33:57 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-12 14:33:57 +0000
commit: 7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch)
tree: 9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts
download: TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz
TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip
8 files changed, 1542 insertions, 0 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py
new file mode 100644
index 00000000..712a6a4a
--- /dev/null
+++ b/analysis/scripts/aggregate_table_wide.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""
+Script to generate the aggregate table (wide version)
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import json
+import tabulate
+
+from enum import Enum
+
+
+class Method(Enum):
+    amoc = "amoc"
+    binseg = "binseg"
+    bocpd = "bocpd"
+    bocpdms = "bocpdms"
+    cpnp = "cpnp"
+    ecp = "ecp"
+    kcpa = "kcpa"
+    pelt = "pelt"
+    prophet = "prophet"
+    rbocpdms = "rbocpdms"
+    rfpop = "rfpop"
+    segneigh = "segneigh"
+    wbs = "wbs"
+
+
+# Methods that support multidimensional datasets
+MULTIMETHODS = [
+    Method.bocpd,
+    Method.bocpdms,
+    Method.ecp,
+    Method.kcpa,
+    Method.rbocpdms,
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bcu",
+        help="Path to json file with results for: best/cover/uni",
+        required=True,
+    )
+    parser.add_argument(
+        "--bcm",
+        help="Path to json file with results for: best/cover/multi",
+        required=True,
+    )
+    parser.add_argument(
+        "--bfu",
+        help="Path to json file with results for: best/f1/uni",
+        required=True,
+    )
+    parser.add_argument(
+        "--bfm",
+        help="Path to json file with results for: best/f1/multi",
+        required=True,
+    )
+    parser.add_argument(
+        "--dcu",
+        help="Path to json file with results for: default/cover/uni",
+        required=True,
+    )
+    parser.add_argument(
+        "--dcm",
+        help="Path to json file with results for: default/cover/multi",
+        required=True,
+    )
+    parser.add_argument(
+        "--dfu",
+        help="Path to json file with results for: default/f1/uni",
+        required=True,
+    )
+    parser.add_argument(
+        "--dfm",
+        help="Path to json file with results for: default/f1/multi",
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def load_json(filename):
+    with open(filename, "r") as fp:
+        return json.load(fp)
+
+
+def make_table(
+    label,
+    uni_default_cover,
+    uni_default_f1,
+    uni_best_cover,
+    uni_best_f1,
+    multi_default_cover,
+    multi_default_f1,
+    multi_best_cover,
+    multi_best_f1,
+    methods,
+):
+    """Create part of the aggregate table
+    """
+    tex = []
+    tex.append("%% This table requires booktabs!")
+
+    tex.append("\\begin{tabular}{lrr|rrrr|rr}")
+    superheader = (
+        " & ".join(
+            [
+                "",
+                "\\multicolumn{4}{c}{Univariate}",
+                "\\multicolumn{4}{c}{Multivariate} \\\\",
+            ]
+        )
+        + "\\cmidrule(lr){2-5}\\cmidrule(lr){6-9}"
+    )
+    tex.append(superheader)
+    header = (
+        " & ".join(
+            [
+                "",
+                "\\multicolumn{2}{c}{Default}",
+                "\\multicolumn{2}{c}{Best}",
+                "\\multicolumn{2}{c}{Default}",
+                "\\multicolumn{2}{c}{Best} \\\\",
+            ]
+        )
+        + "\\cmidrule(lr){2-3}"
+        + "\\cmidrule(lr){4-5}"
+        + "\\cmidrule(lr){6-7}"
+        + "\\cmidrule(lr){8-9}"
+    )
+    tex.append(header)
+    subheader = (
+        " & ".join(
+            [
+                "",
+                "Cover",
+                "F1",
+                "Cover",
+                "F1",
+                "Cover",
+                "F1",
+                "Cover",
+                "F1" + "\\\\",
+            ]
+        )
+        + "\\cmidrule(r){1-1}"
+        + "\\cmidrule(lr){2-5}"
+        + "\\cmidrule(l){6-9}"
+    )
+    tex.append(subheader)
+
+    table = []
+
+    L = max(map(len, methods))
+    textsc = lambda m: "\\textsc{%s}%s" % (m, (L - len(m)) * " ")
+    table.append(list(map(textsc, methods)))
+
+    all_exps = [
+        [uni_default_cover, uni_default_f1, uni_best_cover, uni_best_f1],
+        [
+            multi_default_cover,
+            multi_default_f1,
+            multi_best_cover,
+            multi_best_f1,
+        ],
+    ]
+
+    for exps in all_exps:
+        for exp in exps:
+            row = []
+            maxscore = max((exp[m] for m in methods if m in exp))
+            for m in methods:
+                if not m in exp:
+                    row.append(5 * " ")
+                    continue
+                score = exp[m]
+                scorestr = tabulate._format(
+                    score, tabulate._float_type, ".3f", ""
+                )
+                if score == maxscore:
+                    row.append("\\textbf{%s}" % scorestr)
+                else:
+                    row.append(scorestr)
+            table.append(row)
+
+    transposed = list(zip(*table))
+    for row in transposed:
+        tex.append(" & ".join(row) + " \\\\")
+    tex.append("\\cmidrule(r){1-1}\\cmidrule(lr){2-5}\\cmidrule(l){6-9}")
+    tex.append("\\end{tabular}")
+
+    return tex
+
+
+def main():
+    args = parse_args()
+
+    bcu = load_json(args.bcu)
+    bcm = load_json(args.bcm)
+    bfu = load_json(args.bfu)
+    bfm = load_json(args.bfm)
+    dcu = load_json(args.dcu)
+    dcm = load_json(args.dcm)
+    dfu = load_json(args.dfu)
+    dfm = load_json(args.dfm)
+
+    methods = sorted([m.name for m in Method])
+    tex = make_table("Wide", dcu, dfu, bcu, bfu, dcm, dfm, bcm, bfm, methods)
+
+    print("\n".join(tex))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/latex.py b/analysis/scripts/latex.py
new file mode 100644
index 00000000..107ce20a
--- /dev/null
+++ b/analysis/scripts/latex.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Code for compiling latex from Python.
+
+Based on: https://github.com/GjjvdBurg/labella.py
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import os
+import shutil
+import subprocess
+import tabulate
+import tempfile
+
+
+def compile_latex(fname, tmpdirname, silent=True):
+    compiler = "latexmk"
+    compiler_args = [
+        "--pdf",
+        "--outdir=" + tmpdirname,
+        "--interaction=nonstopmode",
+        fname,
+    ]
+    command = [compiler] + compiler_args
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+    except (OSError, IOError) as e:
+        raise (e)
+    except subprocess.CalledProcessError as e:
+        print(e.output.decode())
+        raise (e)
+    else:
+        if not silent:
+            print(output.decode())
+
+
+def build_latex_doc(tex, output_name=None, silent=True):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        basename = "labella_text"
+        fname = os.path.join(tmpdirname, basename + ".tex")
+        with open(fname, "w") as fid:
+            fid.write(tex)
+
+        compile_latex(fname, tmpdirname, silent=silent)
+
+        pdfname = os.path.join(tmpdirname, basename + ".pdf")
+        if output_name:
+            shutil.copy2(pdfname, output_name)
+
+
+def build_latex_table(
+    table,
+    headers,
+    floatfmt="g",
+    missingval="",
+    bests="default",
+    table_spec=None,
+):
+    """Construct the LaTeX code for a table
+
+    This function creates the LaTeX code for a data table while taking number 
+    formatting, headers, missing values, and "best value formatting" into 
+    account.
+
+    The numbers in the table are formatted following the provided float format 
+    and the missing value indicator using the ``_format`` function from the 
+    ``tabulate`` package. To indicate a missing value the data row should mark 
+    this value as ``None``.
+
+    The ``bests`` parameter is used to decide how to highlight the best values 
+    in each row. It can be either ``'default'``, ``None``, a list of length 1 
+    where the element is either ``min`` or ``max``, or a list of length ``K`` 
+    with similar elements where ``K`` is the length of the data table. If it is 
+    ``'default'`` then ``max`` will be considered best for each row. If a list 
+    of length 1 is supplied then the provided function will be used for each 
+    row. If ``None``, no highlighting will be done.
+
+    The ``table_spec`` parameter allows the user to specify the table 
+    specification. This value is not checked. If it is None, the first column 
+    will get 'l' spec and the remaining columns will get the 'r' spec.
+
+    """
+    if bests == "default":
+        bests = [max]
+    elif bests is None:
+        bests = []
+
+    if len(bests) > 1:
+        assert len(bests) == len(table)
+    assert all((x in [min, max] for x in bests))
+
+    if len(bests) == 0:
+        best_funcs = [None for x in range(len(table))]
+    elif len(bests) == 1:
+        best_funcs = [bests[0] for x in range(len(table))]
+    else:
+        best_funcs = bests[:]
+
+    _typ = lambda v: tabulate._type(v)
+    _fmt = lambda v: tabulate._format(v, _typ(v), floatfmt, missingval)
+
+    list_of_lists, headers = table, headers
+    cols = list(zip(*list_of_lists))
+    coltypes = list(map(tabulate._column_type, cols))
+
+    cols = [
+        [_fmt(v) for v in c]
+        for c, ct in zip(cols, coltypes)
+    ]
+
+    n_cols = len(cols)
+
+    data_rows = table
+    text_rows = list(zip(*cols))
+
+    text = []
+    if table_spec is None:
+        text.append("\\begin{tabular}{l%s}" % ("r" * (n_cols - 1)))
+    else:
+        text.append("\\begin{tabular}{%s}" % table_spec)
+    text.append(" & ".join(headers) + "\\\\")
+    text.append("\\hline")
+    for data_row, text_row, best_func in zip(data_rows, text_rows, best_funcs):
+        text_row = list(text_row)
+        if not best_func is None:
+            best_val = best_func([x for x in data_row if isinstance(x, float)])
+            best_idx = [i for i, v in enumerate(data_row) if v == best_val]
+            for idx in best_idx:
+                text_row[idx] = "\\textbf{" + text_row[idx] + "}"
+        text.append(" & ".join(text_row) + "\\\\")
+    text.append("\\hline")
+    text.append("\\end{tabular}")
+
+    return "\n".join(text)
diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py
new file mode 100644
index 00000000..e4747258
--- /dev/null
+++ b/analysis/scripts/make_table.py
@@ -0,0 +1,441 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Script to generate tables from summary files
+
+Metrics, experiments, methods, and datasets are hard-coded as a means of 
+validation.
+
+For the "best" experiment, the RBOCPDMS method is excluded because it fails too 
+often. For the other experiments, datasets with incomplete results are removed.
+
+Author: G.J.J. van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import colorama
+import json
+import os
+import sys
+import termcolor
+
+from enum import Enum
+from typing import Optional
+
+# from pydantic.dataclasses import dataclass
+from dataclasses import dataclass
+
+from latex import build_latex_table
+
+colorama.init()
+
+
+class Metric(Enum):
+    f1 = "f1"
+    cover = "cover"
+
+
+class Experiment(Enum):
+    default = "default"
+    best = "best"
+
+
+class Dataset(Enum):
+    apple = "apple"
+    bank = "bank"
+    bee_waggle_6 = "bee_waggle_6"
+    bitcoin = "bitcoin"
+    brent_spot = "brent_spot"
+    businv = "businv"
+    centralia = "centralia"
+    children_per_woman = "children_per_woman"
+    co2_canada = "co2_canada"
+    construction = "construction"
+    debt_ireland = "debt_ireland"
+    gdp_argentina = "gdp_argentina"
+    gdp_croatia = "gdp_croatia"
+    gdp_iran = "gdp_iran"
+    gdp_japan = "gdp_japan"
+    global_co2 = "global_co2"
+    homeruns = "homeruns"
+    iceland_tourism = "iceland_tourism"
+    jfk_passengers = "jfk_passengers"
+    lga_passengers = "lga_passengers"
+    nile = "nile"
+    occupancy = "occupancy"
+    ozone = "ozone"
+    quality_control_1 = "quality_control_1"
+    quality_control_2 = "quality_control_2"
+    quality_control_3 = "quality_control_3"
+    quality_control_4 = "quality_control_4"
+    quality_control_5 = "quality_control_5"
+    rail_lines = "rail_lines"
+    ratner_stock = "ratner_stock"
+    robocalls = "robocalls"
+    run_log = "run_log"
+    scanline_126007 = "scanline_126007"
+    scanline_42049 = "scanline_42049"
+    seatbelts = "seatbelts"
+    shanghai_license = "shanghai_license"
+    uk_coal_employ = "uk_coal_employ"
+    measles = "measles"
+    unemployment_nl = "unemployment_nl"
+    us_population = "us_population"
+    usd_isk = "usd_isk"
+    well_log = "well_log"
+
+
+class Method(Enum):
+    amoc = "amoc"
+    binseg = "binseg"
+    bocpd = "bocpd"
+    bocpdms = "bocpdms"
+    cpnp = "cpnp"
+    ecp = "ecp"
+    kcpa = "kcpa"
+    pelt = "pelt"
+    prophet = "prophet"
+    rbocpdms = "rbocpdms"
+    rfpop = "rfpop"
+    segneigh = "segneigh"
+    wbs = "wbs"
+
+
+# Methods that support multidimensional datasets
+MULTIMETHODS = [
+    Method.bocpd,
+    Method.bocpdms,
+    Method.ecp,
+    Method.kcpa,
+    Method.rbocpdms,
+]
+
+# Multidimensional datasets
+MULTIDATASETS = [
+    Dataset.apple,
+    Dataset.bee_waggle_6,
+    Dataset.occupancy,
+    Dataset.run_log,
+]
+
+# Datasets with missing values
+MISSING_DATASETS = [Dataset.uk_coal_employ]
+
+# Methods that handle missing values
+MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet]
+
+
+@dataclass
+class Result:
+    dataset: Dataset
+    experiment: Experiment
+    is_multidim: bool
+    method: Method
+    metric: Metric
+    score: Optional[float]
+    summary_file: str
+    placeholder: Optional[str]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--summary-dir",
+        help="Directory with summary files",
+        required=True,
+    )
+    parser.add_argument(
+        "-m",
+        "--metric",
+        help="Metric to use for the table",
+        choices=["f1", "cover"],
+        required=True,
+    )
+    parser.add_argument(
+        "-e",
+        "--experiment",
+        help="Experiment to make table for",
+        choices=["best", "default"],
+        required=True,
+    )
+    parser.add_argument(
+        "-d",
+        "--dim",
+        help="Dimensionality",
+        choices=["uni", "multi", "combined"],
+        required=True,
+    )
+    parser.add_argument(
+        "-f",
+        "--format",
+        help="Output format",
+        choices=["json", "tex"],
+        required=True,
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of table to make",
+        choices=["avg", "full"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def warning(msg):
+    termcolor.cprint(msg, "yellow", file=sys.stderr)
+
+
+def load_summary(filename):
+    with open(filename, "r") as fp:
+        data = json.load(fp)
+    return data
+
+
+def extract_score(method_results, metric=None, experiment=None):
+    """Extract a single numeric score from a list of dictionaries
+    """
+
+    if not metric in [Metric.f1, Metric.cover]:
+        raise ValueError("Unknown metric: %s" % metric)
+    if not experiment in ["default", "best"]:
+        raise ValueError("Unknown experiment: %s" % experiment)
+
+    # Collect all values for the chosen metric
+    scores = []
+    for result in method_results:
+        if not result["status"] == "SUCCESS":
+            continue
+        scores.append(result["scores"][metric.name])
+
+    if len(scores) == 0:
+        return None
+
+    # check that we have only one score for the 'default' experiment
+    if experiment == "default":
+        if len(scores) > 1:
+            raise ValueError("Default experiment with more than one score!")
+        return scores[0]
+    return max(scores)
+
+
+def collect_results(summary_dir=None, metric=None, experiment=None):
+    """Collect the results for the experiment on the specified metric.
+
+    Returns a list of Result objects.
+    """
+    if not metric in [Metric.f1, Metric.cover]:
+        raise ValueError("Unknown metric: %s" % metric)
+    if not experiment in ["default", "best"]:
+        raise ValueError("Unknown experiment: %s" % experiment)
+    if not os.path.isdir(summary_dir):
+        raise FileNotFoundError(summary_dir)
+
+    results = []
+    for fname in sorted(os.listdir(summary_dir)):
+        path = os.path.join(summary_dir, fname)
+        summary_data = load_summary(path)
+
+        dataset_name = summary_data["dataset"]
+        summary_results = summary_data["results"]
+
+        is_multi = summary_data["dataset_ndim"] > 1
+
+        for method in summary_results:
+            # method names are prefixed with the experiment type, so we skip
+            # the ones we don't want
+            if not method.startswith(experiment + "_"):
+                continue
+
+            # extract the metric score for this experiment from the summary
+            # results for the method
+            score = extract_score(
+                summary_results[method], metric=metric, experiment=experiment
+            )
+
+            # strip the experiment from the method name
+            method_name = method[len(experiment + "_") :]
+
+            # determine the placeholder value if there is no score.
+            placeholder = set()
+            if score is None:
+                if (Dataset(dataset_name) in MISSING_DATASETS) and (
+                    not Method(method_name) in MISSING_METHODS
+                ):
+                    # dataset has missing values and method can't handle it
+                    placeholder.add("M")
+                else:
+                    for result in summary_results[method]:
+                        if result["status"] == "FAIL":
+                            placeholder.add("F")
+                        elif result["status"] == "TIMEOUT":
+                            placeholder.add("T")
+            placeholder = "/".join(sorted(placeholder))
+
+            # create a Result object
+            res = Result(
+                dataset=Dataset(dataset_name),
+                experiment=Experiment(experiment),
+                is_multidim=is_multi,
+                method=Method(method_name),
+                metric=Metric(metric),
+                score=score,
+                summary_file=fname,
+                placeholder=placeholder or None,
+            )
+            results.append(res)
+    return results
+
+
+def average_results(results):
+    """Average the results
+
+    NOTE: This function filters out some methods/datasets for which we have 
+    insufficient results.
+    """
+    experiment = list(set(r.experiment for r in results))[0]
+    # determine if we're dealing with multidimensional datasets
+    is_multi = all(r.is_multidim for r in results)
+
+    expected_methods = MULTIMETHODS if is_multi else list(Method)
+
+    # keep only expected methods
+    results = list(filter(lambda r: r.method in expected_methods, results))
+
+    # remove RBOCPDMS for 'best', because it fails too often
+    if experiment == Experiment.best:
+        warning(
+            "\nWarning: Removing RBOCPDMS (experiment = %s)\n" % experiment
+        )
+        results = list(filter(lambda r: r.method != Method.rbocpdms, results))
+        expected_methods.remove(Method.rbocpdms)
+
+    # remove datasets for which we do not have complete results
+    to_remove = []
+    for dataset in set(r.dataset for r in results):
+        dset_results = filter(lambda r: r.dataset == dataset, results)
+        if any(r.score is None for r in dset_results):
+            to_remove.append(dataset)
+    if to_remove:
+        warning("\nWarning: Filtering out datasets: %r\n" % to_remove)
+    results = list(filter(lambda r: not r.dataset in to_remove, results))
+
+    # check that we are now complete: for all datasets and all methods in the
+    # remaining results, we have a non-None score.
+    assert all(r.score is not None for r in results)
+
+    # compute the average per method
+    methods = set(r.method for r in results)
+    avg = {}
+    for method in methods:
+        method_scores = [r.score for r in results if r.method == method]
+        avg_score = sum(method_scores) / len(method_scores)
+        avg[method.name] = avg_score
+
+    return avg
+
+
+def write_json(results, is_avg=None):
+    if not is_avg in [True, False]:
+        raise ValueError("is_avg should be either True or False")
+
+    output = {}
+    if is_avg:
+        output = results
+    else:
+        datasets = set(r.dataset for r in results)
+        methods = set(r.method for r in results)
+        for d in datasets:
+            output[d.name] = {}
+            for m in methods:
+                r = next(
+                    (r for r in results if r.dataset == d and r.method == m),
+                    None,
+                )
+                # intended to fail if r is None, because that shouldn't happen
+                output[d.name][m.name] = r.score
+    print(json.dumps(output, indent="\t", sort_keys=True))
+
+
+def write_latex(results, dim=None, is_avg=None):
+    if is_avg:
+        raise NotImplementedError(
+            "write_latex is not supported for is_avg = True"
+        )
+
+    methods = sorted(set(r.method.name for r in results))
+    datasets = sorted(set(r.dataset.name for r in results))
+    if dim == "combined":
+        uni_datasets = [
+            d.name for d in list(Dataset) if not d in MULTIDATASETS
+        ]
+        multi_datasets = [d.name for d in MULTIDATASETS]
+        datasets = sorted(uni_datasets) + sorted(multi_datasets)
+        first_multi = sorted(multi_datasets)[0]
+
+    textsc = lambda m: "\\textsc{%s}" % m
+    verb = lambda m: "\\verb+%s+" % m
+
+    headers = ["Dataset"] + list(map(textsc, methods))
+
+    table = []
+    for dataset in datasets:
+        row = [verb(dataset)]
+        d = Dataset(dataset)
+
+        for method in methods:
+            m = Method(method)
+            r = next((r for r in results if r.method == m and r.dataset == d))
+            row.append(r.placeholder if r.score is None else r.score)
+
+        table.append(row)
+    spec = "l" + "c" * len(methods)
+    tex = build_latex_table(table, headers, floatfmt=".3f", table_spec=spec)
+
+    if dim == "combined":
+        # add a horizontal line for these datasets
+        lines = tex.split("\n")
+        newlines = []
+        for line in lines:
+            if line.startswith(verb(first_multi)):
+                newlines.append("\\hline")
+            newlines.append(line)
+        tex = "\n".join(newlines)
+
+    print(tex)
+
+
+def main():
+    args = parse_args()
+    if args.type == "avg" and args.dim == "combined":
+        raise ValueError("Using 'avg' and 'combined' is not supported.")
+
+    results = collect_results(
+        summary_dir=args.summary_dir,
+        metric=Metric(args.metric),
+        experiment=args.experiment,
+    )
+
+    if args.dim == "uni":
+        # filter out multi
+        results = list(filter(lambda r: not r.is_multidim, results))
+    elif args.dim == "multi":
+        # filter out uni
+        results = list(filter(lambda r: r.is_multidim, results))
+
+    if args.type == "avg":
+        results = average_results(results)
+
+    if args.format == "json":
+        write_json(results, is_avg=args.type == "avg")
+    else:
+        write_latex(results, args.dim, is_avg=args.type == "avg")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py
new file mode 100644
index 00000000..932fbb7c
--- /dev/null
+++ b/analysis/scripts/metrics.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+"""
+Evaluation metrics
+
+Author: G.J.J. van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+def true_positives(T, X, margin=5):
+    """Compute true positives without double counting
+    """
+    # make a copy so we don't affect the caller
+    X = set(list(X))
+    TP = set()
+    for tau in T:
+        close = [(abs(tau - x), x) for x in X if abs(tau - x) <= margin]
+        close.sort()
+        if not close:
+            continue
+        dist, xstar = close[0]
+        TP.add(tau)
+        X.remove(xstar)
+    return TP
+
+
+def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
+    """Compute the F-measure based on human annotations.
+
+    annotations : dict from user_id to iterable of CP locations
+    predictions : iterable of predicted CP locations
+    alpha : value for the F-measure, alpha=0.5 gives the F1-measure
+    return_PR : whether to return precision and recall too
+
+    Remember that all CP locations are 0-based!
+
+    """
+    # ensure 0 is in all the sets
+    Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)}
+    for Tk in Tks.values():
+        Tk.add(0)
+
+    X = set(predictions)
+    X.add(0)
+
+    Tstar = [tau for tau in Tk for Tk in Tks.values()]
+    Tstar = set(Tstar)
+
+    K = len(Tks)
+
+    P = len(true_positives(Tstar, X, margin=margin)) / len(X)
+
+    TPk = {k: true_positives(Tks[k], X, margin=margin) for k in Tks}
+    R = 1 / K * sum(len(TPk[k]) / len(Tks[k]) for k in Tks)
+
+    F = P * R / (alpha * R + (1 - alpha) * P)
+    if return_PR:
+        return F, P, R
+    return F
+
+
+def overlap(A, B):
+    """ Return the overlap (i.e. Jaccard index) of two sets """
+    return len(A.intersection(B)) / len(A.union(B))
+
+
+def partition_from_cps(locations, n_obs):
+    """ Return a list of sets that give a partition of the set [0, T-1], as 
+    defined by the change point locations.
+
+    >>> partition_from_cps([], 5)
+    [{0, 1, 2, 3, 4}]
+    >>> partition_from_cps([3, 5], 8)
+    [{0, 1, 2}, {3, 4}, {5, 6, 7}]
+    >>> partition_from_cps([1,2,7], 8)
+    [{0}, {1}, {2, 3, 4, 5, 6}, {7}]
+    >>> partition_from_cps([0, 4], 6)
+    [{0, 1, 2, 3}, {4, 5}]
+    """
+    T = n_obs
+    partition = []
+    current = set()
+
+    all_cps = iter(sorted(set(locations)))
+    cp = next(all_cps, None)
+    for i in range(T):
+        if i == cp:
+            if current:
+                partition.append(current)
+            current = set()
+            cp = next(all_cps, None)
+        current.add(i)
+    partition.append(current)
+    return partition
+
+
+def cover_single(Sprime, S):
+    """Compute the covering of a segmentation S by a segmentation Sprime.
+
+    This follows equation (8) in Arbaleaz, 2010.
+    """
+    T = sum(map(len, Sprime))
+    assert T == sum(map(len, S))
+    C = 0
+    for R in S:
+        C += len(R) * max(overlap(R, Rprime) for Rprime in Sprime)
+    C /= T
+    return C
+
+
+def covering(annotations, predictions, n_obs):
+    """Compute the average segmentation covering against the human annotations.
+
+    annotations : dict from user_id to iterable of CP locations
+    predictions : iterable of predicted Cp locations
+    n_obs : number of observations in the series
+
+    """
+    Ak = {
+        k + 1: partition_from_cps(annotations[uid], n_obs)
+        for k, uid in enumerate(annotations)
+    }
+    pX = partition_from_cps(predictions, n_obs)
+
+    Cs = [cover_single(pX, Ak[k]) for k in Ak]
+    return sum(Cs) / len(Cs)
diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py
new file mode 100644
index 00000000..c47cac77
--- /dev/null
+++ b/analysis/scripts/rank_common.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+"""Shared code to do with ranks
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import json
+import numpy as np
+import sys
+
+from scipy.stats import rankdata
+
+
+def load_data(filename):
+    with open(filename, "r") as fp:
+        return json.load(fp)
+
+
+def compute_ranks(results, keep_methods=None, higher_better=True):
+    """Compute the ranks
+
+    Parameters
+    ----------
+    results : dict
+        Mapping from dataset name to dict, where each dict in turn is a map 
+        from method name to a score value.
+
+    keep_methods: list
+        Methods to include in the ranks
+
+    higher_better: bool
+        Whether a higher or a lower value is considered better
+
+    Returns
+    -------
+    avg_ranks : dict
+        Map from method name to average rank
+
+    all_ranks: dict
+        Map from dataset name to dictionary, which is in turn a map from method 
+        name to rank for that dataset and that method.
+
+    """
+    vec_ranks = []
+    all_ranks = {}
+
+    for dset in results:
+        methods = results[dset].keys()
+        methods = sorted(methods)
+
+        methods = [m for m in methods if m in keep_methods]
+        assert methods == keep_methods
+
+        if higher_better:
+            values = [-results[dset][m] for m in methods]
+        else:
+            values = [results[dset][m] for m in methods]
+
+        if any(np.isnan(v) for v in values):
+            print(
+                "Skipping dataset %s because of nans" % dset, file=sys.stderr
+            )
+            continue
+
+        ranks = rankdata(values, method="average")
+
+        vec_ranks.append(ranks)
+        rank_dict = {m: ranks[i] for i, m in enumerate(methods)}
+
+        all_ranks[dset] = rank_dict
+
+    avg_ranks = np.mean(vec_ranks, axis=0)
+    avg_ranks = {m: r for m, r in zip(methods, avg_ranks)}
+    return avg_ranks, all_ranks
+
+
+def preprocess_data(data, _type):
+    methods = set([m for dset in data.keys() for m in data[dset].keys()])
+    methods = sorted(methods)
+    if _type == "best":
+        print(
+            "\nWarning: Filtering out RBOCPDMS due to insufficient results.\n",
+            file=sys.stderr,
+        )
+        methods = [m for m in methods if not m == "rbocpdms"]
+
+    data_w_methods = {}
+    for dset in data:
+        data_w_methods[dset] = {}
+        for method in methods:
+            data_w_methods[dset][method] = data[dset][method]
+
+    data_no_missing = {}
+    for dset in data_w_methods:
+        if any((x is None for x in data_w_methods[dset].values())):
+            continue
+        data_no_missing[dset] = data_w_methods[dset]
+    return data_no_missing, methods
diff --git a/analysis/scripts/rank_plots.py b/analysis/scripts/rank_plots.py
new file mode 100644
index 00000000..c4cad16f
--- /dev/null
+++ b/analysis/scripts/rank_plots.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Create rank plots from best table json files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+
+from labella.timeline import TimelineTex
+from labella.scale import LinearScale
+
+from rank_common import load_data, compute_ranks, preprocess_data
+from significance import reference_difference
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        help="Input JSON file with results for each method",
+        required=True,
+    )
+    parser.add_argument(
+        "-o", "--output", help="Output tex file to write to", required=True
+    )
+    parser.add_argument(
+        "-b",
+        "--better",
+        help="Whether higher or lower is better",
+        choices=["min", "max"],
+        default="max",
+    )
+    parser.add_argument(
+        "--type",
+        help="Type of table to make",
+        choices=["best", "default"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def method_name(m):
+    m = m.split("_")[-1]
+    return "\\textsc{%s}" % m
+
+
+def make_rank_plot(
+    results,
+    output_file,
+    keep_methods=None,
+    higher_better=True,
+    return_ranks=False,
+):
+    methods = keep_methods[:]
+    avg_ranks, all_ranks = compute_ranks(
+        results, keep_methods=keep_methods, higher_better=higher_better
+    )
+    plot_data = [
+        {"time": rank, "text": method_name(method)}
+        for method, rank in avg_ranks.items()
+    ]
+
+    color = "#000"
+
+    options = {
+        "scale": LinearScale(),
+        "direction": "up",
+        "domain": [1, len(methods)],
+        "layerGap": 20,
+        "borderColor": "#000",
+        "showBorder": False,
+        "labelBgColor": "#fff",
+        "linkColor": color,
+        "labelTextColor": color,
+        "dotColor": color,
+        "initialWidth": 600,
+        "initialHeight": 75,
+        "latex": {"linkThickness": "thin", "reproducible": True},
+        "dotRadius": 2,
+        "margin": {"left": 0, "bottom": 0, "right": 0, "top": 0},
+    }
+
+    tl = TimelineTex(plot_data, options=options)
+    texlines = tl.export()
+
+    n_datasets = len(all_ranks)
+
+    ref_method, CD, _ = reference_difference(avg_ranks, n_datasets)
+
+    # we're going to insert the critical difference line after the dots
+    # scope,·so we first have to figure out where that is.
+    lines = texlines.split("\n")
+    idx = None
+    find_scope = False
+    for i, line in enumerate(lines):
+        if line.strip() == "% dots":
+            find_scope = True
+        if find_scope and "\\end{scope}" in line:
+            idx = i + 1
+            break
+
+    before = lines[:idx]
+    after = lines[idx:]
+
+    nodes, _ = tl.compute()
+    bestnode = next(
+        (n for n in nodes if n.data.text == method_name(ref_method)), None
+    )
+    # idealPos is the position on the axis
+    posBest = bestnode.getRoot().idealPos
+    posCD = tl.options["scale"](bestnode.data.time + CD)
+
+    CDlines = [
+        "% Critical difference",
+        "\\def\\posBest{%.16f}" % posBest,
+        "\\def\\posCD{%.16f}" % posCD,
+        "\\begin{scope}",
+        "\\draw (\\posBest, 30) -- (\\posBest, 20);",
+        "\\draw (\\posBest, 25) --node[below] {CD} (\\posCD, 25);",
+        "\\draw (\\posCD, 30) -- (\\posCD, 20);",
+        "\\end{scope}",
+    ]
+
+    all_lines = before + [""] + CDlines + after
+
+    with open(output_file, "w") as fp:
+        fp.write("\n".join(all_lines))
+
+
+def main():
+    args = parse_args()
+
+    higher_better = args.better == "max"
+
+    data = load_data(args.input)
+    clean, methods = preprocess_data(data, args.type)
+
+    make_rank_plot(
+        clean, args.output, keep_methods=methods, higher_better=higher_better
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/significance.py b/analysis/scripts/significance.py
new file mode 100644
index 00000000..6666306b
--- /dev/null
+++ b/analysis/scripts/significance.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Code to compute significant differences
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import math
+import scipy.stats as stats
+
+from tabulate import tabulate
+
+from rank_common import (
+    load_data,
+    preprocess_data,
+    compute_ranks,
+)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output", help="Output basename")
+    parser.add_argument(
+        "-i", "--input", help="Input JSON file with results for each method"
+    )
+    parser.add_argument(
+        "-m",
+        "--mode",
+        choices=["global", "reference"],
+        help="Whether to do a global difference F test or a reference test to the best performing method",
+    )
+    parser.add_argument(
+        "--type",
+        help="Type of table to make",
+        choices=["best", "default"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def global_difference(avg_ranks, n_datasets):
+    N = n_datasets
+    k = len(avg_ranks)
+    avg_sq_sum = sum([pow(float(avg_ranks[m]), 2.0) for m in avg_ranks])
+
+    chi2 = (
+        12.0 * N / (k * (k + 1)) * (avg_sq_sum - (k * pow(k + 1, 2.0) / 4.0))
+    )
+    chiprob = 1.0 - stats.chi2.cdf(chi2, k - 1)
+
+    Fstat = (N - 1.0) * chi2 / (N * (k - 1) - chi2)
+    Fprob = 1.0 - stats.f.cdf(Fstat, k - 1, (k - 1) * (N - 1))
+
+    return Fstat, Fprob
+
+
+def argmin(func, args):
+    m, inc = float("inf"), None
+    for a in args:
+        v = func(a)
+        if v < m:
+            m, inc = v, a
+    return inc
+
+
+def reference_difference(avg_ranks, n_datasets, significance_level=0.05):
+    N = n_datasets
+    k = len(avg_ranks)
+
+    methods = sorted(avg_ranks.keys())
+    ranks = [avg_ranks[m] for m in methods]
+    ref_method = argmin(lambda m: avg_ranks[m], methods)
+    ref_idx = methods.index(ref_method)
+    others = [m for m in methods if not m == ref_method]
+
+    Z_scores = [0.0] * (k - 1)
+    P_values = [0.0] * (k - 1)
+
+    constant = math.sqrt(6 * N / (k * (k + 1)))
+    for j, method in enumerate(others):
+        i = methods.index(method)
+        Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant
+        P_values[j] = stats.norm.cdf(Z_scores[j])
+
+    # sort the p-values in ascending order
+    sorted_pvals = sorted((p, i) for i, p in enumerate(P_values))
+
+    # Calculate significance differences following Holm's procedure
+    significant_differences = [False] * (k - 1)
+    thresholds = [0] * (k - 1)
+    CD_threshold = None
+    for i in range(k - 1):
+        threshold = significance_level / float(k - (i + 1))
+        pval, idx = sorted_pvals[i]
+        significant_differences[idx] = pval < threshold
+        thresholds[idx] = threshold
+        if pval > threshold and CD_threshold is None:
+            CD_threshold = threshold
+
+    # Calculate the critical difference from the first threshold that failed to
+    # reject. This works because if the p-value would be below the threshold we
+    # would consider it significantly different and above the threshold we
+    # would not.
+    CD = -1 * stats.norm.ppf(CD_threshold) / constant
+
+    txt = [
+        "Number of datasets: %i" % N,
+        "Number of methods: %i" % k,
+        "Reference method: %s" % ref_method,
+        "Significance level: %g" % significance_level,
+        "",
+        "Reference method rank: %.6f" % avg_ranks[ref_method],
+        "Holm's procedure:",
+    ]
+
+    table = []
+    for o, p, t, s in zip(
+        others, P_values, thresholds, significant_differences
+    ):
+        table.append([o, avg_ranks[o], p, t, s])
+
+    txt.append(
+        tabulate(
+            table,
+            headers=["Method", "Rank", "p-Value", "Threshold", "Significant"],
+        )
+    )
+
+    txt.append("")
+    txt.append(
+        "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold)
+    )
+    txt.append("")
+
+    return ref_method, CD, txt
+
+
+def main():
+    args = parse_args()
+
+    data = load_data(args.input)
+    clean, methods = preprocess_data(data, args.type)
+
+    n_datasets = len(clean)
+
+    avg_ranks, all_ranks = compute_ranks(
+        clean, keep_methods=methods, higher_better=True
+    )
+
+    if args.mode == "global":
+        Fstat, Fprob = global_difference(avg_ranks, n_datasets)
+        if args.output:
+            with open(args.output, "w") as fp:
+                fp.write("F = %.1f (p = %g)" % (Fstat, Fprob))
+        else:
+            print("Fstat = %.2f, Fprob = %.2g" % (Fstat, Fprob))
+    elif args.mode == "reference":
+        ref_method, CD, txt = reference_difference(avg_ranks, n_datasets)
+        if args.output:
+            outRef = args.output + "_ref.tex"
+            with open(outRef + "w") as fp:
+                fp.write(outRef + "%")
+            outCD = args.output + "_CD.tex"
+            with open(outCD + "w") as fp:
+                fp.write(outCD + "%")
+        else:
+            print("Reference method = %s, CD = %.2f" % (ref_method, CD))
+            print("")
+            print("\n".join(txt))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/summarize.py b/analysis/scripts/summarize.py
new file mode 100644
index 00000000..426976c5
--- /dev/null
+++ b/analysis/scripts/summarize.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Summarize the results into a single file per dataset.
+
+For each dataset we want::
+
+    {
+        "dataset": "<name>",
+        "dataset_nobs": N,
+        "dataset_ndim": N,
+        "annotations": {
+            "<user_id>": [...],
+            "<user_id>": [...],
+            },
+        "results": {
+            "<method>": [
+                {
+                    "parameters": {
+                        "<param>": value,
+                        },
+                    "cplocations": [...],
+                    "scores": {
+                        "<score_1>": value,
+                    },
+                    "status": <status>
+                },
+                {
+                    "parameters": {
+                        "<param>": value,
+                        },
+                    "cplocations": [...],
+                    "scores": {
+                        "<score_1>": value,
+                    },
+                    "status": <status>
+                },
+                ],
+        }
+    }
+
+Basic cleanup on the change point locations will also be performed:
+
+    - deduplication
+    - removal of invalid indices. Recall that indices are 0-based. We remove 
+      any indices smaller than 1 and larger than n_obs - 2. The reason that we 
+      don't allow 0 or n_obs - 1 (both valid endpoints) is that several 
+      algorithms declare these locations as change points by default and they 
+      are meaningless.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from metrics import f_measure, covering
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-a",
+        "--annotation-file",
+        help="Path to annotation file",
+        required=True,
+    )
+    parser.add_argument(
+        "-d", "--dataset-file", help="Path to dataset file", required=True
+    )
+    parser.add_argument(
+        "-r", "--result-dir", help="Directory of abed results", required=True
+    )
+    parser.add_argument("-o", "--output-file", help="File to write to")
+    return parser.parse_args()
+
+
+def load_json(filename):
+    with open(filename, "r") as fp:
+        try:
+            data = json.load(fp)
+        except json.decoder.JSONDecodeError:
+            print("Error parsing json file: %s" % filename, file=sys.stderr)
+            return {"error": "parsing error"}
+    return data
+
+
+def load_annotations(filename, dataset):
+    with open(filename, "r") as fp:
+        data = json.load(fp)
+    return data[dataset]
+
+
+def clean_cps(locations, dataset):
+    n_obs = dataset["n_obs"]
+    valid = set([x for x in locations if 1 <= x < n_obs - 1])
+    return sorted(valid)
+
+
+def main():
+    args = parse_args()
+
+    dataset = load_json(args.dataset_file)
+    annotations = load_annotations(args.annotation_file, dataset["name"])
+
+    out = {
+        "dataset": dataset["name"],
+        "dataset_nobs": dataset["n_obs"],
+        "dataset_ndim": dataset["n_dim"],
+        "annotations": annotations,
+        "results": {},
+    }
+
+    data_results = next(
+        (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None
+    )
+    if data_results is None:
+        print(
+            "Couldn't find the result directory for dataset %s"
+            % dataset["name"],
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
+    dataset_dir = os.path.join(args.result_dir, data_results)
+
+    for method in os.listdir(dataset_dir):
+        method_dir = os.path.join(dataset_dir, method)
+        for result_file in os.listdir(method_dir):
+            # print("Processing result file: %s" % result_file)
+            fname = os.path.join(method_dir, result_file)
+            result = load_json(fname)
+            if not method in out["results"]:
+                out["results"][method] = []
+
+            if result["status"].lower() == "success":
+                locations = clean_cps(result["result"]["cplocations"], dataset)
+
+                f1, precision, recall = f_measure(
+                    annotations, locations, return_PR=True
+                )
+                n_obs = dataset["n_obs"]
+                cover = covering(annotations, locations, n_obs)
+                scores = {
+                    "f1": f1,
+                    "precision": precision,
+                    "recall": recall,
+                    "cover": cover,
+                }
+            else:
+                locations = None
+                scores = None
+
+            out["results"][method].append(
+                {
+                    "parameters": result["parameters"],
+                    "task_file": result_file,
+                    "cplocations": locations,
+                    "scores": scores,
+                    "status": result['status'],
+                }
+            )
+
+    if args.output_file:
+        with open(args.output_file, "w") as fp:
+            json.dump(out, fp, indent="\t")
+    else:
+        print(json.dumps(out, indent="\t"))
+
+
+if __name__ == "__main__":
+    main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-12 14:33:57 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-12 14:33:57 +0000
commit	7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch)
tree	9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts
download	TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip