6 files changed, 231 insertions, 5 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py
index 712a6a4a..52676e1c 100644
--- a/analysis/scripts/aggregate_table_wide.py
+++ b/analysis/scripts/aggregate_table_wide.py
@@ -30,6 +30,7 @@ class Method(Enum):
     rfpop = "rfpop"
     segneigh = "segneigh"
     wbs = "wbs"
+    zero = "zero"
 
 
 # Methods that support multidimensional datasets
@@ -39,6 +40,7 @@ MULTIMETHODS = [
     Method.ecp,
     Method.kcpa,
     Method.rbocpdms,
+    Method.zero
 ]
 
 
diff --git a/analysis/scripts/descriptive_annotations.py b/analysis/scripts/descriptive_annotations.py
new file mode 100644
index 00000000..2afdc422
--- /dev/null
+++ b/analysis/scripts/descriptive_annotations.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics about the number of 
+annotations from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--summary-dir",
+        help="Directory with summary files",
+        required=True,
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of statistic to compute",
+        choices=["min", "max", "mean", "std"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def load_unique_annotations(summary_dir):
+    files = os.listdir(summary_dir)
+    assert len(files) == N_DATASETS
+
+    n_uniq_anno = []
+    for f in sorted(files):
+        path = os.path.join(summary_dir, f)
+        with open(path, "r") as fp:
+            data = json.load(fp)
+
+        all_anno = set()
+        for annotations in data["annotations"].values():
+            for cp in annotations:
+                all_anno.add(cp)
+        n_uniq_anno.append(len(all_anno))
+    return n_uniq_anno
+
+
+def main():
+    args = parse_args()
+    if args.type == "max":
+        func = max
+    elif args.type == "mean":
+        func = statistics.mean
+    elif args.type == "std":
+        func = statistics.stdev
+    elif args.type == "min":
+        func = min
+    else:
+        raise ValueError("Unknown type")
+
+    n_uniq_anno = load_unique_annotations(args.summary_dir)
+    if args.type in ["min", "max"]:
+        print("%i%%" % func(n_uniq_anno))
+    else:
+        print("%.1f%%" % func(n_uniq_anno))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/descriptive_length.py b/analysis/scripts/descriptive_length.py
new file mode 100644
index 00000000..e8504b92
--- /dev/null
+++ b/analysis/scripts/descriptive_length.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics regarding features of the 
+time series from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--summary-dir",
+        help="Directory with summary files",
+        required=True,
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of statistic to compute",
+        choices=["min", "max", "mean"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def load_summary_nobs(summary_dir):
+    files = os.listdir(summary_dir)
+    assert len(files) == N_DATASETS
+
+    all_nobs = []
+    for f in sorted(files):
+        path = os.path.join(summary_dir, f)
+        with open(path, "r") as fp:
+            data = json.load(fp)
+        all_nobs.append(data["dataset_nobs"])
+    return all_nobs
+
+
+def main():
+    args = parse_args()
+    if args.type == "min":
+        func = min
+    elif args.type == "mean":
+        func = statistics.mean
+    elif args.type == "max":
+        func = max
+    else:
+        raise ValueError("Unknown type")
+
+    all_nobs = load_summary_nobs(args.summary_dir)
+    if args.type in ["min", "max"]:
+        print("%i%%" % func(all_nobs))
+    else:
+        print("%.1f%%" % func(all_nobs))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py
index c326775e..8070eecd 100644
--- a/analysis/scripts/make_table.py
+++ b/analysis/scripts/make_table.py
@@ -103,6 +103,7 @@ class Method(Enum):
     rfpop = "rfpop"
     segneigh = "segneigh"
     wbs = "wbs"
+    zero = "zero"
 
 
 # Methods that support multidimensional datasets
@@ -112,6 +113,7 @@ MULTIMETHODS = [
     Method.ecp,
     Method.kcpa,
     Method.rbocpdms,
+    Method.zero,
 ]
 
 # Multidimensional datasets
@@ -126,7 +128,13 @@ MULTIDATASETS = [
 MISSING_DATASETS = [Dataset.uk_coal_employ]
 
 # Methods that handle missing values
-MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet]
+MISSING_METHODS = [
+    Method.bocpdms,
+    Method.ecp,
+    Method.kcpa,
+    Method.prophet,
+    Method.zero,
+]
 
 
 @dataclass
@@ -323,7 +331,10 @@ def average_results(results):
         if any(r.score is None for r in dset_results):
             to_remove.append(dataset)
     if to_remove:
-        warning("\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n" % to_remove)
+        warning(
+            "\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n"
+            % to_remove
+        )
     results = list(filter(lambda r: not r.dataset in to_remove, results))
 
     # check that we are now complete: for all datasets and all methods in the
diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py
index 932fbb7c..a504227b 100644
--- a/analysis/scripts/metrics.py
+++ b/analysis/scripts/metrics.py
@@ -12,6 +12,17 @@ License: See the LICENSE file.
 
 def true_positives(T, X, margin=5):
     """Compute true positives without double counting
+
+    >>> true_positives({1, 10, 20, 23}, {3, 8, 20})
+    {1, 10, 20}
+    >>> true_positives({1, 10, 20, 23}, {1, 3, 8, 20})
+    {1, 10, 20}
+    >>> true_positives({1, 10, 20, 23}, {1, 3, 5, 8, 20})
+    {1, 10, 20}
+    >>> true_positives(set(), {1, 2, 3})
+    set()
+    >>> true_positives({1, 2, 3}, set())
+    set()
     """
     # make a copy so we don't affect the caller
     X = set(list(X))
@@ -37,6 +48,12 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
 
     Remember that all CP locations are 0-based!
 
+    >>> f_measure({1: [10, 20], 2: [11, 20], 3: [10], 4: [0, 5]}, [10, 20])
+    1.0
+    >>> f_measure({1: [], 2: [10], 3: [50]}, [10])
+    0.9090909090909091
+    >>> f_measure({1: [], 2: [10], 3: [50]}, [])
+    0.8
     """
     # ensure 0 is in all the sets
     Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)}
@@ -46,8 +63,10 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
     X = set(predictions)
     X.add(0)
 
-    Tstar = [tau for tau in Tk for Tk in Tks.values()]
-    Tstar = set(Tstar)
+    Tstar = set()
+    for Tk in Tks.values():
+        for tau in Tk:
+            Tstar.add(tau)
 
     K = len(Tks)
 
@@ -63,7 +82,17 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
 
 
 def overlap(A, B):
-    """ Return the overlap (i.e. Jaccard index) of two sets """
+    """ Return the overlap (i.e. Jaccard index) of two sets
+
+    >>> overlap({1, 2, 3}, set())
+    0.0
+    >>> overlap({1, 2, 3}, {2, 5})
+    0.25
+    >>> overlap(set(), {1, 2, 3})
+    0.0
+    >>> overlap({1, 2, 3}, {1, 2, 3})
+    1.0
+    """
     return len(A.intersection(B)) / len(A.union(B))
 
 
@@ -101,6 +130,15 @@ def cover_single(Sprime, S):
     """Compute the covering of a segmentation S by a segmentation Sprime.
 
     This follows equation (8) in Arbaleaz, 2010.
+
+    >>> cover_single([{1, 2, 3}, {4, 5}, {6}], [{1, 2, 3}, {4, 5, 6}])
+    0.8333333333333334
+    >>> cover_single([{1, 2, 3, 4}, {5, 6}], [{1, 2, 3, 4, 5, 6}])
+    0.6666666666666666
+    >>> cover_single([{1, 2}, {3, 4}, {5, 6}], [{1, 2, 3}, {4, 5, 6}])
+    0.6666666666666666
+    >>> cover_single([{1, 2, 3, 4, 5, 6}], [{1}, {2}, {3}, {4, 5, 6}])
+    0.3333333333333333
     """
     T = sum(map(len, Sprime))
     assert T == sum(map(len, S))
@@ -118,6 +156,13 @@ def covering(annotations, predictions, n_obs):
     predictions : iterable of predicted Cp locations
     n_obs : number of observations in the series
 
+    >>> covering({1: [10, 20], 2: [10], 3: [0, 5]}, [10, 20], 45)
+    0.7962962962962963
+    >>> covering({1: [], 2: [10], 3: [40]}, [10], 45)
+    0.7954144620811286
+    >>> covering({1: [], 2: [10], 3: [40]}, [], 45)
+    0.8189300411522634
+
     """
     Ak = {
         k + 1: partition_from_cps(annotations[uid], n_obs)
diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py
index b1d5af77..d12dc5bc 100644
--- a/analysis/scripts/rank_common.py
+++ b/analysis/scripts/rank_common.py
@@ -89,12 +89,27 @@ def warning(msg):
 def preprocess_data(data, _type):
     methods = set([m for dset in data.keys() for m in data[dset].keys()])
     methods = sorted(methods)
+
+    # filter out rbocpdms on "best" (uni or multi)
     if _type == "best":
         warning(
             "\nWarning: Filtering out RBOCPDMS due to insufficient results.\n"
         )
         methods = [m for m in methods if not m == "rbocpdms"]
 
+    # filter out methods that have no results on any dataset
+    methods_no_result = set()
+    for m in methods:
+        if all(data[d][m] is None for d in data):
+            methods_no_result.add(m)
+    if methods_no_result:
+        print(
+            "\nWarning: Filtering out %r due to no results on any series\n"
+            % methods_no_result,
+            file=sys.stderr,
+        )
+        methods = [m for m in methods if not m in methods_no_result]
+
     data_w_methods = {}
     for dset in data:
         data_w_methods[dset] = {}