aboutsummaryrefslogtreecommitdiff
path: root/analysis/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'analysis/scripts')
-rw-r--r--analysis/scripts/aggregate_table_wide.py2
-rw-r--r--analysis/scripts/descriptive_annotations.py80
-rw-r--r--analysis/scripts/descriptive_length.py73
-rw-r--r--analysis/scripts/make_table.py15
-rw-r--r--analysis/scripts/metrics.py51
-rw-r--r--analysis/scripts/rank_common.py15
6 files changed, 231 insertions, 5 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py
index 712a6a4a..52676e1c 100644
--- a/analysis/scripts/aggregate_table_wide.py
+++ b/analysis/scripts/aggregate_table_wide.py
@@ -30,6 +30,7 @@ class Method(Enum):
rfpop = "rfpop"
segneigh = "segneigh"
wbs = "wbs"
+ zero = "zero"
# Methods that support multidimensional datasets
@@ -39,6 +40,7 @@ MULTIMETHODS = [
Method.ecp,
Method.kcpa,
Method.rbocpdms,
+ Method.zero
]
diff --git a/analysis/scripts/descriptive_annotations.py b/analysis/scripts/descriptive_annotations.py
new file mode 100644
index 00000000..2afdc422
--- /dev/null
+++ b/analysis/scripts/descriptive_annotations.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics about the number of
+annotations from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-s",
+ "--summary-dir",
+ help="Directory with summary files",
+ required=True,
+ )
+ parser.add_argument(
+ "-t",
+ "--type",
+ help="Type of statistic to compute",
+ choices=["min", "max", "mean", "std"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def load_unique_annotations(summary_dir):
+ files = os.listdir(summary_dir)
+ assert len(files) == N_DATASETS
+
+ n_uniq_anno = []
+ for f in sorted(files):
+ path = os.path.join(summary_dir, f)
+ with open(path, "r") as fp:
+ data = json.load(fp)
+
+ all_anno = set()
+ for annotations in data["annotations"].values():
+ for cp in annotations:
+ all_anno.add(cp)
+ n_uniq_anno.append(len(all_anno))
+ return n_uniq_anno
+
+
+def main():
+ args = parse_args()
+ if args.type == "max":
+ func = max
+ elif args.type == "mean":
+ func = statistics.mean
+ elif args.type == "std":
+ func = statistics.stdev
+ elif args.type == "min":
+ func = min
+ else:
+ raise ValueError("Unknown type")
+
+ n_uniq_anno = load_unique_annotations(args.summary_dir)
+ if args.type in ["min", "max"]:
+ print("%i%%" % func(n_uniq_anno))
+ else:
+ print("%.1f%%" % func(n_uniq_anno))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/descriptive_length.py b/analysis/scripts/descriptive_length.py
new file mode 100644
index 00000000..e8504b92
--- /dev/null
+++ b/analysis/scripts/descriptive_length.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics regarding features of the
+time series from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-s",
+ "--summary-dir",
+ help="Directory with summary files",
+ required=True,
+ )
+ parser.add_argument(
+ "-t",
+ "--type",
+ help="Type of statistic to compute",
+ choices=["min", "max", "mean"],
+ required=True,
+ )
+ return parser.parse_args()
+
+
+def load_summary_nobs(summary_dir):
+ files = os.listdir(summary_dir)
+ assert len(files) == N_DATASETS
+
+ all_nobs = []
+ for f in sorted(files):
+ path = os.path.join(summary_dir, f)
+ with open(path, "r") as fp:
+ data = json.load(fp)
+ all_nobs.append(data["dataset_nobs"])
+ return all_nobs
+
+
+def main():
+ args = parse_args()
+ if args.type == "min":
+ func = min
+ elif args.type == "mean":
+ func = statistics.mean
+ elif args.type == "max":
+ func = max
+ else:
+ raise ValueError("Unknown type")
+
+ all_nobs = load_summary_nobs(args.summary_dir)
+ if args.type in ["min", "max"]:
+ print("%i%%" % func(all_nobs))
+ else:
+ print("%.1f%%" % func(all_nobs))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py
index c326775e..8070eecd 100644
--- a/analysis/scripts/make_table.py
+++ b/analysis/scripts/make_table.py
@@ -103,6 +103,7 @@ class Method(Enum):
rfpop = "rfpop"
segneigh = "segneigh"
wbs = "wbs"
+ zero = "zero"
# Methods that support multidimensional datasets
@@ -112,6 +113,7 @@ MULTIMETHODS = [
Method.ecp,
Method.kcpa,
Method.rbocpdms,
+ Method.zero,
]
# Multidimensional datasets
@@ -126,7 +128,13 @@ MULTIDATASETS = [
MISSING_DATASETS = [Dataset.uk_coal_employ]
# Methods that handle missing values
-MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet]
+MISSING_METHODS = [
+ Method.bocpdms,
+ Method.ecp,
+ Method.kcpa,
+ Method.prophet,
+ Method.zero,
+]
@dataclass
@@ -323,7 +331,10 @@ def average_results(results):
if any(r.score is None for r in dset_results):
to_remove.append(dataset)
if to_remove:
- warning("\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n" % to_remove)
+ warning(
+ "\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n"
+ % to_remove
+ )
results = list(filter(lambda r: not r.dataset in to_remove, results))
# check that we are now complete: for all datasets and all methods in the
diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py
index 932fbb7c..a504227b 100644
--- a/analysis/scripts/metrics.py
+++ b/analysis/scripts/metrics.py
@@ -12,6 +12,17 @@ License: See the LICENSE file.
def true_positives(T, X, margin=5):
"""Compute true positives without double counting
+
+ >>> true_positives({1, 10, 20, 23}, {3, 8, 20})
+ {1, 10, 20}
+ >>> true_positives({1, 10, 20, 23}, {1, 3, 8, 20})
+ {1, 10, 20}
+ >>> true_positives({1, 10, 20, 23}, {1, 3, 5, 8, 20})
+ {1, 10, 20}
+ >>> true_positives(set(), {1, 2, 3})
+ set()
+ >>> true_positives({1, 2, 3}, set())
+ set()
"""
# make a copy so we don't affect the caller
X = set(list(X))
@@ -37,6 +48,12 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
Remember that all CP locations are 0-based!
+ >>> f_measure({1: [10, 20], 2: [11, 20], 3: [10], 4: [0, 5]}, [10, 20])
+ 1.0
+ >>> f_measure({1: [], 2: [10], 3: [50]}, [10])
+ 0.9090909090909091
+ >>> f_measure({1: [], 2: [10], 3: [50]}, [])
+ 0.8
"""
# ensure 0 is in all the sets
Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)}
@@ -46,8 +63,10 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
X = set(predictions)
X.add(0)
- Tstar = [tau for tau in Tk for Tk in Tks.values()]
- Tstar = set(Tstar)
+ Tstar = set()
+ for Tk in Tks.values():
+ for tau in Tk:
+ Tstar.add(tau)
K = len(Tks)
@@ -63,7 +82,17 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False):
def overlap(A, B):
- """ Return the overlap (i.e. Jaccard index) of two sets """
+ """ Return the overlap (i.e. Jaccard index) of two sets
+
+ >>> overlap({1, 2, 3}, set())
+ 0.0
+ >>> overlap({1, 2, 3}, {2, 5})
+ 0.25
+ >>> overlap(set(), {1, 2, 3})
+ 0.0
+ >>> overlap({1, 2, 3}, {1, 2, 3})
+ 1.0
+ """
return len(A.intersection(B)) / len(A.union(B))
@@ -101,6 +130,15 @@ def cover_single(Sprime, S):
"""Compute the covering of a segmentation S by a segmentation Sprime.
This follows equation (8) in Arbaleaz, 2010.
+
+ >>> cover_single([{1, 2, 3}, {4, 5}, {6}], [{1, 2, 3}, {4, 5, 6}])
+ 0.8333333333333334
+ >>> cover_single([{1, 2, 3, 4}, {5, 6}], [{1, 2, 3, 4, 5, 6}])
+ 0.6666666666666666
+ >>> cover_single([{1, 2}, {3, 4}, {5, 6}], [{1, 2, 3}, {4, 5, 6}])
+ 0.6666666666666666
+ >>> cover_single([{1, 2, 3, 4, 5, 6}], [{1}, {2}, {3}, {4, 5, 6}])
+ 0.3333333333333333
"""
T = sum(map(len, Sprime))
assert T == sum(map(len, S))
@@ -118,6 +156,13 @@ def covering(annotations, predictions, n_obs):
predictions : iterable of predicted Cp locations
n_obs : number of observations in the series
+ >>> covering({1: [10, 20], 2: [10], 3: [0, 5]}, [10, 20], 45)
+ 0.7962962962962963
+ >>> covering({1: [], 2: [10], 3: [40]}, [10], 45)
+ 0.7954144620811286
+ >>> covering({1: [], 2: [10], 3: [40]}, [], 45)
+ 0.8189300411522634
+
"""
Ak = {
k + 1: partition_from_cps(annotations[uid], n_obs)
diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py
index b1d5af77..d12dc5bc 100644
--- a/analysis/scripts/rank_common.py
+++ b/analysis/scripts/rank_common.py
@@ -89,12 +89,27 @@ def warning(msg):
def preprocess_data(data, _type):
methods = set([m for dset in data.keys() for m in data[dset].keys()])
methods = sorted(methods)
+
+ # filter out rbocpdms on "best" (uni or multi)
if _type == "best":
warning(
"\nWarning: Filtering out RBOCPDMS due to insufficient results.\n"
)
methods = [m for m in methods if not m == "rbocpdms"]
+ # filter out methods that have no results on any dataset
+ methods_no_result = set()
+ for m in methods:
+ if all(data[d][m] is None for d in data):
+ methods_no_result.add(m)
+ if methods_no_result:
+ print(
+ "\nWarning: Filtering out %r due to no results on any series\n"
+ % methods_no_result,
+ file=sys.stderr,
+ )
+ methods = [m for m in methods if not m in methods_no_result]
+
data_w_methods = {}
for dset in data:
data_w_methods[dset] = {}