diff options
Diffstat (limited to 'analysis/scripts')
| -rw-r--r-- | analysis/scripts/aggregate_table_wide.py | 2 | ||||
| -rw-r--r-- | analysis/scripts/descriptive_annotations.py | 80 | ||||
| -rw-r--r-- | analysis/scripts/descriptive_length.py | 73 | ||||
| -rw-r--r-- | analysis/scripts/make_table.py | 15 | ||||
| -rw-r--r-- | analysis/scripts/metrics.py | 51 | ||||
| -rw-r--r-- | analysis/scripts/rank_common.py | 15 |
6 files changed, 231 insertions, 5 deletions
diff --git a/analysis/scripts/aggregate_table_wide.py b/analysis/scripts/aggregate_table_wide.py index 712a6a4a..52676e1c 100644 --- a/analysis/scripts/aggregate_table_wide.py +++ b/analysis/scripts/aggregate_table_wide.py @@ -30,6 +30,7 @@ class Method(Enum): rfpop = "rfpop" segneigh = "segneigh" wbs = "wbs" + zero = "zero" # Methods that support multidimensional datasets @@ -39,6 +40,7 @@ MULTIMETHODS = [ Method.ecp, Method.kcpa, Method.rbocpdms, + Method.zero ] diff --git a/analysis/scripts/descriptive_annotations.py b/analysis/scripts/descriptive_annotations.py new file mode 100644 index 00000000..2afdc422 --- /dev/null +++ b/analysis/scripts/descriptive_annotations.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +"""Extract descriptive statistics for the time series + +This script is used to extract descriptive statistics about the number of +annotations from the summary files. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + + +import argparse +import json +import os +import statistics + +N_DATASETS = 42 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of statistic to compute", + choices=["min", "max", "mean", "std"], + required=True, + ) + return parser.parse_args() + + +def load_unique_annotations(summary_dir): + files = os.listdir(summary_dir) + assert len(files) == N_DATASETS + + n_uniq_anno = [] + for f in sorted(files): + path = os.path.join(summary_dir, f) + with open(path, "r") as fp: + data = json.load(fp) + + all_anno = set() + for annotations in data["annotations"].values(): + for cp in annotations: + all_anno.add(cp) + n_uniq_anno.append(len(all_anno)) + return n_uniq_anno + + +def main(): + args = parse_args() + if args.type == "max": + func = max + elif args.type == "mean": + func = statistics.mean + elif args.type == "std": + func = statistics.stdev + elif args.type == "min": + func = min + else: + raise ValueError("Unknown type") + + n_uniq_anno = load_unique_annotations(args.summary_dir) + if args.type in ["min", "max"]: + print("%i%%" % func(n_uniq_anno)) + else: + print("%.1f%%" % func(n_uniq_anno)) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/descriptive_length.py b/analysis/scripts/descriptive_length.py new file mode 100644 index 00000000..e8504b92 --- /dev/null +++ b/analysis/scripts/descriptive_length.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +"""Extract descriptive statistics for the time series + +This script is used to extract descriptive statistics regarding features of the +time series from the summary files. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + + +import argparse +import json +import os +import statistics + +N_DATASETS = 42 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of statistic to compute", + choices=["min", "max", "mean"], + required=True, + ) + return parser.parse_args() + + +def load_summary_nobs(summary_dir): + files = os.listdir(summary_dir) + assert len(files) == N_DATASETS + + all_nobs = [] + for f in sorted(files): + path = os.path.join(summary_dir, f) + with open(path, "r") as fp: + data = json.load(fp) + all_nobs.append(data["dataset_nobs"]) + return all_nobs + + +def main(): + args = parse_args() + if args.type == "min": + func = min + elif args.type == "mean": + func = statistics.mean + elif args.type == "max": + func = max + else: + raise ValueError("Unknown type") + + all_nobs = load_summary_nobs(args.summary_dir) + if args.type in ["min", "max"]: + print("%i%%" % func(all_nobs)) + else: + print("%.1f%%" % func(all_nobs)) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/make_table.py b/analysis/scripts/make_table.py index c326775e..8070eecd 100644 --- a/analysis/scripts/make_table.py +++ b/analysis/scripts/make_table.py @@ -103,6 +103,7 @@ class Method(Enum): rfpop = "rfpop" segneigh = "segneigh" wbs = "wbs" + zero = "zero" # Methods that support multidimensional datasets @@ -112,6 +113,7 @@ MULTIMETHODS = [ Method.ecp, Method.kcpa, Method.rbocpdms, + Method.zero, ] # Multidimensional datasets @@ -126,7 +128,13 @@ MULTIDATASETS = [ MISSING_DATASETS = [Dataset.uk_coal_employ] # Methods that handle missing values -MISSING_METHODS = [Method.bocpdms, Method.ecp, Method.kcpa, Method.prophet] +MISSING_METHODS = [ + Method.bocpdms, + Method.ecp, + Method.kcpa, + Method.prophet, + Method.zero, +] @dataclass @@ -323,7 +331,10 @@ def average_results(results): if any(r.score is None for r in dset_results): to_remove.append(dataset) if to_remove: - warning("\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n" % to_remove) + warning( + "\nWarning: Filtering out datasets: %r due to incomplete results for some detectors.\n" + % to_remove + ) results = list(filter(lambda r: not r.dataset in to_remove, results)) # check that we are now complete: for all datasets and all methods in the diff --git a/analysis/scripts/metrics.py b/analysis/scripts/metrics.py index 932fbb7c..a504227b 100644 --- a/analysis/scripts/metrics.py +++ b/analysis/scripts/metrics.py @@ -12,6 +12,17 @@ License: See the LICENSE file. def true_positives(T, X, margin=5): """Compute true positives without double counting + + >>> true_positives({1, 10, 20, 23}, {3, 8, 20}) + {1, 10, 20} + >>> true_positives({1, 10, 20, 23}, {1, 3, 8, 20}) + {1, 10, 20} + >>> true_positives({1, 10, 20, 23}, {1, 3, 5, 8, 20}) + {1, 10, 20} + >>> true_positives(set(), {1, 2, 3}) + set() + >>> true_positives({1, 2, 3}, set()) + set() """ # make a copy so we don't affect the caller X = set(list(X)) @@ -37,6 +48,12 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False): Remember that all CP locations are 0-based! + >>> f_measure({1: [10, 20], 2: [11, 20], 3: [10], 4: [0, 5]}, [10, 20]) + 1.0 + >>> f_measure({1: [], 2: [10], 3: [50]}, [10]) + 0.9090909090909091 + >>> f_measure({1: [], 2: [10], 3: [50]}, []) + 0.8 """ # ensure 0 is in all the sets Tks = {k + 1: set(annotations[uid]) for k, uid in enumerate(annotations)} @@ -46,8 +63,10 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False): X = set(predictions) X.add(0) - Tstar = [tau for tau in Tk for Tk in Tks.values()] - Tstar = set(Tstar) + Tstar = set() + for Tk in Tks.values(): + for tau in Tk: + Tstar.add(tau) K = len(Tks) @@ -63,7 +82,17 @@ def f_measure(annotations, predictions, margin=5, alpha=0.5, return_PR=False): def overlap(A, B): - """ Return the overlap (i.e. Jaccard index) of two sets """ + """ Return the overlap (i.e. Jaccard index) of two sets + + >>> overlap({1, 2, 3}, set()) + 0.0 + >>> overlap({1, 2, 3}, {2, 5}) + 0.25 + >>> overlap(set(), {1, 2, 3}) + 0.0 + >>> overlap({1, 2, 3}, {1, 2, 3}) + 1.0 + """ return len(A.intersection(B)) / len(A.union(B)) @@ -101,6 +130,15 @@ def cover_single(Sprime, S): """Compute the covering of a segmentation S by a segmentation Sprime. This follows equation (8) in Arbaleaz, 2010. + + >>> cover_single([{1, 2, 3}, {4, 5}, {6}], [{1, 2, 3}, {4, 5, 6}]) + 0.8333333333333334 + >>> cover_single([{1, 2, 3, 4}, {5, 6}], [{1, 2, 3, 4, 5, 6}]) + 0.6666666666666666 + >>> cover_single([{1, 2}, {3, 4}, {5, 6}], [{1, 2, 3}, {4, 5, 6}]) + 0.6666666666666666 + >>> cover_single([{1, 2, 3, 4, 5, 6}], [{1}, {2}, {3}, {4, 5, 6}]) + 0.3333333333333333 """ T = sum(map(len, Sprime)) assert T == sum(map(len, S)) @@ -118,6 +156,13 @@ def covering(annotations, predictions, n_obs): predictions : iterable of predicted Cp locations n_obs : number of observations in the series + >>> covering({1: [10, 20], 2: [10], 3: [0, 5]}, [10, 20], 45) + 0.7962962962962963 + >>> covering({1: [], 2: [10], 3: [40]}, [10], 45) + 0.7954144620811286 + >>> covering({1: [], 2: [10], 3: [40]}, [], 45) + 0.8189300411522634 + """ Ak = { k + 1: partition_from_cps(annotations[uid], n_obs) diff --git a/analysis/scripts/rank_common.py b/analysis/scripts/rank_common.py index b1d5af77..d12dc5bc 100644 --- a/analysis/scripts/rank_common.py +++ b/analysis/scripts/rank_common.py @@ -89,12 +89,27 @@ def warning(msg): def preprocess_data(data, _type): methods = set([m for dset in data.keys() for m in data[dset].keys()]) methods = sorted(methods) + + # filter out rbocpdms on "best" (uni or multi) if _type == "best": warning( "\nWarning: Filtering out RBOCPDMS due to insufficient results.\n" ) methods = [m for m in methods if not m == "rbocpdms"] + # filter out methods that have no results on any dataset + methods_no_result = set() + for m in methods: + if all(data[d][m] is None for d in data): + methods_no_result.add(m) + if methods_no_result: + print( + "\nWarning: Filtering out %r due to no results on any series\n" + % methods_no_result, + file=sys.stderr, + ) + methods = [m for m in methods if not m in methods_no_result] + data_w_methods = {} for dset in data: data_w_methods[dset] = {} |
