Add descriptive statistics code and results

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-05-25 15:32:17 +0100
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-05-25 15:32:17 +0100
commit: 08d3ea5916864f6f4143e6c1f622f2dd87d21d27 (patch)
tree: 72aa9ab50a8c87fbe4dda0998a644aedd5c426aa /analysis/scripts
parent: Correct calculation of F measure (diff)
download: TCPDBench-08d3ea5916864f6f4143e6c1f622f2dd87d21d27.tar.gz
TCPDBench-08d3ea5916864f6f4143e6c1f622f2dd87d21d27.zip
2 files changed, 153 insertions, 0 deletions
diff --git a/analysis/scripts/descriptive_annotations.py b/analysis/scripts/descriptive_annotations.py
new file mode 100644
index 00000000..2afdc422
--- /dev/null
+++ b/analysis/scripts/descriptive_annotations.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics about the number of 
+annotations from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--summary-dir",
+        help="Directory with summary files",
+        required=True,
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of statistic to compute",
+        choices=["min", "max", "mean", "std"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def load_unique_annotations(summary_dir):
+    files = os.listdir(summary_dir)
+    assert len(files) == N_DATASETS
+
+    n_uniq_anno = []
+    for f in sorted(files):
+        path = os.path.join(summary_dir, f)
+        with open(path, "r") as fp:
+            data = json.load(fp)
+
+        all_anno = set()
+        for annotations in data["annotations"].values():
+            for cp in annotations:
+                all_anno.add(cp)
+        n_uniq_anno.append(len(all_anno))
+    return n_uniq_anno
+
+
+def main():
+    args = parse_args()
+    if args.type == "max":
+        func = max
+    elif args.type == "mean":
+        func = statistics.mean
+    elif args.type == "std":
+        func = statistics.stdev
+    elif args.type == "min":
+        func = min
+    else:
+        raise ValueError("Unknown type")
+
+    n_uniq_anno = load_unique_annotations(args.summary_dir)
+    if args.type in ["min", "max"]:
+        print("%i%%" % func(n_uniq_anno))
+    else:
+        print("%.1f%%" % func(n_uniq_anno))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/scripts/descriptive_length.py b/analysis/scripts/descriptive_length.py
new file mode 100644
index 00000000..e8504b92
--- /dev/null
+++ b/analysis/scripts/descriptive_length.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+"""Extract descriptive statistics for the time series
+
+This script is used to extract descriptive statistics regarding features of the 
+time series from the summary files.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+
+import argparse
+import json
+import os
+import statistics
+
+N_DATASETS = 42
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--summary-dir",
+        help="Directory with summary files",
+        required=True,
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of statistic to compute",
+        choices=["min", "max", "mean"],
+        required=True,
+    )
+    return parser.parse_args()
+
+
+def load_summary_nobs(summary_dir):
+    files = os.listdir(summary_dir)
+    assert len(files) == N_DATASETS
+
+    all_nobs = []
+    for f in sorted(files):
+        path = os.path.join(summary_dir, f)
+        with open(path, "r") as fp:
+            data = json.load(fp)
+        all_nobs.append(data["dataset_nobs"])
+    return all_nobs
+
+
+def main():
+    args = parse_args()
+    if args.type == "min":
+        func = min
+    elif args.type == "mean":
+        func = statistics.mean
+    elif args.type == "max":
+        func = max
+    else:
+        raise ValueError("Unknown type")
+
+    all_nobs = load_summary_nobs(args.summary_dir)
+    if args.type in ["min", "max"]:
+        print("%i%%" % func(all_nobs))
+    else:
+        print("%.1f%%" % func(all_nobs))
+
+
+if __name__ == "__main__":
+    main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-05-25 15:32:17 +0100
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-05-25 15:32:17 +0100
commit	08d3ea5916864f6f4143e6c1f622f2dd87d21d27 (patch)
tree	72aa9ab50a8c87fbe4dda0998a644aedd5c426aa /analysis/scripts
parent	Correct calculation of F measure (diff)
download	TCPDBench-08d3ea5916864f6f4143e6c1f622f2dd87d21d27.tar.gz TCPDBench-08d3ea5916864f6f4143e6c1f622f2dd87d21d27.zip