aboutsummaryrefslogtreecommitdiff
path: root/analysis/scripts/summarize.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-03-12 14:33:57 +0000
commit7ef8f6e58990fc069cccc71ed6564e8c639ea4fc (patch)
tree9e7662a34b7d0c1f1c5d9faf6d7d6ea8672f6410 /analysis/scripts/summarize.py
downloadTCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.tar.gz
TCPDBench-7ef8f6e58990fc069cccc71ed6564e8c639ea4fc.zip
initial commit
Diffstat (limited to 'analysis/scripts/summarize.py')
-rw-r--r--analysis/scripts/summarize.py178
1 files changed, 178 insertions, 0 deletions
diff --git a/analysis/scripts/summarize.py b/analysis/scripts/summarize.py
new file mode 100644
index 00000000..426976c5
--- /dev/null
+++ b/analysis/scripts/summarize.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Summarize the results into a single file per dataset.
+
+For each dataset we want::
+
+ {
+ "dataset": "<name>",
+ "dataset_nobs": N,
+ "dataset_ndim": N,
+ "annotations": {
+ "<user_id>": [...],
+ "<user_id>": [...],
+ },
+ "results": {
+ "<method>": [
+ {
+ "parameters": {
+ "<param>": value,
+ },
+ "cplocations": [...],
+ "scores": {
+ "<score_1>": value,
+ },
+ "status": <status>
+ },
+ {
+ "parameters": {
+ "<param>": value,
+ },
+ "cplocations": [...],
+ "scores": {
+ "<score_1>": value,
+ },
+ "status": <status>
+ },
+ ],
+ }
+ }
+
+Basic cleanup on the change point locations will also be performed:
+
+ - deduplication
+ - removal of invalid indices. Recall that indices are 0-based. We remove
+ any indices smaller than 1 and larger than n_obs - 2. The reason that we
+ don't allow 0 or n_obs - 1 (both valid endpoints) is that several
+ algorithms declare these locations as change points by default and they
+ are meaningless.
+
+Author: Gertjan van den Burg
+Copyright (c) 2020 - The Alan Turing Institute
+License: See the LICENSE file.
+
+"""
+
+import argparse
+import json
+import os
+import sys
+
+from metrics import f_measure, covering
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-a",
+ "--annotation-file",
+ help="Path to annotation file",
+ required=True,
+ )
+ parser.add_argument(
+ "-d", "--dataset-file", help="Path to dataset file", required=True
+ )
+ parser.add_argument(
+ "-r", "--result-dir", help="Directory of abed results", required=True
+ )
+ parser.add_argument("-o", "--output-file", help="File to write to")
+ return parser.parse_args()
+
+
+def load_json(filename):
+ with open(filename, "r") as fp:
+ try:
+ data = json.load(fp)
+ except json.decoder.JSONDecodeError:
+ print("Error parsing json file: %s" % filename, file=sys.stderr)
+ return {"error": "parsing error"}
+ return data
+
+
+def load_annotations(filename, dataset):
+ with open(filename, "r") as fp:
+ data = json.load(fp)
+ return data[dataset]
+
+
+def clean_cps(locations, dataset):
+ n_obs = dataset["n_obs"]
+ valid = set([x for x in locations if 1 <= x < n_obs - 1])
+ return sorted(valid)
+
+
+def main():
+ args = parse_args()
+
+ dataset = load_json(args.dataset_file)
+ annotations = load_annotations(args.annotation_file, dataset["name"])
+
+ out = {
+ "dataset": dataset["name"],
+ "dataset_nobs": dataset["n_obs"],
+ "dataset_ndim": dataset["n_dim"],
+ "annotations": annotations,
+ "results": {},
+ }
+
+ data_results = next(
+ (d for d in os.listdir(args.result_dir) if d == dataset["name"]), None
+ )
+ if data_results is None:
+ print(
+ "Couldn't find the result directory for dataset %s"
+ % dataset["name"],
+ file=sys.stderr,
+ )
+ raise SystemExit(1)
+
+ dataset_dir = os.path.join(args.result_dir, data_results)
+
+ for method in os.listdir(dataset_dir):
+ method_dir = os.path.join(dataset_dir, method)
+ for result_file in os.listdir(method_dir):
+ # print("Processing result file: %s" % result_file)
+ fname = os.path.join(method_dir, result_file)
+ result = load_json(fname)
+ if not method in out["results"]:
+ out["results"][method] = []
+
+ if result["status"].lower() == "success":
+ locations = clean_cps(result["result"]["cplocations"], dataset)
+
+ f1, precision, recall = f_measure(
+ annotations, locations, return_PR=True
+ )
+ n_obs = dataset["n_obs"]
+ cover = covering(annotations, locations, n_obs)
+ scores = {
+ "f1": f1,
+ "precision": precision,
+ "recall": recall,
+ "cover": cover,
+ }
+ else:
+ locations = None
+ scores = None
+
+ out["results"][method].append(
+ {
+ "parameters": result["parameters"],
+ "task_file": result_file,
+ "cplocations": locations,
+ "scores": scores,
+ "status": result['status'],
+ }
+ )
+
+ if args.output_file:
+ with open(args.output_file, "w") as fp:
+ json.dump(out, fp, indent="\t")
+ else:
+ print(json.dumps(out, indent="\t"))
+
+
+if __name__ == "__main__":
+ main()