From 08d3ea5916864f6f4143e6c1f622f2dd87d21d27 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 25 May 2020 15:32:17 +0100 Subject: Add descriptive statistics code and results --- analysis/output/constants/SeriesLengthMax.tex | 1 + analysis/output/constants/SeriesLengthMean.tex | 1 + analysis/output/constants/SeriesLengthMin.tex | 1 + analysis/output/constants/UniqueAnnotationsMax.tex | 1 + .../output/constants/UniqueAnnotationsMean.tex | 1 + analysis/output/constants/UniqueAnnotationsMin.tex | 1 + analysis/output/constants/UniqueAnnotationsStd.tex | 1 + analysis/scripts/descriptive_annotations.py | 80 ++++++++++++++++++++++ analysis/scripts/descriptive_length.py | 73 ++++++++++++++++++++ 9 files changed, 160 insertions(+) create mode 100644 analysis/output/constants/SeriesLengthMax.tex create mode 100644 analysis/output/constants/SeriesLengthMean.tex create mode 100644 analysis/output/constants/SeriesLengthMin.tex create mode 100644 analysis/output/constants/UniqueAnnotationsMax.tex create mode 100644 analysis/output/constants/UniqueAnnotationsMean.tex create mode 100644 analysis/output/constants/UniqueAnnotationsMin.tex create mode 100644 analysis/output/constants/UniqueAnnotationsStd.tex create mode 100644 analysis/scripts/descriptive_annotations.py create mode 100644 analysis/scripts/descriptive_length.py (limited to 'analysis') diff --git a/analysis/output/constants/SeriesLengthMax.tex b/analysis/output/constants/SeriesLengthMax.tex new file mode 100644 index 00000000..a9000c23 --- /dev/null +++ b/analysis/output/constants/SeriesLengthMax.tex @@ -0,0 +1 @@ +991% diff --git a/analysis/output/constants/SeriesLengthMean.tex b/analysis/output/constants/SeriesLengthMean.tex new file mode 100644 index 00000000..9d3449e5 --- /dev/null +++ b/analysis/output/constants/SeriesLengthMean.tex @@ -0,0 +1 @@ +327.7% diff --git a/analysis/output/constants/SeriesLengthMin.tex b/analysis/output/constants/SeriesLengthMin.tex new file mode 100644 index 00000000..8826b25d --- /dev/null +++ b/analysis/output/constants/SeriesLengthMin.tex @@ -0,0 +1 @@ +15% diff --git a/analysis/output/constants/UniqueAnnotationsMax.tex b/analysis/output/constants/UniqueAnnotationsMax.tex new file mode 100644 index 00000000..2f8b1fd1 --- /dev/null +++ b/analysis/output/constants/UniqueAnnotationsMax.tex @@ -0,0 +1 @@ +26% diff --git a/analysis/output/constants/UniqueAnnotationsMean.tex b/analysis/output/constants/UniqueAnnotationsMean.tex new file mode 100644 index 00000000..18a14a6b --- /dev/null +++ b/analysis/output/constants/UniqueAnnotationsMean.tex @@ -0,0 +1 @@ +7.4% diff --git a/analysis/output/constants/UniqueAnnotationsMin.tex b/analysis/output/constants/UniqueAnnotationsMin.tex new file mode 100644 index 00000000..635c47ac --- /dev/null +++ b/analysis/output/constants/UniqueAnnotationsMin.tex @@ -0,0 +1 @@ +0% diff --git a/analysis/output/constants/UniqueAnnotationsStd.tex b/analysis/output/constants/UniqueAnnotationsStd.tex new file mode 100644 index 00000000..0a119a8a --- /dev/null +++ b/analysis/output/constants/UniqueAnnotationsStd.tex @@ -0,0 +1 @@ +7.0% diff --git a/analysis/scripts/descriptive_annotations.py b/analysis/scripts/descriptive_annotations.py new file mode 100644 index 00000000..2afdc422 --- /dev/null +++ b/analysis/scripts/descriptive_annotations.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +"""Extract descriptive statistics for the time series + +This script is used to extract descriptive statistics about the number of +annotations from the summary files. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + + +import argparse +import json +import os +import statistics + +N_DATASETS = 42 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of statistic to compute", + choices=["min", "max", "mean", "std"], + required=True, + ) + return parser.parse_args() + + +def load_unique_annotations(summary_dir): + files = os.listdir(summary_dir) + assert len(files) == N_DATASETS + + n_uniq_anno = [] + for f in sorted(files): + path = os.path.join(summary_dir, f) + with open(path, "r") as fp: + data = json.load(fp) + + all_anno = set() + for annotations in data["annotations"].values(): + for cp in annotations: + all_anno.add(cp) + n_uniq_anno.append(len(all_anno)) + return n_uniq_anno + + +def main(): + args = parse_args() + if args.type == "max": + func = max + elif args.type == "mean": + func = statistics.mean + elif args.type == "std": + func = statistics.stdev + elif args.type == "min": + func = min + else: + raise ValueError("Unknown type") + + n_uniq_anno = load_unique_annotations(args.summary_dir) + if args.type in ["min", "max"]: + print("%i%%" % func(n_uniq_anno)) + else: + print("%.1f%%" % func(n_uniq_anno)) + + +if __name__ == "__main__": + main() diff --git a/analysis/scripts/descriptive_length.py b/analysis/scripts/descriptive_length.py new file mode 100644 index 00000000..e8504b92 --- /dev/null +++ b/analysis/scripts/descriptive_length.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +"""Extract descriptive statistics for the time series + +This script is used to extract descriptive statistics regarding features of the +time series from the summary files. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + + +import argparse +import json +import os +import statistics + +N_DATASETS = 42 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-s", + "--summary-dir", + help="Directory with summary files", + required=True, + ) + parser.add_argument( + "-t", + "--type", + help="Type of statistic to compute", + choices=["min", "max", "mean"], + required=True, + ) + return parser.parse_args() + + +def load_summary_nobs(summary_dir): + files = os.listdir(summary_dir) + assert len(files) == N_DATASETS + + all_nobs = [] + for f in sorted(files): + path = os.path.join(summary_dir, f) + with open(path, "r") as fp: + data = json.load(fp) + all_nobs.append(data["dataset_nobs"]) + return all_nobs + + +def main(): + args = parse_args() + if args.type == "min": + func = min + elif args.type == "mean": + func = statistics.mean + elif args.type == "max": + func = max + else: + raise ValueError("Unknown type") + + all_nobs = load_summary_nobs(args.summary_dir) + if args.type in ["min", "max"]: + print("%i%%" % func(all_nobs)) + else: + print("%.1f%%" % func(all_nobs)) + + +if __name__ == "__main__": + main() -- cgit v1.2.3