diff options
Diffstat (limited to 'datasets/businv/convert.py')
| -rw-r--r-- | datasets/businv/convert.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/datasets/businv/convert.py b/datasets/businv/convert.py new file mode 100644 index 0000000..68aea10 --- /dev/null +++ b/datasets/businv/convert.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Dataset conversion script + +Author: G.J.J. van den Burg + +""" + +import argparse +import json + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="File to convert") + parser.add_argument("output_file", help="File to write to") + return parser.parse_args() + + +def main(): + args = parse_args() + + with open(args.input_file, "r") as fp: + lines = [l.strip() for l in fp] + + # header data should be first three lines + # we use some asserts to ensure things are what we expect them to be + header = lines[:3] + assert header[-1] == "Total Business" + + lines = lines[4:] + assert lines[0].startswith("1992") + + by_month = {} + for line in lines: + # stop on first empty line + if not line.strip(): + break + parts = [x for x in line.split(" ") if x.strip()] + assert len(parts) == 13 # year + 12 months + year = parts.pop(0) + for midx, v in enumerate(parts, start=1): + if v == ".": + break + by_month[f"{year}-{midx:02}"] = int(v) + + name = "businv" + longname = "Business Inventory" + time = sorted(by_month.keys()) + time_fmt = "%Y-%m" + values = [by_month[t] for t in time] + + series = [{"label": "Business Inventory", "type": "int", "raw": values}] + + data = { + "name": name, + "longname": longname, + "n_obs": len(time), + "n_dim": len(series), + "time": { + "type": "string", + "format": time_fmt, + "index": list(range(len(time))), + "raw": time, + }, + "series": series, + } + + with open(args.output_file, "w") as fp: + json.dump(data, fp, indent="\t") + + +if __name__ == "__main__": + main() |
