aboutsummaryrefslogtreecommitdiff
path: root/datasets/businv/convert.py
blob: 68aea1080e7e23bf181351e172f9657fabf3aa1e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Dataset conversion script

Author: G.J.J. van den Burg

"""

import argparse
import json


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="File to convert")
    parser.add_argument("output_file", help="File to write to")
    return parser.parse_args()


def main():
    args = parse_args()

    with open(args.input_file, "r") as fp:
        lines = [l.strip() for l in fp]

    # header data should be first three lines
    # we use some asserts to ensure things are what we expect them to be
    header = lines[:3]
    assert header[-1] == "Total Business"

    lines = lines[4:]
    assert lines[0].startswith("1992")

    by_month = {}
    for line in lines:
        # stop on first empty line
        if not line.strip():
            break
        parts = [x for x in line.split(" ") if x.strip()]
        assert len(parts) == 13  # year + 12 months
        year = parts.pop(0)
        for midx, v in enumerate(parts, start=1):
            if v == ".":
                break
            by_month[f"{year}-{midx:02}"] = int(v)

    name = "businv"
    longname = "Business Inventory"
    time = sorted(by_month.keys())
    time_fmt = "%Y-%m"
    values = [by_month[t] for t in time]

    series = [{"label": "Business Inventory", "type": "int", "raw": values}]

    data = {
        "name": name,
        "longname": longname,
        "n_obs": len(time),
        "n_dim": len(series),
        "time": {
            "type": "string",
            "format": time_fmt,
            "index": list(range(len(time))),
            "raw": time,
        },
        "series": series,
    }

    with open(args.output_file, "w") as fp:
        json.dump(data, fp, indent="\t")


if __name__ == "__main__":
    main()