aboutsummaryrefslogtreecommitdiff
path: root/datasets/construction/convert.py
blob: 0abde23d12f93f73841461c0c751d4b7bf1a2703 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Dataset conversion script

Author: G.J.J. van den Burg

"""

import argparse
import json
import xlrd

MONTHS = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12,
}


def format_date(datestr):
    """ expects: mmm-yyx with x an extraneous character or empty """
    mmm, yyx = datestr.split("-")
    midx = MONTHS[mmm]
    if len(yyx) == 3:
        yy = yyx[:2]
    elif len(yyx) == 2:
        yy = yyx
    else:
        raise ValueError

    # this will break in 71 years
    if yy.startswith("9"):
        yyyy = 1900 + int(yy)
    else:
        yyyy = 2000 + int(yy)
    return f"{yyyy}-{midx:02}"


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="File to convert")
    parser.add_argument("output_file", help="File to write to")
    return parser.parse_args()


def main():
    args = parse_args()

    wb = xlrd.open_workbook(args.input_file)
    ws = wb.sheet_by_index(0)
    header = ws.row(3)
    assert header[0].value == "Date"

    by_month = {}
    ridx = 4
    while True:
        # stop if date cell is empty
        if ws.row(ridx)[0].ctype == xlrd.XL_CELL_EMPTY:
            break

        date_value = ws.row(ridx)[0].value
        construct_value = ws.row(ridx)[1].value

        date = format_date(date_value)
        construct = int(construct_value)

        by_month[date] = construct
        ridx += 1

    name = "construction"
    longname = "US Construction Spending"
    time = sorted(by_month.keys())
    time_fmt = "%Y-%m"
    values = [by_month[t] for t in time]

    series = [
        {
            "label": "Total Private Construction Spending",
            "type": "int",
            "raw": values,
        }
    ]

    data = {
        "name": name,
        "longname": longname,
        "n_obs": len(time),
        "n_dim": len(series),
        "time": {
            "type": "string",
            "format": time_fmt,
            "index": list(range(len(time))),
            "raw": time,
        },
        "series": series,
    }

    with open(args.output_file, "w") as fp:
        json.dump(data, fp, indent="\t")


if __name__ == "__main__":
    main()