diff options
Diffstat (limited to 'datasets/construction/convert.py')
| -rw-r--r-- | datasets/construction/convert.py | 114 |
1 files changed, 114 insertions, 0 deletions
diff --git a/datasets/construction/convert.py b/datasets/construction/convert.py new file mode 100644 index 0000000..0abde23 --- /dev/null +++ b/datasets/construction/convert.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Dataset conversion script + +Author: G.J.J. van den Burg + +""" + +import argparse +import json +import xlrd + +MONTHS = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12, +} + + +def format_date(datestr): + """ expects: mmm-yyx with x an extraneous character or empty """ + mmm, yyx = datestr.split("-") + midx = MONTHS[mmm] + if len(yyx) == 3: + yy = yyx[:2] + elif len(yyx) == 2: + yy = yyx + else: + raise ValueError + + # this will break in 71 years + if yy.startswith("9"): + yyyy = 1900 + int(yy) + else: + yyyy = 2000 + int(yy) + return f"{yyyy}-{midx:02}" + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="File to convert") + parser.add_argument("output_file", help="File to write to") + return parser.parse_args() + + +def main(): + args = parse_args() + + wb = xlrd.open_workbook(args.input_file) + ws = wb.sheet_by_index(0) + header = ws.row(3) + assert header[0].value == "Date" + + by_month = {} + ridx = 4 + while True: + # stop if date cell is empty + if ws.row(ridx)[0].ctype == xlrd.XL_CELL_EMPTY: + break + + date_value = ws.row(ridx)[0].value + construct_value = ws.row(ridx)[1].value + + date = format_date(date_value) + construct = int(construct_value) + + by_month[date] = construct + ridx += 1 + + name = "construction" + longname = "US Construction Spending" + time = sorted(by_month.keys()) + time_fmt = "%Y-%m" + values = [by_month[t] for t in time] + + series = [ + { + "label": "Total Private Construction Spending", + "type": "int", + "raw": values, + } + ] + + data = { + "name": name, + "longname": longname, + "n_obs": len(time), + "n_dim": len(series), + "time": { + "type": "string", + "format": time_fmt, + "index": list(range(len(time))), + "raw": time, + }, + "series": series, + } + + with open(args.output_file, "w") as fp: + json.dump(data, fp, indent="\t") + + +if __name__ == "__main__": + main() |
