diff options
Diffstat (limited to 'datasets/shanghai_license/convert.py')
| -rw-r--r-- | datasets/shanghai_license/convert.py | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/datasets/shanghai_license/convert.py b/datasets/shanghai_license/convert.py new file mode 100644 index 0000000..b29395a --- /dev/null +++ b/datasets/shanghai_license/convert.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Dataset conversion script + +Author: Gertjan van den Burg + +""" + +import json +import argparse +import clevercsv + + +def reformat_time(mmmyy): + """ From MMM-YY to %Y-%m """ + MONTHS = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12, + } + mmm, yy = mmmyy.split("-") + Y = int(yy) + 2000 + m = MONTHS.get(mmm) + return "%i-%02i" % (Y, m) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="File to convert") + parser.add_argument("output_file", help="File to write to") + return parser.parse_args() + + +def main(): + args = parse_args() + + with open(args.input_file, "r", newline="", encoding="ascii") as fp: + reader = clevercsv.reader( + fp, delimiter=",", quotechar="", escapechar="" + ) + rows = list(reader) + + rows.pop(0) + + time = [reformat_time(r[0]) for r in rows] + values = [int(r[-1]) for r in rows] + + # Manually split Jan-08 into two, see readme for details. + jan08idx = time.index("2008-01") + values[jan08idx] /= 2 + time.insert(jan08idx + 1, "2008-02") + values.insert(jan08idx + 1, values[jan08idx]) + + name = "shanghai_license" + longname = "Shanghai License" + time_fmt = "%Y-%m" + series = [{"label": "No. of Applicants", "type": "int", "raw": values}] + + data = { + "name": name, + "longname": longname, + "n_obs": len(time), + "n_dim": len(series), + "time": { + "type": "string", + "format": time_fmt, + "index": list(range(len(time))), + "raw": time, + }, + "series": series, + } + + with open(args.output_file, "w") as fp: + json.dump(data, fp, indent="\t") + + +if __name__ == "__main__": + main() |
