aboutsummaryrefslogtreecommitdiff
path: root/datasets/jfk_passengers/convert.py
blob: 3c8ae1a71744f3e6b52458f437ba66451b4d11d5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Dataset conversion script

Author: G.J.J. van den Burg

"""

import json
import argparse
import clevercsv


def month2index(month):
    return {
        "Jan": "01",
        "Feb": "02",
        "Mar": "03",
        "Apr": "04",
        "May": "05",
        "Jun": "06",
        "Jul": "07",
        "Aug": "08",
        "Sep": "09",
        "Oct": "10",
        "Nov": "11",
        "Dec": "12",
    }[month]


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="File to convert")
    parser.add_argument("output_file", help="File to write to")
    return parser.parse_args()


def main():
    args = parse_args()

    with open(args.input_file, "r", newline="", encoding="ascii") as fp:
        reader = clevercsv.DictReader(
            fp, delimiter=",", quotechar="", escapechar=""
        )
        items = list(reader)

    for it in items:
        it["time"] = f"{it['Year']}-{month2index(it['Month'])}"
        it["value"] = int(it["Total Passengers"])


    jfks = [it for it in items if it["Airport Code"] == "JFK"]
    pairs = [(it["time"], it["value"]) for it in jfks]
    # with this date format string sort is date sort
    pairs.sort()

    name = "jfk_passengers"
    longname = "JFK Passengers"
    time_fmt = "%Y-%m"
    time = [p[0] for p in pairs]
    values = [p[1] for p in pairs]

    series = [{"label": "Number of Passengers", "type": "int", "raw": values}]

    data = {
        "name": name,
        "longname": longname,
        "n_obs": len(time),
        "n_dim": len(series),
        "time": {
            "type": "string",
            "format": time_fmt,
            "index": list(range(len(time))),
            "raw": time,
        },
        "series": series,
    }

    with open(args.output_file, "w") as fp:
        json.dump(data, fp, indent="\t")


if __name__ == "__main__":
    main()