aboutsummaryrefslogtreecommitdiff
path: root/build_tcpd.py
blob: ebf7bb824460b79349ff15ad4ecaa3b4797e0f2c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Collect and verify all time series that are not packaged in the repository.

Author: Gertjan van den Burg
License: See LICENSE file.
Copyright: 2019, The Alan Turing Institute

"""

import argparse
import platform
import os

DATASET_DIR = "./datasets"

TARGETS = [
    ("apple", "get_apple.py"),
    ("bee_waggle_6", "get_bee_waggle_6.py"),
    ("bitcoin", "get_bitcoin.py"),
    ("iceland_tourism", "get_iceland_tourism.py"),
    ("measles", "get_measles.py"),
    ("occupancy", "get_occupancy.py"),
    ("ratner_stock", "get_ratner_stock.py"),
    ("robocalls", "get_robocalls.py"),
    ("scanline_126007", "get_scanline_126007.py"),
    ("scanline_42049", "get_scanline_42049.py"),
]


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-v", "--verbose", help="Enable logging", action="store_true"
    )
    parser.add_argument(
        "-o", "--output-dir", help="Output directory to store all time series"
    )
    parser.add_argument(
        "action",
        help="Action to perform",
        choices=["collect", "clean"],
        default="collect",
        nargs="?",
    )
    return parser.parse_args()


def load_dataset_script(module_name, path):
    """Load the dataset collection script as a module

    This is not a *super* clean way to do this, but it maintains the modularity 
    of the dataset, where each dataset can be downloaded individually as well 
    as through this script.
    """
    version = platform.python_version_tuple()
    if version[0] == "2":
        import imp

        module = imp.load_source(module_name, path)
    elif version[0] == "3" and version[1] in ["3", "4"]:
        from importlib.machinery import SourceFileLoader

        module = SourceFileLoader(module_name, path).load_module()
    else:
        import importlib.util

        spec = importlib.util.spec_from_file_location(module_name, path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
    return module


def run_dataset_func(name, script, funcname):
    dir_path = os.path.join(DATASET_DIR, name)
    get_path = os.path.join(dir_path, script)
    module = load_dataset_script("tcpd.%s" % name, get_path)
    func = getattr(module, funcname)
    func(output_dir=dir_path)


def collect_dataset(name, script):
    return run_dataset_func(name, script, "collect")


def clean_dataset(name, script):
    return run_dataset_func(name, script, "clean")


def main():
    args = parse_args()

    log = lambda *a, **kw: print(*a, **kw) if args.verbose else None

    if args.action == "collect":
        func = collect_dataset
    elif args.action == "clean":
        func = clean_dataset
    else:
        raise ValueError("Unknown action: %s" % args.action)

    for name, script in TARGETS:
        log(
            "Running %s action for dataset: %s ... " % (args.action, name),
            end="",
            flush=True,
        )
        func(name, script)
        log("ok", flush=True)


if __name__ == "__main__":
    main()