execs/R/cpdbench_ecp.R


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

#' ---
#' title: Wrapper for ecp package in TCPDBench
#' author: G.J.J. van den Burg
#' date: 2019-09-29
#' license: See LICENSE file.
#' copyright: 2019, The Alan Turing Institute
#' ---

library(argparse)
library(ecp)

load.utils <- function() {
    # get the name of the current script so we can load utils.R (yay, R!)
    cmd.args <- commandArgs(trailingOnly=F)
    file.arg <- "--file="
    this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)])
    this.dir <- dirname(this.script)
    utils.script <- file.path(this.dir, 'utils.R')
    source(utils.script)
}

parse.args <- function() {
    parser <- ArgumentParser(description='Wrapper for ecp package')
    parser$add_argument('-i', 
                        '--input',
                        help='path to the input data file',
                        required=TRUE
    )
    parser$add_argument('-o',
                        '--output',
                        help='path to the output file'
    )
    parser$add_argument('-a',
                        '--algorithm',
                        help='algorithm to use',
                        choices=c('e.agglo', 'e.divisive', 'kcpa'),
                        required=TRUE
                        )
    parser$add_argument('--alpha',
                        type='double',
                        help='alpha parameter for agglo and divisive')
    parser$add_argument('--minsize',
                        help='minsize argument for e.divisive',
                        type='integer', default=30)
    parser$add_argument('-R', '--runs',
                        help='number of random permutations to use',
                        type='integer', default=199)
    parser$add_argument('--siglvl',
                        type='double',
                        help='Significance level to use for tests')
    # No examples are provided in the ecp package documentation about 
    # reasonable values for C, so we use 1 as default.
    parser$add_argument('-C', '--cost',
                        type='double',
                        help='cost to use in the kcpa algorithm',
                        default=1)
    parser$add_argument('-L', '--maxcp',
                        help='maximum number of cps in kcpa algorithm',
                        choices=c('max', 'default')
    )
    return(parser$parse_args())
}

main <- function() {
    args <- parse.args()

    # load the dataset
    data <- load.dataset(args$input)

    # copy defaults from the ecp package
    defaults <- list()
    if (args$algorithm == 'e.divisive') {
        defaults$k <- 'null'
    }
    if (args$algorithm == 'kcpa') {
        # Again, we don't want to limit the number of change points a priori, 
        # so set the maximum to the length of the series.
        if (args$maxcp == 'max')
            defaults$L <- data$original$n_obs
        else
            defaults$L <- 5 # following binseg and segneigh default
    }
    params <- make.param.list(args, defaults)

    start.time <- Sys.time()
    result <- tryCatch({
        if (args$algorithm == 'e.agglo') {
            out <- e.agglo(data$mat, alpha=params$alpha)
            locs <- out$estimates
        } else if (args$algorithm == 'e.divisive') {
            out <- e.divisive(data$mat, sig.lvl=params$siglvl, R=params$runs,
                              min.size=params$minsize, alpha=params$alpha)
            locs <- out$estimates
        } else {
            # kcpa
            out <- kcpa(data$mat, params$L, params$cost)
            locs <- out
        }
        list(locations=locs, error=NULL)
    }, error=function(e) {
        return(list(locations=NULL, error=e$message))
    })

    stop.time <- Sys.time()
    runtime <- difftime(stop.time, start.time, units='secs')

    if (!is.null(result$error))
        exit.with.error(data$original, args, params, result$error)

    # convert to 0-based indices
    locations <- as.list(result$locations - 1)

    exit.success(data$original, args, params, locations, runtime)
}

load.utils()
main()