diff options
Diffstat (limited to 'execs')
| -rw-r--r-- | execs/R/cpdbench_changepoint.R | 153 | ||||
| -rw-r--r-- | execs/R/cpdbench_changepointnp.R | 102 | ||||
| -rw-r--r-- | execs/R/cpdbench_ecp.R | 117 | ||||
| -rw-r--r-- | execs/R/cpdbench_ocp.R | 116 | ||||
| -rw-r--r-- | execs/R/cpdbench_prophet.R | 185 | ||||
| -rw-r--r-- | execs/R/cpdbench_rfpop.R | 102 | ||||
| -rw-r--r-- | execs/R/cpdbench_wbs.R | 102 | ||||
| -rw-r--r-- | execs/R/utils.R | 138 | ||||
| m--------- | execs/python/bocpdms | 0 | ||||
| -rw-r--r-- | execs/python/cpdbench_bocpdms.py | 199 | ||||
| -rw-r--r-- | execs/python/cpdbench_rbocpdms.py | 220 | ||||
| -rw-r--r-- | execs/python/cpdbench_utils.py | 152 | ||||
| m--------- | execs/python/rbocpdms | 0 |
13 files changed, 1586 insertions, 0 deletions
diff --git a/execs/R/cpdbench_changepoint.R b/execs/R/cpdbench_changepoint.R new file mode 100644 index 00000000..ed1f6391 --- /dev/null +++ b/execs/R/cpdbench_changepoint.R @@ -0,0 +1,153 @@ +#' --- +#' title: Wrapper for changepoint package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-28 +#' license: See LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(changepoint) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for changepoint package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file') + parser$add_argument('-f', + '--func', + choices=c('mean', 'var', 'meanvar'), + help='Function to call in the changepoint package', + required=TRUE + ) + parser$add_argument('-p', + '--penalty', + choices=c( + 'None', + 'SIC', + 'BIC', + 'MBIC', + 'AIC', + 'Hannan-Quinn', + 'Asymptotic' + ), + help='Choice of penalty in the cpt function', + default='MBIC' + ) + parser$add_argument( + '-m', + '--method', + choices=c('AMOC', 'PELT', 'SegNeigh', 'BinSeg'), + help="Choice of method in the cpt function", + default='AMOC' + ) + parser$add_argument( + '-t', + '--test-statistic', + choices=c('Normal', 'CUSUM', 'CSS', 'Gamma', + 'Exponential', 'Poisson'), + help="Test statistic to use", + default='Normal' + ) + parser$add_argument('-Q', + '--max-cp', + help='Maximum number of change points', + choices=c('max', 'default'), + default='max') + return(parser$parse_args()) +} + +main <- function() +{ + args <- parse.args() + + # load the data + data <- load.dataset(args$input) + + n.obs <- data$original$n_obs + + # get the parameter list + defaults <- list() + # we set this to the maximum because we have no a priori knowledge of the + # maximum number of change points we expect. + if (args$method == 'BinSeg' || args$method == 'SegNeigh') { + if (args$max_cp == 'max') + defaults$Q <- n.obs/2 + 1 + else + defaults$Q <- 5 + } + if (args$penalty == "Asymptotic") + defaults$pen.value <- 0.05 + else + defaults$pen.value <- 0 # not used for other penalties + params <- make.param.list(args, defaults) + + if (args$func == "mean") { + cpt.func <- cpt.mean + } else if (args$func == "var") { + cpt.func <- cpt.var + } else if (args$func == "meanvar") { + cpt.func <- cpt.meanvar + } + + if (data$original$n_dim > 1) { + # changepoint package can't handle multidimensional data + exit.error.multidim(data$original, args, params) + } + + vec <- as.vector(data$mat) + start.time <- Sys.time() + + # call the appropriate function with the specified parameters + result <- tryCatch({ + locs <- cpt.func(vec, + penalty=params$penalty, + pen.value=params$pen.value, + method=params$method, + test.stat=params$test_statistic, + Q=params$Q, + class=FALSE + ) + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units="secs") + + if (!is.null(result$error)) { + exit.with.error(data$original, args, params, result$error) + } + + # convert indices to 0-based indices. + if (params$method == 'AMOC') { + locations <- c(result$locations[1]) - 1 + names(locations) <- NULL + locations <- as.list(locations) + } else { + if (is.list(result$locations)) { + result$locations <- result$locations$cpts + } + locations <- as.list(result$locations - 1) + } + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_changepointnp.R b/execs/R/cpdbench_changepointnp.R new file mode 100644 index 00000000..823e58a4 --- /dev/null +++ b/execs/R/cpdbench_changepointnp.R @@ -0,0 +1,102 @@ +#' --- +#' title: Wrapper for changepoint.np package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-30 +#' license: See LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(changepoint.np) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for changepoint.np package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file') + parser$add_argument('-p', + '--penalty', + choices=c( + 'None', + 'SIC', + 'BIC', + 'MBIC', + 'AIC', + 'Hannan-Quinn', + 'Asymptotic' + ), + help='Choice of penalty in the cpt function', + default='MBIC' + ) + parser$add_argument('-q', + '--nquantiles', + type='integer', + help='Number of quantiles to use', + default=10 + ) + return(parser$parse_args()) +} + +main <- function() { + args <- parse.args() + + # load the data + data <- load.dataset(args$input) + + # get the parameter list + defaults <- list(method='PELT', + test.stat='empirical_distribution', + minseglen=1) + params <- make.param.list(args, defaults) + + if (data$origina$n_dim > 1) { + # changepoint.np package can't handle multidimensional data + exit.error.multidim(data$original, args, params) + } + + vec <- as.vector(data$mat) + start.time <- Sys.time() + + result <- tryCatch({ + locs <- cpt.np(vec, + penalty=params$penalty, + method=params$method, + test.stat=params$test.stat, + minseglen=params$minseglen, + nquantiles=params$nquantiles, + class=FALSE + ) + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units="secs") + + if (!is.null(result$error)) { + exit.with.error(data$original, args, params, result$error) + } + + # convert indices to 0-based indices + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_ecp.R b/execs/R/cpdbench_ecp.R new file mode 100644 index 00000000..84131c20 --- /dev/null +++ b/execs/R/cpdbench_ecp.R @@ -0,0 +1,117 @@ +#' --- +#' title: Wrapper for ecp package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-29 +#' license: See LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(ecp) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for ecp package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file' + ) + parser$add_argument('-a', + '--algorithm', + help='algorithm to use', + choices=c('e.agglo', 'e.divisive', 'kcpa'), + required=TRUE + ) + parser$add_argument('--alpha', + type='double', + help='alpha parameter for agglo and divisive') + parser$add_argument('--minsize', + help='minsize argument for e.divisive', + type='integer', default=30) + parser$add_argument('-R', '--runs', + help='number of random permutations to use', + type='integer', default=199) + parser$add_argument('--siglvl', + type='double', + help='Significance level to use for tests') + # No examples are provided in the ecp package documentation about + # reasonable values for C, so we use 1 as default. + parser$add_argument('-C', '--cost', + type='double', + help='cost to use in the kcpa algorithm', + default=1) + parser$add_argument('-L', '--maxcp', + help='maximum number of cps in kcpa algorithm', + choices=c('max', 'default') + ) + return(parser$parse_args()) +} + +main <- function() { + args <- parse.args() + + # load the dataset + data <- load.dataset(args$input) + + # copy defaults from the ecp package + defaults <- list() + if (args$algorithm == 'e.divisive') { + defaults$k <- 'null' + } + if (args$algorithm == 'kcpa') { + # Again, we don't want to limit the number of change points a priori, + # so set the maximum to the length of the series. + if (args$maxcp == 'max') + defaults$L <- data$original$n_obs + else + defaults$L <- 5 # following binseg and segneigh default + } + params <- make.param.list(args, defaults) + + start.time <- Sys.time() + result <- tryCatch({ + if (args$algorithm == 'e.agglo') { + out <- e.agglo(data$mat, alpha=params$alpha) + locs <- out$estimates + } else if (args$algorithm == 'e.divisive') { + out <- e.divisive(data$mat, sig.lvl=params$siglvl, R=params$runs, + min.size=params$minsize, alpha=params$alpha) + locs <- out$estimates + } else { + # kcpa + out <- kcpa(data$mat, params$L, params$cost) + locs <- out + } + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units='secs') + + if (!is.null(result$error)) + exit.with.error(data$original, args, params, result$error) + + # convert to 0-based indices + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_ocp.R b/execs/R/cpdbench_ocp.R new file mode 100644 index 00000000..308e5bac --- /dev/null +++ b/execs/R/cpdbench_ocp.R @@ -0,0 +1,116 @@ +#' --- +#' title: Wrapper for ocp package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-10-05 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(ocp) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for changepoint package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file') + parser$add_argument('-l', + '--lambda', + help='lambda parameter for constant hazard function', + type='integer', + default=100 + ) + parser$add_argument('--prior-a', + help='Prior alpha for student-t', + type='double', + default=1) + parser$add_argument('--prior-b', + help='Prior beta for student-t', + type='double', + default=1 + ) + parser$add_argument('--prior-k', + help='Prior kappa for student-t', + type='double', + default=1 + ) + + return(parser$parse_args()) +} + +main <- function() +{ + args <- parse.args() + data <- load.dataset(args$input) + + # set the defaults that we don't change + defaults <- list(missPts="none", + cpthreshold=0.5, # unused by us + truncRlim=10^(-4), + minRlength=1, + maxRlength=10^4, # bigger than any of our datasets + minsep=1, + maxsep=10^4 # bigger than any of our datasets + ) + defaults$multivariate = data$original$n_dim > 1 + + # combine defaults and cmd args + params <- make.param.list(args, defaults) + + # define our hazard function with the lambda in the parameters + hazard_func <- function(x, lambda) { + const_hazard(x, lambda=params$lambda) + } + + # we only use the gaussian model since the data is scaled + model.params <- list(list(m=0, k=params$prior_k, a=params$prior_a, + b=params$prior_b)) + + start.time <- Sys.time() + result <- tryCatch({ + fit <- onlineCPD(data$mat, oCPD=NULL, missPts=params$missPts, + hazard_func=hazard_func, + probModel=list("gaussian"), + init_params=model.params, + multivariate=params$multivariate, + cpthreshold=params$cpthreshold, + truncRlim=params$truncRlim, + minRlength=params$minRlength, + maxRlength=params$maxRlength, + minsep=params$minsep, + maxsep=params$maxsep + ) + locs <- as.vector(fit$changepoint_lists$maxCPs[[1]]) + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units="secs") + + if (!is.null(result$error)) + exit.with.error(data$original, args, params, result$error) + + # convert indices to 0-based indices + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_prophet.R b/execs/R/cpdbench_prophet.R new file mode 100644 index 00000000..511217f5 --- /dev/null +++ b/execs/R/cpdbench_prophet.R @@ -0,0 +1,185 @@ +#' --- +#' title: Wrapper for the Prophet package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-30 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(prophet) +library(lubridate) + +NO.DATETIME <- c('scanline_126007', 'scanline_42049', 'well_log', + 'quality_control_1', 'quality_control_2', 'quality_control_3', + 'quality_control_4', 'quality_control_5') + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for Prophet package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file' + ) + parser$add_argument('-N', + '--Nmax', + help='maximum number of changepoints', + choices=c('default', 'max') + ) + return(parser$parse_args()) +} + +frac.to.dt <- function(raw) { + out <- c() + for (i in 1:length(raw)) { + replaced <- gsub('-', '.', raw[i]); + number <- as.double(replaced) + year <- floor(number) + remainder <- number - year + begin <- as_datetime(paste(year, '-01-01', sep='')) + end <- as_datetime(paste(year+1, '-01-01', sep='')) + offset <- remainder * (end - begin) + dt <- begin + offset + # you'd think there'd be a well-documented easy-to-find function for + # this + datepart <- date(dt) + timepart <- sprintf("%02d:%02d:%02d", hour(dt), minute(dt), + round(second(dt))) + iso <- paste(datepart, timepart, sep=' ') + out <- c(out, iso) + } + return(out) +} + +preprocess.data <- function(data) +{ + if ("format" %in% names(data$original$time)) { + if (data$original$time$format == "%Y-%m-%d %H:%M:%S") { + tidx <- data$original$time$raw + } else if (data$original$time$format == "%Y-%m-%d") { + tidx <- data$original$time$raw + } else if (data$original$time$format == "%Y-%m") { + tidx <- paste(data$original$time$raw, '-01', sep='') + } else if (data$original$time$format == "%Y") { + tidx <- paste(data$original$time$raw, '-01-01', sep='') + } else if (data$original$time$format == "%Y-%F") { + tidx <- frac.to.dt(data$original$time$raw) + } else { + stop(cat("Unknown time format: ", data$original$time$format, '\n')) + } + } else { + if (data$original$name == 'bank') { + # bank is daily data + dt <- as_date("2019-01-01") + tidx <- c(dt) + for (i in 2:data$original$n_obs) { + dt <- dt + ddays(1) + tidx <- c(tidx, dt) + } + } else if (data$original$name == 'bee_waggle') { + # bee_waggle is seconds data (I believe) + dt <- as_date("2019-01-01 00:00:00") + tidx <- c(dt) + for (i in 2:data$original$n_obs) { + dt <- dt + dseconds(1) + tidx <- c(tidx, dt) + } + } else if (data$original$name %in% NO.DATETIME) { + # these datasets have no corresponding time axis, so we disable + # periodicity in prophet for fairness. + # We'll make it "daily", because prophet needs a datetime format + dt <- as_date("2019-01-01") + tidx <- c(dt) + for (i in 2:data$original$n_obs) { + dt <- dt + ddays(1) + tidx <- c(tidx, dt) + } + } else { + stop(cat("Unhandled time series: ", data$original$name, '\n')) + } + } + + raw <- as.vector(data$mat) + + df <- data.frame(ds=tidx, y=raw) + + return(df) +} + +main <- function() { + args <- parse.args() + data <- load.dataset(args$input) + + defaults <- list() + # we want to allow change points throughout the entire range of the series + defaults$changepoint.range <- 1 + # threshold used in add_changepoints_to_plot + defaults$threshold <- 0.01 + defaults$yearly.seasonality <- 'auto' + defaults$weekly.seasonality <- 'auto' + defaults$daily.seasonality <- 'auto' + + if (args$Nmax == 'default') + args$Nmax <- 25 + else + args$Nmax <- data$original$n_obs - 1 + + if (data$original$name %in% NO.DATETIME) { + defaults$yearly.seasonality <- FALSE + defaults$weekly.seasonality <- FALSE + defaults$daily.seasonality <- FALSE + } + + params <- make.param.list(args, defaults) + + if (data$original$n_dim > 1) { + # package doesn't handle multidimensional data + exit.error.multidim(data$original, args, params) + } + + df <- preprocess.data(data) + + start.time <- Sys.time() + result <- tryCatch({ + model <- prophet(df, changepoint.range=params$changepoint.range, + n.changepoints=params$Nmax, + yearly.seasonality=params$yearly.seasonality, + weekly.seasonality=params$weekly.seasonality, + daily.seasonality=params$daily.seasonality + ) + threshold <- params$threshold + cpt <- model$changepoints[abs(model$params$delta) >= threshold] + cpt <- as.character(as.POSIXct(cpt)) + locs <- match(cpt, as.character(df$ds)) + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units='secs') + + if (!is.null(result$error)) + exit.with.error(data$original, args, params, result$error) + + # convert to 0-based indices + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_rfpop.R b/execs/R/cpdbench_rfpop.R new file mode 100644 index 00000000..33ced873 --- /dev/null +++ b/execs/R/cpdbench_rfpop.R @@ -0,0 +1,102 @@ +#' --- +#' title: Wrapper for robust-fpop package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-30 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(robseg) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for robseg package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file' + ) + parser$add_argument('-l', + '--loss', + help='loss function to use', + choices=c('L1', 'L2', 'Huber', 'Outlier'), + required=TRUE + ) + return(parser$parse_args()) +} + +main <- function() { + args <- parse.args() + data <- load.dataset(args$input) + + # copy the defaults from the robust-fpop repo and the JASA paper. + defaults <- list() + if (args$loss == 'Outlier') { + defaults$lambda <- 2 * log(data$original$n_obs) + defaults$lthreshold <- 3 + } else if (args$loss == 'Huber') { + defaults$lambda <- 1.4 * log(data$original$n_obs) + defaults$lthreshold <- 1.345 + } else if (args$loss == 'L1') { + defaults$lambda <- log(data$original$n_obs) + } else if (args$loss == 'L2') { + defaults$lambda <- log(data$original$n_obs) + } + params <- make.param.list(args, defaults) + + if (data$original$n_dim > 1) { + # robseg package can't handle multidimensional data + exit.error.multidim(data$original, args, params) + } + + vec <- as.vector(data$mat) + + start.time <- Sys.time() + + # estimate the standard deviation as in the README of the robseg package. + est.std <- mad(diff(vec)/sqrt(2)) + # and normalise the data with this. Note that this means that we don't need + # to scale lambda and the threshold by the estimated standard deviation. + x <- vec / est.std + + result <- tryCatch({ + out <- Rob_seg.std(x=x, + loss=params$loss, + lambda=params$lambda, + lthreshold=params$lthreshold + ) + locs <- out$t.est + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units='secs') + + if (!is.null(result$error)) { + exit.with.error(data$original, args, params, result$error) + } + + # convert indices to 0-based + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/cpdbench_wbs.R b/execs/R/cpdbench_wbs.R new file mode 100644 index 00000000..e858b4df --- /dev/null +++ b/execs/R/cpdbench_wbs.R @@ -0,0 +1,102 @@ +#' --- +#' title: Wrapper for wbs package in TCPDBench +#' author: G.J.J. van den Burg +#' date: 2019-09-28 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(argparse) +library(wbs) + +load.utils <- function() { + # get the name of the current script so we can load utils.R (yay, R!) + cmd.args <- commandArgs(trailingOnly=F) + file.arg <- "--file=" + this.script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + this.dir <- dirname(this.script) + utils.script <- file.path(this.dir, 'utils.R') + source(utils.script) +} + +parse.args <- function() { + parser <- ArgumentParser(description='Wrapper for wbs package') + parser$add_argument('-i', + '--input', + help='path to the input data file', + required=TRUE + ) + parser$add_argument('-o', + '--output', + help='path to the output file' + ) + parser$add_argument('-K', '--Kmax', choices=c('default', 'max'), + help='the maximum number of changepoints', + default='max') + parser$add_argument('-p', '--penalty', choices=c('SSIC', 'BIC', 'MBIC'), + help='The penalty to use in WBS') + parser$add_argument("-g", "--integrated", choices=c("true", "false"), + help="Whether to use integrated WBS or not") + return(parser$parse_args()) +} + +main <- function() { + args <- parse.args() + + # load the data + data <- load.dataset(args$input) + + # copy defaults from the wbs package and set params + defaults <- list(M=5000, rand.intervals=T) + if (args$Kmax == 'default') + args$Kmax <- 50 + else + args$Kmax <- data$original$n_obs + + if (args$integrated == "true") + args$integrated = TRUE + else + args$integrated = FALSE + params <- make.param.list(args, defaults) + + if (data$original$n_dim > 1) { + # wbs package doesn't handle multidimensional data + exit.error.multidim(data$original, args, params) + } + + vec <- as.vector(data$mat) + start.time <- Sys.time() + + # We use the SSIC penalty as this is used in the WBS paper and is the + # default in the WBS package (for plot.wbs, for instance). + + result <- tryCatch({ + out <- wbs(vec, M=params$M, rand.intervals=params$rand.intervals, + integrated=params$integrated) + cpt <- changepoints(out, Kmax=params$Kmax) + if (params$penalty == "SSIC") + locs <- cpt$cpt.ic$ssic.penalty + else if (params$penalty == "BIC") + locs <- cpt$cpt.ic$bic.penalty + else if (params$penalty == "MBIC") + locs <- cpt$cpt.ic$mbic.penalty + locs <- sort(locs) + list(locations=locs, error=NULL) + }, error=function(e) { + return(list(locations=NULL, error=e$message)) + }) + stop.time <- Sys.time() + runtime <- difftime(stop.time, start.time, units='secs') + + if (!is.null(result$error)) { + exit.with.error(data$original, args, params, result$error) + } + + # convert to 0-based indices. + locations <- as.list(result$locations - 1) + + exit.success(data$original, args, params, locations, runtime) +} + +load.utils() +main() diff --git a/execs/R/utils.R b/execs/R/utils.R new file mode 100644 index 00000000..504b5373 --- /dev/null +++ b/execs/R/utils.R @@ -0,0 +1,138 @@ +#' --- +#' title: Utilities shared between R code +#' author: G.J.J. van den Burg +#' date: 2019-09-29 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(RJSONIO) + +printf <- function(...) invisible(cat(sprintf(...))); + +load.dataset <- function(filename) +{ + data <- fromJSON(filename) + + # reformat the data to a data frame with a time index and the data values + tidx <- data$time$index + exp <- 0:(data$n_obs - 1) + if (all(tidx == exp) && length(tidx) == length(exp)) { + tidx <- NULL + } else { + tidx <- data$time$index + } + + mat <- NULL + + for (j in 1:data$n_dim) { + s <- data$series[[j]] + v <- NULL + for (i in 1:data$n_obs) { + val <- s$raw[[i]] + if (is.null(val)) { + v <- c(v, NA) + } else { + v <- c(v, val) + } + } + mat <- cbind(mat, v) + } + + # We normalize to avoid issues with numerical precision. + mat <- scale(mat) + + out <- list(original=data, + time=tidx, + mat=mat) + return(out) +} + +prepare.result <- function(data, data.filename, status, error, + params, locations, runtime) { + out <- list(error=NULL) + cmd.args <- commandArgs(trailingOnly=F) + + # the full command used + out$command <- paste(cmd.args, collapse=' ') + + # get the name of the current script + file.arg <- "--file=" + out$script <- sub(file.arg, "", cmd.args[grep(file.arg, cmd.args)]) + + # hash of the script + script.hash <- tools::md5sum(out$script) + names(script.hash) <- NULL + out$script_md5 <- script.hash + + # hostname of the machine + hostname <- Sys.info()['nodename'] + names(hostname) <- NULL + out$hostname <- hostname + + # dataset name + out$dataset <- data$name + + # dataset hash + data.hash <- tools::md5sum(data.filename) + names(data.hash) <- NULL + out$dataset_md5 <- data.hash + + # status of running the script + out$status <- status + + # error (if any) + if (!is.null(error)) + out$error <- error + + # parameters used + out$parameters <- params + + # result + out$result <- list(cplocations=locations, runtime=runtime) + + return(out) +} + +make.param.list <- function(args, defaults) +{ + params <- defaults + + args.copy <- args + args.copy['input'] <- NULL + args.copy['output'] <- NULL + + params <- modifyList(params, args.copy) + return(params) +} + +dump.output <- function(out, filename) { + json.out <- toJSON(out, pretty=T) + if (!is.null(filename)) + write(json.out, filename) + else + cat(json.out, '\n') +} + +exit.error.multidim <- function(data, args, params) { + status = 'SKIP' + error = 'This method has no support for multidimensional data.' + out <- prepare.result(data, args$input, status, error, params, NULL, NA) + dump.output(out, args$output) + quit(save='no') +} + +exit.with.error <- function(data, args, params, error) { + status = 'FAIL' + out <- prepare.result(data, args$input, status, error, params, NULL, NULL) + dump.output(out, args$output) + quit(save='no') +} + +exit.success <- function(data, args, params, locations, runtime) { + status = 'SUCCESS' + error = NULL + out <- prepare.result(data, args$input, status, error, params, locations, + runtime) + dump.output(out, args$output) +} diff --git a/execs/python/bocpdms b/execs/python/bocpdms new file mode 160000 +Subproject 26d3cf0bbb688dc99cb9b3ee335b97c1d09b586 diff --git a/execs/python/cpdbench_bocpdms.py b/execs/python/cpdbench_bocpdms.py new file mode 100644 index 00000000..1f95faf8 --- /dev/null +++ b/execs/python/cpdbench_bocpdms.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Wrapper for BOCPDMS in CPDBench. + +Author: G.J.J. van den Burg +Date: 2019-10-02 +License: See the LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import numpy as np +import time + +from bocpdms import CpModel, BVARNIG, Detector +from multiprocessing import Process, Manager + +from cpdbench_utils import ( + load_dataset, + make_param_dict, + exit_with_error, + exit_with_timeout, + exit_success, +) + +# Ensure overflow errors are raised +# np.seterr(over="raise") + +TIMEOUT = 60 * 30 # 30 minutes + + +def parse_args(): + parser = argparse.ArgumentParser(description="Wrapper for BOCPDMS") + parser.add_argument( + "-i", "--input", help="path to the input data file", required=True + ) + parser.add_argument("-o", "--output", help="path to the output file") + parser.add_argument( + "--intensity", + help="parameter for the hazard function", + type=float, + required=True, + ) + parser.add_argument( + "--prior-a", help="initial value of a", type=float, required=True + ) + parser.add_argument( + "--prior-b", help="initial value of b", type=float, required=True + ) + parser.add_argument( + "--threshold", help="threshold to apply", type=int, default=0 + ) + parser.add_argument("--use-timeout", action="store_true") + + return parser.parse_args() + + +def wrapper(args, return_dict, **kwargs): + detector = run_bocpdms(*args, **kwargs) + return_dict["detector"] = detector + + +def wrap_with_timeout(args, kwargs, limit): + manager = Manager() + return_dict = manager.dict() + + p = Process(target=wrapper, args=(args, return_dict), kwargs=kwargs) + p.start() + p.join(limit) + if p.is_alive(): + p.terminate() + return None, "timeout" + if "detector" in return_dict: + return return_dict["detector"], "success" + return None, "fail" + + +def run_bocpdms(mat, params): + """Set up and run BOCPDMS + """ + + AR_models = [] + for lag in range(params["lower_AR"], params["upper_AR"] + 1): + AR_models.append( + BVARNIG( + prior_a=params["prior_a"], + prior_b=params["prior_b"], + S1=params["S1"], + S2=params["S2"], + prior_mean_scale=params["prior_mean_scale"], + prior_var_scale=params["prior_var_scale"], + intercept_grouping=params["intercept_grouping"], + nbh_sequence=[0] * lag, + restriction_sequence=[0] * lag, + hyperparameter_optimization="online", + ) + ) + + cp_model = CpModel(params["intensity"]) + + model_universe = np.array(AR_models) + model_prior = np.array([1 / len(AR_models) for m in AR_models]) + + detector = Detector( + data=mat, + model_universe=model_universe, + model_prior=model_prior, + cp_model=cp_model, + S1=params["S1"], + S2=params["S2"], + T=mat.shape[0], + store_rl=True, + store_mrl=True, + trim_type="keep_K", + threshold=params["threshold"], + save_performance_indicators=True, + generalized_bayes_rld="kullback_leibler", + # alpha_param_learning="individual", # not sure if used + # alpha_param=0.01, # not sure if used + # alpha_param_opt_t=30, # not sure if used + # alpha_rld_learning=True, # not sure if used + loss_der_rld_learning="squared_loss", + loss_param_learning="squared_loss", + ) + detector.run() + + return detector + + +def main(): + args = parse_args() + + data, mat = load_dataset(args.input) + + # setting S1 as dimensionality follows the 30portfolio_ICML18.py script. + defaults = { + "S1": mat.shape[1], + "S2": 1, + "intercept_grouping": None, + "prior_mean_scale": 0, # data is standardized + "prior_var_scale": 1, # data is standardized + } + + # pick the lag lengths based on the paragraph below the proof of Theorem 1, + # using C = 1, as in ``30portfolio_ICML18.py``. + T = mat.shape[0] + Lmin = 1 + Lmax = int(pow(T / np.log(T), 0.25) + 1) + defaults["lower_AR"] = Lmin + defaults["upper_AR"] = Lmax + + parameters = make_param_dict(args, defaults) + + start_time = time.time() + + error = None + status = "fail" # if not overwritten, it must have failed + try: + if args.use_timeout: + detector, status = wrap_with_timeout( + (mat, parameters), {}, TIMEOUT + ) + else: + detector = run_bocpdms(mat, parameters) + status = "success" + except Exception as err: + error = repr(err) + + stop_time = time.time() + runtime = stop_time - start_time + + if status == "timeout": + exit_with_timeout(data, args, parameters, runtime, __file__) + + if not error is None or status == "fail": + exit_with_error(data, args, parameters, error, __file__) + + # According to the Nile unit test, the MAP change points are in + # detector.CPs[-2], with time indices in the first of the two-element + # vectors. + locations = [x[0] for x in detector.CPs[-2]] + + # Based on the fact that time_range in plot_raw_TS of the EvaluationTool + # starts from 1 and the fact that CP_loc that same function is ensured to + # be in time_range, we assert that the change point locations are 1-based. + # We want 0-based, so subtract 1 from each point. + locations = [loc - 1 for loc in locations] + + # convert to Python ints + locations = [int(loc) for loc in locations] + + exit_success(data, args, parameters, locations, runtime, __file__) + + +if __name__ == "__main__": + main() diff --git a/execs/python/cpdbench_rbocpdms.py b/execs/python/cpdbench_rbocpdms.py new file mode 100644 index 00000000..53c5e6c2 --- /dev/null +++ b/execs/python/cpdbench_rbocpdms.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Wrapper for RBOCPDMS in CPDBench. + +Author: G.J.J. van den Burg +Date: 2019-10-03 +License: See the LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + +import argparse +import numpy as np +import time + +from rbocpdms import CpModel, BVARNIGDPD, Detector +from multiprocessing import Process, Manager + +from cpdbench_utils import ( + load_dataset, + make_param_dict, + exit_with_error, + exit_with_timeout, + exit_success, +) + +TIMEOUT = 60 * 30 # 30 minutes + + +def parse_args(): + parser = argparse.ArgumentParser(description="Wrapper for BOCPDMS") + parser.add_argument( + "-i", "--input", help="path to the input data file", required=True + ) + parser.add_argument("-o", "--output", help="path to the output file") + parser.add_argument( + "--intensity", help="parameter for the hazard function", type=float + ) + parser.add_argument( + "--prior-a", help="initial value of a", type=float, required=True + ) + parser.add_argument( + "--prior-b", help="initial value of b", type=float, required=True + ) + parser.add_argument( + "--threshold", help="threshold to apply", type=int, default=0 + ) + parser.add_argument( + "--alpha-param", help="alpha parameter", type=float, default=0.5 + ) + parser.add_argument( + "--alpha-rld", help="alpha rld parameter", type=float, default=0.0 + ) + parser.add_argument("--use-timeout", action="store_true") + parser.add_argument( + "--timeout", type=int, help="timeout in minutes", default=30 + ) + return parser.parse_args() + + +def wrapper(args, return_dict, **kwargs): + detector = run_rbocpdms(*args, **kwargs) + return_dict["detector"] = detector + + +def wrap_with_timeout(args, kwargs, limit): + manager = Manager() + return_dict = manager.dict() + + p = Process(target=wrapper, args=(args, return_dict), kwargs=kwargs) + p.start() + p.join(limit) + if p.is_alive(): + p.terminate() + status = "timeout" + return None, status + if "detector" in return_dict: + status = "success" + return return_dict["detector"], status + status = "fail" + return None, status + + +def run_rbocpdms(mat, params): + """Set up and run RBOCPDMS + """ + S1 = params["S1"] + S2 = params["S2"] + + # we use "DPD" from the well log example, as that seems to be the robust + # version. + model_universe = [ + BVARNIGDPD( + prior_a=params["prior_a"], + prior_b=params["prior_b"], + S1=S1, + S2=S2, + alpha_param=params["alpha_param"], + prior_mean_beta=params["prior_mean_beta"], + prior_var_beta=params["prior_var_beta"], + prior_mean_scale=params["prior_mean_scale"], + prior_var_scale=params["prior_var_scale"], + general_nbh_sequence=[[[]]] * S1 * S2, + general_nbh_restriction_sequence=[[0]], + general_nbh_coupling="weak coupling", + hyperparameter_optimization="online", + VB_window_size=params["VB_window_size"], + full_opt_thinning=params["full_opt_thinning"], + SGD_batch_size=params["SGD_batch_size"], + anchor_batch_size_SCSG=params["anchor_batch_size_SCSG"], + anchor_batch_size_SVRG=None, + first_full_opt=params["first_full_opt"], + ) + ] + + model_universe = np.array(model_universe) + model_prior = np.array([1 / len(model_universe)] * len(model_universe)) + + cp_model = CpModel(params["intensity"]) + + detector = Detector( + data=mat, + model_universe=model_universe, + model_prior=model_prior, + cp_model=cp_model, + S1=params["S1"], + S2=params["S2"], + T=mat.shape[0], + store_rl=True, + store_mrl=True, + trim_type="keep_K", + threshold=params["threshold"], + save_performance_indicators=True, + generalized_bayes_rld=params["rld_DPD"], + alpha_param_learning="individual", + alpha_param=params["alpha_param"], + alpha_param_opt_t=100, + alpha_rld=params["alpha_rld"], + alpha_rld_learning=True, + loss_der_rld_learning=params["loss_der_rld_learning"], + ) + detector.run() + + return detector + + +def main(): + args = parse_args() + + data, mat = load_dataset(args.input) + + # setting S1 as dimensionality follows the 30portfolio_ICML18.py script. + # other settings mostly taken from the well log example + defaults = { + "S1": mat.shape[1], + "S2": 1, + "SGD_batch_size": 10, + "VB_window_size": 360, + "anchor_batch_size_SCSG": 25, + "first_full_opt": 10, + "full_opt_thinning": 20, + "intercept_grouping": None, + "loss_der_rld_learning": "absolute_loss", + "prior_mean_beta": None, + "prior_mean_scale": 0, # data has been standardized + "prior_var_beta": None, + "prior_var_scale": 1.0, # data has been standardized + "rld_DPD": "power_divergence", # this ensures doubly robust + } + + parameters = make_param_dict(args, defaults) + + start_time = time.time() + + error = None + try: + if args.use_timeout: + detector, status = wrap_with_timeout( + (mat, parameters), {}, TIMEOUT + ) + elif args.timeout: + detector, status = wrap_with_timeout( + (mat, parameters), {}, args.timeout * 60 + ) + else: + detector = run_rbocpdms(mat, parameters) + status = "success" + except Exception as err: + error = repr(err) + + stop_time = time.time() + runtime = stop_time - start_time + + if status == "timeout": + exit_with_timeout(data, args, parameters, runtime, __file__) + + if not error is None or status == "fail": + exit_with_error(data, args, parameters, error, __file__) + + # According to the Nile unit test, the MAP change points are in + # detector.CPs[-2], with time indices in the first of the two-element + # vectors. + locations = [x[0] for x in detector.CPs[-2]] + + # Based on the fact that time_range in plot_raw_TS of the EvaluationTool + # starts from 1 and the fact that CP_loc that same function is ensured to + # be in time_range, we assert that the change point locations are 1-based. + # We want 0-based, so subtract 1 from each point. + locations = [loc - 1 for loc in locations] + + # convert to Python ints + locations = [int(loc) for loc in locations] + + exit_success(data, args, parameters, locations, runtime, __file__) + + +if __name__ == "__main__": + main() diff --git a/execs/python/cpdbench_utils.py b/execs/python/cpdbench_utils.py new file mode 100644 index 00000000..cb074c69 --- /dev/null +++ b/execs/python/cpdbench_utils.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Utility functions for CPDBench. + +Author: Gertjan van den Burg +Copyright (c) 2020 - The Alan Turing Institute +License: See the LICENSE file. + +""" + +import copy +import hashlib +import json +import numpy as np +import socket +import sys + + +def md5sum(filename): + blocksize = 65536 + hasher = hashlib.md5() + with open(filename, "rb") as fp: + buf = fp.read(blocksize) + while len(buf) > 0: + hasher.update(buf) + buf = fp.read(blocksize) + return hasher.hexdigest() + + +def load_dataset(filename): + with open(filename, "r") as fp: + data = json.load(fp) + + if data["time"]["index"] != list(range(0, data["n_obs"])): + raise NotImplementedError( + "Time series with non-consecutive time axis are not yet supported." + ) + + mat = np.zeros((data["n_obs"], data["n_dim"])) + for j, series in enumerate(data["series"]): + mat[:, j] = series["raw"] + + # We normalize to avoid numerical errors. + mat = (mat - np.nanmean(mat)) / np.sqrt(np.nanvar(mat)) + + return data, mat + + +def prepare_result( + data, + data_filename, + status, + error, + params, + locations, + runtime, + script_filename, +): + out = {} + + # record the command that was used + out["command"] = " ".join(sys.argv) + + # save the script and the hash of the script as very rough versioning + out["script"] = script_filename + out["script_md5"] = md5sum(script_filename) + + # record the hostname + out["hostname"] = socket.gethostname() + + # record the dataset name and hash of the dataset + out["dataset"] = data["name"] + out["dataset_md5"] = md5sum(data_filename) + + # record the status of the detection and any potential error message + out["status"] = status + out["error"] = error + + # save the parameters that were used + out["parameters"] = params + + # save the detection results + out["result"] = {"cplocations": locations, "runtime": runtime} + + return out + + +def dump_output(output, filename=None): + """Save result to output file or write to stdout """ + if filename is None: + print(json.dumps(output, sort_keys=True, indent="\t")) + else: + with open(filename, "w") as fp: + json.dump(output, fp, sort_keys=True, indent="\t") + + +def make_param_dict(args, defaults): + params = copy.deepcopy(vars(args)) + del params["input"] + if "output" in params: + del params["output"] + params.update(defaults) + return params + + +def exit_with_error(data, args, parameters, error, script_filename): + status = "FAIL" + out = prepare_result( + data, + args.input, + status, + error, + parameters, + None, + None, + script_filename, + ) + dump_output(out, args.output) + raise SystemExit + +def exit_with_timeout(data, args, parameters, runtime, script_filename): + status = "TIMEOUT" + out = prepare_result( + data, + args.input, + status, + None, + parameters, + None, + runtime, + script_filename, + ) + dump_output(out, args.output) + raise SystemExit + + +def exit_success(data, args, parameters, locations, runtime, script_filename): + status = "SUCCESS" + error = None + out = prepare_result( + data, + args.input, + status, + error, + parameters, + locations, + runtime, + script_filename, + ) + dump_output(out, args.output) diff --git a/execs/python/rbocpdms b/execs/python/rbocpdms new file mode 160000 +Subproject 1ffd72aa94fa43c2cb588bdab478c401ebfe1ca |
