/**
* @file gensvm_gridsearch.c
* @author G.J.J. van den Burg
* @date 2014-01-07
* @brief Functions for finding the optimal parameters for the dataset
*
* @details
* The GenSVM algorithm takes a number of parameters. The functions in
* this file are used to find the optimal parameters.
*
* @copyright
Copyright 2016, G.J.J. van den Burg.
This file is part of GenSVM.
GenSVM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
GenSVM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GenSVM. If not, see .
*/
#include "gensvm_gridsearch.h"
extern FILE *GENSVM_OUTPUT_FILE;
/**
* @brief Initialize a GenQueue from a Training instance
*
* @details
* A Training instance describes the grid to search over. This funtion
* creates all tasks that need to be performed and adds these to
* a GenQueue. Each task contains a pointer to the train and test datasets
* which are supplied. Note that the tasks are created in a specific order of
* the parameters, to ensure that the GenModel::V of a previous parameter
* set provides the best possible initial estimate of GenModel::V for the next
* parameter set.
*
* @param[in] grid Training struct describing the grid search
* @param[in] queue pointer to a GenQueue that will be used to
* add the tasks to
* @param[in] train_data GenData of the training set
* @param[in] test_data GenData of the test set
*
*/
void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue,
struct GenData *train_data, struct GenData *test_data)
{
long i, j, k;
long N, cnt = 0;
struct GenTask *task = NULL;
queue->i = 0;
N = grid->Np;
N *= grid->Nl;
N *= grid->Nk;
N *= grid->Ne;
N *= grid->Nw;
// these parameters are not necessarily non-zero
N *= grid->Ng > 0 ? grid->Ng : 1;
N *= grid->Nc > 0 ? grid->Nc : 1;
N *= grid->Nd > 0 ? grid->Nd : 1;
queue->tasks = Calloc(struct GenTask *, N);
queue->N = N;
// initialize all tasks
for (i=0; iID = i;
task->train_data = train_data;
task->test_data = test_data;
task->folds = grid->folds;
task->kerneltype = grid->kerneltype;
task->kernelparam = Calloc(double, grid->Ng +
grid->Nc + grid->Nd);
queue->tasks[i] = task;
}
// These loops mimick a large nested for loop. The advantage is that
// Nd, Nc and Ng which are on the outside of the nested for loop can
// now be zero, without large modification (see below). Whether this
// is indeed better than the nested for loop has not been tested.
cnt = 1;
i = 0;
while (i < N )
for (j=0; jNp; j++)
for (k=0; ktasks[i]->p = grid->ps[j];
i++;
}
cnt *= grid->Np;
i = 0;
while (i < N )
for (j=0; jNl; j++)
for (k=0; ktasks[i]->lambda =
grid->lambdas[j];
i++;
}
cnt *= grid->Nl;
i = 0;
while (i < N )
for (j=0; jNk; j++)
for (k=0; ktasks[i]->kappa = grid->kappas[j];
i++;
}
cnt *= grid->Nk;
i = 0;
while (i < N )
for (j=0; jNw; j++)
for (k=0; ktasks[i]->weight_idx =
grid->weight_idxs[j];
i++;
}
cnt *= grid->Nw;
i = 0;
while (i < N )
for (j=0; jNe; j++)
for (k=0; ktasks[i]->epsilon =
grid->epsilons[j];
i++;
}
cnt *= grid->Ne;
i = 0;
while (i < N && grid->Ng > 0)
for (j=0; jNg; j++)
for (k=0; ktasks[i]->kernelparam[0] =
grid->gammas[j];
i++;
}
cnt *= grid->Ng > 0 ? grid->Ng : 1;
i = 0;
while (i < N && grid->Nc > 0)
for (j=0; jNc; j++)
for (k=0; ktasks[i]->kernelparam[1] =
grid->coefs[j];
i++;
}
cnt *= grid->Nc > 0 ? grid->Nc : 1;
i = 0;
while (i < N && grid->Nd > 0)
for (j=0; jNd; j++)
for (k=0; ktasks[i]->kernelparam[2] =
grid->degrees[j];
i++;
}
}
/**
* @brief Comparison function for doubl
*
* @param[in] elem1 number 1
* @param[in] elem2 number 2
* @returns comparison of number 1 larger than number 2
*/
int doublesort(const void *elem1, const void *elem2)
{
const double t1 = (*(double *) elem1);
const double t2 = (*(double *) elem2);
return t1 > t2;
}
/**
* @brief Calculate the percentile of an array of doubles
*
* @details
* The percentile of performance is used to find the top performing
* configurations. Since no standard definition of the percentile exists, we
* use the method used in MATLAB and Octave. Since calculating the percentile
* requires a sorted list of the values, a local copy is made first.
*
* @param[in] values array of doubles
* @param[in] N length of the array
* @param[in] p percentile to calculate ( 0 <= p <= 100.0 ).
* @returns the p-th percentile of the values
*/
double prctile(double *values, long N, double p)
{
if (N == 1)
return values[0];
long i;
double pi, pr, boundary;
double *local = Malloc(double, N);
for (i=0; iN);
for (i=0; iN; i++) {
perf[i] = q->tasks[i]->performance;
}
boundary = prctile(perf, q->N, 95.0);
free(perf);
note("boundary determined at: %f\n", boundary);
// find the number of tasks that perform at or above the boundary
for (i=0; iN; i++) {
if (q->tasks[i]->performance >= boundary)
N++;
}
// create a new queue with the best tasks
nq->tasks = Malloc(struct GenTask *, N);
k = 0;
for (i=0; iN; i++) {
if (q->tasks[i]->performance >= boundary)
nq->tasks[k++] = q->tasks[i];
}
nq->N = N;
nq->i = 0;
return nq;
}
/**
* @brief Run repeats of the GenTask structs in GenQueue to find the best
* configuration
*
* @details
* The best performing tasks in the supplied GenQueue are found by taking those
* GenTask structs that have a performance greater or equal to the 95% percentile
* of the performance of all tasks. These tasks are then gathered in a new
* GenQueue. For each of the tasks in this new GenQueue the cross validation run is
* repeated a number of times.
*
* For each of the GenTask configurations that are repeated the mean performance,
* standard deviation of the performance and the mean computation time are
* reported.
*
* Finally, the overall best tasks are written to the specified output. These
* tasks are selected to have both the highest mean performance, as well as the
* smallest standard deviation in their performance. This is done as follows.
* First the 99th percentile of task performance and the 1st percentile of
* standard deviation is calculated. If a task exists for which the mean
* performance of the repeats and the standard deviation equals these values
* respectively, this task is found to be the best and is written to the
* output. If no such task exists, the 98th percentile of performance and the
* 2nd percentile of standard deviation is considered. This is repeated until
* an interval is found which contains tasks. If one or more tasks are found,
* this loop stops.
*
* @param[in] q GenQueue of GenTask structs which have already been
* run and have a GenTask::performance value
* @param[in] repeats Number of times to repeat the best
* configurations for consistency
* @param[in] traintype type of training to do (CV or TT)
*
*/
void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype)
{
bool breakout;
long i, f, r, N, *cv_idx = NULL;
double p, pi, pr, pt,
*time = NULL,
*std = NULL,
*mean = NULL,
*perf = NULL;
struct GenQueue *nq = NULL;
struct GenData **train_folds = NULL,
**test_folds = NULL;
struct GenModel *model = gensvm_init_model();
struct GenTask *task = NULL;
struct timespec loop_s, loop_e;
nq = create_top_queue(q);
N = nq->N;
note("Number of items: %li\n", nq->N);
std = Calloc(double, N);
mean = Calloc(double, N);
time = Calloc(double, N);
perf = Calloc(double, N*repeats);
task = get_next_task(nq);
model->n = 0;
model->m = task->train_data->m;
model->K = task->train_data->K;
gensvm_allocate_model(model);
gensvm_init_V(NULL, model, task->train_data);
cv_idx = Calloc(long, task->train_data->n);
train_folds = Malloc(struct GenData *, task->folds);
test_folds = Malloc(struct GenData *, task->folds);
i = 0;
while (task) {
make_model_from_task(task, model);
time[i] = 0.0;
note("(%02li/%02li:%03li)\t", i+1, N, task->ID);
for (r=0; rtrain_data->n);
gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx);
train_folds = Malloc(struct GenData *, task->folds);
test_folds = Malloc(struct GenData *, task->folds);
for (f=0; ffolds; f++) {
train_folds[f] = gensvm_init_data();
test_folds[f] = gensvm_init_data();
gensvm_get_tt_split(task->train_data, train_folds[f],
test_folds[f], cv_idx, f);
gensvm_kernel_preprocess(model, train_folds[f]);
gensvm_kernel_postprocess(model, train_folds[f],
test_folds[f]);
}
Timer(loop_s);
p = gensvm_cross_validation(model, train_folds, test_folds,
task->folds, task->train_data->n);
Timer(loop_e);
time[i] += gensvm_elapsed_time(&loop_s, &loop_e);
matrix_set(perf, repeats, i, r, p);
mean[i] += p/((double) repeats);
note("%3.3f\t", p);
// this is done because if we reuse the V it's not a
// consistency check
gensvm_init_V(NULL, model, task->train_data);
for (f=0; ffolds; f++) {
gensvm_free_data(train_folds[f]);
gensvm_free_data(test_folds[f]);
}
free(train_folds);
train_folds = NULL;
free(test_folds);
test_folds = NULL;
}
for (r=0; r 1) {
std[i] /= ((double) repeats) - 1.0;
std[i] = sqrt(std[i]);
} else {
std[i] = 0.0;
}
note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i],
time[i]);
task = get_next_task(nq);
i++;
}
// find the best overall configurations: those with high average
// performance and low deviation in the performance
note("\nBest overall configuration(s):\n");
note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t"
"mean_perf\tstd_perf\ttime_perf\n");
p = 0.0;
breakout = false;
while (breakout == false) {
pi = prctile(mean, N, (100.0-p));
pr = prctile(std, N, p);
pt = prctile(time, N, p);
for (i=0; itasks[i]->ID,
nq->tasks[i]->weight_idx,
nq->tasks[i]->epsilon,
nq->tasks[i]->p,
nq->tasks[i]->kappa,
nq->tasks[i]->lambda,
mean[i],
std[i],
time[i]);
breakout = true;
}
p += 1.0;
}
free(cv_idx);
// make sure no double free occurs with the copied kernelparam
model->kernelparam = NULL;
gensvm_free_model(model);
free(nq->tasks);
free(nq);
free(perf);
free(std);
free(mean);
free(time);
}
/**
* @brief Check if the kernel parameters change between tasks
*
* @details
* In the current strategy for training the kernel matrix is decomposed once,
* and tasks with the same kernel settings are performed sequentially. When a
* task needs to be done with different kernel parameters, the kernel matrix
* needs to be recalculated. This function is used to check whether this is
* the case.
*
* @param[in] newtask the next task
* @param[in] oldtask the old task
* @return whether the kernel needs to be reevaluated
*/
bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask)
{
int i;
if (oldtask == NULL)
return true;
if (newtask->kerneltype != oldtask->kerneltype) {
return true;
} else if (newtask->kerneltype == K_POLY) {
for (i=0; i<3; i++)
if (newtask->kernelparam[i] != oldtask->kernelparam[i])
return true;
return false;
} else if (newtask->kerneltype == K_RBF) {
if (newtask->kernelparam[0] != oldtask->kernelparam[0])
return true;
return false;
} else if (newtask->kerneltype == K_SIGMOID) {
for (i=0; i<2; i++)
if (newtask->kernelparam[i] != oldtask->kernelparam[i])
return true;
return false;
}
return false;
}
/**
* @brief Run the grid search for a GenQueue
*
* @details
* Given a GenQueue of GenTask struct to be trained, a grid search is launched to
* find the optimal parameter configuration. As is also done within
* cross_validation(), the optimal weights of one parameter set are used as
* initial estimates for GenModel::V in the next parameter set. Note that to
* optimally exploit this feature of the optimization algorithm, the order in
* which tasks are considered is important. This is considered in
* make_queue().
*
* The performance found by cross validation is stored in the GenTask struct.
*
* @param[in,out] q GenQueue with GenTask instances to run
*/
void start_training(struct GenQueue *q)
{
int f, folds;
double perf, current_max = 0;
struct GenTask *task = get_next_task(q);
struct GenTask *prevtask = NULL;
struct GenModel *model = gensvm_init_model();
struct timespec main_s, main_e, loop_s, loop_e;
folds = task->folds;
model->n = 0;
model->m = task->train_data->m;
model->K = task->train_data->K;
gensvm_allocate_model(model);
gensvm_init_V(NULL, model, task->train_data);
long *cv_idx = Calloc(long, task->train_data->n);
gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx);
struct GenData **train_folds = Malloc(struct GenData *, task->folds);
struct GenData **test_folds = Malloc(struct GenData *, task->folds);
for (f=0; ftrain_data, train_folds[f],
test_folds[f], cv_idx, f);
}
Timer(main_s);
while (task) {
make_model_from_task(task, model);
if (kernel_changed(task, prevtask)) {
note("Computing kernel");
for (f=0; fZ != train_folds[f]->RAW)
free(train_folds[f]->Z);
if (test_folds[f]->Z != test_folds[f]->RAW)
free(test_folds[f]->Z);
gensvm_kernel_preprocess(model,
train_folds[f]);
gensvm_kernel_postprocess(model,
train_folds[f], test_folds[f]);
}
note(".\n");
}
print_progress_string(task, q->N);
Timer(loop_s);
perf = gensvm_cross_validation(model, train_folds, test_folds,
folds, task->train_data->n);
Timer(loop_e);
current_max = maximum(current_max, perf);
note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf,
gensvm_elapsed_time(&loop_s, &loop_e),
current_max);
q->tasks[task->ID]->performance = perf;
prevtask = task;
task = get_next_task(q);
}
Timer(main_e);
note("\nTotal elapsed training time: %8.8f seconds\n",
gensvm_elapsed_time(&main_s, &main_e));
// make sure no double free occurs with the copied kernelparam
model->kernelparam = NULL;
gensvm_free_model(model);
for (f=0; fn,
train_folds[f]->r);
// initialize object weights
gensvm_initialize_weights(train_folds[f], model);
// train the model (surpressing output)
fid = GENSVM_OUTPUT_FILE;
GENSVM_OUTPUT_FILE = NULL;
gensvm_optimize(model, train_folds[f]);
GENSVM_OUTPUT_FILE = fid;
// calculate prediction performance on test set
predy = Calloc(long, test_folds[f]->n);
gensvm_predict_labels(test_folds[f], model, predy);
performance = gensvm_prediction_perf(test_folds[f], predy);
total_perf += performance * test_folds[f]->n;
free(predy);
}
total_perf /= ((double) n_total);
return total_perf;
}
/**
* @brief Copy parameters from GenTask to GenModel
*
* @details
* A GenTask struct only contains the parameters of the GenModel to be estimated.
* This function is used to copy these parameters.
*
* @param[in] task GenTask instance with parameters
* @param[in,out] model GenModel to which the parameters are copied
*/
void make_model_from_task(struct GenTask *task, struct GenModel *model)
{
// copy basic model parameters
model->weight_idx = task->weight_idx;
model->epsilon = task->epsilon;
model->p = task->p;
model->kappa = task->kappa;
model->lambda = task->lambda;
// copy kernel parameters
model->kerneltype = task->kerneltype;
model->kernelparam = task->kernelparam;
}
/**
* @brief Print the description of the current task on screen
*
* @details
* To track the progress of the grid search the parameters of the current task
* are written to the output specified in GENSVM_OUTPUT_FILE. Since the
* parameters differ with the specified kernel, this function writes a
* parameter string depending on which kernel is used.
*
* @param[in] task the GenTask specified
* @param[in] N total number of tasks
*
*/
void print_progress_string(struct GenTask *task, long N)
{
char buffer[MAX_LINE_LENGTH];
sprintf(buffer, "(%03li/%03li)\t", task->ID+1, N);
if (task->kerneltype == K_POLY)
sprintf(buffer + strlen(buffer), "d = %2.2f\t",
task->kernelparam[2]);
if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID)
sprintf(buffer + strlen(buffer), "c = %2.2f\t",
task->kernelparam[1]);
if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID ||
task->kerneltype == K_RBF)
sprintf(buffer + strlen(buffer), "g = %3.3f\t",
task->kernelparam[0]);
sprintf(buffer + strlen(buffer), "eps = %g\tw = %i\tk = %2.2f\t"
"l = %f\tp = %2.2f\t", task->epsilon,
task->weight_idx, task->kappa, task->lambda, task->p);
note(buffer);
}