From 6d68250648154fe4ae7d4f0f9553291aaf51a172 Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Wed, 4 Mar 2015 19:34:04 +0100 Subject: rewrite of consistency_repeats to work with kernels --- src/GenSVMgrid.c | 5 +- src/GenSVMtraintest.c | 6 + src/gensvm_init.c | 2 +- src/gensvm_train.c | 6 +- src/gensvm_train_dataset.c | 424 +++++++++++++++------------------------------ 5 files changed, 151 insertions(+), 292 deletions(-) (limited to 'src') diff --git a/src/GenSVMgrid.c b/src/GenSVMgrid.c index 184f9a2..b4d3524 100644 --- a/src/GenSVMgrid.c +++ b/src/GenSVMgrid.c @@ -103,10 +103,7 @@ int main(int argc, char **argv) srand(time(NULL)); note("Starting training\n"); - if (training->traintype == TT) - start_training_tt(q); - else - start_training(q); + start_training(q); note("Training finished\n"); if (training->repeats > 0) { diff --git a/src/GenSVMtraintest.c b/src/GenSVMtraintest.c index e43d09f..e15b340 100644 --- a/src/GenSVMtraintest.c +++ b/src/GenSVMtraintest.c @@ -161,6 +161,12 @@ int main(int argc, char **argv) gensvm_free_model(model); gensvm_free_data(traindata); gensvm_free_data(testdata); + free(training_inputfile); + free(testing_inputfile); + free(model_inputfile); + free(model_outputfile); + free(prediction_outputfile); + free(predy); return 0; diff --git a/src/gensvm_init.c b/src/gensvm_init.c index 6228d9a..89002f8 100644 --- a/src/gensvm_init.c +++ b/src/gensvm_init.c @@ -301,7 +301,7 @@ void gensvm_free_data(struct GenData *data) if (data->Z == data->RAW) { free(data->Z); - }else { + } else { free(data->Z); free(data->RAW); } diff --git a/src/gensvm_train.c b/src/gensvm_train.c index c264ffa..776e546 100644 --- a/src/gensvm_train.c +++ b/src/gensvm_train.c @@ -141,9 +141,9 @@ double gensvm_get_loss(struct GenModel *model, struct GenData *data, double *ZV) { long i, j; - long n = data->n; - long K = data->K; - long m = data->m; + long n = model->n; + long K = model->K; + long m = model->m; double value, rowvalue, loss = 0.0; diff --git a/src/gensvm_train_dataset.c b/src/gensvm_train_dataset.c index f0c4e93..62c69d7 100644 --- a/src/gensvm_train_dataset.c +++ b/src/gensvm_train_dataset.c @@ -205,10 +205,7 @@ int tasksort(const void *elem1, const void *elem2) } /** - * @brief Comparison function for doubles - * - * @details - * Similar to tasksort() only now for two doubles. + * @brief Comparison function for doubl * * @param[in] elem1 number 1 * @param[in] elem2 number 2 @@ -258,6 +255,41 @@ double prctile(double *values, long N, double p) return boundary; } +struct Queue *create_top_queue(struct Queue *q) +{ + long i, k, N = 0; + double boundary, *perf; + struct Queue *nq = Malloc(struct Queue, 1); + + // find the 95th percentile of performance + perf = Calloc(double, q->N); + for (i=0; iN; i++) { + perf[i] = q->tasks[i]->performance; + } + boundary = prctile(perf, q->N, 95.0); + free(perf); + note("boundary determined at: %f\n", boundary); + + // find the number of tasks that perform at or above the boundary + for (i=0; iN; i++) { + if (q->tasks[i]->performance >= boundary) + N++; + } + + // create a new queue with the best tasks + nq->tasks = Malloc(struct Task *, N); + k = 0; + for (i=0; iN; i++) { + if (q->tasks[i]->performance >= boundary) + nq->tasks[k++] = q->tasks[i]; + } + nq->N = N; + nq->i = 0; + + return nq; +} + + /** * @brief Run repeats of the Task structs in Queue to find the best * configuration @@ -294,89 +326,87 @@ double prctile(double *values, long N, double p) */ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) { - long i, r, N; - double p, pi, pr, pt, boundary, *time, *std, *mean, *perf; - struct Queue *nq = Malloc(struct Queue, 1); + long i, f, r, N, *cv_idx; + double p, pi, pr, pt, *time, *std, *mean, *perf; + struct Queue *nq; + struct GenData **train_folds, **test_folds; struct GenModel *model = gensvm_init_model(); - struct Task *task; + struct Task *task = NULL; clock_t loop_s, loop_e; - // calculate the performance percentile (Matlab style) - qsort(q->tasks, q->N, sizeof(struct Task *), tasksort); - p = 0.95*q->N + 0.5; - pi = maximum(minimum(floor(p), q->N-1), 1); - pr = maximum(minimum(p - pi, 1), 0); - boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance; - boundary += pr*q->tasks[((long) pi)]->performance; - note("boundary determined at: %f\n", boundary); - - // find the number of tasks that perform at least as good as the 95th - // percentile - N = 0; - for (i=0; iN; i++) - if (q->tasks[i]->performance >= boundary) - N++; - note("Number of items: %li\n", N); + nq = create_top_queue(q); + N = nq->N; + + note("Number of items: %li\n", nq->N); std = Calloc(double, N); mean = Calloc(double, N); time = Calloc(double, N); perf = Calloc(double, N*repeats); - // create a new task queue with the tasks which perform well - nq->tasks = Malloc(struct Task *, N); - for (i=q->N-1; i>q->N-N-1; i--) - nq->tasks[q->N-i-1] = q->tasks[i]; - nq->N = N; - nq->i = 0; + task = get_next_task(nq); - // for each task run the consistency repeats - for (i=0; in = 0; + model->m = task->train_data->m; + model->K = task->train_data->K; + gensvm_allocate_model(model); + gensvm_seed_model_V(NULL, model, task->train_data); - if (i == 0) { - model->n = 0; - model->m = task->train_data->m; - model->K = task->train_data->K; - gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); - } + cv_idx = Calloc(long, task->train_data->n); + + train_folds = Malloc(struct GenData *, task->folds); + test_folds = Malloc(struct GenData *, task->folds); + + i = 0; + while (task) { + make_model_from_task(task, model); time[i] = 0.0; note("(%02li/%02li:%03li)\t", i+1, N, task->ID); for (r=0; rtrain_data, - task->folds); - loop_e = clock(); - time[i] += elapsed_time(loop_s, loop_e); - matrix_set(perf, repeats, i, r, p); - mean[i] += p/((double) repeats); - } else { - note("Only cv is implemented\n"); - exit(1); + Memset(cv_idx, long, task->train_data->n); + gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); + train_folds = Malloc(struct GenData *, task->folds); + test_folds = Malloc(struct GenData *, task->folds); + for (f=0; ffolds; f++) { + train_folds[f] = gensvm_init_data(); + test_folds[f] = gensvm_init_data(); + gensvm_get_tt_split(task->train_data, train_folds[f], + test_folds[f], cv_idx, f); + gensvm_kernel_preprocess(model, train_folds[f]); + gensvm_kernel_postprocess(model, train_folds[f], + test_folds[f]); } + + loop_s = clock(); + p = gensvm_cross_validation(model, train_folds, test_folds, + task->folds, task->train_data->n); + loop_e = clock(); + time[i] += elapsed_time(loop_s, loop_e); + matrix_set(perf, repeats, i, r, p); + mean[i] += p/((double) repeats); note("%3.3f\t", p); // this is done because if we reuse the V it's not a // consistency check gensvm_seed_model_V(NULL, model, task->train_data); + for (f=0; ffolds; f++) { + gensvm_free_data(train_folds[f]); + gensvm_free_data(test_folds[f]); + } + free(train_folds); + free(test_folds); } for (r=0; r 1) { std[i] /= ((double) repeats) - 1.0; std[i] = sqrt(std[i]); - } else + } else { std[i] = 0.0; - note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", - mean[i], std[i], time[i]); + } + note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time[i]); + task = get_next_task(nq); + i++; } // find the best overall configurations: those with high average @@ -412,6 +442,7 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) p += 1.0; } + free(cv_idx); // make sure no double free occurs with the copied kernelparam model->kernelparam = NULL; gensvm_free_model(model); @@ -424,142 +455,24 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) } /** - * @brief Run cross validation with a seed model - * - * @details - * This is an implementation of cross validation which uses the optimal - * parameters GenModel::V of a previous fold as initial conditions for - * GenModel::V of the next fold. An initial seed for V can be given through the - * seed_model parameter. If seed_model is NULL, random starting values are - * used. - * - * @param[in] model GenModel with the configuration to train - * @param[in] seed_model GenModel with a seed for GenModel::V - * @param[in] data GenData with the dataset - * @param[in] folds number of cross validation folds - * @returns performance (hitrate) of the configuration on - * cross validation - */ -double cross_validation(struct GenModel *model, struct GenData *data, - long folds) -{ - return 0.0; -} -/* -double cross_validation(struct GenModel *model, struct GenData *data, - long folds) -{ - FILE *fid; - - long f, *predy; - double performance, total_perf = 0; - struct GenData *train_data, *test_data; - - long *cv_idx = Calloc(long, data->n); - - train_data = gensvm_init_data(); - test_data = gensvm_init_data(); - - // create splits - gensvm_make_cv_split(data->n, folds, cv_idx); - - for (f=0; fn, train_data->m); - - gensvm_initialize_weights(train_data, model); - - // train the model (without output) - fid = GENSVM_OUTPUT_FILE; - GENSVM_OUTPUT_FILE = NULL; - gensvm_optimize(model, train_data); - GENSVM_OUTPUT_FILE = fid; - - // calculate prediction performance on test set - predy = Calloc(long, test_data->n); - gensvm_predict_labels(test_data, train_data, model, predy); - performance = gensvm_prediction_perf(test_data, predy); - total_perf += performance * test_data->n; - - free(predy); - free(train_data->y); - free(train_data->Z); - free(test_data->y); - free(test_data->Z); - } - - free(train_data); - free(test_data); - - total_perf /= ((double) data->n); - - return total_perf; -} -*/ -/** - * @brief Run the grid search for a cross validation dataset + * @brief Check if the kernel parameters change between tasks * * @details - * Given a Queue of Task struct to be trained, a grid search is launched to - * find the optimal parameter configuration. As is also done within - * cross_validation(), the optimal weights of one parameter set are used as - * initial estimates for GenModel::V in the next parameter set. Note that to - * optimally exploit this feature of the optimization algorithm, the order in - * which tasks are considered is important. This is considered in - * make_queue(). - * - * The performance found by cross validation is stored in the Task struct. - * - * @param[in,out] q Queue with Task instances to run + * In the current strategy for training the kernel matrix is decomposed once, + * and tasks with the same kernel settings are performed sequentially. When a + * task needs to be done with different kernel parameters, the kernel matrix + * needs to be recalculated. This function is used to check whether this is + * the case. + * + * @param[in] newtask the next task + * @param[in] oldtask the old task + * @return whether the kernel needs to be reevaluated */ -void start_training_cv(struct Queue *q) -{ - double perf, current_max = 0; - struct Task *task = get_next_task(q); - struct GenModel *model = gensvm_init_model(); - clock_t main_s, main_e, loop_s, loop_e; - - model->n = 0; - model->m = task->train_data->m; - model->K = task->train_data->K; - gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); - - main_s = clock(); - while (task) { - print_progress_string(task, q->N); - make_model_from_task(task, model); - - loop_s = clock(); - perf = cross_validation(model, task->train_data, task->folds); - loop_e = clock(); - current_max = maximum(current_max, perf); - - note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, - elapsed_time(loop_s, loop_e), - current_max); - - q->tasks[task->ID]->performance = perf; - task = get_next_task(q); - } - main_e = clock(); - - note("\nTotal elapsed time: %8.8f seconds\n", - elapsed_time(main_s, main_e)); - - gensvm_free_model(model); -} - - bool kernel_changed(struct Task *newtask, struct Task *oldtask) { + int i; if (oldtask == NULL) return true; - int i; if (newtask->kerneltype != oldtask->kerneltype) { return true; } else if (newtask->kerneltype == K_POLY) { @@ -580,6 +493,22 @@ bool kernel_changed(struct Task *newtask, struct Task *oldtask) return false; } +/** + * @brief Run the grid search for a Queue + * + * @details + * Given a Queue of Task struct to be trained, a grid search is launched to + * find the optimal parameter configuration. As is also done within + * cross_validation(), the optimal weights of one parameter set are used as + * initial estimates for GenModel::V in the next parameter set. Note that to + * optimally exploit this feature of the optimization algorithm, the order in + * which tasks are considered is important. This is considered in + * make_queue(). + * + * The performance found by cross validation is stored in the Task struct. + * + * @param[in,out] q Queue with Task instances to run + */ void start_training(struct Queue *q) { @@ -620,6 +549,10 @@ void start_training(struct Queue *q) if (kernel_changed(task, prevtask)) { note("*"); for (f=0; fZ != train_folds[f]->RAW) + free(train_folds[f]->Z); + if (test_folds[f]->Z != test_folds[f]->RAW) + free(test_folds[f]->Z); gensvm_kernel_preprocess(model, train_folds[f]); gensvm_kernel_postprocess(model, @@ -658,7 +591,23 @@ void start_training(struct Queue *q) free(cv_idx); } - +/** + * @brief Run cross validation with a given set of train/test folds + * + * @details + * This cross validation function uses predefined train/test splits. Also, the + * the optimal parameters GenModel::V of a previous fold as initial conditions + * for GenModel::V of the next fold. + * + * @param[in] model GenModel with the configuration to train + * @param[in] train_folds array of training datasets + * @param[in] test_folds array of test datasets + * @param[in] folds number of folds + * @param[in] n_total number of objects in the union of the train + * datasets + * @return performance (hitrate) of the configuration on + * cross validation + */ double gensvm_cross_validation(struct GenModel *model, struct GenData **train_folds, struct GenData **test_folds, int folds, long n_total) @@ -698,99 +647,6 @@ double gensvm_cross_validation(struct GenModel *model, } -/** - * @brief Run the grid search for a train/test dataset - * - * @details - * This function is similar to start_training_cv(), except that the - * pre-determined training set is used only once, and the pre-determined test - * set is used for validation. - * - * @todo - * It would probably be better to train the model on the training set using - * cross validation and only use the test set when comparing with other - * methods. The way it is now, you're finding out which parameters predict - * _this_ test set best, which is not what you want. This function should - * therefore not be used and is considered deprecated, to be removed in the - * future . - * - * @param[in] q Queue with Task structs to run - * - */ -void start_training_tt(struct Queue *q) -{ - return; -} -/* -void start_training_tt(struct Queue *q) -{ - FILE *fid; - - long c = 0; - long *predy; - double total_perf, current_max = 0; - - struct Task *task = get_next_task(q); - struct GenModel *seed_model = gensvm_init_model(); - - clock_t main_s, main_e; - clock_t loop_s, loop_e; - - seed_model->m = task->train_data->m; - seed_model->K = task->train_data->K; - gensvm_allocate_model(seed_model); - gensvm_seed_model_V(NULL, seed_model, task->train_data); - - main_s = clock(); - while (task) { - total_perf = 0; - note("(%li/%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t", - c+1, q->N, task->weight_idx, task->epsilon, - task->p, task->kappa, task->lambda); - loop_s = clock(); - struct GenModel *model = gensvm_init_model(); - make_model_from_task(task, model); - - model->n = task->train_data->n; - model->m = task->train_data->m; - model->K = task->train_data->K; - - gensvm_allocate_model(model); - gensvm_initialize_weights(task->train_data, model); - gensvm_seed_model_V(seed_model, model, task->train_data); - - fid = GENSVM_OUTPUT_FILE; - GENSVM_OUTPUT_FILE = NULL; - gensvm_optimize(model, task->train_data); - GENSVM_OUTPUT_FILE = fid; - - predy = Calloc(long, task->test_data->n); - gensvm_predict_labels(task->test_data, task->train_data, - model, predy); - if (task->test_data->y != NULL) - total_perf = gensvm_prediction_perf(task->test_data, - predy); - gensvm_seed_model_V(model, seed_model, task->train_data); - - gensvm_free_model(model); - free(predy); - note("."); - loop_e = clock(); - current_max = maximum(current_max, total_perf); - note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", total_perf, - elapsed_time(loop_s, loop_e), current_max); - q->tasks[task->ID]->performance = total_perf; - task = get_next_task(q); - } - main_e = clock(); - - note("\nTotal elapsed time: %8.8f seconds\n", - elapsed_time(main_s, main_e)); - free(task); - gensvm_free_model(seed_model); -} -*/ - /** * @brief Free the Queue struct * -- cgit v1.2.3