/** * @file gensvm_consistency.c * @author G.J.J. van den Burg * @date 2016-10-24 * @brief Functions for running consistency repeats * * @details * When running the grid search, the user may optionally choose to run * repetitions of the best performing configurations to check if they perform * well consistently. These are called consistency repeats, and the code * needed for them is defined here. * * @copyright Copyright 2016, G.J.J. van den Burg. This file is part of GenSVM. GenSVM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. GenSVM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GenSVM. If not, see . */ #include "gensvm_consistency.h" /** * @brief Create GenQueue of tasks with performance above a given percentile * * @details * This function constructs a GenQueue of the GenTask instances in the * provided GenQueue which have a performance at or above the given percentile * of the performance of all elements in the provided GenQueue. This can be * used to determine which hyperparameter configurations belong to the top x-% * of all tasks in terms of performance. * * @sa * gensvm_consistency_repeats(), gensvm_percentile() * * @note * This function assumes that for each task in the given GenQueue, the * GenTask::perf element has been set. * * @param[in] q a complete GenQueue struct * @param[in] percentile the desired percentile * * @return a GenQueue struct with GenTasks which are at * or above the desired percentile of performance * */ struct GenQueue *gensvm_top_queue(struct GenQueue *q, double percentile) { long i, k, N = 0; double boundary, *perf = Calloc(double, q->N); struct GenQueue *nq = gensvm_init_queue(); // find the desired percentile of performance for (i=0; iN; i++) { perf[i] = q->tasks[i]->performance; } boundary = gensvm_percentile(perf, q->N, percentile); note("Boundary of the %g-th percentile determined at: %f\n", percentile, boundary); // find the number of tasks that perform at or above the boundary for (i=0; iN; i++) { if (q->tasks[i]->performance >= boundary) N++; } // create a new queue with the best tasks nq->tasks = Malloc(struct GenTask *, N); k = 0; for (i=0; iN; i++) { if (q->tasks[i]->performance >= boundary) nq->tasks[k++] = gensvm_copy_task(q->tasks[i]); } nq->N = N; nq->i = 0; free(perf); return nq; } /** * @brief Run repeats of the GenTask structs in GenQueue to find the best * configuration * * @details * The best performing tasks in the supplied GenQueue are found by taking * those GenTask structs that have a performance greater or equal to the given * percentile of the performance of all tasks. These tasks are then gathered * in a new GenQueue. For each of the tasks in this new GenQueue the cross * validation run is repeated a number of times. * * For each of the GenTask configurations that are repeated the mean * performance, standard deviation of the performance and the mean computation * time are reported. * * Finally, the overall best tasks are written to the specified output. These * tasks are selected to have both the highest mean performance, as well as * the smallest standard deviation in their performance. This is done as * follows. First the 99th percentile of task performance and the 1st * percentile of standard deviation is calculated. If a task exists for which * the mean performance of the repeats and the standard deviation equals these * values respectively, this task is found to be the best and is written to * the output. If no such task exists, the 98th percentile of performance and * the 2nd percentile of standard deviation is considered. This is repeated * until an interval is found which contains tasks. If one or more tasks are * found, this loop stops. * * @param[in] q GenQueue of GenTask structs which have already been * run and have a GenTask::performance value * @param[in] repeats Number of times to repeat the best * configurations for consistency * @param[in] percentile percentile of performance to determine which * tasks to repeat * * @return ID of the best task * */ int gensvm_consistency_repeats(struct GenQueue *q, long repeats, double percentile) { bool breakout; long i, f, r, N, *cv_idx = NULL; double p, pi, pr, pt, *time = NULL, *std = NULL, *mean = NULL, *perf = NULL; struct GenQueue *nq = NULL; struct GenData **train_folds = NULL, **test_folds = NULL; struct GenModel *model = gensvm_init_model(); struct GenTask *task = NULL; struct timespec loop_s, loop_e; nq = gensvm_top_queue(q, percentile); N = nq->N; note("Number of items to check: %li\n", nq->N); std = Calloc(double, N); mean = Calloc(double, N); time = Calloc(double, N); perf = Calloc(double, N*repeats); task = get_next_task(nq); model->n = 0; model->m = task->train_data->m; model->K = task->train_data->K; gensvm_allocate_model(model); gensvm_init_V(NULL, model, task->train_data); cv_idx = Calloc(long, task->train_data->n); train_folds = Malloc(struct GenData *, task->folds); test_folds = Malloc(struct GenData *, task->folds); i = 0; while (task) { gensvm_task_to_model(task, model); time[i] = 0.0; note("(%02li/%02li:%03li)\t", i+1, N, task->ID); for (r=0; rtrain_data->n); gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); train_folds = Malloc(struct GenData *, task->folds); test_folds = Malloc(struct GenData *, task->folds); for (f=0; ffolds; f++) { train_folds[f] = gensvm_init_data(); test_folds[f] = gensvm_init_data(); gensvm_get_tt_split(task->train_data, train_folds[f], test_folds[f], cv_idx, f); gensvm_kernel_preprocess(model, train_folds[f]); gensvm_kernel_postprocess(model, train_folds[f], test_folds[f]); } Timer(loop_s); p = gensvm_cross_validation(model, train_folds, test_folds, task->folds, task->train_data->n); Timer(loop_e); time[i] += gensvm_elapsed_time(&loop_s, &loop_e); matrix_set(perf, repeats, i, r, p); mean[i] += p/((double) repeats); note("%3.3f\t", p); // this is done because if we reuse the V it's not a // consistency check gensvm_init_V(NULL, model, task->train_data); for (f=0; ffolds; f++) { gensvm_free_data(train_folds[f]); gensvm_free_data(test_folds[f]); } free(train_folds); train_folds = NULL; free(test_folds); test_folds = NULL; } for (r=0; r 1) { std[i] /= ((double) repeats) - 1.0; std[i] = sqrt(std[i]); } else { std[i] = 0.0; } note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time[i]); task = get_next_task(nq); i++; } // find the best overall configurations: those with high average // performance and low deviation in the performance note("\nBest overall configuration(s):\n"); note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t" "mean_perf\tstd_perf\ttime_perf\n"); p = 0.0; breakout = false; int best_id = -1; while (!breakout) { pi = gensvm_percentile(mean, N, (100.0-p)); pr = gensvm_percentile(std, N, p); pt = gensvm_percentile(time, N, p); for (i=0; itasks[i]->ID, nq->tasks[i]->weight_idx, nq->tasks[i]->epsilon, nq->tasks[i]->p, nq->tasks[i]->kappa, nq->tasks[i]->lambda, mean[i], std[i], time[i]); breakout = true; if (best_id == -1) best_id = nq->tasks[i]->ID; } } p += 1.0; } free(cv_idx); gensvm_free_model(model); gensvm_free_queue(nq); free(perf); free(std); free(mean); free(time); return best_id; } /** * @brief Comparison function for doubl * * @param[in] elem1 number 1 * @param[in] elem2 number 2 * @returns comparison of number 1 larger than number 2 */ int gensvm_dsort(const void *elem1, const void *elem2) { const double t1 = (*(double *) elem1); const double t2 = (*(double *) elem2); return t1 > t2; } /** * @brief Calculate the percentile of an array of doubles * * @details * The percentile of performance is used to find the top performing * configurations. Since no standard definition of the percentile exists, we * use the method used in MATLAB and Octave. Since calculating the percentile * requires a sorted list of the values, a local copy is made first. * * @param[in] values array of doubles * @param[in] N length of the array * @param[in] p percentile to calculate ( 0 <= p <= 100.0 ). * @returns the p-th percentile of the values */ double gensvm_percentile(double *values, long N, double p) { if (N == 1) return values[0]; long i; double pi, pr, boundary; double *local = Malloc(double, N); for (i=0; i