diff options
| author | Gertjan van den Burg <burg@ese.eur.nl> | 2016-11-03 15:55:03 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <burg@ese.eur.nl> | 2016-11-03 15:55:03 +0100 |
| commit | c3edde20d385614f0016b74e03575344b7c5081a (patch) | |
| tree | 314d386874ea60dccf8e111fa856bac06c9f656a /src/gensvm_gridsearch.c | |
| parent | update copyright information (diff) | |
| download | gensvm-c3edde20d385614f0016b74e03575344b7c5081a.tar.gz gensvm-c3edde20d385614f0016b74e03575344b7c5081a.zip | |
prepare for gridsearch unit testing
Diffstat (limited to 'src/gensvm_gridsearch.c')
| -rw-r--r-- | src/gensvm_gridsearch.c | 454 |
1 files changed, 77 insertions, 377 deletions
diff --git a/src/gensvm_gridsearch.c b/src/gensvm_gridsearch.c index 6e48320..7a81c51 100644 --- a/src/gensvm_gridsearch.c +++ b/src/gensvm_gridsearch.c @@ -91,342 +91,97 @@ void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, // is indeed better than the nested for loop has not been tested. cnt = 1; i = 0; - while (i < N ) - for (j=0; j<grid->Np; j++) + while (i < N) { + for (j=0; j<grid->Np; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->p = grid->ps[j]; i++; } + } + } cnt *= grid->Np; i = 0; - while (i < N ) - for (j=0; j<grid->Nl; j++) + while (i < N) { + for (j=0; j<grid->Nl; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->lambda = grid->lambdas[j]; i++; } + } + } cnt *= grid->Nl; i = 0; - while (i < N ) - for (j=0; j<grid->Nk; j++) + while (i < N) { + for (j=0; j<grid->Nk; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->kappa = grid->kappas[j]; i++; } + } + } cnt *= grid->Nk; i = 0; - while (i < N ) - for (j=0; j<grid->Nw; j++) + while (i < N) { + for (j=0; j<grid->Nw; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->weight_idx = grid->weight_idxs[j]; i++; } + } + } cnt *= grid->Nw; i = 0; - while (i < N ) - for (j=0; j<grid->Ne; j++) + while (i < N) { + for (j=0; j<grid->Ne; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->epsilon = grid->epsilons[j]; i++; } + } + } cnt *= grid->Ne; i = 0; - while (i < N && grid->Ng > 0) - for (j=0; j<grid->Ng; j++) + while (i < N && grid->Ng > 0) { + for (j=0; j<grid->Ng; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[0] = grid->gammas[j]; i++; } + } + } cnt *= grid->Ng > 0 ? grid->Ng : 1; i = 0; - while (i < N && grid->Nc > 0) - for (j=0; j<grid->Nc; j++) + while (i < N && grid->Nc > 0) { + for (j=0; j<grid->Nc; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[1] = grid->coefs[j]; i++; } + } + } cnt *= grid->Nc > 0 ? grid->Nc : 1; i = 0; - while (i < N && grid->Nd > 0) - for (j=0; j<grid->Nd; j++) + while (i < N && grid->Nd > 0) { + for (j=0; j<grid->Nd; j++) { for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[2] = grid->degrees[j]; i++; } -} - -/** - * @brief Comparison function for doubl - * - * @param[in] elem1 number 1 - * @param[in] elem2 number 2 - * @returns comparison of number 1 larger than number 2 - */ -int doublesort(const void *elem1, const void *elem2) -{ - const double t1 = (*(double *) elem1); - const double t2 = (*(double *) elem2); - return t1 > t2; -} - -/** - * @brief Calculate the percentile of an array of doubles - * - * @details - * The percentile of performance is used to find the top performing - * configurations. Since no standard definition of the percentile exists, we - * use the method used in MATLAB and Octave. Since calculating the percentile - * requires a sorted list of the values, a local copy is made first. - * - * @param[in] values array of doubles - * @param[in] N length of the array - * @param[in] p percentile to calculate ( 0 <= p <= 100.0 ). - * @returns the p-th percentile of the values - */ -double prctile(double *values, long N, double p) -{ - if (N == 1) - return values[0]; - - long i; - double pi, pr, boundary; - double *local = Malloc(double, N); - for (i=0; i<N; i++) - local[i] = values[i]; - - qsort(local, N, sizeof(double), doublesort); - p /= 100.0; - p = p*N + 0.5; - pi = maximum(minimum(floor(p), N-1), 1); - pr = maximum(minimum(p - pi, 1), 0); - boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)]; - - free(local); - - return boundary; -} - -struct GenQueue *create_top_queue(struct GenQueue *q) -{ - long i, k, N = 0; - double boundary, *perf = NULL; - struct GenQueue *nq = Malloc(struct GenQueue, 1); - - // find the 95th percentile of performance - perf = Calloc(double, q->N); - for (i=0; i<q->N; i++) { - perf[i] = q->tasks[i]->performance; - } - boundary = prctile(perf, q->N, 95.0); - free(perf); - note("boundary determined at: %f\n", boundary); - - // find the number of tasks that perform at or above the boundary - for (i=0; i<q->N; i++) { - if (q->tasks[i]->performance >= boundary) - N++; - } - - // create a new queue with the best tasks - nq->tasks = Malloc(struct GenTask *, N); - k = 0; - for (i=0; i<q->N; i++) { - if (q->tasks[i]->performance >= boundary) - nq->tasks[k++] = q->tasks[i]; - } - nq->N = N; - nq->i = 0; - - return nq; -} - - -/** - * @brief Run repeats of the GenTask structs in GenQueue to find the best - * configuration - * - * @details - * The best performing tasks in the supplied GenQueue are found by taking those - * GenTask structs that have a performance greater or equal to the 95% percentile - * of the performance of all tasks. These tasks are then gathered in a new - * GenQueue. For each of the tasks in this new GenQueue the cross validation run is - * repeated a number of times. - * - * For each of the GenTask configurations that are repeated the mean performance, - * standard deviation of the performance and the mean computation time are - * reported. - * - * Finally, the overall best tasks are written to the specified output. These - * tasks are selected to have both the highest mean performance, as well as the - * smallest standard deviation in their performance. This is done as follows. - * First the 99th percentile of task performance and the 1st percentile of - * standard deviation is calculated. If a task exists for which the mean - * performance of the repeats and the standard deviation equals these values - * respectively, this task is found to be the best and is written to the - * output. If no such task exists, the 98th percentile of performance and the - * 2nd percentile of standard deviation is considered. This is repeated until - * an interval is found which contains tasks. If one or more tasks are found, - * this loop stops. - * - * @param[in] q GenQueue of GenTask structs which have already been - * run and have a GenTask::performance value - * @param[in] repeats Number of times to repeat the best - * configurations for consistency - * @param[in] traintype type of training to do (CV or TT) - * - */ -void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype) -{ - bool breakout; - long i, f, r, N, *cv_idx = NULL; - double p, pi, pr, pt, - *time = NULL, - *std = NULL, - *mean = NULL, - *perf = NULL; - struct GenQueue *nq = NULL; - struct GenData **train_folds = NULL, - **test_folds = NULL; - struct GenModel *model = gensvm_init_model(); - struct GenTask *task = NULL; - struct timespec loop_s, loop_e; - - nq = create_top_queue(q); - N = nq->N; - - note("Number of items: %li\n", nq->N); - std = Calloc(double, N); - mean = Calloc(double, N); - time = Calloc(double, N); - perf = Calloc(double, N*repeats); - - task = get_next_task(nq); - - model->n = 0; - model->m = task->train_data->m; - model->K = task->train_data->K; - gensvm_allocate_model(model); - gensvm_init_V(NULL, model, task->train_data); - - cv_idx = Calloc(long, task->train_data->n); - - train_folds = Malloc(struct GenData *, task->folds); - test_folds = Malloc(struct GenData *, task->folds); - - i = 0; - while (task) { - make_model_from_task(task, model); - - time[i] = 0.0; - note("(%02li/%02li:%03li)\t", i+1, N, task->ID); - for (r=0; r<repeats; r++) { - Memset(cv_idx, long, task->train_data->n); - gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); - train_folds = Malloc(struct GenData *, task->folds); - test_folds = Malloc(struct GenData *, task->folds); - for (f=0; f<task->folds; f++) { - train_folds[f] = gensvm_init_data(); - test_folds[f] = gensvm_init_data(); - gensvm_get_tt_split(task->train_data, train_folds[f], - test_folds[f], cv_idx, f); - gensvm_kernel_preprocess(model, train_folds[f]); - gensvm_kernel_postprocess(model, train_folds[f], - test_folds[f]); - } - - Timer(loop_s); - p = gensvm_cross_validation(model, train_folds, test_folds, - task->folds, task->train_data->n); - Timer(loop_e); - time[i] += gensvm_elapsed_time(&loop_s, &loop_e); - matrix_set(perf, repeats, i, r, p); - mean[i] += p/((double) repeats); - note("%3.3f\t", p); - // this is done because if we reuse the V it's not a - // consistency check - gensvm_init_V(NULL, model, task->train_data); - for (f=0; f<task->folds; f++) { - gensvm_free_data(train_folds[f]); - gensvm_free_data(test_folds[f]); - } - free(train_folds); - train_folds = NULL; - - free(test_folds); - test_folds = NULL; - } - for (r=0; r<repeats; r++) { - std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i], - 2.0); } - if (r > 1) { - std[i] /= ((double) repeats) - 1.0; - std[i] = sqrt(std[i]); - } else { - std[i] = 0.0; - } - note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], - time[i]); - task = get_next_task(nq); - i++; - } - - // find the best overall configurations: those with high average - // performance and low deviation in the performance - note("\nBest overall configuration(s):\n"); - note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t" - "mean_perf\tstd_perf\ttime_perf\n"); - p = 0.0; - breakout = false; - while (breakout == false) { - pi = prctile(mean, N, (100.0-p)); - pr = prctile(std, N, p); - pt = prctile(time, N, p); - for (i=0; i<N; i++) - if ((pi - mean[i] < 0.0001) && - (std[i] - pr < 0.0001) && - (time[i] - pt < 0.0001)) { - note("(%li)\tw = %li\te = %f\tp = %f\t" - "k = %f\tl = %f\t" - "mean: %3.3f\tstd: %3.3f\t" - "time: %3.3f\n", - nq->tasks[i]->ID, - nq->tasks[i]->weight_idx, - nq->tasks[i]->epsilon, - nq->tasks[i]->p, - nq->tasks[i]->kappa, - nq->tasks[i]->lambda, - mean[i], - std[i], - time[i]); - breakout = true; - } - p += 1.0; } - - free(cv_idx); - // make sure no double free occurs with the copied kernelparam - model->kernelparam = NULL; - gensvm_free_model(model); - free(nq->tasks); - free(nq); - free(perf); - free(std); - free(mean); - free(time); } /** @@ -443,7 +198,7 @@ void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype) * @param[in] oldtask the old task * @return whether the kernel needs to be reevaluated */ -bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) +bool gensvm_kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) { int i; if (oldtask == NULL) @@ -469,6 +224,39 @@ bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) } /** + * @brief Compute the kernels for the folds of the train and test datasets + * + * @details + * When the kernel parameters change in a kernel grid search, the kernel + * pre- and post-processing has to be done for the new kernel parameters. This + * is done here for each of the folds. Each of the training folds is + * preprocessed, and each of the test folds is postprocessed. + * + * @param[in] folds number of cross validation folds + * @param[in] model GenModel with new kernel parameters + * @param[in,out] train_folds array of train datasets + * @param[in,out] test_folds array of test datasets + * + */ +void gensvm_kernel_folds(int folds, struct GenModel *model, + struct GenData **train_folds, struct GenData **test_folds) +{ + int f; + + note("Computing kernel"); + for (f=0; f<folds; f++) { + if (train_folds[f]->Z != train_folds[f]->RAW) + free(train_folds[f]->Z); + if (test_folds[f]->Z != test_folds[f]->RAW) + free(test_folds[f]->Z); + gensvm_kernel_preprocess(model, train_folds[f]); + gensvm_kernel_postprocess(model, train_folds[f], + test_folds[f]); + } + note(".\n"); +} + +/** * @brief Run the grid search for a GenQueue * * @details @@ -484,11 +272,10 @@ bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) * * @param[in,out] q GenQueue with GenTask instances to run */ - -void start_training(struct GenQueue *q) +void gensvm_train_queue(struct GenQueue *q) { int f, folds; - double perf, current_max = 0; + double perf, duration, current_max = 0; struct GenTask *task = get_next_task(q); struct GenTask *prevtask = NULL; struct GenModel *model = gensvm_init_model(); @@ -516,31 +303,21 @@ void start_training(struct GenQueue *q) Timer(main_s); while (task) { - make_model_from_task(task, model); - if (kernel_changed(task, prevtask)) { - note("Computing kernel"); - for (f=0; f<folds; f++) { - if (train_folds[f]->Z != train_folds[f]->RAW) - free(train_folds[f]->Z); - if (test_folds[f]->Z != test_folds[f]->RAW) - free(test_folds[f]->Z); - gensvm_kernel_preprocess(model, - train_folds[f]); - gensvm_kernel_postprocess(model, - train_folds[f], test_folds[f]); - } - note(".\n"); + gensvm_task_to_model(task, model); + if (gensvm_kernel_changed(task, prevtask)) { + gensvm_kernel_folds(task->folds, model, train_folds, + test_folds); } - print_progress_string(task, q->N); Timer(loop_s); perf = gensvm_cross_validation(model, train_folds, test_folds, folds, task->train_data->n); Timer(loop_e); + current_max = maximum(current_max, perf); + duration = gensvm_elapsed_time(&loop_s, &loop_e); - note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, - gensvm_elapsed_time(&loop_s, &loop_e), + gensvm_gridsearch_progress(task, q->N, perf, duration, current_max); q->tasks[task->ID]->performance = perf; @@ -565,86 +342,6 @@ void start_training(struct GenQueue *q) } /** - * @brief Run cross validation with a given set of train/test folds - * - * @details - * This cross validation function uses predefined train/test splits. Also, the - * the optimal parameters GenModel::V of a previous fold as initial conditions - * for GenModel::V of the next fold. - * - * @param[in] model GenModel with the configuration to train - * @param[in] train_folds array of training datasets - * @param[in] test_folds array of test datasets - * @param[in] folds number of folds - * @param[in] n_total number of objects in the union of the train - * datasets - * @return performance (hitrate) of the configuration on - * cross validation - */ -double gensvm_cross_validation(struct GenModel *model, - struct GenData **train_folds, struct GenData **test_folds, - int folds, long n_total) -{ - FILE *fid = NULL; - - int f; - long *predy = NULL; - double performance, total_perf = 0; - - for (f=0; f<folds; f++) { - // reallocate model in case dimensions differ with data - gensvm_reallocate_model(model, train_folds[f]->n, - train_folds[f]->r); - - // initialize object weights - gensvm_initialize_weights(train_folds[f], model); - - // train the model (surpressing output) - fid = GENSVM_OUTPUT_FILE; - GENSVM_OUTPUT_FILE = NULL; - gensvm_optimize(model, train_folds[f]); - GENSVM_OUTPUT_FILE = fid; - - // calculate prediction performance on test set - predy = Calloc(long, test_folds[f]->n); - gensvm_predict_labels(test_folds[f], model, predy); - performance = gensvm_prediction_perf(test_folds[f], predy); - total_perf += performance * test_folds[f]->n; - - free(predy); - } - - total_perf /= ((double) n_total); - - return total_perf; -} - - -/** - * @brief Copy parameters from GenTask to GenModel - * - * @details - * A GenTask struct only contains the parameters of the GenModel to be estimated. - * This function is used to copy these parameters. - * - * @param[in] task GenTask instance with parameters - * @param[in,out] model GenModel to which the parameters are copied - */ -void make_model_from_task(struct GenTask *task, struct GenModel *model) -{ - // copy basic model parameters - model->weight_idx = task->weight_idx; - model->epsilon = task->epsilon; - model->p = task->p; - model->kappa = task->kappa; - model->lambda = task->lambda; - - // copy kernel parameters - model->kerneltype = task->kerneltype; - model->kernelparam = task->kernelparam; -} - -/** * @brief Print the description of the current task on screen * * @details @@ -657,7 +354,8 @@ void make_model_from_task(struct GenTask *task, struct GenModel *model) * @param[in] N total number of tasks * */ -void print_progress_string(struct GenTask *task, long N) +void gensvm_gridsearch_progress(struct GenTask *task, long N, double perf, + double duration, double current_max) { char buffer[MAX_LINE_LENGTH]; sprintf(buffer, "(%03li/%03li)\t", task->ID+1, N); @@ -675,4 +373,6 @@ void print_progress_string(struct GenTask *task, long N) "l = %f\tp = %2.2f\t", task->epsilon, task->weight_idx, task->kappa, task->lambda, task->p); note(buffer); + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, duration, + current_max); } |
