diff options
| author | Gertjan van den Burg <burg@ese.eur.nl> | 2013-10-18 15:48:59 +0200 |
|---|---|---|
| committer | Gertjan van den Burg <burg@ese.eur.nl> | 2013-10-18 15:48:59 +0200 |
| commit | 6d064658f8ae7ca0f42fef6dcc7f896144e9637b (patch) | |
| tree | a41e8793f71f637b68f862220ae5566f4537073d /src/msvmmaj_train_dataset.c | |
| parent | allow seeding of V and added documentation (diff) | |
| download | gensvm-6d064658f8ae7ca0f42fef6dcc7f896144e9637b.tar.gz gensvm-6d064658f8ae7ca0f42fef6dcc7f896144e9637b.zip | |
restart using git
Diffstat (limited to 'src/msvmmaj_train_dataset.c')
| -rw-r--r-- | src/msvmmaj_train_dataset.c | 402 |
1 files changed, 402 insertions, 0 deletions
diff --git a/src/msvmmaj_train_dataset.c b/src/msvmmaj_train_dataset.c new file mode 100644 index 0000000..2da8bee --- /dev/null +++ b/src/msvmmaj_train_dataset.c @@ -0,0 +1,402 @@ +#include <math.h> +#include <time.h> + +#include "crossval.h" +#include "libMSVMMaj.h" +#include "matrix.h" +#include "msvmmaj_train.h" +#include "msvmmaj_train_dataset.h" +#include "msvmmaj_pred.h" +#include "MSVMMaj.h" +#include "util.h" +#include "timer.h" + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +void make_queue(struct Training *training, struct Queue *queue, + struct MajData *train_data, struct MajData *test_data) +{ + long i, j, k, l, m; + long N, cnt = 0; + struct Task *task; + queue->i = 0; + + N = training->Np; + N *= training->Nl; + N *= training->Nk; + N *= training->Ne; + N *= training->Nw; + + queue->tasks = Malloc(struct Task *, N); + queue->N = N; + + for (i=0; i<training->Ne; i++) + for (j=0; j<training->Nw; j++) + for (k=0; k<training->Nk; k++) + for (l=0; l<training->Nl; l++) + for (m=0; m<training->Np; m++) { + task = Malloc(struct Task, 1); + task->epsilon = training->epsilons[i]; + task->weight_idx = training->weight_idxs[j]; + task->kappa = training->kappas[k]; + task->lambda = training->lambdas[l]; + task->p = training->ps[m]; + task->train_data = train_data; + task->test_data = test_data; + task->folds = training->folds; + task->ID = cnt; + queue->tasks[cnt] = task; + cnt++; + } +} + +struct Task *get_next_task(struct Queue *q) +{ + long i = q->i; + if (i < q->N) { + q->i++; + return q->tasks[i]; + } + return NULL; +} + +int tasksort(const void *elem1, const void *elem2) +{ + const struct Task *t1 = (*(struct Task **) elem1); + const struct Task *t2 = (*(struct Task **) elem2); + return (t1->performance > t2->performance); +} + +int doublesort(const void *elem1, const void *elem2) +{ + const double t1 = (*(double *) elem1); + const double t2 = (*(double *) elem2); + return t1 > t2; +} + + +double prctile(double *values, long N, double p) +{ + long i; + double pi, pr, boundary; + double *local = Malloc(double, N); + for (i=0; i<N; i++) + local[i] = values[i]; + + qsort(local, N, sizeof(double), doublesort); + p = p*N + 0.5; + pi = maximum(minimum(floor(p), N-1), 1); + pr = maximum(minimum(p - pi, 1), 0); + boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)]; + + free(local); + + return boundary; +} + +void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) +{ + long i, r, N; + double p, pi, pr, boundary, time, *std, *mean, *perf; + struct Queue *nq = Malloc(struct Queue, 1); + struct MajModel *model = Malloc(struct MajModel, 1); + struct Task *task = Malloc(struct Task, 1); + clock_t loop_s, loop_e; + + // calculate the percentile (Matlab style) + qsort(q->tasks, q->N, sizeof(struct Task *), tasksort); + p = 0.95*q->N + 0.5; + pi = maximum(minimum(floor(p), q->N-1), 1); + pr = maximum(minimum(p - pi, 1), 0); + boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance; + boundary += pr*q->tasks[((long) pi)]->performance; + note("boundary determined at: %f\n", boundary); + + N = 0; + for (i=0; i<q->N; i++) + if (q->tasks[i]->performance >= boundary) + N++; + note("Number of items: %li\n", N); + std = Calloc(double, N); + mean = Calloc(double, N); + perf = Calloc(double, N*repeats); + + nq->tasks = Malloc(struct Task *, N); + for (i=q->N-1; i>q->N-N-1; i--) + nq->tasks[q->N-i-1] = q->tasks[i]; + nq->N = N; + nq->i = 0; + + for (i=0; i<N; i++) { + task = get_next_task(nq); + make_model_from_task(task, model); + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + time = 0; + note("(%02li/%02li:%03li)\t", i+1, N, task->ID); + for (r=0; r<repeats; r++) { + if (traintype == CV) { + loop_s = clock(); + p = cross_validation(model, NULL, task->train_data, task->folds); + loop_e = clock(); + time += elapsed_time(loop_s, loop_e); + matrix_set(perf, repeats, i, r, p); + mean[i] += p/((double) repeats); + } else { + note("Only cv is implemented\n"); + exit(1); + } + note("%3.3f\t", p); + } + for (r=0; r<repeats; r++) { + std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i], 2); + } + std[i] /= ((double) repeats) - 1.0; + std[i] = sqrt(std[i]); + note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time); + } + + note("\nBest overall configuration(s):\n"); + note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\tmean_perf\tstd_perf\n"); + p = 0.0; + bool breakout = false; + while (breakout == false) { + pi = prctile(mean, N, (100.0-p)/100.0); + pr = prctile(std, N, p/100.0); + for (i=0; i<N; i++) + if ((pi - mean[i] < 0.0001) && (std[i] - pr < 0.0001)) { + note("(%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t" + "mean: %3.3f\tstd: %3.3f\n", + nq->tasks[i]->ID, + nq->tasks[i]->weight_idx, + nq->tasks[i]->epsilon, nq->tasks[i]->p, + nq->tasks[i]->kappa, nq->tasks[i]->lambda, + mean[i], std[i]); + breakout = true; + } + p += 1.0; + } + + free(task); + free(model); + free(perf); + free(std); + free(mean); +} + +double cross_validation(struct MajModel *model, struct MajModel *seed_model, + struct MajData *data, long folds) +{ + FILE *fid; + + bool fs = false; + long f, *predy; + double total_perf = 0; + struct MajModel *fold_model; + struct MajData *train_data, *test_data; + + long *cv_idx = Calloc(long, model->n); + double *performance = Calloc(double, folds); + + if (seed_model == NULL) { + seed_model = Malloc(struct MajModel, 1); + seed_model->n = 0; // we never use anything other than V + seed_model->m = model->m; + seed_model->K = model->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + fs = true; + } + + train_data = Malloc(struct MajData, 1); + test_data = Malloc(struct MajData, 1); + + msvmmaj_make_cv_split(model->n, folds, cv_idx); + for (f=0; f<folds; f++) { + msvmmaj_get_tt_split(data, train_data, test_data, cv_idx, f); + + fold_model = Malloc(struct MajModel, 1); + copy_model(model, fold_model); + + fold_model->n = train_data->n; + fold_model->m = train_data->m; + fold_model->K = train_data->K; + + msvmmaj_allocate_model(fold_model); + msvmmaj_initialize_weights(train_data, fold_model); + msvmmaj_seed_model_V(seed_model, fold_model); + + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(fold_model, train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + predy = Calloc(long, test_data->n); + msvmmaj_predict_labels(test_data, fold_model, predy); + performance[f] = msvmmaj_prediction_perf(test_data, predy); + total_perf += performance[f]/((double) folds); + + msvmmaj_seed_model_V(fold_model, seed_model); + + free(predy); + free(train_data->y); + free(train_data->Z); + free(test_data->y); + free(test_data->Z); + + msvmmaj_free_model(fold_model); + } + + if (fs) + msvmmaj_free_model(seed_model); + free(train_data); + free(test_data); + free(performance); + free(cv_idx); + + return total_perf; + +} + +void start_training_cv(struct Queue *q) +{ + double perf, current_max = 0; + struct Task *task = get_next_task(q); + struct MajModel *seed_model = Malloc(struct MajModel, 1); + struct MajModel *model = Malloc(struct MajModel, 1); + clock_t main_s, main_e, loop_s, loop_e; + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + msvmmaj_allocate_model(model); + + seed_model->n = 0; + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t l = %f\t", + task->ID+1, q->N, task->weight_idx, task->epsilon, + task->p, task->kappa, task->lambda); + make_model_from_task(task, model); + + loop_s = clock(); + perf = cross_validation(model, seed_model, task->train_data, task->folds); + loop_e = clock(); + current_max = maximum(current_max, perf); + + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, + elapsed_time(loop_s, loop_e), + current_max); + + q->tasks[task->ID]->performance = perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + + free(task); + msvmmaj_free_model(seed_model); +} + +void start_training_tt(struct Queue *q) +{ + FILE *fid; + + long c = 0; + long *predy; + double total_perf, current_max = 0; + + struct Task *task = get_next_task(q); + struct MajModel *seed_model = Malloc(struct MajModel, 1); + + clock_t main_s, main_e; + clock_t loop_s, loop_e; + + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + total_perf = 0; + note("(%li/%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t", + c+1, q->N, task->weight_idx, task->epsilon, + task->p, task->kappa, task->lambda); + loop_s = clock(); + struct MajModel *model = Malloc(struct MajModel, 1); + make_model_from_task(task, model); + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + msvmmaj_allocate_model(model); + msvmmaj_initialize_weights(task->train_data, model); + msvmmaj_seed_model_V(seed_model, model); + + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(model, task->train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + predy = Calloc(long, task->test_data->n); + msvmmaj_predict_labels(task->test_data, model, predy); + if (task->test_data->y != NULL) + total_perf = msvmmaj_prediction_perf(task->test_data, predy); + msvmmaj_seed_model_V(model, seed_model); + + msvmmaj_free_model(model); + free(predy); + note("."); + loop_e = clock(); + current_max = maximum(current_max, total_perf); + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", total_perf, + elapsed_time(loop_s, loop_e), current_max); + q->tasks[task->ID]->performance = total_perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + free(task); + msvmmaj_free_model(seed_model); +} + +void free_queue(struct Queue *q) +{ + long i; + for (i=0; i<q->N; i++) + free(q->tasks[i]); + free(q->tasks); + free(q); +} + +void make_model_from_task(struct Task *task, struct MajModel *model) +{ + model->weight_idx = task->weight_idx; + model->epsilon = task->epsilon; + model->p = task->p; + model->kappa = task->kappa; + model->lambda = task->lambda; +} + +void copy_model(struct MajModel *from, struct MajModel *to) +{ + to->weight_idx = from->weight_idx; + to->epsilon = from->epsilon; + to->p = from->p; + to->kappa = from->kappa; + to->lambda = from->lambda; +} |
