From 6d064658f8ae7ca0f42fef6dcc7f896144e9637b Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 18 Oct 2013 15:48:59 +0200 Subject: restart using git --- src/msvmmaj_train_dataset.c | 402 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 src/msvmmaj_train_dataset.c (limited to 'src/msvmmaj_train_dataset.c') diff --git a/src/msvmmaj_train_dataset.c b/src/msvmmaj_train_dataset.c new file mode 100644 index 0000000..2da8bee --- /dev/null +++ b/src/msvmmaj_train_dataset.c @@ -0,0 +1,402 @@ +#include +#include + +#include "crossval.h" +#include "libMSVMMaj.h" +#include "matrix.h" +#include "msvmmaj_train.h" +#include "msvmmaj_train_dataset.h" +#include "msvmmaj_pred.h" +#include "MSVMMaj.h" +#include "util.h" +#include "timer.h" + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +void make_queue(struct Training *training, struct Queue *queue, + struct MajData *train_data, struct MajData *test_data) +{ + long i, j, k, l, m; + long N, cnt = 0; + struct Task *task; + queue->i = 0; + + N = training->Np; + N *= training->Nl; + N *= training->Nk; + N *= training->Ne; + N *= training->Nw; + + queue->tasks = Malloc(struct Task *, N); + queue->N = N; + + for (i=0; iNe; i++) + for (j=0; jNw; j++) + for (k=0; kNk; k++) + for (l=0; lNl; l++) + for (m=0; mNp; m++) { + task = Malloc(struct Task, 1); + task->epsilon = training->epsilons[i]; + task->weight_idx = training->weight_idxs[j]; + task->kappa = training->kappas[k]; + task->lambda = training->lambdas[l]; + task->p = training->ps[m]; + task->train_data = train_data; + task->test_data = test_data; + task->folds = training->folds; + task->ID = cnt; + queue->tasks[cnt] = task; + cnt++; + } +} + +struct Task *get_next_task(struct Queue *q) +{ + long i = q->i; + if (i < q->N) { + q->i++; + return q->tasks[i]; + } + return NULL; +} + +int tasksort(const void *elem1, const void *elem2) +{ + const struct Task *t1 = (*(struct Task **) elem1); + const struct Task *t2 = (*(struct Task **) elem2); + return (t1->performance > t2->performance); +} + +int doublesort(const void *elem1, const void *elem2) +{ + const double t1 = (*(double *) elem1); + const double t2 = (*(double *) elem2); + return t1 > t2; +} + + +double prctile(double *values, long N, double p) +{ + long i; + double pi, pr, boundary; + double *local = Malloc(double, N); + for (i=0; itasks, q->N, sizeof(struct Task *), tasksort); + p = 0.95*q->N + 0.5; + pi = maximum(minimum(floor(p), q->N-1), 1); + pr = maximum(minimum(p - pi, 1), 0); + boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance; + boundary += pr*q->tasks[((long) pi)]->performance; + note("boundary determined at: %f\n", boundary); + + N = 0; + for (i=0; iN; i++) + if (q->tasks[i]->performance >= boundary) + N++; + note("Number of items: %li\n", N); + std = Calloc(double, N); + mean = Calloc(double, N); + perf = Calloc(double, N*repeats); + + nq->tasks = Malloc(struct Task *, N); + for (i=q->N-1; i>q->N-N-1; i--) + nq->tasks[q->N-i-1] = q->tasks[i]; + nq->N = N; + nq->i = 0; + + for (i=0; in = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + time = 0; + note("(%02li/%02li:%03li)\t", i+1, N, task->ID); + for (r=0; rtrain_data, task->folds); + loop_e = clock(); + time += elapsed_time(loop_s, loop_e); + matrix_set(perf, repeats, i, r, p); + mean[i] += p/((double) repeats); + } else { + note("Only cv is implemented\n"); + exit(1); + } + note("%3.3f\t", p); + } + for (r=0; rtasks[i]->ID, + nq->tasks[i]->weight_idx, + nq->tasks[i]->epsilon, nq->tasks[i]->p, + nq->tasks[i]->kappa, nq->tasks[i]->lambda, + mean[i], std[i]); + breakout = true; + } + p += 1.0; + } + + free(task); + free(model); + free(perf); + free(std); + free(mean); +} + +double cross_validation(struct MajModel *model, struct MajModel *seed_model, + struct MajData *data, long folds) +{ + FILE *fid; + + bool fs = false; + long f, *predy; + double total_perf = 0; + struct MajModel *fold_model; + struct MajData *train_data, *test_data; + + long *cv_idx = Calloc(long, model->n); + double *performance = Calloc(double, folds); + + if (seed_model == NULL) { + seed_model = Malloc(struct MajModel, 1); + seed_model->n = 0; // we never use anything other than V + seed_model->m = model->m; + seed_model->K = model->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + fs = true; + } + + train_data = Malloc(struct MajData, 1); + test_data = Malloc(struct MajData, 1); + + msvmmaj_make_cv_split(model->n, folds, cv_idx); + for (f=0; fn = train_data->n; + fold_model->m = train_data->m; + fold_model->K = train_data->K; + + msvmmaj_allocate_model(fold_model); + msvmmaj_initialize_weights(train_data, fold_model); + msvmmaj_seed_model_V(seed_model, fold_model); + + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(fold_model, train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + predy = Calloc(long, test_data->n); + msvmmaj_predict_labels(test_data, fold_model, predy); + performance[f] = msvmmaj_prediction_perf(test_data, predy); + total_perf += performance[f]/((double) folds); + + msvmmaj_seed_model_V(fold_model, seed_model); + + free(predy); + free(train_data->y); + free(train_data->Z); + free(test_data->y); + free(test_data->Z); + + msvmmaj_free_model(fold_model); + } + + if (fs) + msvmmaj_free_model(seed_model); + free(train_data); + free(test_data); + free(performance); + free(cv_idx); + + return total_perf; + +} + +void start_training_cv(struct Queue *q) +{ + double perf, current_max = 0; + struct Task *task = get_next_task(q); + struct MajModel *seed_model = Malloc(struct MajModel, 1); + struct MajModel *model = Malloc(struct MajModel, 1); + clock_t main_s, main_e, loop_s, loop_e; + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + msvmmaj_allocate_model(model); + + seed_model->n = 0; + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t l = %f\t", + task->ID+1, q->N, task->weight_idx, task->epsilon, + task->p, task->kappa, task->lambda); + make_model_from_task(task, model); + + loop_s = clock(); + perf = cross_validation(model, seed_model, task->train_data, task->folds); + loop_e = clock(); + current_max = maximum(current_max, perf); + + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, + elapsed_time(loop_s, loop_e), + current_max); + + q->tasks[task->ID]->performance = perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + + free(task); + msvmmaj_free_model(seed_model); +} + +void start_training_tt(struct Queue *q) +{ + FILE *fid; + + long c = 0; + long *predy; + double total_perf, current_max = 0; + + struct Task *task = get_next_task(q); + struct MajModel *seed_model = Malloc(struct MajModel, 1); + + clock_t main_s, main_e; + clock_t loop_s, loop_e; + + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + total_perf = 0; + note("(%li/%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t", + c+1, q->N, task->weight_idx, task->epsilon, + task->p, task->kappa, task->lambda); + loop_s = clock(); + struct MajModel *model = Malloc(struct MajModel, 1); + make_model_from_task(task, model); + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + msvmmaj_allocate_model(model); + msvmmaj_initialize_weights(task->train_data, model); + msvmmaj_seed_model_V(seed_model, model); + + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(model, task->train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + predy = Calloc(long, task->test_data->n); + msvmmaj_predict_labels(task->test_data, model, predy); + if (task->test_data->y != NULL) + total_perf = msvmmaj_prediction_perf(task->test_data, predy); + msvmmaj_seed_model_V(model, seed_model); + + msvmmaj_free_model(model); + free(predy); + note("."); + loop_e = clock(); + current_max = maximum(current_max, total_perf); + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", total_perf, + elapsed_time(loop_s, loop_e), current_max); + q->tasks[task->ID]->performance = total_perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + free(task); + msvmmaj_free_model(seed_model); +} + +void free_queue(struct Queue *q) +{ + long i; + for (i=0; iN; i++) + free(q->tasks[i]); + free(q->tasks); + free(q); +} + +void make_model_from_task(struct Task *task, struct MajModel *model) +{ + model->weight_idx = task->weight_idx; + model->epsilon = task->epsilon; + model->p = task->p; + model->kappa = task->kappa; + model->lambda = task->lambda; +} + +void copy_model(struct MajModel *from, struct MajModel *to) +{ + to->weight_idx = from->weight_idx; + to->epsilon = from->epsilon; + to->p = from->p; + to->kappa = from->kappa; + to->lambda = from->lambda; +} -- cgit v1.2.3