#include #include #include "crossval.h" #include "libMSVMMaj.h" #include "matrix.h" #include "msvmmaj_train.h" #include "msvmmaj_train_dataset.h" #include "msvmmaj_pred.h" #include "MSVMMaj.h" #include "util.h" #include "timer.h" extern FILE *MSVMMAJ_OUTPUT_FILE; void make_queue(struct Training *training, struct Queue *queue, struct MajData *train_data, struct MajData *test_data) { long i, j, k, l, m; long N, cnt = 0; struct Task *task; queue->i = 0; N = training->Np; N *= training->Nl; N *= training->Nk; N *= training->Ne; N *= training->Nw; queue->tasks = Malloc(struct Task *, N); queue->N = N; for (i=0; iNe; i++) for (j=0; jNw; j++) for (k=0; kNk; k++) for (l=0; lNl; l++) for (m=0; mNp; m++) { task = Malloc(struct Task, 1); task->epsilon = training->epsilons[i]; task->weight_idx = training->weight_idxs[j]; task->kappa = training->kappas[k]; task->lambda = training->lambdas[l]; task->p = training->ps[m]; task->train_data = train_data; task->test_data = test_data; task->folds = training->folds; task->ID = cnt; queue->tasks[cnt] = task; cnt++; } } struct Task *get_next_task(struct Queue *q) { long i = q->i; if (i < q->N) { q->i++; return q->tasks[i]; } return NULL; } int tasksort(const void *elem1, const void *elem2) { const struct Task *t1 = (*(struct Task **) elem1); const struct Task *t2 = (*(struct Task **) elem2); return (t1->performance > t2->performance); } int doublesort(const void *elem1, const void *elem2) { const double t1 = (*(double *) elem1); const double t2 = (*(double *) elem2); return t1 > t2; } double prctile(double *values, long N, double p) { long i; double pi, pr, boundary; double *local = Malloc(double, N); for (i=0; itasks, q->N, sizeof(struct Task *), tasksort); p = 0.95*q->N + 0.5; pi = maximum(minimum(floor(p), q->N-1), 1); pr = maximum(minimum(p - pi, 1), 0); boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance; boundary += pr*q->tasks[((long) pi)]->performance; note("boundary determined at: %f\n", boundary); N = 0; for (i=0; iN; i++) if (q->tasks[i]->performance >= boundary) N++; note("Number of items: %li\n", N); std = Calloc(double, N); mean = Calloc(double, N); perf = Calloc(double, N*repeats); nq->tasks = Malloc(struct Task *, N); for (i=q->N-1; i>q->N-N-1; i--) nq->tasks[q->N-i-1] = q->tasks[i]; nq->N = N; nq->i = 0; for (i=0; in = task->train_data->n; model->m = task->train_data->m; model->K = task->train_data->K; time = 0; note("(%02li/%02li:%03li)\t", i+1, N, task->ID); for (r=0; rtrain_data, task->folds); loop_e = clock(); time += elapsed_time(loop_s, loop_e); matrix_set(perf, repeats, i, r, p); mean[i] += p/((double) repeats); } else { note("Only cv is implemented\n"); exit(1); } note("%3.3f\t", p); } for (r=0; rtasks[i]->ID, nq->tasks[i]->weight_idx, nq->tasks[i]->epsilon, nq->tasks[i]->p, nq->tasks[i]->kappa, nq->tasks[i]->lambda, mean[i], std[i]); breakout = true; } p += 1.0; } free(task); free(model); free(perf); free(std); free(mean); } double cross_validation(struct MajModel *model, struct MajModel *seed_model, struct MajData *data, long folds) { FILE *fid; bool fs = false; long f, *predy; double total_perf = 0; struct MajModel *fold_model; struct MajData *train_data, *test_data; long *cv_idx = Calloc(long, model->n); double *performance = Calloc(double, folds); if (seed_model == NULL) { seed_model = Malloc(struct MajModel, 1); seed_model->n = 0; // we never use anything other than V seed_model->m = model->m; seed_model->K = model->K; msvmmaj_allocate_model(seed_model); msvmmaj_seed_model_V(NULL, seed_model); fs = true; } train_data = Malloc(struct MajData, 1); test_data = Malloc(struct MajData, 1); msvmmaj_make_cv_split(model->n, folds, cv_idx); for (f=0; fn = train_data->n; fold_model->m = train_data->m; fold_model->K = train_data->K; msvmmaj_allocate_model(fold_model); msvmmaj_initialize_weights(train_data, fold_model); msvmmaj_seed_model_V(seed_model, fold_model); fid = MSVMMAJ_OUTPUT_FILE; MSVMMAJ_OUTPUT_FILE = NULL; msvmmaj_optimize(fold_model, train_data); MSVMMAJ_OUTPUT_FILE = fid; predy = Calloc(long, test_data->n); msvmmaj_predict_labels(test_data, fold_model, predy); performance[f] = msvmmaj_prediction_perf(test_data, predy); total_perf += performance[f]/((double) folds); msvmmaj_seed_model_V(fold_model, seed_model); free(predy); free(train_data->y); free(train_data->Z); free(test_data->y); free(test_data->Z); msvmmaj_free_model(fold_model); } if (fs) msvmmaj_free_model(seed_model); free(train_data); free(test_data); free(performance); free(cv_idx); return total_perf; } void start_training_cv(struct Queue *q) { double perf, current_max = 0; struct Task *task = get_next_task(q); struct MajModel *seed_model = Malloc(struct MajModel, 1); struct MajModel *model = Malloc(struct MajModel, 1); clock_t main_s, main_e, loop_s, loop_e; model->n = task->train_data->n; model->m = task->train_data->m; model->K = task->train_data->K; msvmmaj_allocate_model(model); seed_model->n = 0; seed_model->m = task->train_data->m; seed_model->K = task->train_data->K; msvmmaj_allocate_model(seed_model); msvmmaj_seed_model_V(NULL, seed_model); main_s = clock(); while (task) { note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t l = %f\t", task->ID+1, q->N, task->weight_idx, task->epsilon, task->p, task->kappa, task->lambda); make_model_from_task(task, model); loop_s = clock(); perf = cross_validation(model, seed_model, task->train_data, task->folds); loop_e = clock(); current_max = maximum(current_max, perf); note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, elapsed_time(loop_s, loop_e), current_max); q->tasks[task->ID]->performance = perf; task = get_next_task(q); } main_e = clock(); note("\nTotal elapsed time: %8.8f seconds\n", elapsed_time(main_s, main_e)); free(task); msvmmaj_free_model(seed_model); } void start_training_tt(struct Queue *q) { FILE *fid; long c = 0; long *predy; double total_perf, current_max = 0; struct Task *task = get_next_task(q); struct MajModel *seed_model = Malloc(struct MajModel, 1); clock_t main_s, main_e; clock_t loop_s, loop_e; seed_model->m = task->train_data->m; seed_model->K = task->train_data->K; msvmmaj_allocate_model(seed_model); msvmmaj_seed_model_V(NULL, seed_model); main_s = clock(); while (task) { total_perf = 0; note("(%li/%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t", c+1, q->N, task->weight_idx, task->epsilon, task->p, task->kappa, task->lambda); loop_s = clock(); struct MajModel *model = Malloc(struct MajModel, 1); make_model_from_task(task, model); model->n = task->train_data->n; model->m = task->train_data->m; model->K = task->train_data->K; msvmmaj_allocate_model(model); msvmmaj_initialize_weights(task->train_data, model); msvmmaj_seed_model_V(seed_model, model); fid = MSVMMAJ_OUTPUT_FILE; MSVMMAJ_OUTPUT_FILE = NULL; msvmmaj_optimize(model, task->train_data); MSVMMAJ_OUTPUT_FILE = fid; predy = Calloc(long, task->test_data->n); msvmmaj_predict_labels(task->test_data, model, predy); if (task->test_data->y != NULL) total_perf = msvmmaj_prediction_perf(task->test_data, predy); msvmmaj_seed_model_V(model, seed_model); msvmmaj_free_model(model); free(predy); note("."); loop_e = clock(); current_max = maximum(current_max, total_perf); note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", total_perf, elapsed_time(loop_s, loop_e), current_max); q->tasks[task->ID]->performance = total_perf; task = get_next_task(q); } main_e = clock(); note("\nTotal elapsed time: %8.8f seconds\n", elapsed_time(main_s, main_e)); free(task); msvmmaj_free_model(seed_model); } void free_queue(struct Queue *q) { long i; for (i=0; iN; i++) free(q->tasks[i]); free(q->tasks); free(q); } void make_model_from_task(struct Task *task, struct MajModel *model) { model->weight_idx = task->weight_idx; model->epsilon = task->epsilon; model->p = task->p; model->kappa = task->kappa; model->lambda = task->lambda; } void copy_model(struct MajModel *from, struct MajModel *to) { to->weight_idx = from->weight_idx; to->epsilon = from->epsilon; to->p = from->p; to->kappa = from->kappa; to->lambda = from->lambda; }