aboutsummaryrefslogtreecommitdiff
path: root/src/msvmmaj_train_dataset.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/msvmmaj_train_dataset.c')
-rw-r--r--src/msvmmaj_train_dataset.c406
1 files changed, 358 insertions, 48 deletions
diff --git a/src/msvmmaj_train_dataset.c b/src/msvmmaj_train_dataset.c
index 2da8bee..4f5f4d9 100644
--- a/src/msvmmaj_train_dataset.c
+++ b/src/msvmmaj_train_dataset.c
@@ -1,22 +1,53 @@
+/**
+ * @file msvmmaj_train_dataset.c
+ * @author Gertjan van den Burg
+ * @date January, 2014
+ * @brief Functions for finding the optimal parameters for the dataset
+ *
+ * @details
+ * The MSVMMaj algorithm takes a number of parameters. The functions in
+ * this file are used to find the optimal parameters.
+ */
+
#include <math.h>
#include <time.h>
#include "crossval.h"
#include "libMSVMMaj.h"
-#include "matrix.h"
+#include "msvmmaj.h"
+#include "msvmmaj_init.h"
+#include "msvmmaj_matrix.h"
#include "msvmmaj_train.h"
#include "msvmmaj_train_dataset.h"
#include "msvmmaj_pred.h"
-#include "MSVMMaj.h"
#include "util.h"
#include "timer.h"
extern FILE *MSVMMAJ_OUTPUT_FILE;
+/**
+ * @brief Initialize a Queue from a Training instance
+ *
+ * @details
+ * A Training instance describes the grid to search over. This funtion
+ * creates all tasks that need to be performed and adds these to
+ * a Queue. Each task contains a pointer to the train and test datasets
+ * which are supplied. Note that the tasks are created in a specific order of
+ * the parameters, to ensure that the MajModel::V of a previous parameter
+ * set provides the best possible initial estimate of MajModel::V for the next
+ * parameter set.
+ *
+ * @param[in] training Training struct describing the grid search
+ * @param[in] queue pointer to a Queue that will be used to
+ * add the tasks to
+ * @param[in] train_data MajData of the training set
+ * @param[in] test_data MajData of the test set
+ *
+ */
void make_queue(struct Training *training, struct Queue *queue,
struct MajData *train_data, struct MajData *test_data)
{
- long i, j, k, l, m;
+ long i, j, k;
long N, cnt = 0;
struct Task *task;
queue->i = 0;
@@ -26,30 +57,122 @@ void make_queue(struct Training *training, struct Queue *queue,
N *= training->Nk;
N *= training->Ne;
N *= training->Nw;
+ // these parameters are not necessarily non-zero
+ N *= training->Ng > 0 ? training->Ng : 1;
+ N *= training->Nc > 0 ? training->Nc : 1;
+ N *= training->Nd > 0 ? training->Nd : 1;
queue->tasks = Malloc(struct Task *, N);
queue->N = N;
- for (i=0; i<training->Ne; i++)
+ // initialize all tasks
+ for (i=0; i<N; i++) {
+ task = Malloc(struct Task, 1);
+ task->ID = i;
+ task->train_data = train_data;
+ task->test_data = test_data;
+ task->folds = training->folds;
+ task->kerneltype = training->kerneltype;
+ task->kernel_param = Calloc(double, training->Ng +
+ training->Nc + training->Nd);
+ queue->tasks[i] = task;
+ }
+
+ // These loops mimick a large nested for loop. The advantage is that
+ // Nd, Nc and Ng which are on the outside of the nested for loop can
+ // now be zero, without large modification (see below). Whether this
+ // is indeed better than the nested for loop has not been tested.
+ cnt = 1;
+ i = 0;
+ while (i < N )
+ for (j=0; j<training->Np; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->p = training->ps[j];
+ i++;
+ }
+
+ cnt *= training->Np;
+ i = 0;
+ while (i < N )
+ for (j=0; j<training->Nl; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->lambda =
+ training->lambdas[j];
+ i++;
+ }
+
+ cnt *= training->Nl;
+ i = 0;
+ while (i < N )
+ for (j=0; j<training->Nk; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->kappa = training->kappas[j];
+ i++;
+ }
+
+ cnt *= training->Nk;
+ i = 0;
+ while (i < N )
for (j=0; j<training->Nw; j++)
- for (k=0; k<training->Nk; k++)
- for (l=0; l<training->Nl; l++)
- for (m=0; m<training->Np; m++) {
- task = Malloc(struct Task, 1);
- task->epsilon = training->epsilons[i];
- task->weight_idx = training->weight_idxs[j];
- task->kappa = training->kappas[k];
- task->lambda = training->lambdas[l];
- task->p = training->ps[m];
- task->train_data = train_data;
- task->test_data = test_data;
- task->folds = training->folds;
- task->ID = cnt;
- queue->tasks[cnt] = task;
- cnt++;
- }
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->weight_idx =
+ training->weight_idxs[j];
+ i++;
+ }
+
+ cnt *= training->Nw;
+ i = 0;
+ while (i < N )
+ for (j=0; j<training->Ne; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->epsilon =
+ training->epsilons[j];
+ i++;
+ }
+
+ cnt *= training->Ne;
+ i = 0;
+ while (i < N && training->Ng > 0)
+ for (j=0; j<training->Ng; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->kernel_param[0] =
+ training->gammas[j];
+ i++;
+ }
+
+ cnt *= training->Ng > 0 ? training->Ng : 1;
+ i = 0;
+ while (i < N && training->Nc > 0)
+ for (j=0; j<training->Nc; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->kernel_param[1] =
+ training->coefs[j];
+ i++;
+ }
+
+ cnt *= training->Nc > 0 ? training->Ng : 1;
+ i = 0;
+ while (i < N && training->Nd > 0)
+ for (j=0; j<training->Nd; j++)
+ for (k=0; k<cnt; k++) {
+ queue->tasks[i]->kernel_param[2] =
+ training->degrees[j];
+ i++;
+ }
}
+/**
+ * @brief Get new Task from Queue
+ *
+ * @details
+ * Return a pointer to the next Task in the Queue. If no Task instances are
+ * left, NULL is returned. The internal counter Queue::i is used for finding
+ * the next Task.
+ *
+ * @param[in] q Queue instance
+ * @returns pointer to next Task
+ *
+ */
struct Task *get_next_task(struct Queue *q)
{
long i = q->i;
@@ -60,6 +183,19 @@ struct Task *get_next_task(struct Queue *q)
return NULL;
}
+/**
+ * @brief Comparison function for Tasks based on performance
+ *
+ * @details
+ * To be able to sort Task structures on the performance of their specific
+ * set of parameters, this comparison function is implemented. Task structs
+ * are sorted with highest performance first.
+ *
+ * @param[in] elem1 Task 1
+ * @param[in] elem2 Task 2
+ * @returns result of inequality of Task 1 performance over
+ * Task 2 performance
+ */
int tasksort(const void *elem1, const void *elem2)
{
const struct Task *t1 = (*(struct Task **) elem1);
@@ -67,6 +203,16 @@ int tasksort(const void *elem1, const void *elem2)
return (t1->performance > t2->performance);
}
+/**
+ * @brief Comparison function for doubles
+ *
+ * @details
+ * Similar to tasksort() only now for two doubles.
+ *
+ * @param[in] elem1 number 1
+ * @param[in] elem2 number 2
+ * @returns comparison of number 1 larger than number 2
+ */
int doublesort(const void *elem1, const void *elem2)
{
const double t1 = (*(double *) elem1);
@@ -74,7 +220,20 @@ int doublesort(const void *elem1, const void *elem2)
return t1 > t2;
}
-
+/**
+ * @brief Calculate the percentile of an array of doubles
+ *
+ * @details
+ * The percentile of performance is used to find the top performing
+ * configurations. Since no standard definition of the percentile exists, we
+ * use the method used in MATLAB and Octave. Since calculating the percentile
+ * requires a sorted list of the values, a local copy is made first.
+ *
+ * @param[in] values array of doubles
+ * @param[in] N length of the array
+ * @param[in] p percentile to calculate ( 0 <= p <= 1.0 ).
+ * @returns the p-th percentile of the values
+ */
double prctile(double *values, long N, double p)
{
long i;
@@ -94,16 +253,50 @@ double prctile(double *values, long N, double p)
return boundary;
}
+/**
+ * @brief Run repeats of the Task structs in Queue to find the best
+ * configuration
+ *
+ * @details
+ * The best performing tasks in the supplied Queue are found by taking those
+ * Task structs that have a performance greater or equal to the 95% percentile
+ * of the performance of all tasks. These tasks are then gathered in a new
+ * Queue. For each of the tasks in this new Queue the cross validation run is
+ * repeated a number of times.
+ *
+ * For each of the Task configurations that are repeated the mean performance,
+ * standard deviation of the performance and the mean computation time are
+ * reported.
+ *
+ * Finally, the overall best tasks are written to the specified output. These
+ * tasks are selected to have both the highest mean performance, as well as the
+ * smallest standard deviation in their performance. This is done as follows.
+ * First the 99th percentile of task performance and the 1st percentile of
+ * standard deviation is calculated. If a task exists for which the mean
+ * performance of the repeats and the standard deviation equals these values
+ * respectively, this task is found to be the best and is written to the
+ * output. If no such task exists, the 98th percentile of performance and the
+ * 2nd percentile of standard deviation is considered. This is repeated until
+ * an interval is found which contains tasks. If one or more tasks are found,
+ * this loop stops.
+ *
+ * @param[in] q Queue of Task structs which have already been
+ * run and have a Task::performance value
+ * @param[in] repeats Number of times to repeat the best
+ * configurations for consistency
+ * @param[in] traintype type of training to do (CV or TT)
+ *
+ */
void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
{
long i, r, N;
double p, pi, pr, boundary, time, *std, *mean, *perf;
struct Queue *nq = Malloc(struct Queue, 1);
- struct MajModel *model = Malloc(struct MajModel, 1);
+ struct MajModel *model = msvmmaj_init_model();
struct Task *task = Malloc(struct Task, 1);
clock_t loop_s, loop_e;
- // calculate the percentile (Matlab style)
+ // calculate the performance percentile (Matlab style)
qsort(q->tasks, q->N, sizeof(struct Task *), tasksort);
p = 0.95*q->N + 0.5;
pi = maximum(minimum(floor(p), q->N-1), 1);
@@ -111,7 +304,9 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance;
boundary += pr*q->tasks[((long) pi)]->performance;
note("boundary determined at: %f\n", boundary);
-
+
+ // find the number of tasks that perform at least as good as the 95th
+ // percentile
N = 0;
for (i=0; i<q->N; i++)
if (q->tasks[i]->performance >= boundary)
@@ -121,12 +316,14 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
mean = Calloc(double, N);
perf = Calloc(double, N*repeats);
+ // create a new task queue with the tasks which perform well
nq->tasks = Malloc(struct Task *, N);
for (i=q->N-1; i>q->N-N-1; i--)
nq->tasks[q->N-i-1] = q->tasks[i];
nq->N = N;
nq->i = 0;
+ // for each task run the consistency repeats
for (i=0; i<N; i++) {
task = get_next_task(nq);
make_model_from_task(task, model);
@@ -140,7 +337,8 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
for (r=0; r<repeats; r++) {
if (traintype == CV) {
loop_s = clock();
- p = cross_validation(model, NULL, task->train_data, task->folds);
+ p = cross_validation(model, NULL,
+ task->train_data, task->folds);
loop_e = clock();
time += elapsed_time(loop_s, loop_e);
matrix_set(perf, repeats, i, r, p);
@@ -152,15 +350,24 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
note("%3.3f\t", p);
}
for (r=0; r<repeats; r++) {
- std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i], 2);
+ std[i] += pow(matrix_get(
+ perf,
+ repeats,
+ i,
+ r) - mean[i],
+ 2.0);
}
std[i] /= ((double) repeats) - 1.0;
std[i] = sqrt(std[i]);
- note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time);
+ note("(m = %3.3f, s = %3.3f, t = %3.3f)\n",
+ mean[i], std[i], time);
}
+ // find the best overall configurations: those with high average
+ // performance and low deviation in the performance
note("\nBest overall configuration(s):\n");
- note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\tmean_perf\tstd_perf\n");
+ note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t"
+ "mean_perf\tstd_perf\n");
p = 0.0;
bool breakout = false;
while (breakout == false) {
@@ -168,13 +375,17 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
pr = prctile(std, N, p/100.0);
for (i=0; i<N; i++)
if ((pi - mean[i] < 0.0001) && (std[i] - pr < 0.0001)) {
- note("(%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t"
+ note("(%li)\tw = %li\te = %f\tp = %f\t"
+ "k = %f\tl = %f\t"
"mean: %3.3f\tstd: %3.3f\n",
nq->tasks[i]->ID,
nq->tasks[i]->weight_idx,
- nq->tasks[i]->epsilon, nq->tasks[i]->p,
- nq->tasks[i]->kappa, nq->tasks[i]->lambda,
- mean[i], std[i]);
+ nq->tasks[i]->epsilon,
+ nq->tasks[i]->p,
+ nq->tasks[i]->kappa,
+ nq->tasks[i]->lambda,
+ mean[i],
+ std[i]);
breakout = true;
}
p += 1.0;
@@ -187,6 +398,30 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype)
free(mean);
}
+/**
+ * @brief Run cross validation with a seed model
+ *
+ * @details
+ * This is an implementation of cross validation which uses the optimal
+ * parameters MajModel::V of a previous fold as initial conditions for
+ * MajModel::V of the next fold. An initial seed for V can be given through the
+ * seed_model parameter. If seed_model is NULL, random starting values are
+ * used.
+ *
+ * @todo
+ * The seed model shouldn't have to be allocated completely, since only V is
+ * used.
+ * @todo
+ * There must be some inefficiencies here because the fold model is allocated
+ * at every fold. This would be detrimental with large datasets.
+ *
+ * @param[in] model MajModel with the configuration to train
+ * @param[in] seed_model MajModel with a seed for MajModel::V
+ * @param[in] data MajData with the dataset
+ * @param[in] folds number of cross validation folds
+ * @returns performance (hitrate) of the configuration on
+ * cross validation
+ */
double cross_validation(struct MajModel *model, struct MajModel *seed_model,
struct MajData *data, long folds)
{
@@ -202,7 +437,7 @@ double cross_validation(struct MajModel *model, struct MajModel *seed_model,
double *performance = Calloc(double, folds);
if (seed_model == NULL) {
- seed_model = Malloc(struct MajModel, 1);
+ seed_model = msvmmaj_init_model();
seed_model->n = 0; // we never use anything other than V
seed_model->m = model->m;
seed_model->K = model->K;
@@ -211,34 +446,40 @@ double cross_validation(struct MajModel *model, struct MajModel *seed_model,
fs = true;
}
- train_data = Malloc(struct MajData, 1);
- test_data = Malloc(struct MajData, 1);
-
+ train_data = msvmmaj_init_data();
+ test_data = msvmmaj_init_data();
+ // create splits
msvmmaj_make_cv_split(model->n, folds, cv_idx);
+
for (f=0; f<folds; f++) {
msvmmaj_get_tt_split(data, train_data, test_data, cv_idx, f);
-
- fold_model = Malloc(struct MajModel, 1);
+ // initialize a model for this fold and copy the model
+ // parameters
+ fold_model = msvmmaj_init_model();
copy_model(model, fold_model);
fold_model->n = train_data->n;
fold_model->m = train_data->m;
fold_model->K = train_data->K;
-
+
+ // allocate, initialize and seed the fold model
msvmmaj_allocate_model(fold_model);
msvmmaj_initialize_weights(train_data, fold_model);
msvmmaj_seed_model_V(seed_model, fold_model);
-
+
+ // train the model (without output)
fid = MSVMMAJ_OUTPUT_FILE;
MSVMMAJ_OUTPUT_FILE = NULL;
msvmmaj_optimize(fold_model, train_data);
MSVMMAJ_OUTPUT_FILE = fid;
+ // calculate predictive performance on test set
predy = Calloc(long, test_data->n);
msvmmaj_predict_labels(test_data, fold_model, predy);
performance[f] = msvmmaj_prediction_perf(test_data, predy);
total_perf += performance[f]/((double) folds);
+ // seed the seed model with the fold model
msvmmaj_seed_model_V(fold_model, seed_model);
free(predy);
@@ -250,6 +491,7 @@ double cross_validation(struct MajModel *model, struct MajModel *seed_model,
msvmmaj_free_model(fold_model);
}
+ // if a seed model was allocated before, free it.
if (fs)
msvmmaj_free_model(seed_model);
free(train_data);
@@ -261,12 +503,28 @@ double cross_validation(struct MajModel *model, struct MajModel *seed_model,
}
+/**
+ * @brief Run the grid search for a cross validation dataset
+ *
+ * @details
+ * Given a Queue of Task struct to be trained, a grid search is launched to
+ * find the optimal parameter configuration. As is also done within
+ * cross_validation(), the optimal weights of one parameter set are used as
+ * initial estimates for MajModel::V in the next parameter set. Note that to
+ * optimally exploit this feature of the optimization algorithm, the order in
+ * which tasks are considered is important. This is considered in
+ * make_queue().
+ *
+ * The performance found by cross validation is stored in the Task struct.
+ *
+ * @param[in,out] q Queue with Task instances to run
+ */
void start_training_cv(struct Queue *q)
{
double perf, current_max = 0;
struct Task *task = get_next_task(q);
- struct MajModel *seed_model = Malloc(struct MajModel, 1);
- struct MajModel *model = Malloc(struct MajModel, 1);
+ struct MajModel *seed_model = msvmmaj_init_model();
+ struct MajModel *model = msvmmaj_init_model();
clock_t main_s, main_e, loop_s, loop_e;
model->n = task->train_data->n;
@@ -282,13 +540,16 @@ void start_training_cv(struct Queue *q)
main_s = clock();
while (task) {
- note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t l = %f\t",
- task->ID+1, q->N, task->weight_idx, task->epsilon,
+ note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t "
+ "l = %f\t",
+ task->ID+1, q->N, task->weight_idx,
+ task->epsilon,
task->p, task->kappa, task->lambda);
make_model_from_task(task, model);
loop_s = clock();
- perf = cross_validation(model, seed_model, task->train_data, task->folds);
+ perf = cross_validation(model, seed_model, task->train_data,
+ task->folds);
loop_e = clock();
current_max = maximum(current_max, perf);
@@ -308,6 +569,23 @@ void start_training_cv(struct Queue *q)
msvmmaj_free_model(seed_model);
}
+/**
+ * @brief Run the grid search for a train/test dataset
+ *
+ * @details
+ * This function is similar to start_training_cv(), except that the
+ * pre-determined training set is used only once, and the pre-determined test
+ * set is used for validation.
+ *
+ * @todo
+ * It would probably be better to train the model on the training set using
+ * cross validation and only use the test set when comparing with other
+ * methods. The way it is now, you're finding out which parameters predict
+ * _this_ test set best, which is not what you want.
+ *
+ * @param[in] q Queue with Task structs to run
+ *
+ */
void start_training_tt(struct Queue *q)
{
FILE *fid;
@@ -317,7 +595,7 @@ void start_training_tt(struct Queue *q)
double total_perf, current_max = 0;
struct Task *task = get_next_task(q);
- struct MajModel *seed_model = Malloc(struct MajModel, 1);
+ struct MajModel *seed_model = msvmmaj_init_model();
clock_t main_s, main_e;
clock_t loop_s, loop_e;
@@ -334,7 +612,7 @@ void start_training_tt(struct Queue *q)
c+1, q->N, task->weight_idx, task->epsilon,
task->p, task->kappa, task->lambda);
loop_s = clock();
- struct MajModel *model = Malloc(struct MajModel, 1);
+ struct MajModel *model = msvmmaj_init_model();
make_model_from_task(task, model);
model->n = task->train_data->n;
@@ -374,15 +652,37 @@ void start_training_tt(struct Queue *q)
msvmmaj_free_model(seed_model);
}
+/**
+ * @brief Free the Queue struct
+ *
+ * @details
+ * Freeing the allocated memory of the Queue means freeing every Task struct
+ * and then freeing the Queue.
+ *
+ * @param[in] q Queue to be freed
+ *
+ */
void free_queue(struct Queue *q)
{
long i;
- for (i=0; i<q->N; i++)
+ for (i=0; i<q->N; i++) {
+ free(q->tasks[i]->kernel_param);
free(q->tasks[i]);
+ }
free(q->tasks);
free(q);
}
+/**
+ * @brief Copy parameters from Task to MajModel
+ *
+ * @details
+ * A Task struct only contains the parameters of the MajModel to be estimated.
+ * This function is used to copy these parameters.
+ *
+ * @param[in] task Task instance with parameters
+ * @param[in,out] model MajModel to which the parameters are copied
+ */
void make_model_from_task(struct Task *task, struct MajModel *model)
{
model->weight_idx = task->weight_idx;
@@ -392,6 +692,16 @@ void make_model_from_task(struct Task *task, struct MajModel *model)
model->lambda = task->lambda;
}
+/**
+ * @brief Copy model parameters between two MajModel structs
+ *
+ * @details
+ * The parameters copied are MajModel::weight_idx, MajModel::epsilon,
+ * MajModel::p, MajModel::kappa, and MajModel::lambda.
+ *
+ * @param[in] from MajModel to copy parameters from
+ * @param[in,out] to MajModel to copy parameters to
+ */
void copy_model(struct MajModel *from, struct MajModel *to)
{
to->weight_idx = from->weight_idx;