diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/crossval.c | 135 | ||||
| -rw-r--r-- | src/libMSVMMaj.c | 307 | ||||
| -rw-r--r-- | src/msvmmaj_init.c | 185 | ||||
| -rw-r--r-- | src/msvmmaj_io.c | 294 | ||||
| -rw-r--r-- | src/msvmmaj_kernel.c | 195 | ||||
| -rw-r--r-- | src/msvmmaj_lapack.c | 129 | ||||
| -rw-r--r-- | src/msvmmaj_matrix.c | 153 | ||||
| -rw-r--r-- | src/msvmmaj_pred.c | 116 | ||||
| -rw-r--r-- | src/msvmmaj_train.c | 520 | ||||
| -rw-r--r-- | src/msvmmaj_train_dataset.c | 719 | ||||
| -rw-r--r-- | src/predMSVMMaj.c | 174 | ||||
| -rw-r--r-- | src/strutil.c | 176 | ||||
| -rw-r--r-- | src/timer.c | 73 | ||||
| -rw-r--r-- | src/trainMSVMMaj.c | 250 | ||||
| -rw-r--r-- | src/trainMSVMMajdataset.c | 322 | ||||
| -rw-r--r-- | src/util.c | 128 |
16 files changed, 3876 insertions, 0 deletions
diff --git a/src/crossval.c b/src/crossval.c new file mode 100644 index 0000000..10e3051 --- /dev/null +++ b/src/crossval.c @@ -0,0 +1,135 @@ +/** + * @file crossval.c + * @author Gertjan van den Burg + * @date January 7, 2014 + * @brief Functions for cross validation + * + * @details + * This file contains functions for performing cross validation. The funtion + * msvmmaj_make_cv_split() creates a cross validation vector for non-stratified + * cross validation. The function msvmmaj_get_tt_split() creates a train and + * test dataset from a given dataset and a pre-determined CV partition vector. + * See individual function documentation for details. + * + */ + +#include "crossval.h" +#include "msvmmaj.h" +#include "msvmmaj_matrix.h" + +/** + * @brief Create a cross validation split vector + * + * @details + * A pre-allocated vector of length N is created which can be used to define + * cross validation splits. The folds are contain between + * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$ + * instances. An instance is mapped to a partition randomly until all folds + * contain @f$ N \% folds @f$ instances. The zero fold then contains + * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$ + * instances are then distributed over the first @f$ N \% folds @f$ folds. + * + * @param[in] N number of instances + * @param[in] folds number of folds + * @param[in,out] cv_idx array of size N which contains the fold index + * for each observation on exit + * + */ +void msvmmaj_make_cv_split(long N, long folds, long *cv_idx) +{ + long i, j, idx; + + long big_folds = N%folds; + long small_fold_size = N/folds; + + j = 0; + for (i=0; i<small_fold_size*folds; i++) + while (1) { + idx = rand()%N; + if (cv_idx[idx] == 0) { + cv_idx[idx] = j; + j++; + j%=folds; + break; + } + } + j = 0; + i = 0; + while (i < big_folds) { + if (cv_idx[j] == 0) { + cv_idx[j] = i++; + } + j++; + } +} + + +/** + * @brief Create train and test datasets for a CV split + * + * @details + * Given a MajData structure for the full dataset, a previously created + * cross validation split vector and a fold index, a training and test dataset + * are created. + * + * @param[in] full_data a MajData structure for the entire + * dataset + * @param[in,out] train_data an initialized MajData structure which + * on exit contains the training dataset + * @param[in,out] test_data an initialized MajData structure which + * on exit contains the test dataset + * @param[in] cv_idx a vector of cv partitions created by + * msvmmaj_make_cv_split() + * @param[in] fold_idx index of the fold which becomes the + * test dataset + */ +void msvmmaj_get_tt_split(struct MajData *full_data, struct MajData *train_data, + struct MajData *test_data, long *cv_idx, long fold_idx) +{ + long i, j, k, l, test_n, train_n; + + long n = full_data->n; + long m = full_data->m; + long K = full_data->K; + + test_n = 0; + for (i=0; i<n; i++) + if (cv_idx[i] == fold_idx) + test_n++; + train_n = n - test_n; + + test_data->n = test_n; + train_data->n = train_n; + + train_data->K = K; + test_data->K = K; + + train_data->m = m; + test_data->m = m; + + train_data->y = Calloc(long, train_n); + test_data->y = Calloc(long, test_n); + + train_data->Z = Calloc(double, train_n*(m+1)); + test_data->Z = Calloc(double, test_n*(m+1)); + + k = 0; + l = 0; + for (i=0; i<n; i++) { + if (cv_idx[i] == fold_idx) { + test_data->y[k] = full_data->y[i]; + for (j=0; j<m+1; j++) + matrix_set(test_data->Z, m+1, k, j, + matrix_get(full_data->Z, m+1, + i, j)); + k++; + } else { + train_data->y[l] = full_data->y[i]; + for (j=0; j<m+1; j++) + matrix_set(train_data->Z, m+1, l, j, + matrix_get(full_data->Z, m+1, + i, j)); + l++; + } + } +} diff --git a/src/libMSVMMaj.c b/src/libMSVMMaj.c new file mode 100644 index 0000000..a0bef97 --- /dev/null +++ b/src/libMSVMMaj.c @@ -0,0 +1,307 @@ +/** + * @file libMSVMMaj.c + * @author Gertjan van den Burg + * @date August 8, 2013 + * @brief Main functions for the MSVMMaj algorithm + * + * @details + * The functions in this file are all functions needed + * to calculate the optimal separation boundaries for + * a multiclass classification problem, using the + * MSVMMaj algorithm. + * + */ + +#include <cblas.h> +#include <math.h> + +#include "libMSVMMaj.h" +#include "msvmmaj.h" +#include "msvmmaj_matrix.h" + +inline double rnd() { return (double) rand()/0x7FFFFFFF; } + +/** + * @brief Generate matrix of simplex vertex coordinates + * + * @details + * Generate the simplex matrix. Each row of the created + * matrix contains the coordinate vector of a single + * vertex of the K-simplex in K-1 dimensions. The simplex + * generated is a special simplex with edges of length 1. + * The simplex matrix U must already have been allocated. + * + * @param[in] K number of classes + * @param[in,out] U simplex matrix of size K * (K-1) + */ +void msvmmaj_simplex_gen(long K, double *U) +{ + long i, j; + for (i=0; i<K; i++) { + for (j=0; j<K-1; j++) { + if (i <= j) { + matrix_set(U, K-1, i, j, -1.0/sqrt(2.0*(j+1)*(j+2))); + } else if (i == j+1) { + matrix_set(U, K-1, i, j, sqrt((j+1)/(2.0*(j+2)))); + } else { + matrix_set(U, K-1, i, j, 0.0); + } + } + } +} + +/** + * @brief Generate the category matrix + * + * @details + * Generate the category matrix R. The category matrix has 1's everywhere + * except at the column corresponding to the label of instance i, there the + * element is 0. + * + * @param[in,out] model corresponding MajModel + * @param[in] dataset corresponding MajData + * + */ +void msvmmaj_category_matrix(struct MajModel *model, struct MajData *dataset) +{ + long i, j; + long n = model->n; + long K = model->K; + + for (i=0; i<n; i++) { + for (j=0; j<K; j++) { + if (dataset->y[i] != j+1) { + matrix_set(model->R, K, i, j, 1.0); + } + } + } +} + +/** + * @brief Generate the simplex difference matrix + * + * @details + * The simplex difference matrix is a 3D matrix which is constructed + * as follows. For each instance i, the difference vectors between the row of + * the simplex matrix corresponding to the class label of instance i and the + * other rows of the simplex matrix are calculated. These difference vectors + * are stored in a matrix, which is one horizontal slice of the 3D matrix. + * + * @param[in,out] model the corresponding MajModel + * @param[in] data the corresponding MajData + * + */ +void msvmmaj_simplex_diff(struct MajModel *model, struct MajData *data) +{ + long i, j, k; + double value; + + long n = model->n; + long K = model->K; + + for (i=0; i<n; i++) { + for (j=0; j<K-1; j++) { + for (k=0; k<K; k++) { + value = matrix_get(model->U, K-1, data->y[i]-1, j); + value -= matrix_get(model->U, K-1, k, j); + matrix3_set(model->UU, K-1, K, i, j, k, value); + } + } + } +} + +/** + * @brief Calculate the scalar errors + * + * @details + * Calculate the scalar errors q based on the current estimate of V, and + * store these in Q. It is assumed that the memory for Q has already been + * allocated. In addition, the matrix ZV is calculated here. It is assigned + * to a pre-allocated block of memory, which is passed to this function. + * + * @param[in,out] model the corresponding MajModel + * @param[in] data the corresponding MajData + * @param[in,out] ZV a pointer to a memory block for ZV. On exit + * this block is updated with the new ZV matrix + * calculated with MajModel::V. + * + */ +void msvmmaj_calculate_errors(struct MajModel *model, struct MajData *data, double *ZV) +{ + long i, j, k; + double a, value; + + long n = model->n; + long m = model->m; + long K = model->K; + + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + K-1, + m+1, + 1.0, + data->Z, + m+1, + model->V, + K-1, + 0.0, + ZV, + K-1); + + Memset(model->Q, double, n*K); + for (i=0; i<n; i++) { + for (j=0; j<K-1; j++) { + a = matrix_get(ZV, K-1, i, j); + for (k=0; k<K; k++) { + value = a * matrix3_get(model->UU, K-1, K, i, j, k); + matrix_add(model->Q, K, i, k, value); + } + } + } +} + +/** + * @brief Calculate the Huber hinge errors + * + * @details + * For each of the scalar errors in Q the Huber hinge errors are + * calculated. The Huber hinge is here defined as + * @f[ + * h(q) = + * \begin{dcases} + * 1 - q - \frac{\kappa + 1}{2} & \text{if } q \leq -\kappa \\ + * \frac{1}{2(\kappa + 1)} ( 1 - q)^2 & \text{if } q \in (-\kappa, 1] \\ + * 0 & \text{if } q > 1 + * \end{dcases} + * @f] + * + * @param[in,out] model the corresponding MajModel + */ +void msvmmaj_calculate_huber(struct MajModel *model) +{ + long i, j; + double q, value; + + for (i=0; i<model->n; i++) { + for (j=0; j<model->K; j++) { + q = matrix_get(model->Q, model->K, i, j); + value = 0.0; + if (q <= -model->kappa) { + value = 1.0 - q - (model->kappa+1.0)/2.0; + } else if (q <= 1.0) { + value = 1.0/(2.0*model->kappa+2.0)*pow(1.0 - q, 2.0); + } + matrix_set(model->H, model->K, i, j, value); + } + } +} + +/** + * @brief seed the matrix V from an existing model or using rand + * + * @details + * The matrix V must be seeded before the main_loop() can start. + * This can be done by either seeding it with random numbers or + * using the solution from a previous model on the same dataset + * as initial seed. The latter option usually allows for a + * significant improvement in the number of iterations necessary + * because the seeded model V is closer to the optimal V. + * + * @param[in] from_model MajModel from which to copy V + * @param[in,out] to_model MajModel to which V will be copied + */ +void msvmmaj_seed_model_V(struct MajModel *from_model, struct MajModel *to_model) +{ + long i, j; + long m = to_model->m; + long K = to_model->K; + double value; + + if (from_model == NULL) { + for (i=0; i<m+1; i++) + for (j=0; j<K-1; j++) + matrix_set(to_model->V, K-1, i, j, -1.0+2.0*rnd()); + } else { + for (i=0; i<m+1; i++) + for (j=0; j<K-1; j++) { + value = matrix_get(from_model->V, K-1, i, j); + matrix_set(to_model->V, K-1, i, j, value); + } + } +} + +/** + * @brief Use step doubling + * + * @details + * Step doubling can be used to speed up the Majorization algorithm. Instead + * of using the value at the minimimum of the majorization function, the value + * ``opposite'' the majorization point is used. This can essentially cut the + * number of iterations necessary to reach the minimum in half. + * + * @param[in] model MajModel containing the augmented parameters + */ +void msvmmaj_step_doubling(struct MajModel *model) +{ + long i, j; + + long m = model->m; + long K = model->K; + + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + matrix_mul(model->V, K-1, i, j, 2.0); + matrix_add(model->V, K-1, i, j, + -matrix_get(model->Vbar, K-1, i, j)); + } + } +} + +/** + * @brief Initialize instance weights + * + * @details + * Instance weights can for instance be used to add additional weights to + * instances of certain classes. Two default weight possibilities are + * implemented here. The first is unit weights, where each instance gets + * weight 1. + * + * The second are group size correction weights, which are calculated as + * @f[ + * \rho_i = \frac{n}{Kn_k} , + * @f] + * where @f$ n_k @f$ is the number of instances in group @f$ k @f$ and + * @f$ y_i = k @f$. + * + * @param[in] data MajData with the dataset + * @param[in,out] model MajModel with the weight specification. On + * exit MajModel::rho contains the instance + * weights. + */ +void msvmmaj_initialize_weights(struct MajData *data, struct MajModel *model) +{ + long *groups; + long i; + + long n = model->n; + long K = model->K; + + if (model->weight_idx == 1) { + for (i=0; i<n; i++) + model->rho[i] = 1.0; + } + else if (model->weight_idx == 2) { + groups = Calloc(long, K); + for (i=0; i<n; i++) + groups[data->y[i]-1]++; + for (i=0; i<n; i++) + model->rho[i] = ((double) n)/((double) (groups[data->y[i]-1]*K)); + } else { + fprintf(stderr, "Unknown weight specification.\n"); + exit(1); + } +} + diff --git a/src/msvmmaj_init.c b/src/msvmmaj_init.c new file mode 100644 index 0000000..b4384be --- /dev/null +++ b/src/msvmmaj_init.c @@ -0,0 +1,185 @@ +/** + * @file msvmmaj_init.c + * @author Gertjan van den Burg + * @date January 7, 2014 + * @brief Functions for initializing model and data structures + * + * @details + * This file contains functions for initializing a MajModel instance + * and a MajData instance. In addition, default values for these + * structures are defined here (and only here). Functions for allocating + * memory for the model structure and freeing of the model and data structures + * are also included. + * + */ + +#include <math.h> + +#include "msvmmaj.h" +#include "msvmmaj_init.h" + +/** + * @brief Initialize a MajModel structure + * + * @details + * A MajModel structure is initialized and the default value for the + * parameters are set. A pointer to the initialized model is returned. + * + * @returns initialized MajModel + */ +struct MajModel *msvmmaj_init_model() +{ + struct MajModel *model = Malloc(struct MajModel, 1); + + // set default values + model->p = 1.0; + model->lambda = pow(2, -8.0); + model->epsilon = 1e-6; + model->kappa = 0.0; + model->weight_idx = 1; + model->kerneltype = K_LINEAR; + model->use_cholesky = false; + + return model; +} + +/** + * @brief Initialize a MajData structure + * + * @details + * A MajData structure is initialized and default values are set. + * A pointer to the initialized data is returned. + * + * @returns initialized MajData + * + */ +struct MajData *msvmmaj_init_data() +{ + struct MajData *data = Malloc(struct MajData, 1); + + // set default values + data->kerneltype = K_LINEAR; + data->use_cholesky = false; + + return data; +} + +/** + * @brief Allocate memory for a MajModel + * + * @details + * This function can be used to allocate the memory needed for a MajModel. All + * arrays in the model are specified and initialized to 0. + * + * @param[in] model MajModel to allocate + * + */ +void msvmmaj_allocate_model(struct MajModel *model) +{ + long n = model->n; + long m = model->m; + long K = model->K; + + model->W = Calloc(double, m*(K-1)); + if (model->W == NULL) { + fprintf(stderr, "Failed to allocate memory for W.\n"); + exit(1); + } + + model->t = Calloc(double, K-1); + if (model->t == NULL) { + fprintf(stderr, "Failed to allocate memory for t.\n"); + exit(1); + } + + model->V = Calloc(double, (m+1)*(K-1)); + if (model->V == NULL) { + fprintf(stderr, "Failed to allocate memory for V.\n"); + exit(1); + } + + model->Vbar = Calloc(double, (m+1)*(K-1)); + if (model->Vbar == NULL) { + fprintf(stderr, "Failed to allocate memory for Vbar.\n"); + exit(1); + } + + model->U = Calloc(double, K*(K-1)); + if (model->U == NULL) { + fprintf(stderr, "Failed to allocate memory for U.\n"); + exit(1); + } + + model->UU = Calloc(double, n*K*(K-1)); + if (model->UU == NULL) { + fprintf(stderr, "Failed to allocate memory for UU.\n"); + exit(1); + } + + model->Q = Calloc(double, n*K); + if (model->Q == NULL) { + fprintf(stderr, "Failed to allocate memory for Q.\n"); + exit(1); + } + + model->H = Calloc(double, n*K); + if (model->H == NULL) { + fprintf(stderr, "Failed to allocate memory for H.\n"); + exit(1); + } + + model->R = Calloc(double, n*K); + if (model->R == NULL) { + fprintf(stderr, "Failed to allocate memory for R.\n"); + exit(1); + } + + model->rho = Calloc(double, n); + if (model->rho == NULL) { + fprintf(stderr, "Failed to allocate memory for rho.\n"); + exit(1); + } +} + +/** + * @brief Free allocated MajModel struct + * + * @details + * Simply free a previously allocated MajModel by freeing all its component + * arrays. Note that the model struct itself is also freed here. + * + * @param[in] model MajModel to free + * + */ +void msvmmaj_free_model(struct MajModel *model) +{ + free(model->W); + free(model->t); + free(model->V); + free(model->Vbar); + free(model->U); + free(model->UU); + free(model->Q); + free(model->H); + free(model->rho); + free(model->R); + + free(model); +} + +/** + * @brief Free allocated MajData struct + * + * @details + * Simply free a previously allocated MajData struct by freeing all its + * components. Note that the data struct itself is also freed here. + * + * @param[in] data MajData struct to free + * + */ +void msvmmaj_free_data(struct MajData *data) +{ + free(data->Z); + free(data->y); + free(data); +} diff --git a/src/msvmmaj_io.c b/src/msvmmaj_io.c new file mode 100644 index 0000000..fc7cc56 --- /dev/null +++ b/src/msvmmaj_io.c @@ -0,0 +1,294 @@ +/** + * @file msvmmaj_io.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Functions for input and output of data and model files + * + * @details + * This file contains functions for reading and writing model files, and data + * files. + * + */ + +#include "msvmmaj.h" +#include "msvmmaj_io.h" +#include "msvmmaj_matrix.h" +#include "strutil.h" +#include "timer.h" + +/** + * @brief Read data from file + * + * @details + * Read the data from the data_file. The data matrix X is augmented + * with a column of ones, to get the matrix Z. The data is expected + * to follow a specific format, which is specified in the @ref spec_data_file. + * The class labels are corrected internally to correspond to the interval + * [1 .. K], where K is the total number of classes. + * + * @todo + * Make sure that this function allows datasets without class labels for + * testing. + * + * @param[in,out] dataset initialized MajData struct + * @param[in] data_file filename of the data file. + */ +void msvmmaj_read_data(struct MajData *dataset, char *data_file) +{ + FILE *fid; + long i, j; + long n, m; // dimensions of data + long nr = 0; // used to check consistency of data + double value; + long K = 0; + long min_y = 1000000; + + char buf[MAX_LINE_LENGTH]; + + if ((fid = fopen(data_file, "r")) == NULL) { + fprintf(stderr, "\nERROR: datafile %s could not be opened.\n", + data_file); + exit(0); + } + + // Read data dimensions + nr += fscanf(fid, "%ld", &n); + nr += fscanf(fid, "%ld", &m); + + // Allocate memory + dataset->Z = Malloc(double, n*(m+1)); + + // Read first line of data + for (j=1; j<m+1; j++) { + nr += fscanf(fid, "%lf", &value); + matrix_set(dataset->Z, n, 0, j, value); + } + + // Check if there is a label at the end of the line + if (fgets(buf, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "ERROR: No label found on first line.\n"); + exit(1); + } + if (sscanf(buf, "%lf", &value) > 0) { + dataset->y = Malloc(long, n); + dataset->y[0] = value; + } else if (dataset->y != NULL) { + free(dataset->y); + dataset->y = NULL; + } + + // Read the rest of the file + for (i=1; i<n; i++) { + for (j=1; j<m+1; j++) { + nr += fscanf(fid, "%lf", &value); + matrix_set(dataset->Z, m+1, i, j, value); + } + if (dataset->y != NULL) { + nr += fscanf(fid, "%lf", &value); + dataset->y[i] = (long) value; + K = maximum(K, value); + min_y = minimum(min_y, value); + } + } + fclose(fid); + + // Correct labels: must be in [1, K] + if (min_y == 0) { + for (i=0; i<n; i++) + dataset->y[i]++; + K++; + } else if (min_y < 0 ) { + fprintf(stderr, "ERROR: wrong class labels in %s, minimum " + "value is: %ld\n", + data_file, min_y); + exit(0); + } + + if (nr < n * m) { + fprintf(stderr, "ERROR: not enough data found in %s\n", + data_file); + exit(0); + } + + // Set the column of ones + for (i=0; i<n; i++) + matrix_set(dataset->Z, m+1, i, 0, 1.0); + + dataset->n = n; + dataset->m = m; + dataset->K = K; +} + + +/** + * @brief Read model from file + * + * @details + * Read a MajModel from a model file. The MajModel struct must have been + * initalized elswhere. The model file is expected to follow the @ref + * spec_model_file. The easiest way to generate a model file is through + * msvmmaj_write_model(), which can for instance be used in trainMSVMMaj.c. + * + * @param[in,out] model initialized MajModel + * @param[in] model_filename filename of the model file + * + */ +void msvmmaj_read_model(struct MajModel *model, char *model_filename) +{ + long i, j, nr = 0; + FILE *fid; + char buffer[MAX_LINE_LENGTH]; + char data_filename[MAX_LINE_LENGTH]; + double value = 0; + + fid = fopen(model_filename, "r"); + if (fid == NULL) { + fprintf(stderr, "Error opening model file %s\n", + model_filename); + exit(1); + } + // skip the first four lines + for (i=0; i<4; i++) + next_line(fid, model_filename); + + // read all model variables + model->p = get_fmt_double(fid, model_filename, "p = %lf"); + model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf"); + model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf"); + model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf"); + model->weight_idx = (int) get_fmt_long(fid, model_filename, + "weight_idx = %li"); + + // skip to data section + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read filename of data file + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading model file %s\n", + model_filename); + exit(1); + } + sscanf(buffer, "filename = %s\n", data_filename); + model->data_file = data_filename; + + // read all data variables + model->n = get_fmt_long(fid, model_filename, "n = %li\n"); + model->m = get_fmt_long(fid, model_filename, "m = %li\n"); + model->K = get_fmt_long(fid, model_filename, "K = %li\n"); + + // skip to output + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read the matrix V and check for consistency + model->V = Malloc(double, (model->m+1)*(model->K-1)); + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + nr += fscanf(fid, "%lf ", &value); + matrix_set(model->V, model->K-1, i, j, value); + } + } + if (nr != (model->m+1)*(model->K-1)) { + fprintf(stderr, "Error reading model file %s. " + "Not enough elements of V found.\n", + model_filename); + exit(1); + } +} + +/** + * @brief Write model to file + * + * @details + * Write a MajModel to a file. The current time is specified in the file in + * UTC + offset. The model file further corresponds to the @ref + * spec_model_file. + * + * @param[in] model MajModel which contains an estimate for + * MajModel::V + * @param[in] output_filename the output file to write the model to + * + */ +void msvmmaj_write_model(struct MajModel *model, char *output_filename) +{ + FILE *fid; + long i, j; + char timestr[MAX_LINE_LENGTH]; + + // open output file + fid = fopen(output_filename, "w"); + if (fid == NULL) { + fprintf(stderr, "Error opening output file %s", + output_filename); + exit(1); + } + get_time_string(timestr); + + // Write output to file + fprintf(fid, "Output file for MSVMMaj (version %1.1f)\n", VERSION); + fprintf(fid, "Generated on: %s\n\n", timestr); + fprintf(fid, "Model:\n"); + fprintf(fid, "p = %15.16f\n", model->p); + fprintf(fid, "lambda = %15.16f\n", model->lambda); + fprintf(fid, "kappa = %15.16f\n", model->kappa); + fprintf(fid, "epsilon = %g\n", model->epsilon); + fprintf(fid, "weight_idx = %i\n", model->weight_idx); + fprintf(fid, "\n"); + fprintf(fid, "Data:\n"); + fprintf(fid, "filename = %s\n", model->data_file); + fprintf(fid, "n = %li\n", model->n); + fprintf(fid, "m = %li\n", model->m); + fprintf(fid, "K = %li\n", model->K); + fprintf(fid, "\n"); + fprintf(fid, "Output:\n"); + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + fprintf(fid, "%+15.16f ", + matrix_get(model->V, + model->K-1, i, j)); + } + fprintf(fid, "\n"); + } + + fclose(fid); +} + +/** + * @brief Write predictions to file + * + * @details + * Write the given predictions to an output file, such that the resulting file + * corresponds to the @ref spec_data_file. + * + * @param[in] data MajData with the original instances + * @param[in] predy predictions of the class labels of the + * instances in the given MajData. Note that the + * order of the instances is assumed to be the + * same. + * @param[in] output_filename the file to which the predictions are written + * + */ +void msvmmaj_write_predictions(struct MajData *data, long *predy, + char *output_filename) +{ + long i, j; + FILE *fid; + + fid = fopen(output_filename, "w"); + if (fid == NULL) { + fprintf(stderr, "Error opening output file %s", + output_filename); + exit(1); + } + + for (i=0; i<data->n; i++) { + for (j=0; j<data->m; j++) + fprintf(fid, "%f ", + matrix_get(data->Z, + data->m+1, i, j+1)); + fprintf(fid, "%li\n", predy[i]); + } + + fclose(fid); +} diff --git a/src/msvmmaj_kernel.c b/src/msvmmaj_kernel.c new file mode 100644 index 0000000..6238fc1 --- /dev/null +++ b/src/msvmmaj_kernel.c @@ -0,0 +1,195 @@ +/** + * @file msvmmaj_kernel.c + * @author Gertjan van den Burg + * @date October 18, 2013 + * @brief Defines main functions for use of kernels in MSVMMaj. + * + * @details + * Functions for constructing different kernels using user-supplied + * parameters. Also contains the functions for decomposing the + * kernel matrix using several decomposition methods. + * + */ +#include <math.h> + +#include "msvmmaj.h" +#include "msvmmaj_kernel.h" +#include "msvmmaj_lapack.h" +#include "msvmmaj_matrix.h" +#include "util.h" + +/** + * @brief Create the kernel matrix + * + * Create a kernel matrix based on the specified kerneltype. Kernel parameters + * are assumed to be specified in the model. + * + * @param[in] model MajModel specifying the parameters + * @param[in] data MajData specifying the data. + * + */ +void msvmmaj_make_kernel(struct MajModel *model, struct MajData *data) +{ + if (model->kerneltype == K_LINEAR) + return; + + long i, j; + long n = model->n; + double value; + double *x1, *x2; + double *K = Calloc(double, n*n*sizeof(double)); + + for (i=0; i<n; i++) { + for (j=i; j<n; j++) { + x1 = &data->Z[i*(data->m+1)+1]; + x2 = &data->Z[j*(data->m+1)+1]; + if (model->kerneltype == K_POLY) + value = msvmmaj_compute_poly(x1, x2, + model->kernelparam, data->m); + else if (model->kerneltype == K_RBF) + value = msvmmaj_compute_rbf(x1, x2, + model->kernelparam, data->m); + else if (model->kerneltype == K_SIGMOID) + value = msvmmaj_compute_rbf(x1, x2, + model->kernelparam, data->m); + else { + fprintf(stderr, "Unknown kernel type in " + "msvmmaj_make_kernel\n"); + exit(1); + } + matrix_set(K, n, i, j, value); + matrix_set(K, n, j, i, value); + } + } + + // get cholesky if necessary. + if (model->use_cholesky == true) { + int status = dpotrf('L', n, K, n); + if (status != 0) { + fprintf(stderr, "Error (%i) computing Cholesky " + "decomposition of kernel matrix.\n", + status); + exit(0); + } + note("Got Cholesky.\n"); + } + + // copy kernel/cholesky to data + data->Z = realloc(data->Z, n*(n+1)*(sizeof(double))); + for (i=0; i<n; i++) { + for (j=0; j<n; j++) + matrix_set(data->Z, n+1, i, j+1, + matrix_get(K, n, i, j)); + matrix_set(data->Z, n+1, i, 0, 1.0); + } + data->m = n; + + // let data know what it's made of + data->kerneltype = model->kerneltype; + free(data->kernelparam); + switch (model->kerneltype) { + case K_LINEAR: + break; + case K_POLY: + data->kernelparam = Calloc(double, 3); + data->kernelparam[0] = model->kernelparam[0]; + data->kernelparam[1] = model->kernelparam[1]; + data->kernelparam[2] = model->kernelparam[2]; + break; + case K_RBF: + data->kernelparam = Calloc(double, 1); + data->kernelparam[0] = model->kernelparam[0]; + break; + case K_SIGMOID: + data->kernelparam = Calloc(double, 2); + data->kernelparam[0] = model->kernelparam[0]; + data->kernelparam[1] = model->kernelparam[1]; + } + data->use_cholesky = model->use_cholesky; + model->m = n; + free(K); +} + +/** + * @brief Compute the RBF kernel between two vectors + * + * @details + * The RBF kernel is computed between two vectors. This kernel is defined as + * @f[ + * k(x_1, x_2) = \exp( -\gamma \| x_1 - x_2 \|^2 ) + * @f] + * where @f$ \gamma @f$ is a kernel parameter specified. + * + * @param[in] x1 first vector + * @param[in] x2 second vector + * @param[in] kernelparam array of kernel parameters (gamma is first + * element) + * @param[in] n length of the vectors x1 and x2 + * @returns kernel evaluation + */ +double msvmmaj_compute_rbf(double *x1, double *x2, double *kernelparam, long n) +{ + long i; + double value = 0.0; + + for (i=0; i<n; i++) + value += (x1[i] - x2[i]) * (x1[i] - x2[i]); + value *= -kernelparam[0]; + return exp(value); +} + +/** + * @brief Compute the polynomial kernel between two vectors + * + * @details + * The polynomial kernel is computed between two vectors. This kernel is + * defined as + * @f[ + * k(x_1, x_2) = ( \gamma \langle x_1, x_2 \rangle + c)^d + * @f] + * where @f$ \gamma @f$, @f$ c @f$ and @f$ d @f$ are kernel parameters. + * + * @param[in] x1 first vector + * @param[in] x2 second vector + * @param[in] kernelparam array of kernel parameters (gamma, c, d) + * @param[in] n length of the vectors x1 and x2 + * @returns kernel evaluation + */ +double msvmmaj_compute_poly(double *x1, double *x2, double *kernelparam, long n) +{ + long i; + double value = 0.0; + for (i=0; i<n; i++) + value += x1[i]*x2[i]; + value *= kernelparam[0]; + value += kernelparam[1]; + return pow(value, ((int) kernelparam[2])); +} + +/** + * @brief Compute the sigmoid kernel between two vectors + * + * @details + * The sigmoid kernel is computed between two vectors. This kernel is defined + * as + * @f[ + * k(x_1, x_2) = \tanh( \gamma \langle x_1 , x_2 \rangle + c) + * @f] + * where @f$ \gamma @f$ and @f$ c @f$ are kernel parameters. + * + * @param[in] x1 first vector + * @param[in] x2 second vector + * @param[in] kernelparam array of kernel parameters (gamma, c) + * @param[in] n length of the vectors x1 and x2 + * @returns kernel evaluation + */ +double msvmmaj_compute_sigmoid(double *x1, double *x2, double *kernelparam, long n) +{ + long i; + double value = 0.0; + for (i=0; i<n; i++) + value += x1[i]*x2[i]; + value *= kernelparam[0]; + value += kernelparam[1]; + return tanh(value); +} diff --git a/src/msvmmaj_lapack.c b/src/msvmmaj_lapack.c new file mode 100644 index 0000000..9ca8dab --- /dev/null +++ b/src/msvmmaj_lapack.c @@ -0,0 +1,129 @@ +/** + * @file msvmmaj_lapack.c + * @author Gertjan van den Burg + * @date August 9, 2013 + * @brief Utility functions for interacting with LAPACK + * + * @details + * Functions in this file are auxiliary functions which make it easier + * to use LAPACK functions from liblapack. + */ + +#include "msvmmaj_lapack.h" + +/** + * @brief Solve AX = B where A is symmetric positive definite. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric positive + * definite. This function uses the externel LAPACK routine dposv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the upper or lower factor of the + * Cholesky factorization of A. + * @param[in] LDA leading dimension of A + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS solution matrix X. + * @param[in] LDB the leading dimension of B + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had + * an illegal value + * - >0: if i, the leading minor of A + * was not positive definite + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/dc/de9/group__double_p_osolve.html + */ +int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, + int LDB) +{ + extern void dposv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, double *B, int *LDBp, int *INFOp); + int INFO; + dposv_(&UPLO, &N, &NRHS, A, &LDA, B, &LDB, &INFO); + return INFO; +} + +/** + * @brief Solve a system of equations AX = B where A is symmetric. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric. This + * function uses the external LAPACK routine dsysv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the block diagonal matrix D and + * the multipliers used to obtain the factor U or + * L from the factorization A = U*D*U**T or + * A = L*D*L**T. + * @param[in] LDA leading dimension of A + * @param[in] IPIV integer array containing the details of D + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS matrix X + * @param[in] LDB leading dimension of B + * @param[out] WORK double precision array of size max(1,LWORK). On + * exit, WORK(1) contains the optimal LWORK + * @param[in] LWORK the length of WORK, can be used for determining + * the optimal blocksize for dsystrf. + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had an + * illegal value + * - >0: if i, D(i, i) is exactly zero, + * no solution can be computed. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d6/d0e/group__double_s_ysolve.html + */ +int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, + double *B, int LDB, double *WORK, int LWORK) +{ + extern void dsysv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, int *IPIV, double *B, int *LDBp, + double *WORK, int *LWORK, int *INFOp); + int INFO; + dsysv_(&UPLO, &N, &NRHS, A, &LDA, IPIV, B, &LDB, WORK, &LWORK, &INFO); + return INFO; +} + +/** + * @brief Compute the Cholesky factorization of a real symmetric positive + * definite matrix. + * + * @details + * This function uses the external LAPACK routine dpotrf. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the factor U or L of the Cholesky + * factorization + * @param[in] LDA leading dimension of A + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had an + * illegal value + * - >0: if i, the leading minor of + * order i is not positive + * definite + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html + */ +int dpotrf(char UPLO, int N, double *A, int LDA) +{ + extern void dpotrf_(char *UPLO, int *N, double *A, int *LDA, int *INFOp); + int INFO; + dpotrf_(&UPLO, &N, A, &LDA, &INFO); + return INFO; +} diff --git a/src/msvmmaj_matrix.c b/src/msvmmaj_matrix.c new file mode 100644 index 0000000..ffa0c21 --- /dev/null +++ b/src/msvmmaj_matrix.c @@ -0,0 +1,153 @@ +/** + * @file msvmmaj_matrix.c + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Functions facilitating matrix access + * + * @details + * The functions contained in this file are used when + * accessing or writing to matrices. Seperate functions + * exist of adding and multiplying existing matrix + * elements, to ensure this is done in place. + * + */ + +#include "msvmmaj_matrix.h" +#include "util.h" + +/** + * @brief Set element of matrix + * + * @details + * Row-Major order is used to set a matrix element. Since matrices + * of type double are most common in MSVMMaj, this function only + * deals with that type. + * + * @param[in] M matrix to set element of + * @param[in] cols number of columns of M + * @param[in] i row index of element to write to + * @param[in] j column index of element to write to + * @param[out] val value to write to specified element of M + */ +void matrix_set(double *M, long cols, long i, long j, double val) +{ + M[i*cols+j] = val; +} + +/** + * @brief Retrieve value from matrix + * + * @details + * Return a value from a matrix using row-major order. + * + * @param[in] M matrix to retrieve value from + * @param[in] cols number of columns of M + * @param[in] i row index (starting from 0) + * @param[in] j column index (starting from 0) + * @returns matrix element at (i, j) + */ +double matrix_get(double *M, long cols, long i, long j) +{ + return M[i*cols+j]; +} + +/** + * @brief Add value to matrix element + * + * @details + * This function is added to efficiently add values to matrix + * elements, without having to use get and set methods. + * + * @param[in] M matrix + * @param[in] cols number of columns of M + * @param[in] i row index (starting from 0) + * @param[in] j column index (starting from 0) + * @param[in] val value to add to matrix element (i, j) + */ +void matrix_add(double *M, long cols, long i, long j, double val) +{ + M[i*cols+j] += val; +} + +/** + * @brief Multiply matrix element by value + * + * @details + * This function is added to efficiently multiply a matrix element + * by a certain value, without having to use get and set methods. + * + * @param[in] M matrix + * @param[in] cols number of columns of M + * @param[in] i row index (starting from 0) + * @param[in] j column index (starting from 0) + * @param[in] val value to multiply matrix element (i, j) with + */ +void matrix_mul(double *M, long cols, long i, long j, double val) +{ + M[i*cols+j] *= val; +} + +/** + * @brief Set element of 3D matrix + * + * @details + * Set an element of a 3D matrix using row-major order. + * + * @param[in] M matrix + * @param[in] N2 second dimension of M + * @param[in] N3 third dimension of M + * @param[in] i index along first dimension + * @param[in] j index along second dimension + * @param[in] k index along third dimension + * @param[in] val value to set element (i, j, k) to + * + * See: + * http://en.wikipedia.org/wiki/Row-major_order + */ +void matrix3_set(double *M, long N2, long N3, long i, long j, + long k, double val) +{ + M[k+N3*(j+N2*i)] = val; +} + +/** + * @brief Get element of 3D matrix + * + * @details + * Retrieve an element from a 3D matrix. + * + * @param[in] M matrix + * @param[in] N2 second dimension of M + * @param[in] N3 third dimension of M + * @param[in] i index along first dimension + * @param[in] j index along second dimension + * @param[in] k index along third dimension + * @returns value at the (i, j, k) element of M + */ +double matrix3_get(double *M, long N2, long N3, long i, long j, + long k) +{ + return M[k+N3*(j+N2*i)]; +} + +/** + * @brief print a matrix + * + * @details + * Debug function to print a matrix + * + * @param[in] M matrix + * @param[in] rows number of rows of M + * @param[in] cols number of columns of M + */ +void print_matrix(double *M, long rows, long cols) +{ + long i, j; + + for (i=0; i<rows; i++) { + for (j=0; j<cols; j++) + note("%8.8f ", matrix_get(M, cols, i, j)); + note("\n"); + } + note("\n"); +} diff --git a/src/msvmmaj_pred.c b/src/msvmmaj_pred.c new file mode 100644 index 0000000..98b6e0a --- /dev/null +++ b/src/msvmmaj_pred.c @@ -0,0 +1,116 @@ +/** + * @file msvmmaj_pred.c + * @author Gertjan van den Burg + * @date August 9, 2013 + * @brief Main functions for predicting class labels.. + * + * @details + * This file contains functions for predicting the class labels of instances + * and a function for calculating the predictive performance (hitrate) of + * a prediction given true class labels. + * + */ + +#include <cblas.h> + +#include "libMSVMMaj.h" +#include "msvmmaj.h" +#include "msvmmaj_matrix.h" +#include "msvmmaj_pred.h" + +/** + * @brief Predict class labels of data given and output in predy + * + * @details + * The labels are predicted by mapping each instance in data to the + * simplex space using the matrix V in the given model. Next, for each + * instance the nearest simplex vertex is determined using an Euclidean + * norm. The nearest simplex vertex determines the predicted class label, + * which is recorded in predy. + * + * @param[in] data MajData to predict labels for + * @param[in] model MajModel with optimized V + * @param[out] predy pre-allocated vector to record predictions in + */ +void msvmmaj_predict_labels(struct MajData *data, struct MajModel *model, long *predy) +{ + long i, j, k, label; + double norm, min_dist; + + long n = data->n; // note that model->n is the size of the training sample. + long m = data->m; + long K = model->K; //data->K does not necessarily equal the original K. + + double *S = Calloc(double, K-1); + double *ZV = Calloc(double, n*(K-1)); + double *U = Calloc(double, K*(K-1)); + + // Get the simplex matrix + msvmmaj_simplex_gen(K, U); + + // Generate the simplex-space vectors + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + K-1, + m+1, + 1.0, + data->Z, + m+1, + model->V, + K-1, + 0.0, + ZV, + K-1); + + // Calculate the distance to each of the vertices of the simplex. + // The closest vertex defines the class label. + for (i=0; i<n; i++) { + label = 0; + min_dist = 1000000000.0; + for (j=0; j<K; j++) { + for (k=0; k<K-1; k++) { + S[k] = matrix_get(ZV, K-1, i, k) - matrix_get(U, K-1, j, k); + } + norm = cblas_dnrm2(K, S, 1); + if (norm < min_dist) { + label = j+1; // labels start counting from 1 + min_dist = norm; + } + } + predy[i] = label; + } + + free(ZV); + free(U); + free(S); +} + +/** + * @brief Calculate the predictive performance (percentage correct) + * + * @details + * The predictive performance is calculated by simply counting the number + * of correctly classified samples and dividing by the total number of + * samples, multiplying by 100. + * + * @param[in] data the MajData dataset with known labels + * @param[in] predy the predicted class labels + * + * @returns percentage correctly classified. + */ +double msvmmaj_prediction_perf(struct MajData *data, long *predy) +{ + long i, correct = 0; + double performance; + + for (i=0; i<data->n; i++) + if (data->y[i] == predy[i]) + correct++; + + performance = ((double) correct)/((double) data->n)* 100.0; + + return performance; +} diff --git a/src/msvmmaj_train.c b/src/msvmmaj_train.c new file mode 100644 index 0000000..97ee6a1 --- /dev/null +++ b/src/msvmmaj_train.c @@ -0,0 +1,520 @@ +/** + * @file msvmmaj_train.c + * @author Gertjan van den Burg + * @date August 9, 2013 + * @brief Main functions for training the MSVMMaj solution. + * + * @details + * Contains update and loss functions used to actually find + * the optimal V. + * + */ + +#include <math.h> +#include <cblas.h> + +#include "libMSVMMaj.h" +#include "msvmmaj.h" +#include "msvmmaj_lapack.h" +#include "msvmmaj_matrix.h" +#include "msvmmaj_train.h" +#include "util.h" + +/** + * Maximum number of iterations of the algorithm. + */ +#define MAX_ITER 1000000 + +/** + * @brief The main training loop for MSVMMaj + * + * @details + * This function is the main training function. This function + * handles the optimization of the model with the given model parameters, with + * the data given. On return the matrix MajModel::V contains the optimal + * weight matrix. + * + * In this function, step doubling is used in the majorization algorithm after + * a burn-in of 50 iterations. If the training is finished, MajModel::t and + * MajModel::W are extracted from MajModel::V. + * + * @param[in,out] model the MajModel to be trained. Contains optimal + * V on exit. + * @param[in] data the MajData to train the model with. + */ +void msvmmaj_optimize(struct MajModel *model, struct MajData *data) +{ + long i, j, it = 0; + double L, Lbar; + + long n = model->n; + long m = model->m; + long K = model->K; + + double *B = Calloc(double, n*(K-1)); + double *ZV = Calloc(double, n*(K-1)); + double *ZAZ = Calloc(double, (m+1)*(m+1)); + double *ZAZV = Calloc(double, (m+1)*(K-1)); + double *ZAZVT = Calloc(double, (m+1)*(K-1)); + + note("Starting main loop.\n"); + note("Dataset:\n"); + note("\tn = %i\n", n); + note("\tm = %i\n", m); + note("\tK = %i\n", K); + note("Parameters:\n"); + note("\tkappa = %f\n", model->kappa); + note("\tp = %f\n", model->p); + note("\tlambda = %15.16f\n", model->lambda); + note("\tepsilon = %g\n", model->epsilon); + note("\n"); + + msvmmaj_simplex_gen(model->K, model->U); + msvmmaj_simplex_diff(model, data); + msvmmaj_category_matrix(model, data); + + L = msvmmaj_get_loss(model, data, ZV); + Lbar = L + 2.0*model->epsilon*L; + + while ((it < MAX_ITER) && (Lbar - L)/L > model->epsilon) + { + // ensure V contains newest V and Vbar contains V from previous + msvmmaj_get_update(model, data, B, ZAZ, ZAZV, ZAZVT); + if (it > 50) + msvmmaj_step_doubling(model); + + Lbar = L; + L = msvmmaj_get_loss(model, data, ZV); + + if (it%50 == 0) + note("iter = %li, L = %15.16f, Lbar = %15.16f, " + "reldiff = %15.16f\n", it, L, Lbar, (Lbar - L)/L); + it++; + } + + note("optimization finished, iter = %li, error = %8.8f\n", it-1, + (Lbar - L)/L); + model->training_error = (Lbar - L)/L; + + for (i=0; i<K-1; i++) + model->t[i] = matrix_get(model->V, K-1, 0, i); + for (i=1; i<m+1; i++) + for (j=0; j<K-1; j++) + matrix_set(model->W, K-1, i-1, j, + matrix_get(model->V, K-1, i, j)); + free(B); + free(ZV); + free(ZAZ); + free(ZAZV); + free(ZAZVT); +} + +/** + * @brief Calculate the current value of the loss function + * + * @details + * The current loss function value is calculated based on the matrix V in the + * given model. Note that the matrix ZV is passed explicitly to avoid having + * to reallocate memory at every step. + * + * @param[in] model MajModel structure which holds the current + * estimate V + * @param[in] data MajData structure + * @param[in,out] ZV pre-allocated matrix ZV which is updated on + * output + * @returns the current value of the loss function + */ +double msvmmaj_get_loss(struct MajModel *model, struct MajData *data, + double *ZV) +{ + long i, j; + long n = data->n; + long K = data->K; + long m = data->m; + + double value, rowvalue, loss = 0.0; + + msvmmaj_calculate_errors(model, data, ZV); + msvmmaj_calculate_huber(model); + + for (i=0; i<n; i++) { + rowvalue = 0; + value = 0; + for (j=0; j<K; j++) { + value = matrix_get(model->H, K, i, j); + value = pow(value, model->p); + value *= matrix_get(model->R, K, i, j); + rowvalue += value; + } + rowvalue = pow(rowvalue, 1.0/(model->p)); + rowvalue *= model->rho[i]; + loss += rowvalue; + } + loss /= ((double) n); + + value = 0; + for (i=1; i<m+1; i++) { + for (j=0; j<K-1; j++) { + value += pow(matrix_get(model->V, K-1, i, j), 2.0); + } + } + loss += model->lambda * value; + + return loss; +} + +/** + * @brief Perform a single step of the majorization algorithm to update V + * + * @details + * This function contains the main update calculations of the algorithm. These + * calculations are necessary to find a new update V. The calculations exist of + * recalculating the majorization coefficients for all instances and all + * classes, and solving a linear system to find V. + * + * Because the function msvmmaj_get_update() is always called after a call to + * msvmmaj_get_loss() with the same MajModel::V, it is unnecessary to calculate + * the updated errors MajModel::Q and MajModel::H here too. This saves on + * computation time. + * + * In calculating the majorization coefficients we calculate the elements of a + * diagonal matrix A with elements + * @f[ + * A_{i, i} = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ + * \varepsilon_i a_{ijk}^{(p)} + (1 - \varepsilon_i) \omega_i + * a_{ijk}^{(p)} \right], + * @f] + * where @f$ k = y_i @f$. + * Since this matrix is only used to calculate the matrix @f$ Z' A Z @f$, it is + * efficient to update a matrix ZAZ through consecutive rank 1 updates with + * a single element of A and the corresponding row of Z. The BLAS function + * dsyr is used for this. + * + * The B matrix is has rows + * @f[ + * \boldsymbol{\beta}_i' = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ + * \varepsilon_i \left( b_{ijk}^{(1)} - a_{ijk}^{(1)} + * \overline{q}_i^{(kj)} \right) + (1 - \varepsilon_i) + * \omega_i \left( b_{ijk}^{(p)} - a_{ijk}^{(p)} + * \overline{q}_i^{(kj)} \right) \right] + * \boldsymbol{\delta}_{kj}' + * @f] + * This is also split into two cases, one for which @f$ \varepsilon_i = 1 @f$, + * and one for when it is 0. The 3D simplex difference matrix is used here, in + * the form of the @f$ \boldsymbol{\delta}_{kj}' @f$. + * + * Finally, the following system is solved + * @f[ + * (\textbf{Z}'\textbf{AZ} + \lambda \textbf{J})\textbf{V} = + * (\textbf{Z}'\textbf{AZ}\overline{\textbf{V}} + \textbf{Z}' + * \textbf{B}) + * @f] + * solving this system is done through dposv(). + * + * @param [in,out] model model to be updated + * @param [in] data data used in model + * @param [in] B pre-allocated matrix used for linear coefficients + * @param [in] ZAZ pre-allocated matrix used in system + * @param [in] ZAZV pre-allocated matrix used in system solving + * @param [in] ZAZVT pre-allocated matrix used in system solving + */ +void msvmmaj_get_update(struct MajModel *model, struct MajData *data, double *B, + double *ZAZ, double *ZAZV, double *ZAZVT) +{ + int status, class; + long i, j, k; + double Avalue, Bvalue; + double omega, value, a, b, q, h, r; + + long n = model->n; + long m = model->m; + long K = model->K; + + double kappa = model->kappa; + double p = model->p; + double *rho = model->rho; + + // constants which are used often throughout + const double a2g2 = 0.25*p*(2.0*p - 1.0)*pow((kappa+1.0)/2.0,p-2.0); + const double in = 1.0/((double) n); + + // clear matrices + Memset(B, double, n*(K-1)); + Memset(ZAZ, double, (m+1)*(m+1)); + + b = 0; + for (i=0; i<n; i++) { + value = 0; + omega = 0; + for (j=0; j<K; j++) { + h = matrix_get(model->H, K, i, j); + r = matrix_get(model->R, K, i, j); + value += (h*r > 0) ? 1 : 0; + omega += pow(h, p)*r; + } + class = (value <= 1.0) ? 1 : 0; + omega = (1.0/p)*pow(omega, 1.0/p - 1.0); + + Avalue = 0; + if (class == 1) { + for (j=0; j<K; j++) { + q = matrix_get(model->Q, K, i, j); + if (q <= -kappa) { + a = 0.25/(0.5 - kappa/2.0 - q); + b = 0.5; + } else if (q <= 1.0) { + a = 1.0/(2.0*kappa + 2.0); + b = (1.0 - q)*a; + } else { + a = -0.25/(0.5 - kappa/2.0 - q); + b = 0; + } + for (k=0; k<K-1; k++) { + Bvalue = in*rho[i]*b*matrix3_get( + model->UU, K-1, K, i, k, j); + matrix_add(B, K-1, i, k, Bvalue); + } + Avalue += a*matrix_get(model->R, K, i, j); + } + } else { + if (2.0 - p < 0.0001) { + for (j=0; j<K; j++) { + q = matrix_get(model->Q, K, i, j); + if (q <= -kappa) { + b = 0.5 - kappa/2.0 - q; + } else if ( q <= 1.0) { + b = pow(1.0 - q, 3.0)/( + 2.0*pow(kappa + 1.0, + 2.0)); + } else { + b = 0; + } + for (k=0; k<K-1; k++) { + Bvalue = in*rho[i]*omega*b* + matrix3_get( + model->UU, + K-1, + K, + i, + k, + j); + matrix_add( + B, + K-1, + i, + k, + Bvalue); + } + } + Avalue = 1.5*(K - 1.0); + } else { + for (j=0; j<K; j++) { + q = matrix_get(model->Q, K, i, j); + if (q <= (p + kappa - 1.0)/(p - 2.0)) { + a = 0.25*pow(p, 2.0)*pow( + 0.5 - kappa/2.0 - q, + p - 2.0); + } else if (q <= 1.0) { + a = a2g2; + } else { + a = 0.25*pow(p, 2.0)*pow( + (p/(p - 2.0))* + (0.5 - kappa/2.0 - q), + p - 2.0); + b = a*(2.0*q + kappa - 1.0)/ + (p - 2.0) + + 0.5*p*pow( + p/(p - 2.0)* + (0.5 - kappa/ + 2.0 - q), + p - 1.0); + } + if (q <= -kappa) { + b = 0.5*p*pow( + 0.5 - kappa/2.0 - q, + p - 1.0); + } else if ( q <= 1.0) { + b = p*pow(1.0 - q, + 2.0*p - 1.0)/ + pow(2*kappa+2.0, p); + } + for (k=0; k<K-1; k++) { + Bvalue = in*rho[i]*omega*b* + matrix3_get( + model->UU, + K-1, + K, + i, + k, + j); + matrix_add( + B, + K-1, + i, + k, + Bvalue); + } + Avalue += a*matrix_get(model->R, + K, i, j); + } + } + Avalue *= omega; + } + Avalue *= in * rho[i]; + + // Now we calculate the matrix ZAZ. Since this is + // guaranteed to be symmetric, we only calculate the + // upper part of the matrix, and then copy this over + // to the lower part after all calculations are done. + // Note that the use of dsym is faster than dspr, even + // though dspr uses less memory. + cblas_dsyr( + CblasRowMajor, + CblasUpper, + m+1, + Avalue, + &data->Z[i*(m+1)], + 1, + ZAZ, + m+1); + } + // Copy upper to lower (necessary because we need to switch + // to Col-Major order for LAPACK). + /* + for (i=0; i<m+1; i++) + for (j=0; j<m+1; j++) + matrix_set(ZAZ, m+1, j, i, matrix_get(ZAZ, m+1, i, j)); + */ + + // Calculate the right hand side of the system we + // want to solve. + cblas_dsymm( + CblasRowMajor, + CblasLeft, + CblasUpper, + m+1, + K-1, + 1.0, + ZAZ, + m+1, + model->V, + K-1, + 0.0, + ZAZV, + K-1); + cblas_dgemm( + CblasRowMajor, + CblasTrans, + CblasNoTrans, + m+1, + K-1, + n, + 1.0, + data->Z, + m+1, + B, + K-1, + 1.0, + ZAZV, + K-1); + /* + * Add lambda to all diagonal elements except the + * first one. + */ + i = 0; + for (j=0; j<m; j++) + ZAZ[i+=m+1 + 1] += model->lambda; + + // For the LAPACK call we need to switch to Column- + // Major order. This is unnecessary for the matrix + // ZAZ because it is symmetric. The matrix ZAZV + // must be converted however. + for (i=0; i<m+1; i++) + for (j=0; j<K-1; j++) + ZAZVT[j*(m+1)+i] = ZAZV[i*(K-1)+j]; + + // We use the lower ('L') part of the matrix ZAZ, + // because we have used the upper part in the BLAS + // calls above in Row-major order, and Lapack uses + // column major order. + status = dposv( + 'L', + m+1, + K-1, + ZAZ, + m+1, + ZAZVT, + m+1); + + if (status != 0) { + // This step should not be necessary, as the matrix + // ZAZ is positive semi-definite by definition. It + // is included for safety. + fprintf(stderr, "Received nonzero status from dposv: %i\n", + status); + int *IPIV = malloc((m+1)*sizeof(int)); + double *WORK = malloc(1*sizeof(double)); + status = dsysv( + 'L', + m+1, + K-1, + ZAZ, + m+1, + IPIV, + ZAZVT, + m+1, + WORK, + -1); + WORK = (double *)realloc(WORK, WORK[0]*sizeof(double)); + status = dsysv( + 'L', + m+1, + K-1, + ZAZ, + m+1, + IPIV, + ZAZVT, + m+1, + WORK, + sizeof(WORK)/sizeof(double)); + if (status != 0) + fprintf(stderr, "Received nonzero status from " + "dsysv: %i\n", status); + } + + // Return to Row-major order. The matrix ZAZVT contains the + // solution after the dposv/dsysv call. + for (i=0; i<m+1; i++) + for (j=0; j<K-1; j++) + ZAZV[i*(K-1)+j] = ZAZVT[j*(m+1)+i]; + + // Store the previous V in Vbar, assign the new V + // (which is stored in ZAZVT) to the model, and give ZAZVT the + // address of Vbar. This should ensure that we keep + // re-using assigned memory instead of reallocating at every + // update. + /* See this answer: http://stackoverflow.com/q/13246615/ + * For now we'll just do it by value until the rest is figured out. + ptr = model->Vbar; + model->Vbar = model->V; + model->V = ZAZVT; + ZAZVT = ptr; + */ + + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + matrix_set( + model->Vbar, + K-1, + i, + j, + matrix_get(model->V, K-1, i, j)); + matrix_set( + model->V, + K-1, + i, + j, + matrix_get(ZAZV, K-1, i, j)); + } + } +} diff --git a/src/msvmmaj_train_dataset.c b/src/msvmmaj_train_dataset.c new file mode 100644 index 0000000..2f6d77d --- /dev/null +++ b/src/msvmmaj_train_dataset.c @@ -0,0 +1,719 @@ +/** + * @file msvmmaj_train_dataset.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Functions for finding the optimal parameters for the dataset + * + * @details + * The MSVMMaj algorithm takes a number of parameters. The functions in + * this file are used to find the optimal parameters. + */ + +#include <math.h> +#include <time.h> + +#include "crossval.h" +#include "libMSVMMaj.h" +#include "msvmmaj.h" +#include "msvmmaj_init.h" +#include "msvmmaj_matrix.h" +#include "msvmmaj_train.h" +#include "msvmmaj_train_dataset.h" +#include "msvmmaj_pred.h" +#include "util.h" +#include "timer.h" + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +/** + * @brief Initialize a Queue from a Training instance + * + * @details + * A Training instance describes the grid to search over. This funtion + * creates all tasks that need to be performed and adds these to + * a Queue. Each task contains a pointer to the train and test datasets + * which are supplied. Note that the tasks are created in a specific order of + * the parameters, to ensure that the MajModel::V of a previous parameter + * set provides the best possible initial estimate of MajModel::V for the next + * parameter set. + * + * @param[in] training Training struct describing the grid search + * @param[in] queue pointer to a Queue that will be used to + * add the tasks to + * @param[in] train_data MajData of the training set + * @param[in] test_data MajData of the test set + * + */ +void make_queue(struct Training *training, struct Queue *queue, + struct MajData *train_data, struct MajData *test_data) +{ + long i, j, k; + long N, cnt = 0; + struct Task *task; + queue->i = 0; + + N = training->Np; + N *= training->Nl; + N *= training->Nk; + N *= training->Ne; + N *= training->Nw; + // these parameters are not necessarily non-zero + N *= training->Ng > 0 ? training->Ng : 1; + N *= training->Nc > 0 ? training->Nc : 1; + N *= training->Nd > 0 ? training->Nd : 1; + + queue->tasks = Malloc(struct Task *, N); + queue->N = N; + + // initialize all tasks + for (i=0; i<N; i++) { + task = Malloc(struct Task, 1); + task->ID = i; + task->train_data = train_data; + task->test_data = test_data; + task->folds = training->folds; + task->kerneltype = training->kerneltype; + task->kernel_param = Calloc(double, training->Ng + + training->Nc + training->Nd); + queue->tasks[i] = task; + } + + // These loops mimick a large nested for loop. The advantage is that + // Nd, Nc and Ng which are on the outside of the nested for loop can + // now be zero, without large modification (see below). Whether this + // is indeed better than the nested for loop has not been tested. + cnt = 1; + i = 0; + while (i < N ) + for (j=0; j<training->Np; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->p = training->ps[j]; + i++; + } + + cnt *= training->Np; + i = 0; + while (i < N ) + for (j=0; j<training->Nl; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->lambda = + training->lambdas[j]; + i++; + } + + cnt *= training->Nl; + i = 0; + while (i < N ) + for (j=0; j<training->Nk; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->kappa = training->kappas[j]; + i++; + } + + cnt *= training->Nk; + i = 0; + while (i < N ) + for (j=0; j<training->Nw; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->weight_idx = + training->weight_idxs[j]; + i++; + } + + cnt *= training->Nw; + i = 0; + while (i < N ) + for (j=0; j<training->Ne; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->epsilon = + training->epsilons[j]; + i++; + } + + cnt *= training->Ne; + i = 0; + while (i < N && training->Ng > 0) + for (j=0; j<training->Ng; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->kernel_param[0] = + training->gammas[j]; + i++; + } + + cnt *= training->Ng > 0 ? training->Ng : 1; + i = 0; + while (i < N && training->Nc > 0) + for (j=0; j<training->Nc; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->kernel_param[1] = + training->coefs[j]; + i++; + } + + cnt *= training->Nc > 0 ? training->Ng : 1; + i = 0; + while (i < N && training->Nd > 0) + for (j=0; j<training->Nd; j++) + for (k=0; k<cnt; k++) { + queue->tasks[i]->kernel_param[2] = + training->degrees[j]; + i++; + } +} + +/** + * @brief Get new Task from Queue + * + * @details + * Return a pointer to the next Task in the Queue. If no Task instances are + * left, NULL is returned. The internal counter Queue::i is used for finding + * the next Task. + * + * @param[in] q Queue instance + * @returns pointer to next Task + * + */ +struct Task *get_next_task(struct Queue *q) +{ + long i = q->i; + if (i < q->N) { + q->i++; + return q->tasks[i]; + } + return NULL; +} + +/** + * @brief Comparison function for Tasks based on performance + * + * @details + * To be able to sort Task structures on the performance of their specific + * set of parameters, this comparison function is implemented. Task structs + * are sorted with highest performance first. + * + * @param[in] elem1 Task 1 + * @param[in] elem2 Task 2 + * @returns result of inequality of Task 1 performance over + * Task 2 performance + */ +int tasksort(const void *elem1, const void *elem2) +{ + const struct Task *t1 = (*(struct Task **) elem1); + const struct Task *t2 = (*(struct Task **) elem2); + return (t1->performance > t2->performance); +} + +/** + * @brief Comparison function for doubles + * + * @details + * Similar to tasksort() only now for two doubles. + * + * @param[in] elem1 number 1 + * @param[in] elem2 number 2 + * @returns comparison of number 1 larger than number 2 + */ +int doublesort(const void *elem1, const void *elem2) +{ + const double t1 = (*(double *) elem1); + const double t2 = (*(double *) elem2); + return t1 > t2; +} + +/** + * @brief Calculate the percentile of an array of doubles + * + * @details + * The percentile of performance is used to find the top performing + * configurations. Since no standard definition of the percentile exists, we + * use the method used in MATLAB and Octave. Since calculating the percentile + * requires a sorted list of the values, a local copy is made first. + * + * @param[in] values array of doubles + * @param[in] N length of the array + * @param[in] p percentile to calculate ( 0 <= p <= 1.0 ). + * @returns the p-th percentile of the values + */ +double prctile(double *values, long N, double p) +{ + long i; + double pi, pr, boundary; + double *local = Malloc(double, N); + for (i=0; i<N; i++) + local[i] = values[i]; + + qsort(local, N, sizeof(double), doublesort); + p = p*N + 0.5; + pi = maximum(minimum(floor(p), N-1), 1); + pr = maximum(minimum(p - pi, 1), 0); + boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)]; + + free(local); + + return boundary; +} + +/** + * @brief Run repeats of the Task structs in Queue to find the best + * configuration + * + * @details + * The best performing tasks in the supplied Queue are found by taking those + * Task structs that have a performance greater or equal to the 95% percentile + * of the performance of all tasks. These tasks are then gathered in a new + * Queue. For each of the tasks in this new Queue the cross validation run is + * repeated a number of times. + * + * For each of the Task configurations that are repeated the mean performance, + * standard deviation of the performance and the mean computation time are + * reported. + * + * Finally, the overall best tasks are written to the specified output. These + * tasks are selected to have both the highest mean performance, as well as the + * smallest standard deviation in their performance. This is done as follows. + * First the 99th percentile of task performance and the 1st percentile of + * standard deviation is calculated. If a task exists for which the mean + * performance of the repeats and the standard deviation equals these values + * respectively, this task is found to be the best and is written to the + * output. If no such task exists, the 98th percentile of performance and the + * 2nd percentile of standard deviation is considered. This is repeated until + * an interval is found which contains tasks. If one or more tasks are found, + * this loop stops. + * + * @param[in] q Queue of Task structs which have already been + * run and have a Task::performance value + * @param[in] repeats Number of times to repeat the best + * configurations for consistency + * @param[in] traintype type of training to do (CV or TT) + * + */ +void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) +{ + long i, r, N; + double p, pi, pr, pt, boundary, *time, *std, *mean, *perf; + struct Queue *nq = Malloc(struct Queue, 1); + struct MajModel *model = msvmmaj_init_model(); + struct Task *task = Malloc(struct Task, 1); + clock_t loop_s, loop_e; + + // calculate the performance percentile (Matlab style) + qsort(q->tasks, q->N, sizeof(struct Task *), tasksort); + p = 0.95*q->N + 0.5; + pi = maximum(minimum(floor(p), q->N-1), 1); + pr = maximum(minimum(p - pi, 1), 0); + boundary = (1 - pr)*q->tasks[((long) pi)-1]->performance; + boundary += pr*q->tasks[((long) pi)]->performance; + note("boundary determined at: %f\n", boundary); + + // find the number of tasks that perform at least as good as the 95th + // percentile + N = 0; + for (i=0; i<q->N; i++) + if (q->tasks[i]->performance >= boundary) + N++; + note("Number of items: %li\n", N); + std = Calloc(double, N); + mean = Calloc(double, N); + time = Calloc(double, N); + perf = Calloc(double, N*repeats); + + // create a new task queue with the tasks which perform well + nq->tasks = Malloc(struct Task *, N); + for (i=q->N-1; i>q->N-N-1; i--) + nq->tasks[q->N-i-1] = q->tasks[i]; + nq->N = N; + nq->i = 0; + + // for each task run the consistency repeats + for (i=0; i<N; i++) { + task = get_next_task(nq); + make_model_from_task(task, model); + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + time[i] = 0.0; + note("(%02li/%02li:%03li)\t", i+1, N, task->ID); + for (r=0; r<repeats; r++) { + if (traintype == CV) { + loop_s = clock(); + p = cross_validation(model, NULL, + task->train_data, task->folds); + loop_e = clock(); + time[i] += elapsed_time(loop_s, loop_e); + matrix_set(perf, repeats, i, r, p); + mean[i] += p/((double) repeats); + } else { + note("Only cv is implemented\n"); + exit(1); + } + note("%3.3f\t", p); + } + for (r=0; r<repeats; r++) { + std[i] += pow(matrix_get( + perf, + repeats, + i, + r) - mean[i], + 2.0); + } + std[i] /= ((double) repeats) - 1.0; + std[i] = sqrt(std[i]); + note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", + mean[i], std[i], time[i]); + } + + // find the best overall configurations: those with high average + // performance and low deviation in the performance + note("\nBest overall configuration(s):\n"); + note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t" + "mean_perf\tstd_perf\ttime_perf\n"); + p = 0.0; + bool breakout = false; + while (breakout == false) { + pi = prctile(mean, N, (100.0-p)/100.0); + pr = prctile(std, N, p/100.0); + pt = prctile(time, N, p/100.0); + for (i=0; i<N; i++) + if ((pi - mean[i] < 0.0001) && + (std[i] - pr < 0.0001) && + (time[i] - pt < 0.0001)) { + note("(%li)\tw = %li\te = %f\tp = %f\t" + "k = %f\tl = %f\t" + "mean: %3.3f\tstd: %3.3f\t" + "time: %3.3f\n", + nq->tasks[i]->ID, + nq->tasks[i]->weight_idx, + nq->tasks[i]->epsilon, + nq->tasks[i]->p, + nq->tasks[i]->kappa, + nq->tasks[i]->lambda, + mean[i], + std[i], + time[i]); + breakout = true; + } + p += 1.0; + } + + free(task); + free(model); + free(perf); + free(std); + free(mean); + free(time); +} + +/** + * @brief Run cross validation with a seed model + * + * @details + * This is an implementation of cross validation which uses the optimal + * parameters MajModel::V of a previous fold as initial conditions for + * MajModel::V of the next fold. An initial seed for V can be given through the + * seed_model parameter. If seed_model is NULL, random starting values are + * used. + * + * @todo + * The seed model shouldn't have to be allocated completely, since only V is + * used. + * @todo + * There must be some inefficiencies here because the fold model is allocated + * at every fold. This would be detrimental with large datasets. + * + * @param[in] model MajModel with the configuration to train + * @param[in] seed_model MajModel with a seed for MajModel::V + * @param[in] data MajData with the dataset + * @param[in] folds number of cross validation folds + * @returns performance (hitrate) of the configuration on + * cross validation + */ +double cross_validation(struct MajModel *model, struct MajModel *seed_model, + struct MajData *data, long folds) +{ + FILE *fid; + + bool fs = false; + long f, *predy; + double total_perf = 0; + struct MajModel *fold_model; + struct MajData *train_data, *test_data; + + long *cv_idx = Calloc(long, model->n); + double *performance = Calloc(double, folds); + + if (seed_model == NULL) { + seed_model = msvmmaj_init_model(); + seed_model->n = 0; // we never use anything other than V + seed_model->m = model->m; + seed_model->K = model->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + fs = true; + } + + train_data = msvmmaj_init_data(); + test_data = msvmmaj_init_data(); + // create splits + msvmmaj_make_cv_split(model->n, folds, cv_idx); + + for (f=0; f<folds; f++) { + msvmmaj_get_tt_split(data, train_data, test_data, cv_idx, f); + // initialize a model for this fold and copy the model + // parameters + fold_model = msvmmaj_init_model(); + copy_model(model, fold_model); + + fold_model->n = train_data->n; + fold_model->m = train_data->m; + fold_model->K = train_data->K; + + // allocate, initialize and seed the fold model + msvmmaj_allocate_model(fold_model); + msvmmaj_initialize_weights(train_data, fold_model); + msvmmaj_seed_model_V(seed_model, fold_model); + + // train the model (without output) + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(fold_model, train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + // calculate predictive performance on test set + predy = Calloc(long, test_data->n); + msvmmaj_predict_labels(test_data, fold_model, predy); + performance[f] = msvmmaj_prediction_perf(test_data, predy); + total_perf += performance[f]/((double) folds); + + // seed the seed model with the fold model + msvmmaj_seed_model_V(fold_model, seed_model); + + free(predy); + free(train_data->y); + free(train_data->Z); + free(test_data->y); + free(test_data->Z); + + msvmmaj_free_model(fold_model); + } + + // if a seed model was allocated before, free it. + if (fs) + msvmmaj_free_model(seed_model); + free(train_data); + free(test_data); + free(performance); + free(cv_idx); + + return total_perf; + +} + +/** + * @brief Run the grid search for a cross validation dataset + * + * @details + * Given a Queue of Task struct to be trained, a grid search is launched to + * find the optimal parameter configuration. As is also done within + * cross_validation(), the optimal weights of one parameter set are used as + * initial estimates for MajModel::V in the next parameter set. Note that to + * optimally exploit this feature of the optimization algorithm, the order in + * which tasks are considered is important. This is considered in + * make_queue(). + * + * The performance found by cross validation is stored in the Task struct. + * + * @param[in,out] q Queue with Task instances to run + */ +void start_training_cv(struct Queue *q) +{ + double perf, current_max = 0; + struct Task *task = get_next_task(q); + struct MajModel *seed_model = msvmmaj_init_model(); + struct MajModel *model = msvmmaj_init_model(); + clock_t main_s, main_e, loop_s, loop_e; + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + msvmmaj_allocate_model(model); + + seed_model->n = 0; + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + note("(%03li/%03li)\tw = %li\te = %f\tp = %f\tk = %f\t " + "l = %f\t", + task->ID+1, q->N, task->weight_idx, + task->epsilon, + task->p, task->kappa, task->lambda); + make_model_from_task(task, model); + + loop_s = clock(); + perf = cross_validation(model, seed_model, task->train_data, + task->folds); + loop_e = clock(); + current_max = maximum(current_max, perf); + + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, + elapsed_time(loop_s, loop_e), + current_max); + + q->tasks[task->ID]->performance = perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + + free(task); + msvmmaj_free_model(seed_model); +} + +/** + * @brief Run the grid search for a train/test dataset + * + * @details + * This function is similar to start_training_cv(), except that the + * pre-determined training set is used only once, and the pre-determined test + * set is used for validation. + * + * @todo + * It would probably be better to train the model on the training set using + * cross validation and only use the test set when comparing with other + * methods. The way it is now, you're finding out which parameters predict + * _this_ test set best, which is not what you want. + * + * @param[in] q Queue with Task structs to run + * + */ +void start_training_tt(struct Queue *q) +{ + FILE *fid; + + long c = 0; + long *predy; + double total_perf, current_max = 0; + + struct Task *task = get_next_task(q); + struct MajModel *seed_model = msvmmaj_init_model(); + + clock_t main_s, main_e; + clock_t loop_s, loop_e; + + seed_model->m = task->train_data->m; + seed_model->K = task->train_data->K; + msvmmaj_allocate_model(seed_model); + msvmmaj_seed_model_V(NULL, seed_model); + + main_s = clock(); + while (task) { + total_perf = 0; + note("(%li/%li)\tw = %li\te = %f\tp = %f\tk = %f\tl = %f\t", + c+1, q->N, task->weight_idx, task->epsilon, + task->p, task->kappa, task->lambda); + loop_s = clock(); + struct MajModel *model = msvmmaj_init_model(); + make_model_from_task(task, model); + + model->n = task->train_data->n; + model->m = task->train_data->m; + model->K = task->train_data->K; + + msvmmaj_allocate_model(model); + msvmmaj_initialize_weights(task->train_data, model); + msvmmaj_seed_model_V(seed_model, model); + + fid = MSVMMAJ_OUTPUT_FILE; + MSVMMAJ_OUTPUT_FILE = NULL; + msvmmaj_optimize(model, task->train_data); + MSVMMAJ_OUTPUT_FILE = fid; + + predy = Calloc(long, task->test_data->n); + msvmmaj_predict_labels(task->test_data, model, predy); + if (task->test_data->y != NULL) + total_perf = msvmmaj_prediction_perf(task->test_data, predy); + msvmmaj_seed_model_V(model, seed_model); + + msvmmaj_free_model(model); + free(predy); + note("."); + loop_e = clock(); + current_max = maximum(current_max, total_perf); + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", total_perf, + elapsed_time(loop_s, loop_e), current_max); + q->tasks[task->ID]->performance = total_perf; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed time: %8.8f seconds\n", + elapsed_time(main_s, main_e)); + free(task); + msvmmaj_free_model(seed_model); +} + +/** + * @brief Free the Queue struct + * + * @details + * Freeing the allocated memory of the Queue means freeing every Task struct + * and then freeing the Queue. + * + * @param[in] q Queue to be freed + * + */ +void free_queue(struct Queue *q) +{ + long i; + for (i=0; i<q->N; i++) { + free(q->tasks[i]->kernel_param); + free(q->tasks[i]); + } + free(q->tasks); + free(q); +} + +/** + * @brief Copy parameters from Task to MajModel + * + * @details + * A Task struct only contains the parameters of the MajModel to be estimated. + * This function is used to copy these parameters. + * + * @param[in] task Task instance with parameters + * @param[in,out] model MajModel to which the parameters are copied + */ +void make_model_from_task(struct Task *task, struct MajModel *model) +{ + model->weight_idx = task->weight_idx; + model->epsilon = task->epsilon; + model->p = task->p; + model->kappa = task->kappa; + model->lambda = task->lambda; +} + +/** + * @brief Copy model parameters between two MajModel structs + * + * @details + * The parameters copied are MajModel::weight_idx, MajModel::epsilon, + * MajModel::p, MajModel::kappa, and MajModel::lambda. + * + * @param[in] from MajModel to copy parameters from + * @param[in,out] to MajModel to copy parameters to + */ +void copy_model(struct MajModel *from, struct MajModel *to) +{ + to->weight_idx = from->weight_idx; + to->epsilon = from->epsilon; + to->p = from->p; + to->kappa = from->kappa; + to->lambda = from->lambda; +} diff --git a/src/predMSVMMaj.c b/src/predMSVMMaj.c new file mode 100644 index 0000000..e67e430 --- /dev/null +++ b/src/predMSVMMaj.c @@ -0,0 +1,174 @@ +/** + * @file predMSVMMaj.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Command line interface for predicting class labels + * + * @details + * This is a command line program for predicting the class labels or + * determining the predictive performance of a pre-determined model on a given + * test dataset. The predictive performance can be written to the screen or + * the predicted class labels can be written to a specified output file. This + * is done using msvmmaj_write_predictions(). + * + * The specified model file must follow the specification given in + * msvmmaj_write_model(). + * + * For usage information, see the program help function. + * + */ + +#include "msvmmaj.h" +#include "msvmmaj_init.h" +#include "msvmmaj_io.h" +#include "msvmmaj_pred.h" +#include "util.h" + +#define MINARGS 3 + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +// function declarations +void exit_with_help(); +void parse_command_line(int argc, char **argv, + char *input_filename, char *output_filename, + char *model_filename); + +/** + * @brief Help function + */ +void exit_with_help() +{ + printf("This is MSVMMaj, version %1.1f\n\n", VERSION); + printf("Usage: predMSVMMaj [options] test_data_file model_file\n"); + printf("Options:\n"); + printf("-o output_file : write output to file\n"); + printf("-q : quiet mode (no output)\n"); + exit(0); +} + +/** + * @brief Main interface function for predMSVMMaj + * + * @details + * Main interface for the command line program. A given model file is read and + * a test dataset is initialized from the given data. The predictive + * performance (hitrate) of the model on the test set is printed to the output + * stream (default = stdout). If an output file is specified the predictions + * are written to the file. + * + * @todo + * Ensure that the program can read model files without class labels + * specified. In that case no prediction accuracy is printed to the screen. + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * + */ +int main(int argc, char **argv) +{ + long *predy; + double performance; + + char input_filename[MAX_LINE_LENGTH]; + char model_filename[MAX_LINE_LENGTH]; + char output_filename[MAX_LINE_LENGTH];; + + if (argc < MINARGS || msvmmaj_check_argv(argc, argv, "-help") + || msvmmaj_check_argv_eq(argc, argv, "-h") ) + exit_with_help(); + parse_command_line(argc, argv, input_filename, output_filename, + model_filename); + + // read the data and model + struct MajModel *model = msvmmaj_init_model(); + struct MajData *data = msvmmaj_init_data(); + msvmmaj_read_data(data, input_filename); + msvmmaj_read_model(model, model_filename); + + // check if the number of attributes in data equals that in model + if (data->m != model->m) { + fprintf(stderr, "Error: number of attributes in data (%li) " + "does not equal the number of attributes in " + "model (%li)\n", data->m, model->m); + exit(1); + } else if (data->K != model->K) { + fprintf(stderr, "Error: number of classes in data (%li) " + "does not equal the number of classes in " + "model (%li)\n", data->K, model->K); + exit(1); + } + + // predict labels and performance if test data has labels + predy = Calloc(long, data->n); + msvmmaj_predict_labels(data, model, predy); + if (data->y != NULL) { + performance = msvmmaj_prediction_perf(data, predy); + note("Predictive performance: %3.2f%%\n", performance); + } + + // if output file is specified, write predictions to it + if (msvmmaj_check_argv_eq(argc, argv, "-o")) { + msvmmaj_write_predictions(data, predy, output_filename); + note("Predictions written to: %s\n", output_filename); + } + + // free the model, data, and predictions + msvmmaj_free_model(model); + msvmmaj_free_data(data); + free(predy); + + return 0; +} + +/** + * @brief Parse command line arguments + * + * @details + * Read the data filename and model filename from the command line arguments. + * If specified, also read the output filename. If the quiet flag is given, + * set the global output stream to NULL. On error, exit_with_help(). + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * @param[in] input_filename pre-allocated array for the input + * filename + * @param[in] output_filename pre-allocated array for the output + * filename + * @param[in] model_filename pre-allocated array for the model + * filename + * + */ +void parse_command_line(int argc, char **argv, char *input_filename, + char *output_filename, char *model_filename) +{ + int i; + + MSVMMAJ_OUTPUT_FILE = stdout; + + for (i=1; i<argc; i++) { + if (argv[i][0] != '-') break; + if (++i >= argc) + exit_with_help(); + switch (argv[i-1][1]) { + case 'o': + strcpy(output_filename, argv[i]); + break; + case 'q': + MSVMMAJ_OUTPUT_FILE = NULL; + i--; + break; + default: + fprintf(stderr, "Unknown option: -%c\n", + argv[i-1][1]); + exit_with_help(); + } + } + + if (i >= argc) + exit_with_help(); + + strcpy(input_filename, argv[i]); + i++; + strcpy(model_filename, argv[i]); +} diff --git a/src/strutil.c b/src/strutil.c new file mode 100644 index 0000000..ca4181f --- /dev/null +++ b/src/strutil.c @@ -0,0 +1,176 @@ +/** + * @file strutil.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Utility functions for dealing with strings + * + * @details + * This file contains functions for reading files, reading strings from a + * format and checking start and ends of strings. + */ + +#include "strutil.h" + +/** + * @brief Check if a string starts with a prefix + * + * @param[in] str string + * @param[in] pre prefix + * @returns boolean, true if string starts with prefix, false + * otherwise + */ +bool str_startswith(const char *str, const char *pre) +{ + size_t lenpre = strlen(pre), + lenstr = strlen(str); + return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0; +} + +/** + * @brief Check if a string ends with a suffix + * + * @param[in] str string + * @param[in] suf suffix + * @returns boolean, true if string ends with suffix, false + * otherwise + */ +bool str_endswith(const char *str, const char *suf) +{ + size_t lensuf = strlen(suf), + lenstr = strlen(str); + return lenstr < lensuf ? false : strncmp(str + lenstr - lensuf, suf, + lensuf) == 0; +} + +/** + * @brief Move to next line in file + * + * @param[in] fid File opened for reading + * @param[in] filename name of the file pointed to by fid + */ +void next_line(FILE *fid, char *filename) +{ + char buffer[MAX_LINE_LENGTH]; + get_line(fid, filename, buffer); +} + +/** + * @brief Read line to buffer + * + * @param[in] fid File opened for reading + * @param[in] filename name of the file + * @param[in,out] buffer allocated buffer to read to + */ +void get_line(FILE *fid, char *filename, char *buffer) +{ + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading file %s\n", filename); + exit(1); + } +} + +/** + * @brief Read a double from file following a format + * + * @param[in] fid File opened for reading + * @param[in] filename Name of the file + * @param[in] fmt Format containing a float format + * @returns value read (if any) + */ +double get_fmt_double(FILE *fid, char *filename, const char *fmt) +{ + char buffer[MAX_LINE_LENGTH]; + double value; + + get_line(fid, filename, buffer); + sscanf(buffer, fmt, &value); + return value; +} + +/** + * @brief Read a long integer from file following a format + * + * @param[in] fid File opened for reading + * @param[in] filename Name of the file + * @param[in] fmt Format containing a long integer format + * @returns value read (if any) + */ +long get_fmt_long(FILE *fid, char *filename, const char *fmt) +{ + char buffer[MAX_LINE_LENGTH]; + long value; + + get_line(fid, filename, buffer); + sscanf(buffer, fmt, &value); + return value; +} + +/** + * @brief Read all doubles in a given buffer + * + * @details + * This function is used to read a line of doubles from a buffer. All the + * doubles found are stored in a pre-allocated array. + * + * @param[in] buffer a string buffer + * @param[in] offset an offset of the string to start looking for + * doubles + * @param[in] all_doubles pre-allocated array of doubles (should be large + * enough) + * @returns number of doubles read + */ +long all_doubles_str(char *buffer, long offset, double *all_doubles) +{ + double value; + long i = 0; + char *start, *end; + + start = buffer + offset; + while (true) { + value = strtod(start, &end); + if (start != end) { + all_doubles[i] = value; + i++; + } else + break; + start = end; + end = NULL; + } + + return i; +} + +/** + * @brief Read all longs in a given buffer + * + * @details + * This function is used to read a line of longs from a buffer. All the + * longs found are stored in a pre-allocated array. + * + * @param[in] buffer a string buffer + * @param[in] offset an offset of the string to start looking for + * longs + * @param[in] all_longs pre-allocated array of longs (should be large + * enough) + * @returns number of longs read + */ +long all_longs_str(char *buffer, long offset, long *all_longs) +{ + long value; + long i = 0; + char *start, *end; + + start = buffer + offset; + while (true) { + value = strtol(start, &end, 10); + if (start != end) { + all_longs[i] = value; + i++; + } else + break; + start = end; + end = NULL; + } + + return i; +} diff --git a/src/timer.c b/src/timer.c new file mode 100644 index 0000000..254f3da --- /dev/null +++ b/src/timer.c @@ -0,0 +1,73 @@ +/** + * @file timer.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Utility functions relating to time + * + * @details + * This file contains a simple function for calculating the time in seconds + * elapsed between two clock() calls. It also contains a function for + * generating a string of the current time, used in writing output files. + */ + +#include <time.h> + +#include "timer.h" + +/** + * @brief Calculate the time between two clocks + * + * @param[in] s_time starting time + * @param[in] e_time end time + * @returns time elapsed in seconds + */ +double elapsed_time(clock_t s_time, clock_t e_time) +{ + return ((double) (e_time - s_time))/((double) CLOCKS_PER_SEC); +} + +/** + * @brief Get time string with UTC offset + * + * @details + * Create a string for the current system time. Include an offset of UTC for + * consistency. The format of the generated string is "DDD MMM D HH:MM:SS + * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". + * + * @param[in,out] buffer allocated string buffer, on exit contains + * formatted string + * + */ +void get_time_string(char *buffer) +{ + int diff, hours, minutes; + char timestr[MAX_LINE_LENGTH]; + time_t current_time, lt, gt; + struct tm *lclt; + + // get current time (in epoch) + current_time = time(NULL); + if (current_time == ((time_t)-1)) { + fprintf(stderr, "Failed to compute the current time.\n"); + return; + } + + // convert time to local time and create a string + lclt = localtime(¤t_time); + strftime(timestr, MAX_LINE_LENGTH, "%c", lclt); + if (timestr == NULL) { + fprintf(stderr, "Failed to convert time to string.\n"); + return; + } + + // calculate the UTC offset including DST + lt = mktime(localtime(¤t_time)); + gt = mktime(gmtime(¤t_time)); + diff = -difftime(gt, lt); + hours = (diff/3600); + minutes = (diff%3600)/60; + if (lclt->tm_isdst == 1) + hours++; + + sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); +} diff --git a/src/trainMSVMMaj.c b/src/trainMSVMMaj.c new file mode 100644 index 0000000..9f71325 --- /dev/null +++ b/src/trainMSVMMaj.c @@ -0,0 +1,250 @@ +/** + * @file trainMSVMMaj.c + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Command line interface for training a single model with MSVMMaj + * + * @details + * This is a command line program for training a single model on a given + * dataset. To run a grid search over a number of parameter configurations, + * see trainMSVMMajdataset.c. + * + */ + +#include <time.h> +#include <math.h> + +#include "msvmmaj_kernel.h" +#include "libMSVMMaj.h" +#include "msvmmaj.h" +#include "msvmmaj_io.h" +#include "msvmmaj_init.h" +#include "msvmmaj_train.h" +#include "util.h" + +#define MINARGS 2 + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +// function declarations +void exit_with_help(); +void parse_command_line(int argc, char **argv, struct MajModel *model, + char *input_filename, char *output_filename, char *model_filename); + +/** + * @brief Help function + */ +void exit_with_help() +{ + printf("This is MSVMMaj, version %1.1f\n\n", VERSION); + printf("Usage: trainMSVMMaj [options] training_data_file\n"); + printf("Options:\n"); + printf("-c coef : coefficient for the polynomial and sigmoid kernel\n"); + printf("-d degree : degree for the polynomial kernel\n"); + printf("-e epsilon : set the value of the stopping criterion\n"); + printf("-g gamma : parameter for the rbf, polynomial or sigmoid " + "kernel\n"); + printf("-h | -help : print this help.\n"); + printf("-k kappa : set the value of kappa used in the Huber hinge\n"); + printf("-l lambda : set the value of lambda (lambda > 0)\n"); + printf("-m model_file : use previous model as seed for W and t\n"); + printf("-o output_file : write output to file\n"); + printf("-p p-value : set the value of p in the lp norm " + "(1.0 <= p <= 2.0)\n"); + printf("-q : quiet mode (no output)\n"); + printf("-r rho : choose the weigth specification (1 = unit, 2 = " + "group)\n"); + printf("-t type: kerneltype (LINEAR=0, POLY=1, RBF=2, SIGMOID=3)\n"); + printf("-u use_cholesky: use cholesky decomposition when using " + "kernels (0 = false, 1 = true). Default 0.\n"); + + exit(0); +} + +/** + * @brief Main interface function for trainMSVMMaj + * + * @details + * Main interface for the command line program. A given dataset file is read + * and a MSVMMaj model is trained on this data. By default the progress of the + * computations are written to stdout. See for full options of the program the + * help function. + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * + */ +int main(int argc, char **argv) +{ + char input_filename[MAX_LINE_LENGTH]; + char model_filename[MAX_LINE_LENGTH]; + char output_filename[MAX_LINE_LENGTH]; + + struct MajModel *model = msvmmaj_init_model(); + struct MajData *data = msvmmaj_init_data(); + + if (argc < MINARGS || msvmmaj_check_argv(argc, argv, "-help") + || msvmmaj_check_argv_eq(argc, argv, "-h") ) + exit_with_help(); + parse_command_line(argc, argv, model, input_filename, + output_filename, model_filename); + + // read data file + msvmmaj_read_data(data, input_filename); + + // copy dataset parameters to model + model->n = data->n; + model->m = data->m; + model->K = data->K; + model->data_file = input_filename; + + // initialize kernel (if necessary) + msvmmaj_make_kernel(model, data); + + // allocate model and initialize weights + msvmmaj_allocate_model(model); + msvmmaj_initialize_weights(data, model); + + // seed the random number generator (only place in programs is in + // command line interfaces) + srand(time(NULL)); + + if (msvmmaj_check_argv_eq(argc, argv, "-m")) { + struct MajModel *seed_model = msvmmaj_init_model(); + msvmmaj_read_model(seed_model, model_filename); + msvmmaj_seed_model_V(seed_model, model); + msvmmaj_free_model(seed_model); + } else { + msvmmaj_seed_model_V(NULL, model); + } + + // start training + msvmmaj_optimize(model, data); + + // write_model to file + if (msvmmaj_check_argv_eq(argc, argv, "-o")) { + msvmmaj_write_model(model, output_filename); + note("Output written to %s\n", output_filename); + } + + // free model and data + msvmmaj_free_model(model); + msvmmaj_free_data(data); + + return 0; +} + +/** + * @brief Parse command line arguments + * + * @details + * Process the command line arguments for the model parameters, and record + * them in the specified MajModel. An input filename for the dataset is read + * and if specified an output filename and a model filename for the seed + * model. + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * @param[in] model initialized model + * @param[in] input_filename pre-allocated buffer for the input + * filename + * @param[in] output_filename pre-allocated buffer for the output + * filename + * @param[in] model_filename pre-allocated buffer for the model + * filename + * + */ +void parse_command_line(int argc, char **argv, struct MajModel *model, + char *input_filename, char *output_filename, char *model_filename) +{ + int i, tmp; + double gamma = 1.0, + degree = 2.0, + coef = 0.0; + + MSVMMAJ_OUTPUT_FILE = stdout; + + // parse options + for (i=1; i<argc; i++) { + if (argv[i][0] != '-') break; + if (++i>=argc) { + exit_with_help(); + } + switch (argv[i-1][1]) { + case 'c': + coef = atof(argv[i]); + break; + case 'd': + degree = atof(argv[i]); + break; + case 'e': + model->epsilon = atof(argv[i]); + break; + case 'g': + gamma = atof(argv[i]); + break; + case 'k': + model->kappa = atof(argv[i]); + break; + case 'l': + model->lambda = atof(argv[i]); + break; + case 'm': + strcpy(model_filename, argv[i]); + break; + case 'o': + strcpy(output_filename, argv[i]); + break; + case 'p': + model->p = atof(argv[i]); + break; + case 'r': + model->weight_idx = atoi(argv[i]); + break; + case 't': + model->kerneltype = atoi(argv[i]); + break; + case 'u': + tmp = atoi(argv[i]); + if (!(tmp == 1 || tmp == 0)) + fprintf(stderr, "Unknown value %i for" + " use_cholesky", tmp); + model->use_cholesky = (tmp == 1) ? true : false; + break; + case 'q': + MSVMMAJ_OUTPUT_FILE = NULL; + i--; + break; + default: + fprintf(stderr, "Unknown option: -%c\n", + argv[i-1][1]); + exit_with_help(); + } + } + + // read input filename + if (i >= argc) + exit_with_help(); + + strcpy(input_filename, argv[i]); + + // set kernel parameters + switch (model->kerneltype) { + case K_LINEAR: + break; + case K_POLY: + model->kernelparam = Calloc(double, 3); + model->kernelparam[0] = gamma; + model->kernelparam[1] = coef; + model->kernelparam[2] = degree; + break; + case K_RBF: + model->kernelparam = Calloc(double, 1); + model->kernelparam[0] = gamma; + break; + case K_SIGMOID: + model->kernelparam = Calloc(double, 1); + model->kernelparam[0] = gamma; + model->kernelparam[1] = coef; + } +} diff --git a/src/trainMSVMMajdataset.c b/src/trainMSVMMajdataset.c new file mode 100644 index 0000000..e9f8b8e --- /dev/null +++ b/src/trainMSVMMajdataset.c @@ -0,0 +1,322 @@ +/** + * @file trainMSVMMajdataset.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Command line interface for the grid search program + * + * @details + * This is a command line interface to the parameter grid search functionality + * of the algorithm. The grid search is specified in a separate file, thereby + * reducing the number of command line arguments. See + * read_training_from_file() for documentation on the training file. + * + * The program runs a grid search as specified in the training file. If + * desired the grid search can incorporate consistency checks to find the + * configuration among the best configurations which scores consistently high. + * All output is written to stdout, unless the quiet mode is specified. + * + * For further usage information, see the program help function. + * + */ + +#include <time.h> + +#include "crossval.h" +#include "msvmmaj.h" +#include "msvmmaj_io.h" +#include "msvmmaj_init.h" +#include "msvmmaj_pred.h" +#include "msvmmaj_train.h" +#include "msvmmaj_train_dataset.h" +#include "strutil.h" +#include "util.h" + +#define MINARGS 2 + +extern FILE *MSVMMAJ_OUTPUT_FILE; + +// function declarations +void exit_with_help(); +void parse_command_line(int argc, char **argv, char *input_filename); +void read_training_from_file(char *input_filename, struct Training *training); + +/** + * @brief Help function + */ +void exit_with_help() +{ + printf("This is MSVMMaj, version %1.1f\n\n", VERSION); + printf("Usage: trainMSVMMajdataset [options] training_file\n"); + printf("Options:\n"); + printf("-h | -help : print this help.\n"); + printf("-q : quiet mode (no output)\n"); + + exit(0); +} + +/** + * @brief Main interface function for trainMSVMMajdataset + * + * @details + * Main interface for the command line program. A given training file which + * specifies a grid search over a single dataset is read. From this, a Queue + * is created containing all Task instances that need to be performed in the + * search. Depending on the type of dataset, either cross validation or + * train/test split training is performed for all tasks. If specified, + * consistency repeats are done at the end of the grid search. Note that + * currently no output is produced other than what is written to stdout. + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * + */ +int main(int argc, char **argv) +{ + char input_filename[MAX_LINE_LENGTH]; + + struct Training *training = Malloc(struct Training, 1); + struct MajData *train_data = Malloc(struct MajData, 1); + struct MajData *test_data = Malloc(struct MajData, 1); + + if (argc < MINARGS || msvmmaj_check_argv(argc, argv, "-help") + || msvmmaj_check_argv_eq(argc, argv, "-h") ) + exit_with_help(); + parse_command_line(argc, argv, input_filename); + + training->repeats = 0; + note("Reading training file\n"); + read_training_from_file(input_filename, training); + + note("Reading data from %s\n", training->train_data_file); + msvmmaj_read_data(train_data, training->train_data_file); + if (training->traintype == TT) { + note("Reading data from %s\n", training->test_data_file); + msvmmaj_read_data(test_data, training->test_data_file); + } + + note("Creating queue\n"); + struct Queue *q = Malloc(struct Queue, 1); + make_queue(training, q, train_data, test_data); + + srand(time(NULL)); + + note("Starting training\n"); + if (training->traintype == TT) + start_training_tt(q); + else + start_training_cv(q); + note("Training finished\n"); + + if (training->repeats > 0) { + consistency_repeats(q, training->repeats, training->traintype); + } + + free_queue(q); + free(training); + msvmmaj_free_data(train_data); + msvmmaj_free_data(test_data); + + note("Done.\n"); + return 0; +} + +/** + * @brief Parse command line arguments + * + * @details + * Few arguments can be supplied to the command line. Only quiet mode can be + * specified, or help can be requested. The filename of the training file is + * read from the arguments. Parsing of the training file is done separately in + * read_training_from_file(). + * + * @param[in] argc number of command line arguments + * @param[in] argv array of command line arguments + * @param[in] input_filename pre-allocated buffer for the training + * filename. + * + */ +void parse_command_line(int argc, char **argv, char *input_filename) +{ + int i; + + MSVMMAJ_OUTPUT_FILE = stdout; + + for (i=1; i<argc; i++) { + if (argv[i][0] != '-') break; + if (++i>=argc) + exit_with_help(); + switch (argv[i-1][1]) { + case 'q': + MSVMMAJ_OUTPUT_FILE = NULL; + i--; + break; + default: + fprintf(stderr, "Unknown option: -%c\n", + argv[i-1][1]); + exit_with_help(); + } + } + + if (i >= argc) + exit_with_help(); + + strcpy(input_filename, argv[i]); +} + +/** + * @brief Read the Training struct from file + * + * @details + * Read the Training struct from a file. The training file follows a specific + * format specified in @ref spec_training_file. + * + * Commonly used string functions in this function are all_doubles_str() and + * all_longs_str(). + * + * @param[in] input_filename filename of the training file + * @param[in] training Training structure to place the parsed + * parameter grid. + * + */ +void read_training_from_file(char *input_filename, struct Training *training) +{ + long i, nr = 0; + FILE *fid; + char buffer[MAX_LINE_LENGTH]; + char train_filename[MAX_LINE_LENGTH]; + char test_filename[MAX_LINE_LENGTH]; + double *params = Calloc(double, MAX_LINE_LENGTH); + long *lparams = Calloc(long, MAX_LINE_LENGTH); + + fid = fopen(input_filename, "r"); + if (fid == NULL) { + fprintf(stderr, "Error opening training file %s\n", + input_filename); + exit(1); + } + training->traintype = CV; + while ( fgets(buffer, MAX_LINE_LENGTH, fid) != NULL ) { + Memset(params, double, MAX_LINE_LENGTH); + Memset(lparams, long, MAX_LINE_LENGTH); + if (str_startswith(buffer, "train:")) { + sscanf(buffer, "train: %s\n", train_filename); + training->train_data_file = Calloc(char, + MAX_LINE_LENGTH); + strcpy(training->train_data_file, train_filename); + } else if (str_startswith(buffer, "test:")) { + sscanf(buffer, "test: %s\n", test_filename); + training->test_data_file = Calloc(char, + MAX_LINE_LENGTH); + strcpy(training->test_data_file, test_filename); + training->traintype = TT; + } else if (str_startswith(buffer, "p:")) { + nr = all_doubles_str(buffer, 2, params); + training->ps = Calloc(double, nr); + for (i=0; i<nr; i++) + training->ps[i] = params[i]; + training->Np = nr; + } else if (str_startswith(buffer, "lambda:")) { + nr = all_doubles_str(buffer, 7, params); + training->lambdas = Calloc(double, nr); + for (i=0; i<nr; i++) + training->lambdas[i] = params[i]; + training->Nl = nr; + } else if (str_startswith(buffer, "kappa:")) { + nr = all_doubles_str(buffer, 6, params); + training->kappas = Calloc(double, nr); + for (i=0; i<nr; i++) + training->kappas[i] = params[i]; + training->Nk = nr; + } else if (str_startswith(buffer, "epsilon:")) { + nr = all_doubles_str(buffer, 8, params); + training->epsilons = Calloc(double, nr); + for (i=0; i<nr; i++) + training->epsilons[i] = params[i]; + training->Ne = nr; + } else if (str_startswith(buffer, "weight:")) { + nr = all_longs_str(buffer, 7, lparams); + training->weight_idxs = Calloc(int, nr); + for (i=0; i<nr; i++) + training->weight_idxs[i] = lparams[i]; + training->Nw = nr; + } else if (str_startswith(buffer, "folds:")) { + nr = all_longs_str(buffer, 6, lparams); + training->folds = lparams[0]; + if (nr > 1) + fprintf(stderr, "Field \"folds\" only takes " + "one value. Additional " + "fields are ignored.\n"); + } else if (str_startswith(buffer, "repeats:")) { + nr = all_longs_str(buffer, 8, lparams); + training->repeats = lparams[0]; + if (nr > 1) + fprintf(stderr, "Field \"repeats\" only " + "takes one value. Additional " + "fields are ignored.\n"); + } else if (str_startswith(buffer, "kernel:")) { + nr = all_longs_str(buffer, 7, lparams); + if (nr > 1) + fprintf(stderr, "Field \"kernel\" only takes " + "one value. Additional " + "fields are ignored.\n"); + switch (lparams[0]) { + case 0: + training->kerneltype = K_LINEAR; + break; + case 1: + training->kerneltype = K_POLY; + break; + case 2: + training->kerneltype = K_RBF; + break; + case 3: + training->kerneltype = K_SIGMOID; + break; + } + } else if (str_startswith(buffer, "gamma:")) { + nr = all_doubles_str(buffer, 6, params); + if (training->kerneltype == K_LINEAR) { + fprintf(stderr, "Field \"gamma\" ignored, " + "linear kernel is used.\n"); + training->Ng = 0; + break; + } + training->gammas = Calloc(double, nr); + for (i=0; i<nr; i++) + training->gammas[i] = params[i]; + training->Ng = nr; + } else if (str_startswith(buffer, "coef:")) { + nr = all_doubles_str(buffer, 5, params); + if (training->kerneltype == K_LINEAR || + training->kerneltype == K_RBF) { + fprintf(stderr, "Field \"coef\" ignored with" + "specified kernel.\n"); + training->Nc = 0; + break; + } + training->coefs = Calloc(double, nr); + for (i=0; i<nr; i++) + training->coefs[i] = params[i]; + training->Nc = nr; + } else if (str_startswith(buffer, "degree:")) { + nr = all_doubles_str(buffer, 7, params); + if (training->kerneltype != K_POLY) { + fprintf(stderr, "Field \"degree\" ignored " + "with specified kernel.\n"); + training->Nd = 0; + break; + } + training->degrees = Calloc(double, nr); + for (i=0; i<nr; i++) + training->degrees[i] = params[i]; + training->Nd = nr; + } else { + fprintf(stderr, "Cannot find any parameters on line: " + "%s\n", buffer); + } + } + + free(params); + free(lparams); + fclose(fid); +} diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..e76a074 --- /dev/null +++ b/src/util.c @@ -0,0 +1,128 @@ +/** + * @file util.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Utility functions + * + * @details + * This file contains several utility functions for coordinating input and + * output of data and model files. It also contains string functions. + * + */ +#include <stdarg.h> + +#include "util.h" + +FILE *MSVMMAJ_OUTPUT_FILE; ///< The #MSVMMAJ_OUTPUT_FILE specifies the + ///< output stream to which all output is + ///< written. This is done through the + ///< internal (!) + ///< function msvmmaj_print_string(). The + ///< advantage of using a global output + ///< stream variable is that the output can + ///< temporarily be suppressed by importing + ///< this variable through @c extern and + ///< (temporarily) setting it to NULL. + +/** + * @brief Check if any command line arguments contain string + * + * @details + * Check if any of a given array of command line arguments contains a given + * string. If the string is found, the index of the string in argv is + * returned. If the string is not found, 0 is returned. + * + * This function is copied from MSVMpack/libMSVM.c. + * + * @param[in] argc number of command line arguments + * @param[in] argv command line arguments + * @param[in] str string to find in the arguments + * @returns index of the string in the arguments if found, 0 + * otherwise + */ +int msvmmaj_check_argv(int argc, char **argv, char *str) +{ + int i; + int arg_str = 0; + for (i=1; i<argc; i++) + if (strstr(argv[i], str) != NULL) { + arg_str = i; + break; + } + + return arg_str; +} + +/** + * @brief Check if a command line argument equals a string + * + * @details + * Check if any of the command line arguments is exactly equal to a given + * string. If so, return the index of the corresponding command line argument. + * If not, return 0. + * + * This function is copied from MSVMpack/libMSVM.c + * + * @param[in] argc number of command line arguments + * @param[in] argv command line arguments + * @param[in] str string to find in the arguments + * @returns index of the command line argument that corresponds to + * the string, 0 if none matches. + */ +int msvmmaj_check_argv_eq(int argc, char **argv, char *str) +{ + int i; + int arg_str = 0; + for (i=1; i<argc; i++) + if (strcmp(argv[i], str) == 0) { + arg_str = i; + break; + } + + return arg_str; +} + + +/** + * @brief Print a given string to the specified output stream + * + * @details + * This function is used to print a given string to the output stream + * specified by #MSVMMAJ_OUTPUT_FILE. The stream is flushed after the string + * is written to the stream. If #MSVMMAJ_OUTPUT_FILE is NULL, nothing is + * written. Note that this function is only used by note(), it should never be + * used directly. + * + * @param[in] s string to write to the stream + * + */ +static void msvmmaj_print_string(const char *s) +{ + if (MSVMMAJ_OUTPUT_FILE != NULL) { + fputs(s, MSVMMAJ_OUTPUT_FILE); + fflush(MSVMMAJ_OUTPUT_FILE); + } +} + +/** + * @brief Parse a formatted string and write to the output stream + * + * @details + * This function is a replacement of fprintf(), such that the output stream + * does not have to be specified at each function call. The functionality is + * exactly the same however. Writing the formatted string to the output stream + * is handled by msvmmaj_print_string(). + * + * @param[in] fmt String format + * @param[in] ... variable argument list for the string format + * + */ +void note(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*msvmmaj_print_string)(buf); +} |
