diff options
Diffstat (limited to 'src/gensvm_cv_util.c')
| -rw-r--r-- | src/gensvm_cv_util.c | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c new file mode 100644 index 0000000..d9cde09 --- /dev/null +++ b/src/gensvm_cv_util.c @@ -0,0 +1,141 @@ +/** + * @file gensvm_cv_util.c + * @author Gertjan van den Burg + * @date January 7, 2014 + * @brief Functions for cross validation + * + * @details + * This file contains functions for performing cross validation. The funtion + * gensvm_make_cv_split() creates a cross validation vector for non-stratified + * cross validation. The function gensvm_get_tt_split() creates a train and + * test dataset from a given dataset and a pre-determined CV partition vector. + * See individual function documentation for details. + * + */ + +#include "gensvm_cv_util.h" + +/** + * @brief Create a cross validation split vector + * + * @details + * A pre-allocated vector of length N is created which can be used to define + * cross validation splits. The folds are contain between + * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$ + * instances. An instance is mapped to a partition randomly until all folds + * contain @f$ N \% folds @f$ instances. The zero fold then contains + * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$ + * instances are then distributed over the first @f$ N \% folds @f$ folds. + * + * @param[in] N number of instances + * @param[in] folds number of folds + * @param[in,out] cv_idx array of size N which contains the fold index + * for each observation on exit + * + */ +void gensvm_make_cv_split(long N, long folds, long *cv_idx) +{ + long i, j, idx; + + for (i=0; i<N; i++) + cv_idx[i] = 0; + + long big_folds = N%folds; + long small_fold_size = N/folds; + + j = 0; + for (i=0; i<small_fold_size*folds; i++) + while (1) { + idx = rand()%N; + if (cv_idx[idx] == 0) { + cv_idx[idx] = j; + j++; + j%=folds; + break; + } + } + j = 0; + i = 0; + while (i < big_folds) { + if (cv_idx[j] == 0) { + cv_idx[j] = i++; + } + j++; + } +} + + +/** + * @brief Create train and test datasets for a CV split + * + * @details + * Given a GenData structure for the full dataset, a previously created + * cross validation split vector and a fold index, a training and test dataset + * are created. + * + * @param[in] full_data a GenData structure for the entire + * dataset + * @param[in,out] train_data an initialized GenData structure which + * on exit contains the training dataset + * @param[in,out] test_data an initialized GenData structure which + * on exit contains the test dataset + * @param[in] cv_idx a vector of cv partitions created by + * gensvm_make_cv_split() + * @param[in] fold_idx index of the fold which becomes the + * test dataset + */ +void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, + struct GenData *test_data, long *cv_idx, long fold_idx) +{ + long i, j, k, l, test_n, train_n; + + long n = full_data->n; + long m = full_data->m; + long K = full_data->K; + + double value; + + test_n = 0; + for (i=0; i<n; i++) + if (cv_idx[i] == fold_idx) + test_n++; + train_n = n - test_n; + + test_data->n = test_n; + train_data->n = train_n; + + train_data->K = K; + test_data->K = K; + + train_data->m = m; + test_data->m = m; + + train_data->y = Calloc(long, train_n); + test_data->y = Calloc(long, test_n); + + train_data->RAW = Calloc(double, train_n*(m+1)); + test_data->RAW = Calloc(double, test_n*(m+1)); + + k = 0; + l = 0; + for (i=0; i<n; i++) { + if (cv_idx[i] == fold_idx) { + test_data->y[k] = full_data->y[i]; + for (j=0; j<m+1; j++) { + value = matrix_get(full_data->RAW, m+1, i, j); + matrix_set(test_data->RAW, m+1, k, j, value); + } + k++; + } else { + train_data->y[l] = full_data->y[i]; + for (j=0; j<m+1; j++) { + value = matrix_get(full_data->RAW, m+1, i, j); + matrix_set(train_data->RAW, m+1, l, j, value); + } + l++; + } + } + + train_data->Z = train_data->RAW; + test_data->Z = test_data->RAW; +} |
