/**
* @file gensvm_cv_util.c
* @author G.J.J. van den Burg
* @date 2014-01-07
* @brief Functions for cross validation
*
* @details
* This file contains functions for performing cross validation. The funtion
* gensvm_make_cv_split() creates a cross validation vector for non-stratified
* cross validation. The function gensvm_get_tt_split() creates a train and
* test dataset from a given dataset and a pre-determined CV partition vector.
* See individual function documentation for details.
*
* @copyright
Copyright 2016, G.J.J. van den Burg.
This file is part of GenSVM.
GenSVM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
GenSVM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GenSVM. If not, see .
*/
#include "gensvm_cv_util.h"
/**
* @brief Create a cross validation split vector
*
* @details
* A pre-allocated vector of length N is created which can be used to define
* cross validation splits. The folds are contain between
* @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$
* instances. An instance is mapped to a partition randomly until all folds
* contain @f$ N \% folds @f$ instances. The zero fold then contains
* @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$
* instances are then distributed over the first @f$ N \% folds @f$ folds.
*
* @param[in] N number of instances
* @param[in] folds number of folds
* @param[in,out] cv_idx array of size N which contains the fold index
* for each observation on exit
*
*/
void gensvm_make_cv_split(long N, long folds, long *cv_idx)
{
long i, j, idx;
for (i=0; iZ == NULL)
gensvm_get_tt_split_sparse(full_data, train_data, test_data,
cv_idx, fold_idx);
else
gensvm_get_tt_split_dense(full_data, train_data, test_data,
cv_idx, fold_idx);
}
/**
* @brief Create train and test datasets for a CV split with dense data
*
* @details
* Given a GenData structure for the full dataset, a previously created
* cross validation split vector and a fold index, a training and test dataset
* are created. It is assumed here that the data is stored as a dense matrix,
* and that the train and test data should also be stored as a dense matrix.
*
* @sa
* gensvm_get_tt_split_sparse(), gensvm_get_tt_split()
*
* @param[in] full_data a GenData structure for the entire
* dataset
* @param[in,out] train_data an initialized GenData structure which
* on exit contains the training dataset
* @param[in,out] test_data an initialized GenData structure which
* on exit contains the test dataset
* @param[in] cv_idx a vector of cv partitions created by
* gensvm_make_cv_split()
* @param[in] fold_idx index of the fold which becomes the
* test dataset
*/
void gensvm_get_tt_split_dense(struct GenData *full_data,
struct GenData *train_data, struct GenData *test_data,
long *cv_idx, long fold_idx)
{
long i, j, k, l, test_n, train_n;
long n = full_data->n;
long m = full_data->m;
long K = full_data->K;
double value;
test_n = 0;
for (i=0; in = test_n;
train_data->n = train_n;
train_data->K = K;
test_data->K = K;
train_data->m = m;
test_data->m = m;
train_data->y = Calloc(long, train_n);
test_data->y = Calloc(long, test_n);
train_data->RAW = Calloc(double, train_n*(m+1));
test_data->RAW = Calloc(double, test_n*(m+1));
k = 0;
l = 0;
for (i=0; iy[k] = full_data->y[i];
for (j=0; jRAW, m+1, i, j);
matrix_set(test_data->RAW, m+1, k, j, value);
}
k++;
} else {
train_data->y[l] = full_data->y[i];
for (j=0; jRAW, m+1, i, j);
matrix_set(train_data->RAW, m+1, l, j, value);
}
l++;
}
}
train_data->Z = train_data->RAW;
test_data->Z = test_data->RAW;
}
/**
* @brief Create train and test dataset for a CV split with sparse data
*
* @details
* Given a GenData structure for the full dataset, a previously created
* cross validation split vector and a fold index, a training and test dataset
* are created. It is assumed here that the data is stored as a sparse matrix,
* and that the train and test data should also be stored as a sparse matrix.
*
* @sa
* gensvm_get_tt_split_dense(), gensvm_get_tt_split()
*
* @param[in] full_data a GenData structure for the entire
* dataset
* @param[in,out] train_data an initialized GenData structure which
* on exit contains the training dataset
* @param[in,out] test_data an initialized GenData structure which
* on exit contains the test dataset
* @param[in] cv_idx a vector of cv partitions created by
* gensvm_make_cv_split()
* @param[in] fold_idx index of the fold which becomes the
* test dataset
*/
void gensvm_get_tt_split_sparse(struct GenData *full_data,
struct GenData *train_data, struct GenData *test_data,
long *cv_idx, long fold_idx)
{
long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj,
jj_start, jj_end,
tr_nnz_idx = 0,
tr_row_idx = 0,
te_nnz_idx = 0,
te_row_idx = 0;
double value;
// determine number of instances in test and train
test_n = 0;
for (i=0; in; i++)
if (cv_idx[i] == fold_idx)
test_n++;
train_n = full_data->n - test_n;
// set n, m, K variables
train_data->n = train_n;
train_data->m = full_data->m;
train_data->K = full_data->K;
test_data->n = test_n;
test_data->m = full_data->m;
test_data->K = full_data->K;
// allocate outcome
train_data->y = Calloc(long, train_n);
test_data->y = Calloc(long, test_n);
// compute train nnz and test nnz
train_nnz = 0;
test_nnz = 0;
for (i=0; in; i++) {
row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i];
if (cv_idx[i] == fold_idx) {
test_nnz += row_nnz;
} else {
train_nnz += row_nnz;
}
}
// allocate the train GenSparse
train_data->spZ = gensvm_init_sparse();
test_data->spZ = gensvm_init_sparse();
// set GenSparse variables for train
train_data->spZ->nnz = train_nnz;
train_data->spZ->n_row = train_n;
train_data->spZ->n_col = full_data->m+1;
train_data->spZ->values = Calloc(double, train_nnz);
train_data->spZ->ia = Calloc(long, train_n+1);
train_data->spZ->ja = Calloc(long, train_nnz);
// set GenSparse variables for test
test_data->spZ->nnz = test_nnz;
test_data->spZ->n_row = test_n;
test_data->spZ->n_col = full_data->m+1;
test_data->spZ->values = Calloc(double, test_nnz);
test_data->spZ->ia = Calloc(long, test_n+1);
test_data->spZ->ja = Calloc(long, test_nnz);
tr_nnz_idx = 0;
tr_row_idx = 0;
te_nnz_idx = 0;
te_row_idx = 0;
test_data->spZ->ia[0] = 0;
train_data->spZ->ia[0] = 0;
for (i=0; in; i++) {
jj_start = full_data->spZ->ia[i];
jj_end = full_data->spZ->ia[i+1];
for (jj=jj_start; jjspZ->ja[jj];
value = full_data->spZ->values[jj];
if (cv_idx[i] == fold_idx) {
test_data->spZ->values[te_nnz_idx] = value;
test_data->spZ->ja[te_nnz_idx] = j;
te_nnz_idx++;
} else {
train_data->spZ->values[tr_nnz_idx] = value;
train_data->spZ->ja[tr_nnz_idx] = j;
tr_nnz_idx++;
}
}
if (cv_idx[i] == fold_idx) {
test_data->y[te_row_idx] = full_data->y[i];
test_data->spZ->ia[te_row_idx+1] = te_nnz_idx;
te_row_idx++;
} else {
train_data->y[tr_row_idx] = full_data->y[i];
train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx;
tr_row_idx++;
}
}
}