aboutsummaryrefslogtreecommitdiff
path: root/src/gensvm_cv_util.c
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2016-05-16 18:47:09 +0200
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2016-05-16 18:47:09 +0200
commit044dc5a93c33d7aa4c9c98a626890c16446a56fc (patch)
tree23cc17a595d36a35ad9cb50e3ab18c2956b5f65c /src/gensvm_cv_util.c
parentMove includes to header (diff)
downloadgensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.tar.gz
gensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.zip
major refactor of the code
Diffstat (limited to 'src/gensvm_cv_util.c')
-rw-r--r--src/gensvm_cv_util.c141
1 files changed, 141 insertions, 0 deletions
diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c
new file mode 100644
index 0000000..d9cde09
--- /dev/null
+++ b/src/gensvm_cv_util.c
@@ -0,0 +1,141 @@
+/**
+ * @file gensvm_cv_util.c
+ * @author Gertjan van den Burg
+ * @date January 7, 2014
+ * @brief Functions for cross validation
+ *
+ * @details
+ * This file contains functions for performing cross validation. The funtion
+ * gensvm_make_cv_split() creates a cross validation vector for non-stratified
+ * cross validation. The function gensvm_get_tt_split() creates a train and
+ * test dataset from a given dataset and a pre-determined CV partition vector.
+ * See individual function documentation for details.
+ *
+ */
+
+#include "gensvm_cv_util.h"
+
+/**
+ * @brief Create a cross validation split vector
+ *
+ * @details
+ * A pre-allocated vector of length N is created which can be used to define
+ * cross validation splits. The folds are contain between
+ * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$
+ * instances. An instance is mapped to a partition randomly until all folds
+ * contain @f$ N \% folds @f$ instances. The zero fold then contains
+ * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$
+ * instances are then distributed over the first @f$ N \% folds @f$ folds.
+ *
+ * @param[in] N number of instances
+ * @param[in] folds number of folds
+ * @param[in,out] cv_idx array of size N which contains the fold index
+ * for each observation on exit
+ *
+ */
+void gensvm_make_cv_split(long N, long folds, long *cv_idx)
+{
+ long i, j, idx;
+
+ for (i=0; i<N; i++)
+ cv_idx[i] = 0;
+
+ long big_folds = N%folds;
+ long small_fold_size = N/folds;
+
+ j = 0;
+ for (i=0; i<small_fold_size*folds; i++)
+ while (1) {
+ idx = rand()%N;
+ if (cv_idx[idx] == 0) {
+ cv_idx[idx] = j;
+ j++;
+ j%=folds;
+ break;
+ }
+ }
+ j = 0;
+ i = 0;
+ while (i < big_folds) {
+ if (cv_idx[j] == 0) {
+ cv_idx[j] = i++;
+ }
+ j++;
+ }
+}
+
+
+/**
+ * @brief Create train and test datasets for a CV split
+ *
+ * @details
+ * Given a GenData structure for the full dataset, a previously created
+ * cross validation split vector and a fold index, a training and test dataset
+ * are created.
+ *
+ * @param[in] full_data a GenData structure for the entire
+ * dataset
+ * @param[in,out] train_data an initialized GenData structure which
+ * on exit contains the training dataset
+ * @param[in,out] test_data an initialized GenData structure which
+ * on exit contains the test dataset
+ * @param[in] cv_idx a vector of cv partitions created by
+ * gensvm_make_cv_split()
+ * @param[in] fold_idx index of the fold which becomes the
+ * test dataset
+ */
+void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data,
+ struct GenData *test_data, long *cv_idx, long fold_idx)
+{
+ long i, j, k, l, test_n, train_n;
+
+ long n = full_data->n;
+ long m = full_data->m;
+ long K = full_data->K;
+
+ double value;
+
+ test_n = 0;
+ for (i=0; i<n; i++)
+ if (cv_idx[i] == fold_idx)
+ test_n++;
+ train_n = n - test_n;
+
+ test_data->n = test_n;
+ train_data->n = train_n;
+
+ train_data->K = K;
+ test_data->K = K;
+
+ train_data->m = m;
+ test_data->m = m;
+
+ train_data->y = Calloc(long, train_n);
+ test_data->y = Calloc(long, test_n);
+
+ train_data->RAW = Calloc(double, train_n*(m+1));
+ test_data->RAW = Calloc(double, test_n*(m+1));
+
+ k = 0;
+ l = 0;
+ for (i=0; i<n; i++) {
+ if (cv_idx[i] == fold_idx) {
+ test_data->y[k] = full_data->y[i];
+ for (j=0; j<m+1; j++) {
+ value = matrix_get(full_data->RAW, m+1, i, j);
+ matrix_set(test_data->RAW, m+1, k, j, value);
+ }
+ k++;
+ } else {
+ train_data->y[l] = full_data->y[i];
+ for (j=0; j<m+1; j++) {
+ value = matrix_get(full_data->RAW, m+1, i, j);
+ matrix_set(train_data->RAW, m+1, l, j, value);
+ }
+ l++;
+ }
+ }
+
+ train_data->Z = train_data->RAW;
+ test_data->Z = test_data->RAW;
+}