major refactor of the code

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2016-05-16 18:47:09 +0200
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2016-05-16 18:47:09 +0200
commit: 044dc5a93c33d7aa4c9c98a626890c16446a56fc (patch)
tree: 23cc17a595d36a35ad9cb50e3ab18c2956b5f65c /src/gensvm_cv_util.c
parent: Move includes to header (diff)
download: gensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.tar.gz
gensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.zip
1 files changed, 141 insertions, 0 deletions
diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c
new file mode 100644
index 0000000..d9cde09
--- /dev/null
+++ b/src/gensvm_cv_util.c
@@ -0,0 +1,141 @@
+/**
+ * @file gensvm_cv_util.c
+ * @author Gertjan van den Burg
+ * @date January 7, 2014
+ * @brief Functions for cross validation
+ *
+ * @details
+ * This file contains functions for performing cross validation. The funtion
+ * gensvm_make_cv_split() creates a cross validation vector for non-stratified
+ * cross validation. The function gensvm_get_tt_split() creates a train and
+ * test dataset from a given dataset and a pre-determined CV partition vector.
+ * See individual function documentation for details.
+ *
+ */
+
+#include "gensvm_cv_util.h"
+
+/**
+ * @brief Create a cross validation split vector
+ *
+ * @details
+ * A pre-allocated vector of length N is created which can be used to define
+ * cross validation splits. The folds are contain between
+ * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$
+ * instances. An instance is mapped to a partition randomly until all folds
+ * contain @f$ N \% folds @f$ instances. The zero fold then contains
+ * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$
+ * instances are then distributed over the first @f$ N \% folds @f$ folds.
+ *
+ * @param[in] 		N 	number of instances
+ * @param[in] 		folds 	number of folds
+ * @param[in,out] 	cv_idx 	array of size N which contains the fold index
+ * 				for each observation on exit
+ *
+ */
+void gensvm_make_cv_split(long N, long folds, long *cv_idx)
+{
+	long i, j, idx;
+
+	for (i=0; i<N; i++)
+		cv_idx[i] = 0;
+
+	long big_folds = N%folds;
+	long small_fold_size = N/folds;
+
+	j = 0;
+	for (i=0; i<small_fold_size*folds; i++)
+		while (1) {
+			idx = rand()%N;
+			if (cv_idx[idx] == 0) {
+				cv_idx[idx] = j;
+				j++;
+				j%=folds;
+				break;
+			}
+		}
+	j = 0;
+	i = 0;
+	while (i < big_folds) {
+		if (cv_idx[j] == 0) {
+			cv_idx[j] = i++;
+		}
+		j++;
+	}
+}
+
+
+/**
+ * @brief Create train and test datasets for a CV split
+ *
+ * @details
+ * Given a GenData structure for the full dataset, a previously created
+ * cross validation split vector and a fold index, a training and test dataset
+ * are created.
+ *
+ * @param[in] 		full_data 	a GenData structure for the entire
+ * 					dataset
+ * @param[in,out] 	train_data 	an initialized GenData structure which
+ * 					on exit contains the training dataset
+ * @param[in,out] 	test_data 	an initialized GenData structure which
+ * 					on exit contains the test dataset
+ * @param[in] 		cv_idx 		a vector of cv partitions created by
+ * 					gensvm_make_cv_split()
+ * @param[in] 		fold_idx 	index of the fold which becomes the
+ * 					test dataset
+ */
+void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data,
+		struct GenData *test_data, long *cv_idx, long fold_idx)
+{
+	long i, j, k, l, test_n, train_n;
+
+	long n = full_data->n;
+	long m = full_data->m;
+	long K = full_data->K;
+
+	double value;
+
+	test_n = 0;
+	for (i=0; i<n; i++)
+		if (cv_idx[i] == fold_idx)
+			test_n++;
+	train_n = n - test_n;
+
+	test_data->n = test_n;
+	train_data->n = train_n;
+
+	train_data->K = K;
+	test_data->K = K;
+
+	train_data->m = m;
+	test_data->m = m;
+
+	train_data->y = Calloc(long, train_n);
+	test_data->y = Calloc(long, test_n);
+
+	train_data->RAW = Calloc(double, train_n*(m+1));
+	test_data->RAW = Calloc(double, test_n*(m+1));
+
+	k = 0;
+	l = 0;
+	for (i=0; i<n; i++) {
+		if (cv_idx[i] == fold_idx) {
+			test_data->y[k] = full_data->y[i];
+			for (j=0; j<m+1; j++) {
+				value = matrix_get(full_data->RAW, m+1, i, j);
+				matrix_set(test_data->RAW, m+1, k, j, value);
+			}
+			k++;
+		} else {
+			train_data->y[l] = full_data->y[i];
+			for (j=0; j<m+1; j++) {
+				value = matrix_get(full_data->RAW, m+1, i, j);
+				matrix_set(train_data->RAW, m+1, l, j, value);
+			}
+			l++;
+		}
+	}
+
+	train_data->Z = train_data->RAW;
+	test_data->Z = test_data->RAW;
+}
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2016-05-16 18:47:09 +0200
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2016-05-16 18:47:09 +0200
commit	044dc5a93c33d7aa4c9c98a626890c16446a56fc (patch)
tree	23cc17a595d36a35ad9cb50e3ab18c2956b5f65c /src/gensvm_cv_util.c
parent	Move includes to header (diff)
download	gensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.tar.gz gensvm-044dc5a93c33d7aa4c9c98a626890c16446a56fc.zip