aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <burg@ese.eur.nl>2016-10-17 12:09:36 +0200
committerGertjan van den Burg <burg@ese.eur.nl>2016-10-17 12:09:36 +0200
commit6897caced16c862c151b26d7cd19df3a72788154 (patch)
tree928fac44666bc3b1ead9eb1584272ae1963c65e6
parentCreate debug function for printing GenSparse structs (diff)
downloadgensvm-6897caced16c862c151b26d7cd19df3a72788154.tar.gz
gensvm-6897caced16c862c151b26d7cd19df3a72788154.zip
Add functionality for cv_util for sparse matrices
-rw-r--r--include/gensvm_cv_util.h6
-rw-r--r--src/gensvm_cv_util.c172
-rw-r--r--tests/src/test_gensvm_cv_util.c171
3 files changed, 342 insertions, 7 deletions
diff --git a/include/gensvm_cv_util.h b/include/gensvm_cv_util.h
index ada727d..ed88479 100644
--- a/include/gensvm_cv_util.h
+++ b/include/gensvm_cv_util.h
@@ -18,5 +18,11 @@
void gensvm_make_cv_split(long N, long folds, long *cv_idx);
void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data,
struct GenData *test_data, long *cv_idx, long fold_idx);
+void gensvm_get_tt_split_dense(struct GenData *full_data,
+ struct GenData *train_data, struct GenData *test_data,
+ long *cv_idx, long fold_idx);
+void gensvm_get_tt_split_sparse(struct GenData *full_data,
+ struct GenData *train_data, struct GenData *test_data,
+ long *cv_idx, long fold_idx);
#endif
diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c
index d9cde09..d94fb98 100644
--- a/src/gensvm_cv_util.c
+++ b/src/gensvm_cv_util.c
@@ -64,14 +64,51 @@ void gensvm_make_cv_split(long N, long folds, long *cv_idx)
}
}
+/**
+ * @brief Wrapper around sparse/dense versions of this function
+ *
+ * @details
+ * This function tests if the data in the full_data structure is stored in a
+ * dense matrix format or not, and calls gensvm_get_tt_split_dense() or
+ * gensvm_get_tt_split_sparse() accordingly.
+ *
+ * @sa
+ * gensvm_get_tt_split_dense(), gensvm_get_tt_split_sparse()
+ *
+ * @param[in] full_data a GenData structure for the entire
+ * dataset
+ * @param[in,out] train_data an initialized GenData structure which
+ * on exit contains the training dataset
+ * @param[in,out] test_data an initialized GenData structure which
+ * on exit contains the test dataset
+ * @param[in] cv_idx a vector of cv partitions created by
+ * gensvm_make_cv_split()
+ * @param[in] fold_idx index of the fold which becomes the
+ * test dataset
+ */
+void gensvm_get_tt_split(struct GenData *full_data,
+ struct GenData *train_data, struct GenData *test_data,
+ long *cv_idx, long fold_idx)
+{
+ if (full_data->Z == NULL)
+ gensvm_get_tt_split_sparse(full_data, train_data, test_data,
+ cv_idx, fold_idx);
+ else
+ gensvm_get_tt_split_dense(full_data, train_data, test_data,
+ cv_idx, fold_idx);
+}
/**
- * @brief Create train and test datasets for a CV split
+ * @brief Create train and test datasets for a CV split with dense data
*
* @details
* Given a GenData structure for the full dataset, a previously created
* cross validation split vector and a fold index, a training and test dataset
- * are created.
+ * are created. It is assumed here that the data is stored as a dense matrix,
+ * and that the train and test data should also be stored as a dense matrix.
+ *
+ * @sa
+ * gensvm_get_tt_split_sparse(), gensvm_get_tt_split()
*
* @param[in] full_data a GenData structure for the entire
* dataset
@@ -84,8 +121,9 @@ void gensvm_make_cv_split(long N, long folds, long *cv_idx)
* @param[in] fold_idx index of the fold which becomes the
* test dataset
*/
-void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data,
- struct GenData *test_data, long *cv_idx, long fold_idx)
+void gensvm_get_tt_split_dense(struct GenData *full_data,
+ struct GenData *train_data, struct GenData *test_data,
+ long *cv_idx, long fold_idx)
{
long i, j, k, l, test_n, train_n;
@@ -139,3 +177,129 @@ void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data,
train_data->Z = train_data->RAW;
test_data->Z = test_data->RAW;
}
+
+
+/**
+ * @brief Create train and test dataset for a CV split with sparse data
+ *
+ * @details
+ * Given a GenData structure for the full dataset, a previously created
+ * cross validation split vector and a fold index, a training and test dataset
+ * are created. It is assumed here that the data is stored as a sparse matrix,
+ * and that the train and test data should also be stored as a sparse matrix.
+ *
+ * @sa
+ * gensvm_get_tt_split_dense(), gensvm_get_tt_split()
+ *
+ * @param[in] full_data a GenData structure for the entire
+ * dataset
+ * @param[in,out] train_data an initialized GenData structure which
+ * on exit contains the training dataset
+ * @param[in,out] test_data an initialized GenData structure which
+ * on exit contains the test dataset
+ * @param[in] cv_idx a vector of cv partitions created by
+ * gensvm_make_cv_split()
+ * @param[in] fold_idx index of the fold which becomes the
+ * test dataset
+ */
+void gensvm_get_tt_split_sparse(struct GenData *full_data,
+ struct GenData *train_data, struct GenData *test_data,
+ long *cv_idx, long fold_idx)
+{
+ long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj,
+ jj_start, jj_end,
+ tr_nnz_idx = 0,
+ tr_row_idx = 0,
+ te_nnz_idx = 0,
+ te_row_idx = 0;
+
+ double value;
+
+ // determine number of instances in test and train
+ test_n = 0;
+ for (i=0; i<full_data->n; i++)
+ if (cv_idx[i] == fold_idx)
+ test_n++;
+ train_n = full_data->n - test_n;
+
+ // set n, m, K variables
+ train_data->n = train_n;
+ train_data->m = full_data->m;
+ train_data->K = full_data->K;
+ test_data->n = test_n;
+ test_data->m = full_data->m;
+ test_data->K = full_data->K;
+
+ // allocate outcome
+ train_data->y = Calloc(long, train_n);
+ test_data->y = Calloc(long, test_n);
+
+ // compute train nnz and test nnz
+ train_nnz = 0;
+ test_nnz = 0;
+ for (i=0; i<full_data->n; i++) {
+ row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i];
+ if (cv_idx[i] == fold_idx) {
+ test_nnz += row_nnz;
+ } else {
+ train_nnz += row_nnz;
+ }
+ }
+
+ // allocate the train GenSparse
+ train_data->spZ = gensvm_init_sparse();
+ test_data->spZ = gensvm_init_sparse();
+
+ // set GenSparse variables for train
+ train_data->spZ->nnz = train_nnz;
+ train_data->spZ->n_row = train_n;
+ train_data->spZ->n_col = full_data->m+1;
+ train_data->spZ->values = Calloc(double, train_nnz);
+ train_data->spZ->ia = Calloc(int, train_n+1);
+ train_data->spZ->ja = Calloc(int, train_nnz);
+
+ // set GenSparse variables for test
+ test_data->spZ->nnz = test_nnz;
+ test_data->spZ->n_row = test_n;
+ test_data->spZ->n_col = full_data->m+1;
+ test_data->spZ->values = Calloc(double, test_nnz);
+ test_data->spZ->ia = Calloc(int, test_n+1);
+ test_data->spZ->ja = Calloc(int, test_nnz);
+
+ tr_nnz_idx = 0;
+ tr_row_idx = 0;
+ te_nnz_idx = 0;
+ te_row_idx = 0;
+
+ test_data->spZ->ia[0] = 0;
+ train_data->spZ->ia[0] = 0;
+ for (i=0; i<full_data->n; i++) {
+ jj_start = full_data->spZ->ia[i];
+ jj_end = full_data->spZ->ia[i+1];
+
+ for (jj=jj_start; jj<jj_end; jj++) {
+ j = full_data->spZ->ja[jj];
+ value = full_data->spZ->values[jj];
+
+ if (cv_idx[i] == fold_idx) {
+ test_data->spZ->values[te_nnz_idx] = value;
+ test_data->spZ->ja[te_nnz_idx] = j;
+ te_nnz_idx++;
+ } else {
+ train_data->spZ->values[tr_nnz_idx] = value;
+ train_data->spZ->ja[tr_nnz_idx] = j;
+ tr_nnz_idx++;
+ }
+ }
+
+ if (cv_idx[i] == fold_idx) {
+ test_data->y[te_row_idx] = full_data->y[i];
+ test_data->spZ->ia[te_row_idx+1] = te_nnz_idx;
+ te_row_idx++;
+ } else {
+ train_data->y[tr_row_idx] = full_data->y[i];
+ train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx;
+ tr_row_idx++;
+ }
+ }
+}
diff --git a/tests/src/test_gensvm_cv_util.c b/tests/src/test_gensvm_cv_util.c
index 5cbf174..2d6ecb7 100644
--- a/tests/src/test_gensvm_cv_util.c
+++ b/tests/src/test_gensvm_cv_util.c
@@ -90,8 +90,7 @@ char *test_make_cv_split_2()
return NULL;
}
-
-char *test_get_tt_split()
+char *test_get_tt_split_dense()
{
struct GenData *full = gensvm_init_data();
full->K = 3;
@@ -132,6 +131,7 @@ char *test_get_tt_split()
matrix_set(full->RAW, full->m+1, 8, 2, 9.0);
matrix_set(full->RAW, full->m+1, 9, 1, 10.0);
matrix_set(full->RAW, full->m+1, 9, 2, 10.0);
+ full->Z = full->RAW;
long *cv_idx = Calloc(long, full->n);
cv_idx[0] = 1;
@@ -243,13 +243,178 @@ char *test_get_tt_split()
return NULL;
}
+char *test_get_tt_split_sparse()
+{
+ struct GenData *full = gensvm_init_data();
+ full->K = 3;
+ full->n = 10;
+ full->m = 2;
+ full->r = 2;
+
+ full->y = Calloc(long, full->n);
+ full->y[0] = 1;
+ full->y[1] = 2;
+ full->y[2] = 3;
+ full->y[3] = 1;
+ full->y[4] = 2;
+ full->y[5] = 3;
+ full->y[6] = 1;
+ full->y[7] = 2;
+ full->y[8] = 3;
+ full->y[9] = 1;
+
+ full->RAW = Calloc(double, full->n * (full->m+1));
+ matrix_set(full->RAW, full->m+1, 0, 1, 1.0);
+ matrix_set(full->RAW, full->m+1, 0, 2, 1.0);
+ matrix_set(full->RAW, full->m+1, 1, 1, 2.0);
+ matrix_set(full->RAW, full->m+1, 1, 2, 2.0);
+ matrix_set(full->RAW, full->m+1, 2, 1, 3.0);
+ matrix_set(full->RAW, full->m+1, 2, 2, 3.0);
+ matrix_set(full->RAW, full->m+1, 3, 1, 4.0);
+ matrix_set(full->RAW, full->m+1, 3, 2, 4.0);
+ matrix_set(full->RAW, full->m+1, 4, 1, 5.0);
+ matrix_set(full->RAW, full->m+1, 4, 2, 5.0);
+ matrix_set(full->RAW, full->m+1, 5, 1, 6.0);
+ matrix_set(full->RAW, full->m+1, 5, 2, 6.0);
+ matrix_set(full->RAW, full->m+1, 6, 1, 7.0);
+ matrix_set(full->RAW, full->m+1, 6, 2, 7.0);
+ matrix_set(full->RAW, full->m+1, 7, 1, 8.0);
+ matrix_set(full->RAW, full->m+1, 7, 2, 8.0);
+ matrix_set(full->RAW, full->m+1, 8, 1, 9.0);
+ matrix_set(full->RAW, full->m+1, 8, 2, 9.0);
+ matrix_set(full->RAW, full->m+1, 9, 1, 10.0);
+ matrix_set(full->RAW, full->m+1, 9, 2, 10.0);
+ full->Z = full->RAW;
+
+ // convert Z to a sparse matrix to test the sparse functions
+ full->spZ = gensvm_dense_to_sparse(full->RAW, full->n, full->m+1);
+ free(full->RAW);
+ full->RAW = NULL;
+ full->Z = NULL;
+
+ long *cv_idx = Calloc(long, full->n);
+ cv_idx[0] = 1;
+ cv_idx[1] = 0;
+ cv_idx[2] = 1;
+ cv_idx[3] = 0;
+ cv_idx[4] = 1;
+ cv_idx[5] = 2;
+ cv_idx[6] = 3;
+ cv_idx[7] = 2;
+ cv_idx[8] = 3;
+ cv_idx[9] = 2;
+
+ struct GenData *train = gensvm_init_data();
+ struct GenData *test = gensvm_init_data();
+
+ // start test code //
+ gensvm_get_tt_split(full, train, test, cv_idx, 0);
+
+ mu_assert(train->n == 8, "train_n incorrect.");
+ mu_assert(test->n == 2, "test_n incorrect.");
+
+ mu_assert(train->m == 2, "train_m incorrect.");
+ mu_assert(test->m == 2, "test_m incorrect.");
+
+ mu_assert(train->K == 3, "train_K incorrect.");
+ mu_assert(test->K == 3, "test_K incorrect.");
+
+ mu_assert(train->y[0] == 1, "train y incorrect.");
+ mu_assert(train->y[1] == 3, "train y incorrect.");
+ mu_assert(train->y[2] == 2, "train y incorrect.");
+ mu_assert(train->y[3] == 3, "train y incorrect.");
+ mu_assert(train->y[4] == 1, "train y incorrect.");
+ mu_assert(train->y[5] == 2, "train y incorrect.");
+ mu_assert(train->y[6] == 3, "train y incorrect.");
+ mu_assert(train->y[7] == 1, "train y incorrect.");
+
+ mu_assert(test->y[0] == 2, "test y incorrect.");
+ mu_assert(test->y[1] == 1, "test y incorrect.");
+
+ // check the train GenSparse struct
+ mu_assert(train->spZ->nnz == 16, "train nnz incorrect");
+ mu_assert(train->spZ->n_row == 8, "train n_row incorrect");
+ mu_assert(train->spZ->n_col == 3, "train n_col incorrect");
+
+ mu_assert(train->spZ->values[0] == 1.0, "Wrong train value at 0");
+ mu_assert(train->spZ->values[1] == 1.0, "Wrong train value at 1");
+ mu_assert(train->spZ->values[2] == 3.0, "Wrong train value at 2");
+ mu_assert(train->spZ->values[3] == 3.0, "Wrong train value at 3");
+ mu_assert(train->spZ->values[4] == 5.0, "Wrong train value at 4");
+ mu_assert(train->spZ->values[5] == 5.0, "Wrong train value at 5");
+ mu_assert(train->spZ->values[6] == 6.0, "Wrong train value at 6");
+ mu_assert(train->spZ->values[7] == 6.0, "Wrong train value at 7");
+ mu_assert(train->spZ->values[8] == 7.0, "Wrong train value at 8");
+ mu_assert(train->spZ->values[9] == 7.0, "Wrong train value at 9");
+ mu_assert(train->spZ->values[10] == 8.0, "Wrong train value at 10");
+ mu_assert(train->spZ->values[11] == 8.0, "Wrong train value at 11");
+ mu_assert(train->spZ->values[12] == 9.0, "Wrong train value at 12");
+ mu_assert(train->spZ->values[13] == 9.0, "Wrong train value at 13");
+ mu_assert(train->spZ->values[14] == 10.0, "Wrong train value at 14");
+ mu_assert(train->spZ->values[15] == 10.0, "Wrong train value at 15");
+
+ mu_assert(train->spZ->ia[0] == 0, "Wrong train ia at 0");
+ mu_assert(train->spZ->ia[1] == 2, "Wrong train ia at 1");
+ mu_assert(train->spZ->ia[2] == 4, "Wrong train ia at 2");
+ mu_assert(train->spZ->ia[3] == 6, "Wrong train ia at 3");
+ mu_assert(train->spZ->ia[4] == 8, "Wrong train ia at 4");
+ mu_assert(train->spZ->ia[5] == 10, "Wrong train ia at 5");
+ mu_assert(train->spZ->ia[6] == 12, "Wrong train ia at 6");
+ mu_assert(train->spZ->ia[7] == 14, "Wrong train ia at 7");
+ mu_assert(train->spZ->ia[8] == 16, "Wrong train ia at 8");
+
+ mu_assert(train->spZ->ja[0] == 1, "Wrong train ja at 0");
+ mu_assert(train->spZ->ja[1] == 2, "Wrong train ja at 1");
+ mu_assert(train->spZ->ja[2] == 1, "Wrong train ja at 2");
+ mu_assert(train->spZ->ja[3] == 2, "Wrong train ja at 3");
+ mu_assert(train->spZ->ja[4] == 1, "Wrong train ja at 4");
+ mu_assert(train->spZ->ja[5] == 2, "Wrong train ja at 5");
+ mu_assert(train->spZ->ja[6] == 1, "Wrong train ja at 6");
+ mu_assert(train->spZ->ja[7] == 2, "Wrong train ja at 7");
+ mu_assert(train->spZ->ja[8] == 1, "Wrong train ja at 8");
+ mu_assert(train->spZ->ja[9] == 2, "Wrong train ja at 9");
+ mu_assert(train->spZ->ja[10] == 1, "Wrong train ja at 10");
+ mu_assert(train->spZ->ja[11] == 2, "Wrong train ja at 11");
+ mu_assert(train->spZ->ja[12] == 1, "Wrong train ja at 12");
+ mu_assert(train->spZ->ja[13] == 2, "Wrong train ja at 13");
+ mu_assert(train->spZ->ja[14] == 1, "Wrong train ja at 14");
+ mu_assert(train->spZ->ja[15] == 2, "Wrong train ja at 15");
+
+ // check the test GenSparse struct
+ mu_assert(test->spZ->nnz == 4, "test nnz incorrect");
+ mu_assert(test->spZ->n_row == 2, "test n_row incorrect");
+ mu_assert(test->spZ->n_col == 3, "test n_col incorrect");
+
+ mu_assert(test->spZ->values[0] == 2.0, "Wrong test value at 0");
+ mu_assert(test->spZ->values[1] == 2.0, "Wrong test value at 1");
+ mu_assert(test->spZ->values[2] == 4.0, "Wrong test value at 2");
+ mu_assert(test->spZ->values[3] == 4.0, "Wrong test value at 3");
+
+ mu_assert(test->spZ->ia[0] == 0, "Wrong test ia at 0");
+ mu_assert(test->spZ->ia[1] == 2, "Wrong test ia at 1");
+ mu_assert(test->spZ->ia[2] == 4, "Wrong test ia at 2");
+
+ mu_assert(test->spZ->ja[0] == 1, "Wrong test ja at 0");
+ mu_assert(test->spZ->ja[1] == 2, "Wrong test ja at 1");
+ mu_assert(test->spZ->ja[2] == 1, "Wrong test ja at 2");
+ mu_assert(test->spZ->ja[3] == 2, "Wrong test ja at 3");
+
+ // end test code //
+ gensvm_free_data(full);
+ gensvm_free_data(train);
+ gensvm_free_data(test);
+ free(cv_idx);
+
+ return NULL;
+}
char *all_tests()
{
mu_suite_start();
mu_run_test(test_make_cv_split_1);
mu_run_test(test_make_cv_split_2);
- mu_run_test(test_get_tt_split);
+ mu_run_test(test_get_tt_split_dense);
+ mu_run_test(test_get_tt_split_sparse);
return NULL;
}