diff options
| author | Gertjan van den Burg <burg@ese.eur.nl> | 2016-10-17 12:09:36 +0200 |
|---|---|---|
| committer | Gertjan van den Burg <burg@ese.eur.nl> | 2016-10-17 12:09:36 +0200 |
| commit | 6897caced16c862c151b26d7cd19df3a72788154 (patch) | |
| tree | 928fac44666bc3b1ead9eb1584272ae1963c65e6 | |
| parent | Create debug function for printing GenSparse structs (diff) | |
| download | gensvm-6897caced16c862c151b26d7cd19df3a72788154.tar.gz gensvm-6897caced16c862c151b26d7cd19df3a72788154.zip | |
Add functionality for cv_util for sparse matrices
| -rw-r--r-- | include/gensvm_cv_util.h | 6 | ||||
| -rw-r--r-- | src/gensvm_cv_util.c | 172 | ||||
| -rw-r--r-- | tests/src/test_gensvm_cv_util.c | 171 |
3 files changed, 342 insertions, 7 deletions
diff --git a/include/gensvm_cv_util.h b/include/gensvm_cv_util.h index ada727d..ed88479 100644 --- a/include/gensvm_cv_util.h +++ b/include/gensvm_cv_util.h @@ -18,5 +18,11 @@ void gensvm_make_cv_split(long N, long folds, long *cv_idx); void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, struct GenData *test_data, long *cv_idx, long fold_idx); +void gensvm_get_tt_split_dense(struct GenData *full_data, + struct GenData *train_data, struct GenData *test_data, + long *cv_idx, long fold_idx); +void gensvm_get_tt_split_sparse(struct GenData *full_data, + struct GenData *train_data, struct GenData *test_data, + long *cv_idx, long fold_idx); #endif diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c index d9cde09..d94fb98 100644 --- a/src/gensvm_cv_util.c +++ b/src/gensvm_cv_util.c @@ -64,14 +64,51 @@ void gensvm_make_cv_split(long N, long folds, long *cv_idx) } } +/** + * @brief Wrapper around sparse/dense versions of this function + * + * @details + * This function tests if the data in the full_data structure is stored in a + * dense matrix format or not, and calls gensvm_get_tt_split_dense() or + * gensvm_get_tt_split_sparse() accordingly. + * + * @sa + * gensvm_get_tt_split_dense(), gensvm_get_tt_split_sparse() + * + * @param[in] full_data a GenData structure for the entire + * dataset + * @param[in,out] train_data an initialized GenData structure which + * on exit contains the training dataset + * @param[in,out] test_data an initialized GenData structure which + * on exit contains the test dataset + * @param[in] cv_idx a vector of cv partitions created by + * gensvm_make_cv_split() + * @param[in] fold_idx index of the fold which becomes the + * test dataset + */ +void gensvm_get_tt_split(struct GenData *full_data, + struct GenData *train_data, struct GenData *test_data, + long *cv_idx, long fold_idx) +{ + if (full_data->Z == NULL) + gensvm_get_tt_split_sparse(full_data, train_data, test_data, + cv_idx, fold_idx); + else + gensvm_get_tt_split_dense(full_data, train_data, test_data, + cv_idx, fold_idx); +} /** - * @brief Create train and test datasets for a CV split + * @brief Create train and test datasets for a CV split with dense data * * @details * Given a GenData structure for the full dataset, a previously created * cross validation split vector and a fold index, a training and test dataset - * are created. + * are created. It is assumed here that the data is stored as a dense matrix, + * and that the train and test data should also be stored as a dense matrix. + * + * @sa + * gensvm_get_tt_split_sparse(), gensvm_get_tt_split() * * @param[in] full_data a GenData structure for the entire * dataset @@ -84,8 +121,9 @@ void gensvm_make_cv_split(long N, long folds, long *cv_idx) * @param[in] fold_idx index of the fold which becomes the * test dataset */ -void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, - struct GenData *test_data, long *cv_idx, long fold_idx) +void gensvm_get_tt_split_dense(struct GenData *full_data, + struct GenData *train_data, struct GenData *test_data, + long *cv_idx, long fold_idx) { long i, j, k, l, test_n, train_n; @@ -139,3 +177,129 @@ void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, train_data->Z = train_data->RAW; test_data->Z = test_data->RAW; } + + +/** + * @brief Create train and test dataset for a CV split with sparse data + * + * @details + * Given a GenData structure for the full dataset, a previously created + * cross validation split vector and a fold index, a training and test dataset + * are created. It is assumed here that the data is stored as a sparse matrix, + * and that the train and test data should also be stored as a sparse matrix. + * + * @sa + * gensvm_get_tt_split_dense(), gensvm_get_tt_split() + * + * @param[in] full_data a GenData structure for the entire + * dataset + * @param[in,out] train_data an initialized GenData structure which + * on exit contains the training dataset + * @param[in,out] test_data an initialized GenData structure which + * on exit contains the test dataset + * @param[in] cv_idx a vector of cv partitions created by + * gensvm_make_cv_split() + * @param[in] fold_idx index of the fold which becomes the + * test dataset + */ +void gensvm_get_tt_split_sparse(struct GenData *full_data, + struct GenData *train_data, struct GenData *test_data, + long *cv_idx, long fold_idx) +{ + long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj, + jj_start, jj_end, + tr_nnz_idx = 0, + tr_row_idx = 0, + te_nnz_idx = 0, + te_row_idx = 0; + + double value; + + // determine number of instances in test and train + test_n = 0; + for (i=0; i<full_data->n; i++) + if (cv_idx[i] == fold_idx) + test_n++; + train_n = full_data->n - test_n; + + // set n, m, K variables + train_data->n = train_n; + train_data->m = full_data->m; + train_data->K = full_data->K; + test_data->n = test_n; + test_data->m = full_data->m; + test_data->K = full_data->K; + + // allocate outcome + train_data->y = Calloc(long, train_n); + test_data->y = Calloc(long, test_n); + + // compute train nnz and test nnz + train_nnz = 0; + test_nnz = 0; + for (i=0; i<full_data->n; i++) { + row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i]; + if (cv_idx[i] == fold_idx) { + test_nnz += row_nnz; + } else { + train_nnz += row_nnz; + } + } + + // allocate the train GenSparse + train_data->spZ = gensvm_init_sparse(); + test_data->spZ = gensvm_init_sparse(); + + // set GenSparse variables for train + train_data->spZ->nnz = train_nnz; + train_data->spZ->n_row = train_n; + train_data->spZ->n_col = full_data->m+1; + train_data->spZ->values = Calloc(double, train_nnz); + train_data->spZ->ia = Calloc(int, train_n+1); + train_data->spZ->ja = Calloc(int, train_nnz); + + // set GenSparse variables for test + test_data->spZ->nnz = test_nnz; + test_data->spZ->n_row = test_n; + test_data->spZ->n_col = full_data->m+1; + test_data->spZ->values = Calloc(double, test_nnz); + test_data->spZ->ia = Calloc(int, test_n+1); + test_data->spZ->ja = Calloc(int, test_nnz); + + tr_nnz_idx = 0; + tr_row_idx = 0; + te_nnz_idx = 0; + te_row_idx = 0; + + test_data->spZ->ia[0] = 0; + train_data->spZ->ia[0] = 0; + for (i=0; i<full_data->n; i++) { + jj_start = full_data->spZ->ia[i]; + jj_end = full_data->spZ->ia[i+1]; + + for (jj=jj_start; jj<jj_end; jj++) { + j = full_data->spZ->ja[jj]; + value = full_data->spZ->values[jj]; + + if (cv_idx[i] == fold_idx) { + test_data->spZ->values[te_nnz_idx] = value; + test_data->spZ->ja[te_nnz_idx] = j; + te_nnz_idx++; + } else { + train_data->spZ->values[tr_nnz_idx] = value; + train_data->spZ->ja[tr_nnz_idx] = j; + tr_nnz_idx++; + } + } + + if (cv_idx[i] == fold_idx) { + test_data->y[te_row_idx] = full_data->y[i]; + test_data->spZ->ia[te_row_idx+1] = te_nnz_idx; + te_row_idx++; + } else { + train_data->y[tr_row_idx] = full_data->y[i]; + train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx; + tr_row_idx++; + } + } +} diff --git a/tests/src/test_gensvm_cv_util.c b/tests/src/test_gensvm_cv_util.c index 5cbf174..2d6ecb7 100644 --- a/tests/src/test_gensvm_cv_util.c +++ b/tests/src/test_gensvm_cv_util.c @@ -90,8 +90,7 @@ char *test_make_cv_split_2() return NULL; } - -char *test_get_tt_split() +char *test_get_tt_split_dense() { struct GenData *full = gensvm_init_data(); full->K = 3; @@ -132,6 +131,7 @@ char *test_get_tt_split() matrix_set(full->RAW, full->m+1, 8, 2, 9.0); matrix_set(full->RAW, full->m+1, 9, 1, 10.0); matrix_set(full->RAW, full->m+1, 9, 2, 10.0); + full->Z = full->RAW; long *cv_idx = Calloc(long, full->n); cv_idx[0] = 1; @@ -243,13 +243,178 @@ char *test_get_tt_split() return NULL; } +char *test_get_tt_split_sparse() +{ + struct GenData *full = gensvm_init_data(); + full->K = 3; + full->n = 10; + full->m = 2; + full->r = 2; + + full->y = Calloc(long, full->n); + full->y[0] = 1; + full->y[1] = 2; + full->y[2] = 3; + full->y[3] = 1; + full->y[4] = 2; + full->y[5] = 3; + full->y[6] = 1; + full->y[7] = 2; + full->y[8] = 3; + full->y[9] = 1; + + full->RAW = Calloc(double, full->n * (full->m+1)); + matrix_set(full->RAW, full->m+1, 0, 1, 1.0); + matrix_set(full->RAW, full->m+1, 0, 2, 1.0); + matrix_set(full->RAW, full->m+1, 1, 1, 2.0); + matrix_set(full->RAW, full->m+1, 1, 2, 2.0); + matrix_set(full->RAW, full->m+1, 2, 1, 3.0); + matrix_set(full->RAW, full->m+1, 2, 2, 3.0); + matrix_set(full->RAW, full->m+1, 3, 1, 4.0); + matrix_set(full->RAW, full->m+1, 3, 2, 4.0); + matrix_set(full->RAW, full->m+1, 4, 1, 5.0); + matrix_set(full->RAW, full->m+1, 4, 2, 5.0); + matrix_set(full->RAW, full->m+1, 5, 1, 6.0); + matrix_set(full->RAW, full->m+1, 5, 2, 6.0); + matrix_set(full->RAW, full->m+1, 6, 1, 7.0); + matrix_set(full->RAW, full->m+1, 6, 2, 7.0); + matrix_set(full->RAW, full->m+1, 7, 1, 8.0); + matrix_set(full->RAW, full->m+1, 7, 2, 8.0); + matrix_set(full->RAW, full->m+1, 8, 1, 9.0); + matrix_set(full->RAW, full->m+1, 8, 2, 9.0); + matrix_set(full->RAW, full->m+1, 9, 1, 10.0); + matrix_set(full->RAW, full->m+1, 9, 2, 10.0); + full->Z = full->RAW; + + // convert Z to a sparse matrix to test the sparse functions + full->spZ = gensvm_dense_to_sparse(full->RAW, full->n, full->m+1); + free(full->RAW); + full->RAW = NULL; + full->Z = NULL; + + long *cv_idx = Calloc(long, full->n); + cv_idx[0] = 1; + cv_idx[1] = 0; + cv_idx[2] = 1; + cv_idx[3] = 0; + cv_idx[4] = 1; + cv_idx[5] = 2; + cv_idx[6] = 3; + cv_idx[7] = 2; + cv_idx[8] = 3; + cv_idx[9] = 2; + + struct GenData *train = gensvm_init_data(); + struct GenData *test = gensvm_init_data(); + + // start test code // + gensvm_get_tt_split(full, train, test, cv_idx, 0); + + mu_assert(train->n == 8, "train_n incorrect."); + mu_assert(test->n == 2, "test_n incorrect."); + + mu_assert(train->m == 2, "train_m incorrect."); + mu_assert(test->m == 2, "test_m incorrect."); + + mu_assert(train->K == 3, "train_K incorrect."); + mu_assert(test->K == 3, "test_K incorrect."); + + mu_assert(train->y[0] == 1, "train y incorrect."); + mu_assert(train->y[1] == 3, "train y incorrect."); + mu_assert(train->y[2] == 2, "train y incorrect."); + mu_assert(train->y[3] == 3, "train y incorrect."); + mu_assert(train->y[4] == 1, "train y incorrect."); + mu_assert(train->y[5] == 2, "train y incorrect."); + mu_assert(train->y[6] == 3, "train y incorrect."); + mu_assert(train->y[7] == 1, "train y incorrect."); + + mu_assert(test->y[0] == 2, "test y incorrect."); + mu_assert(test->y[1] == 1, "test y incorrect."); + + // check the train GenSparse struct + mu_assert(train->spZ->nnz == 16, "train nnz incorrect"); + mu_assert(train->spZ->n_row == 8, "train n_row incorrect"); + mu_assert(train->spZ->n_col == 3, "train n_col incorrect"); + + mu_assert(train->spZ->values[0] == 1.0, "Wrong train value at 0"); + mu_assert(train->spZ->values[1] == 1.0, "Wrong train value at 1"); + mu_assert(train->spZ->values[2] == 3.0, "Wrong train value at 2"); + mu_assert(train->spZ->values[3] == 3.0, "Wrong train value at 3"); + mu_assert(train->spZ->values[4] == 5.0, "Wrong train value at 4"); + mu_assert(train->spZ->values[5] == 5.0, "Wrong train value at 5"); + mu_assert(train->spZ->values[6] == 6.0, "Wrong train value at 6"); + mu_assert(train->spZ->values[7] == 6.0, "Wrong train value at 7"); + mu_assert(train->spZ->values[8] == 7.0, "Wrong train value at 8"); + mu_assert(train->spZ->values[9] == 7.0, "Wrong train value at 9"); + mu_assert(train->spZ->values[10] == 8.0, "Wrong train value at 10"); + mu_assert(train->spZ->values[11] == 8.0, "Wrong train value at 11"); + mu_assert(train->spZ->values[12] == 9.0, "Wrong train value at 12"); + mu_assert(train->spZ->values[13] == 9.0, "Wrong train value at 13"); + mu_assert(train->spZ->values[14] == 10.0, "Wrong train value at 14"); + mu_assert(train->spZ->values[15] == 10.0, "Wrong train value at 15"); + + mu_assert(train->spZ->ia[0] == 0, "Wrong train ia at 0"); + mu_assert(train->spZ->ia[1] == 2, "Wrong train ia at 1"); + mu_assert(train->spZ->ia[2] == 4, "Wrong train ia at 2"); + mu_assert(train->spZ->ia[3] == 6, "Wrong train ia at 3"); + mu_assert(train->spZ->ia[4] == 8, "Wrong train ia at 4"); + mu_assert(train->spZ->ia[5] == 10, "Wrong train ia at 5"); + mu_assert(train->spZ->ia[6] == 12, "Wrong train ia at 6"); + mu_assert(train->spZ->ia[7] == 14, "Wrong train ia at 7"); + mu_assert(train->spZ->ia[8] == 16, "Wrong train ia at 8"); + + mu_assert(train->spZ->ja[0] == 1, "Wrong train ja at 0"); + mu_assert(train->spZ->ja[1] == 2, "Wrong train ja at 1"); + mu_assert(train->spZ->ja[2] == 1, "Wrong train ja at 2"); + mu_assert(train->spZ->ja[3] == 2, "Wrong train ja at 3"); + mu_assert(train->spZ->ja[4] == 1, "Wrong train ja at 4"); + mu_assert(train->spZ->ja[5] == 2, "Wrong train ja at 5"); + mu_assert(train->spZ->ja[6] == 1, "Wrong train ja at 6"); + mu_assert(train->spZ->ja[7] == 2, "Wrong train ja at 7"); + mu_assert(train->spZ->ja[8] == 1, "Wrong train ja at 8"); + mu_assert(train->spZ->ja[9] == 2, "Wrong train ja at 9"); + mu_assert(train->spZ->ja[10] == 1, "Wrong train ja at 10"); + mu_assert(train->spZ->ja[11] == 2, "Wrong train ja at 11"); + mu_assert(train->spZ->ja[12] == 1, "Wrong train ja at 12"); + mu_assert(train->spZ->ja[13] == 2, "Wrong train ja at 13"); + mu_assert(train->spZ->ja[14] == 1, "Wrong train ja at 14"); + mu_assert(train->spZ->ja[15] == 2, "Wrong train ja at 15"); + + // check the test GenSparse struct + mu_assert(test->spZ->nnz == 4, "test nnz incorrect"); + mu_assert(test->spZ->n_row == 2, "test n_row incorrect"); + mu_assert(test->spZ->n_col == 3, "test n_col incorrect"); + + mu_assert(test->spZ->values[0] == 2.0, "Wrong test value at 0"); + mu_assert(test->spZ->values[1] == 2.0, "Wrong test value at 1"); + mu_assert(test->spZ->values[2] == 4.0, "Wrong test value at 2"); + mu_assert(test->spZ->values[3] == 4.0, "Wrong test value at 3"); + + mu_assert(test->spZ->ia[0] == 0, "Wrong test ia at 0"); + mu_assert(test->spZ->ia[1] == 2, "Wrong test ia at 1"); + mu_assert(test->spZ->ia[2] == 4, "Wrong test ia at 2"); + + mu_assert(test->spZ->ja[0] == 1, "Wrong test ja at 0"); + mu_assert(test->spZ->ja[1] == 2, "Wrong test ja at 1"); + mu_assert(test->spZ->ja[2] == 1, "Wrong test ja at 2"); + mu_assert(test->spZ->ja[3] == 2, "Wrong test ja at 3"); + + // end test code // + gensvm_free_data(full); + gensvm_free_data(train); + gensvm_free_data(test); + free(cv_idx); + + return NULL; +} char *all_tests() { mu_suite_start(); mu_run_test(test_make_cv_split_1); mu_run_test(test_make_cv_split_2); - mu_run_test(test_get_tt_split); + mu_run_test(test_get_tt_split_dense); + mu_run_test(test_get_tt_split_sparse); return NULL; } |
