From 044dc5a93c33d7aa4c9c98a626890c16446a56fc Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 16 May 2016 18:47:09 +0200 Subject: major refactor of the code --- .gitignore | 6 + Makefile | 65 ++-- include/gensvm_base.h | 116 ++++++ include/gensvm_cmdarg.h | 21 ++ include/gensvm_copy.h | 18 + include/gensvm_crossval.h | 23 -- include/gensvm_cv_util.h | 22 ++ include/gensvm_debug.h | 20 ++ include/gensvm_grid.h | 78 ++++ include/gensvm_gridsearch.h | 37 ++ include/gensvm_init.h | 21 +- include/gensvm_io.h | 7 +- include/gensvm_kernel.h | 12 +- include/gensvm_lapack.h | 9 - include/gensvm_matrix.h | 23 +- include/gensvm_memory.h | 2 + include/gensvm_optimize.h | 36 ++ include/gensvm_pred.h | 6 +- include/gensvm_print.h | 22 ++ include/gensvm_queue.h | 40 +++ include/gensvm_simplex.h | 18 + include/gensvm_strutil.h | 6 +- include/gensvm_sv.h | 4 + include/gensvm_task.h | 53 +++ include/gensvm_timer.h | 6 +- include/gensvm_train.h | 29 -- include/gensvm_train_dataset.h | 143 -------- include/gensvm_util.h | 26 -- include/globals.h | 27 +- include/libGenSVM.h | 4 - src/GenSVMgrid.c | 215 ++++------- src/GenSVMpred.c | 180 ---------- src/GenSVMtrain.c | 245 ------------- src/GenSVMtraintest.c | 15 +- src/gensvm_base.c | 211 +++++++++++ src/gensvm_cmdarg.c | 71 ++++ src/gensvm_copy.c | 49 +++ src/gensvm_crossval.c | 144 -------- src/gensvm_cv_util.c | 141 ++++++++ src/gensvm_debug.c | 34 ++ src/gensvm_grid.c | 62 ++++ src/gensvm_gridsearch.c | 652 +++++++++++++++++++++++++++++++++ src/gensvm_init.c | 251 ++++--------- src/gensvm_io.c | 58 ++- src/gensvm_kernel.c | 51 ++- src/gensvm_lapack.c | 122 ------- src/gensvm_matrix.c | 38 -- src/gensvm_memory.c | 7 +- src/gensvm_optimize.c | 794 +++++++++++++++++++++++++++++++++++++++++ src/gensvm_pred.c | 10 +- src/gensvm_print.c | 88 +++++ src/gensvm_queue.c | 71 ++++ src/gensvm_simplex.c | 45 +++ src/gensvm_strutil.c | 3 +- src/gensvm_sv.c | 3 +- src/gensvm_task.c | 55 +++ src/gensvm_timer.c | 55 +-- src/gensvm_train.c | 538 ---------------------------- src/gensvm_train_dataset.c | 766 --------------------------------------- src/gensvm_util.c | 149 -------- src/libGenSVM.c | 304 ---------------- 61 files changed, 3105 insertions(+), 3222 deletions(-) create mode 100644 include/gensvm_base.h create mode 100644 include/gensvm_cmdarg.h create mode 100644 include/gensvm_copy.h delete mode 100644 include/gensvm_crossval.h create mode 100644 include/gensvm_cv_util.h create mode 100644 include/gensvm_debug.h create mode 100644 include/gensvm_grid.h create mode 100644 include/gensvm_gridsearch.h create mode 100644 include/gensvm_optimize.h create mode 100644 include/gensvm_print.h create mode 100644 include/gensvm_queue.h create mode 100644 include/gensvm_simplex.h create mode 100644 include/gensvm_task.h delete mode 100644 include/gensvm_train.h delete mode 100644 include/gensvm_train_dataset.h delete mode 100644 include/gensvm_util.h delete mode 100644 src/GenSVMpred.c delete mode 100644 src/GenSVMtrain.c create mode 100644 src/gensvm_base.c create mode 100644 src/gensvm_cmdarg.c create mode 100644 src/gensvm_copy.c delete mode 100644 src/gensvm_crossval.c create mode 100644 src/gensvm_cv_util.c create mode 100644 src/gensvm_debug.c create mode 100644 src/gensvm_grid.c create mode 100644 src/gensvm_gridsearch.c delete mode 100644 src/gensvm_matrix.c create mode 100644 src/gensvm_optimize.c create mode 100644 src/gensvm_print.c create mode 100644 src/gensvm_queue.c create mode 100644 src/gensvm_simplex.c create mode 100644 src/gensvm_task.c delete mode 100644 src/gensvm_train.c delete mode 100644 src/gensvm_train_dataset.c delete mode 100644 src/gensvm_util.c diff --git a/.gitignore b/.gitignore index 98ab832..aed01b8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,9 @@ *.swp GenSVM_train GenSVM_grid +doc/* +training/* +codegrep* +log* +gensvm +gensvm_grid diff --git a/Makefile b/Makefile index 91cb916..d281bb4 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ DOXY=doxygen DOCDIR=doc DOXYFILE=$(DOCDIR)/Doxyfile -EXECS=GenSVM_train GenSVM_grid gensvm +EXECS=gensvm gensvm_grid .PHONY: all clean doc test @@ -29,58 +29,55 @@ test: lib/libgensvm.a $(MAKE) -C tests all lib/libgensvm.a: \ - src/libGenSVM.o \ - src/gensvm_crossval.o \ + src/gensvm_base.o \ + src/gensvm_cmdarg.o \ + src/gensvm_copy.o \ + src/gensvm_cv_util.o \ + src/gensvm_grid.o \ + src/gensvm_gridsearch.o \ src/gensvm_init.o \ src/gensvm_io.o \ src/gensvm_kernel.o \ - src/gensvm_lapack.o \ - src/gensvm_matrix.o \ src/gensvm_memory.o \ + src/gensvm_optimize.o \ src/gensvm_pred.o \ + src/gensvm_print.o \ + src/gensvm_queue.o \ + src/gensvm_simplex.o \ src/gensvm_strutil.o \ src/gensvm_sv.o \ - src/gensvm_train.o \ - src/gensvm_train_dataset.o \ - src/gensvm_timer.o \ - src/gensvm_util.o + src/gensvm_task.o \ + src/gensvm_timer.o @ar rcs lib/libgensvm.a \ - src/libGenSVM.o \ - src/gensvm_crossval.o \ + src/gensvm_base.o \ + src/gensvm_cmdarg.o \ + src/gensvm_copy.o \ + src/gensvm_cv_util.o \ + src/gensvm_grid.o \ + src/gensvm_gridsearch.o \ src/gensvm_init.o \ src/gensvm_io.o \ - src/gensvm_matrix.o \ - src/gensvm_memory.o \ src/gensvm_kernel.o \ - src/gensvm_lapack.o \ + src/gensvm_memory.o \ + src/gensvm_optimize.o \ src/gensvm_pred.o \ + src/gensvm_print.o \ + src/gensvm_queue.o \ + src/gensvm_simplex.o \ src/gensvm_strutil.o \ src/gensvm_sv.o \ - src/gensvm_train.o \ - src/gensvm_train_dataset.o \ - src/gensvm_timer.o \ - src/gensvm_util.o + src/gensvm_task.o \ + src/gensvm_timer.o @echo libgensvm.a... gensvm: src/GenSVMtraintest.c lib/libgensvm.a @$(CC) -o $@ $< $(CFLAGS) $(INCLUDE) $(LIB) -lgensvm $(LDFLAGS) - @echo gensvm... - -GenSVM_train: src/GenSVMtrain.c lib/libgensvm.a - @$(CC) -o GenSVM_train src/GenSVMtrain.c $(CFLAGS) $(INCLUDE) $(LIB)\ - -lgensvm $(LDFLAGS) - @echo GenSVM_train... + @echo gensvm ... -GenSVM_grid: src/GenSVMgrid.c lib/libgensvm.a - @$(CC) -o GenSVM_grid src/GenSVMgrid.c $(CFLAGS) $(INCLUDE) $(LIB) \ - -lgensvm $(LDFLAGS) - @echo GenSVM_grid... - -GenSVM_pred: src/GenSVMpred.c lib/libgensvm.a - @$(CC) -o GenSVM_pred src/GenSVMpred.c $(CFLAGS) $(INCLUDE) $(LIB) \ - -lgensvm $(LDFLAGS) - @echo GenSVM_pred... +gensvm_grid: src/GenSVMgrid.c lib/libgensvm.a + @$(CC) -o $@ $< $(CFLAGS) $(INCLUDE) $(LIB) -lgensvm $(LDFLAGS) + @echo gensvm_grid ... src/%.o: src/%.c @$(CC) $(CFLAGS) $(INCLUDE) $(LDFLAGS) -c $< -o $@ - @echo $<... + @echo $< ... diff --git a/include/gensvm_base.h b/include/gensvm_base.h new file mode 100644 index 0000000..b1f4a6b --- /dev/null +++ b/include/gensvm_base.h @@ -0,0 +1,116 @@ +/** + * @file gensvm_base.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Definitions for GenData and GenModel structures + * + * @details + * Contains documentation and declarations of GenModel and GenData. + * + */ + +#ifndef GENSVM_BASE_H +#define GENSVM_BASE_H + +// includes +#include "globals.h" + +// type declarations + +/** + * @brief A structure to represent the data. + * + * @param K number of classes + * @param n number of instances + * @param m number of predictors + * @param *y pointer to vector of class labels + * @param *Z pointer to augmented data matrix + * @param *RAW pointer to augmented raw data matrix + * @param *J pointer to regularization vector + * @param kerneltype kerneltype used in GenData::Z + * @param *kernelparam kernel parameters used in GenData::Z + * + */ +struct GenData { + long K; + ///< number of classes + long n; + ///< number of instances + long m; + ///< number of predictors (width of RAW) + long r; + ///< number of eigenvalues (width of Z) + long *y; + ///< array of class labels, 1..K + double *Z; + ///< augmented data matrix (either equal to RAW or to the eigenvectors + ///< of the kernel matrix) + double *RAW; + ///< augmented raw data matrix + double *Sigma; + KernelType kerneltype; + double *kernelparam; +}; + +/** + * @brief A structure to represent a single GenSVM model. + * + */ +struct GenModel { + int weight_idx; + ///< which weights to use (1 = unit, 2 = group) + long K; + ///< number of classes in the dataset + long n; + ///< number of instances in the dataset + long m; + ///< number of predictor variables in the dataset + double epsilon; + ///< stopping criterion for the IM algorithm. + double p; + ///< parameter for the L-p norm in the loss function + double kappa; + ///< parameter for the Huber hinge function + double lambda; + ///< regularization parameter in the loss function + double *W; + ///< weight matrix + double *t; + ///< translation vector + double *V; + ///< augmented weight matrix + double *Vbar; + ///< augmented weight matrix from the previous iteration of the IM + ///< algorithm + double *U; + ///< simplex matrix + double *UU; + ///< 3D simplex difference matrix + double *Q; + ///< error matrix + double *H; + ///< Huber weighted error matrix + double *R; + ///< 0-1 auixiliary matrix, this matrix is n x K, with for row i a 0 on + ///< column y[i]-1, and 1 everywhere else. + double *rho; + ///< vector of instance weights + double training_error; + ///< loss function value after training has finished + char *data_file; + ///< filename of the data + KernelType kerneltype; + ///< type of kernel used in the model + double *kernelparam; + ///< array of kernel parameters, size depends on kernel type +}; + +// function declarations +struct GenModel *gensvm_init_model(); +void gensvm_allocate_model(struct GenModel *model); +void gensvm_reallocate_model(struct GenModel *model, long n, long m); +void gensvm_free_model(struct GenModel *model); +struct GenData *gensvm_init_data(); +void gensvm_free_data(struct GenData *data); + +#endif diff --git a/include/gensvm_cmdarg.h b/include/gensvm_cmdarg.h new file mode 100644 index 0000000..ac33be8 --- /dev/null +++ b/include/gensvm_cmdarg.h @@ -0,0 +1,21 @@ +/** + * @file gensvm_cmdarg.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_cmdarg.c + * + * @details + * Function declarations for dealing with command line arguments. + * + */ + +#ifndef GENSVM_CMDARG_H +#define GENSVM_CMDARG_H + +#include "globals.h" + +// function declarations +int gensvm_check_argv(int argc, char **argv, char *str); +int gensvm_check_argv_eq(int argc, char **argv, char *str); + +#endif diff --git a/include/gensvm_copy.h b/include/gensvm_copy.h new file mode 100644 index 0000000..bdf6eec --- /dev/null +++ b/include/gensvm_copy.h @@ -0,0 +1,18 @@ +/** + * @file gensvm_copy.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_copy.c + * + */ + +#ifndef GENSVM_COPY_H +#define GENSVM_COPY_H + +// includes +#include "gensvm_base.h" + +// function declarations +void gensvm_copy_model(struct GenModel *from, struct GenModel *to); + +#endif diff --git a/include/gensvm_crossval.h b/include/gensvm_crossval.h deleted file mode 100644 index 3ac5fa9..0000000 --- a/include/gensvm_crossval.h +++ /dev/null @@ -1,23 +0,0 @@ -/** - * @file crossval.h - * @author Gertjan van den Burg - * @date January, 2014 - * @brief Header file for crossval.c - * - * @details - * Contains function declarations for functions needed for performing cross - * validation on GenData structures. - * - */ - -#ifndef GENSVM_CROSSVAL_H -#define GENSVM_CROSSVAL_H - -// forward delaration -struct GenData; - -void gensvm_make_cv_split(long N, long folds, long *cv_idx); -void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, - struct GenData *test_data, long *cv_idx, long fold_idx); - -#endif diff --git a/include/gensvm_cv_util.h b/include/gensvm_cv_util.h new file mode 100644 index 0000000..ada727d --- /dev/null +++ b/include/gensvm_cv_util.h @@ -0,0 +1,22 @@ +/** + * @file gensvm_cv_util.h + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Header file for gensvm_cv_util.c + * + * @details + * Contains function declarations for functions needed for performing cross + * validation on GenData structures. + * + */ + +#ifndef GENSVM_CV_UTIL_H +#define GENSVM_CV_UTIL_H + +#include "gensvm_base.h" + +void gensvm_make_cv_split(long N, long folds, long *cv_idx); +void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, + struct GenData *test_data, long *cv_idx, long fold_idx); + +#endif diff --git a/include/gensvm_debug.h b/include/gensvm_debug.h new file mode 100644 index 0000000..1cab4ca --- /dev/null +++ b/include/gensvm_debug.h @@ -0,0 +1,20 @@ + +/** + * @file gensvm_debug.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header for useful debug functions + * + * @details + * Contains defines useful for debugging. + * + */ + +#ifndef GENSVM_DEBUG_H +#define GENSVM_DEBUG_H + +#include "gensvm_print.h" + +void print_matrix(double *M, long rows, long cols); + +#endif diff --git a/include/gensvm_grid.h b/include/gensvm_grid.h new file mode 100644 index 0000000..d335d7c --- /dev/null +++ b/include/gensvm_grid.h @@ -0,0 +1,78 @@ +/** + * @file gensvm_grid.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Structs necessary for the grid search + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue and a single + * task in a queue, as well as a structure for the complete training + * scheme. Function declarations are also included. + * + */ + +#ifndef GENSVM_GRID_H +#define GENSVM_GRID_H + +#include "globals.h" + +/** + * @brief Structure for describing the entire grid search + * + * @param traintype type of training to use + * @param kerneltype type of kernel to use throughout training + * @param repeats number of repeats to be done after the grid + * search to find the parameter set with the + * most consistent high performance + * @param folds number of folds in cross validation + * @param Np size of the array of p values + * @param Nl size of the array of lambda values + * @param Nk size of the array of kappa values + * @param Ne size of the array of epsilon values + * @param Nw size of the array of weight_idx values + * @param Ng size of the array of gamma values + * @param Nc size of the array of coef values + * @param Nd size of the array of degree values + * @param *weight_idxs array of weight_idxs + * @param *ps array of p values + * @param *lambdas array of lambda values + * @param *kappas array of kappa values + * @param *epsilons array of epsilon values + * @param *gammas array of gamma values + * @param *coefs array of coef values + * @param *degrees array of degree values + * @param *train_data_file filename of train data file + * @param *test_data_file filename of test data file + * + */ +struct GenGrid { + TrainType traintype; + KernelType kerneltype; + long repeats; + long folds; + long Np; + long Nl; + long Nk; + long Ne; + long Nw; + long Ng; + long Nc; + long Nd; + int *weight_idxs; + double *ps; + double *lambdas; + double *kappas; + double *epsilons; + double *gammas; + double *coefs; + double *degrees; + char *train_data_file; + char *test_data_file; +}; + +// function declarations +struct GenGrid *gensvm_init_grid(); +void gensvm_free_grid(struct GenGrid *grid); + +#endif diff --git a/include/gensvm_gridsearch.h b/include/gensvm_gridsearch.h new file mode 100644 index 0000000..dcd9b93 --- /dev/null +++ b/include/gensvm_gridsearch.h @@ -0,0 +1,37 @@ +/** + * @file gensvm_gridsearch.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_gridsearch.c + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue and a single + * task in a queue, as well as a structure for the complete training + * scheme. Function declarations are also included. + * + */ + +#ifndef GENSVM_GRIDSEARCH_H +#define GENSVM_GRIDSEARCH_H + +// includes +#include "gensvm_cv_util.h" +#include "gensvm_init.h" +#include "gensvm_grid.h" +#include "gensvm_optimize.h" +#include "gensvm_pred.h" +#include "gensvm_queue.h" +#include "gensvm_timer.h" + +// function declarations +void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, + struct GenData *train_data, struct GenData *test_data); +void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype); +void make_model_from_task(struct GenTask *task, struct GenModel *model); +void print_progress_string(struct GenTask *task, long N); +void start_training(struct GenQueue *q); +double gensvm_cross_validation(struct GenModel *model, + struct GenData **train_folds, struct GenData **test_folds, + int folds, long n_total); +#endif diff --git a/include/gensvm_init.h b/include/gensvm_init.h index 980366b..3f4a1cb 100644 --- a/include/gensvm_init.h +++ b/include/gensvm_init.h @@ -1,28 +1,21 @@ /** * @file gensvm_init.h * @author Gertjan van den Burg - * @date January, 2014 + * @date May, 2016 * @brief Header file for gensvm_init.c * * @details - * Contains function declarations for the initialization functions for - * GenModel and GenData structures. + * Contains function declarations for the initialization functions for the + * model weights and model V matrix. */ #ifndef GENSVM_INIT_H #define GENSVM_INIT_H -// include -#include "globals.h" -#include "gensvm.h" +#include "gensvm_base.h" -struct GenModel *gensvm_init_model(); - -struct GenData *gensvm_init_data(); - -void gensvm_allocate_model(struct GenModel *model); -void gensvm_reallocate_model(struct GenModel *model, long n, long m); -void gensvm_free_model(struct GenModel *model); -void gensvm_free_data(struct GenData *data); +void gensvm_init_V(struct GenModel *from_model, struct GenModel *to_model, + struct GenData *data); +void gensvm_initialize_weights(struct GenData *data, struct GenModel *model); #endif diff --git a/include/gensvm_io.h b/include/gensvm_io.h index 4581c5f..9b0d973 100644 --- a/include/gensvm_io.h +++ b/include/gensvm_io.h @@ -12,9 +12,9 @@ #ifndef GENSVM_IO_H #define GENSVM_IO_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_base.h" +#include "gensvm_strutil.h" // function declarations void gensvm_read_data(struct GenData *dataset, char *data_file); @@ -24,5 +24,6 @@ void gensvm_write_model(struct GenModel *model, char *output_filename); void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename); +void gensvm_time_string(char *buffer); #endif diff --git a/include/gensvm_kernel.h b/include/gensvm_kernel.h index 45b7e62..a1fac20 100644 --- a/include/gensvm_kernel.h +++ b/include/gensvm_kernel.h @@ -14,12 +14,10 @@ #ifndef GENSVM_KERNEL_H #define GENSVM_KERNEL_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_base.h" // function declarations - void gensvm_kernel_preprocess(struct GenModel *model, struct GenData *data); void gensvm_kernel_postprocess(struct GenModel *model, struct GenData *traindata, struct GenData *testdata); @@ -36,5 +34,9 @@ void gensvm_make_testfactor(struct GenData *testdata, double gensvm_dot_rbf(double *x1, double *x2, double *kernelparam, long n); double gensvm_dot_poly(double *x1, double *x2, double *kernelparam, long n); double gensvm_dot_sigmoid(double *x1, double *x2, double *kernelparam, long n); - +int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, + double VL, double VU, int IL, int IU, double ABSTOL, + int *M, double *W, double *Z, int LDZ, double *WORK, int LWORK, + int *IWORK, int *IFAIL); +double dlamch(char CMACH); #endif diff --git a/include/gensvm_lapack.h b/include/gensvm_lapack.h index c4e58e8..843169b 100644 --- a/include/gensvm_lapack.h +++ b/include/gensvm_lapack.h @@ -12,13 +12,4 @@ #ifndef GENSVM_LAPACK_H #define GENSVM_LAPACK_H -int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, - int LDB); -int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, - double *B, int LDB, double *WORK, int LWORK); -int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, - double VL, double VU, int IL, int IU, double ABSTOL, - int *M, double *W, double *Z, int LDZ, double *WORK, int LWORK, - int *IWORK, int *IFAIL); -double dlamch(char CMACH); #endif diff --git a/include/gensvm_matrix.h b/include/gensvm_matrix.h index 5c88f0b..9982b78 100644 --- a/include/gensvm_matrix.h +++ b/include/gensvm_matrix.h @@ -2,34 +2,15 @@ * @file gensvm_matrix.h * @author Gertjan van den Burg * @date August, 2013 - * @brief Header file for gensvm_matrix.c + * @brief Header with defines for matrix access * * @details - * Contains function declarations for functions useful for dealing with matrices. + * Contains defines useful for dealing with matrices. * */ #ifndef GENSVM_MATRIX_H #define GENSVM_MATRIX_H -// Set a matrix element (RowMajor) -#define matrix_set(M, cols, i, j, val) M[(i)*(cols)+j] = val - -// Get a matrix element (RowMajor) -#define matrix_get(M, cols, i, j) M[(i)*(cols)+j] - -// Add to a matrix element (RowMajor) -#define matrix_add(M, cols, i, j, val) M[(i)*(cols)+j] += val - -// Multiply a matrix element (RowMajor) -#define matrix_mul(M, cols, i, j, val) M[(i)*(cols)+j] *= val - -// Set a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) -#define matrix3_set(M, N2, N3, i, j, k, val) M[k+(N3)*(j+(N2)*(i))] = val - -// Get a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) -#define matrix3_get(M, N2, N3, i, j, k) M[k+(N3)*(j+(N2)*(i))] - -void print_matrix(double *M, long rows, long cols); #endif diff --git a/include/gensvm_memory.h b/include/gensvm_memory.h index bc4aae9..08d6f2d 100644 --- a/include/gensvm_memory.h +++ b/include/gensvm_memory.h @@ -9,6 +9,8 @@ #ifndef GENSVM_MEMORY_H #define GENSVM_MEMORY_H +#include + #define Calloc(type, size) \ mycalloc(__FILE__, __LINE__, size, sizeof(type)) #define Malloc(type, size) \ diff --git a/include/gensvm_optimize.h b/include/gensvm_optimize.h new file mode 100644 index 0000000..7a23bdb --- /dev/null +++ b/include/gensvm_optimize.h @@ -0,0 +1,36 @@ +/** + * @file gensvm_train.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_train.c + * + * @details + * Contains function declarations for functions used to train a single + * GenModel. + * + */ + +#ifndef GENSVM_TRAIN_H +#define GENSVM_TRAIN_H + +#include "gensvm_sv.h" +#include "gensvm_print.h" +#include "gensvm_simplex.h" + +// function declarations +void gensvm_optimize(struct GenModel *model, struct GenData *data); +double gensvm_get_loss(struct GenModel *model, struct GenData *data, + double *ZV); +void gensvm_get_update(struct GenModel *model, struct GenData *data, + double *B, double *ZAZ, double *ZAZV, double *ZAZVT); +void gensvm_category_matrix(struct GenModel *model, struct GenData *data); +void gensvm_simplex_diff(struct GenModel *model, struct GenData *dataset); +void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, + double *ZV); +void gensvm_calculate_huber(struct GenModel *model); +void gensvm_step_doubling(struct GenModel *model); +int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, int LDB); +int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, double *B, + int LDB, double *WORK, int LWORK); + +#endif diff --git a/include/gensvm_pred.h b/include/gensvm_pred.h index 97af01f..56e16e8 100644 --- a/include/gensvm_pred.h +++ b/include/gensvm_pred.h @@ -12,9 +12,9 @@ #ifndef GENSVM_PRED_H #define GENSVM_PRED_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_kernel.h" +#include "gensvm_simplex.h" // function declarations void gensvm_predict_labels(struct GenData *testdata, diff --git a/include/gensvm_print.h b/include/gensvm_print.h new file mode 100644 index 0000000..fff7af5 --- /dev/null +++ b/include/gensvm_print.h @@ -0,0 +1,22 @@ +/** + * @file gensvm_print.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_print.c + * + * @details + * Function declarations for printing to stdout and stderr. + * + */ + +#ifndef GENSVM_PRINT_H +#define GENSVM_PRINT_H + +// includes +#include "globals.h" + +// function declarations +void note(const char *fmt,...); +void err(const char *fmt,...); + +#endif diff --git a/include/gensvm_queue.h b/include/gensvm_queue.h new file mode 100644 index 0000000..e8d26d6 --- /dev/null +++ b/include/gensvm_queue.h @@ -0,0 +1,40 @@ +/** + * @file gensvm_queue.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_queue.c + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue. Function declarations + * for initializing and freeing the queue are also included. + * + */ + +#ifndef GENSVM_QUEUE_H +#define GENSVM_QUEUE_H + +#include "gensvm_task.h" + +/** + * @brief Simple task queue. + * + * This struct is basically just an array of pointers to Task instances, + * with a length and an index of the current task. + * + * @param **tasks array of pointers to Task structs + * @param N size of task array + * @param i index used for keeping track of the queue + */ +struct GenQueue { + struct GenTask **tasks; + long N; + long i; +}; + +// function declarations +struct GenQueue *gensvm_init_queue(); +void gensvm_free_queue(struct GenQueue *q); +struct GenTask *get_next_task(struct GenQueue *q); + +#endif diff --git a/include/gensvm_simplex.h b/include/gensvm_simplex.h new file mode 100644 index 0000000..9bb40b1 --- /dev/null +++ b/include/gensvm_simplex.h @@ -0,0 +1,18 @@ +/** + * @file gensvm_simplex.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_simplex.c + * + */ + +#ifndef GENSVM_SIMPLEX_H +#define GENSVM_SIMPLEX_H + +// includes +#include "globals.h" + +// forward declarations +void gensvm_simplex(long K, double *U); + +#endif diff --git a/include/gensvm_strutil.h b/include/gensvm_strutil.h index c51422f..efaa5ec 100644 --- a/include/gensvm_strutil.h +++ b/include/gensvm_strutil.h @@ -1,8 +1,8 @@ /** - * @file strutil.h + * @file gensvm_strutil.h * @author Gertjan van den Burg * @date August, 2013 - * @brief Header file for strutil.c + * @brief Header file for gensvm_strutil.c * * @details * Function declarations for useful string functions used in parsing @@ -13,7 +13,7 @@ #ifndef GENSVM_STRUTIL_H #define GENSVM_STRUTIL_H -#include "types.h" +#include "globals.h" bool str_startswith(const char *str, const char *pre); bool str_endswith(const char *str, const char *suf); diff --git a/include/gensvm_sv.h b/include/gensvm_sv.h index 2c7cf57..8347b95 100644 --- a/include/gensvm_sv.h +++ b/include/gensvm_sv.h @@ -12,6 +12,10 @@ #ifndef GENSVM_SV_H #define GENSVM_SV_H +// includes +#include "gensvm_base.h" + +// function declarations long gensvm_num_sv(struct GenModel *model, struct GenData *data); #endif diff --git a/include/gensvm_task.h b/include/gensvm_task.h new file mode 100644 index 0000000..98c8f26 --- /dev/null +++ b/include/gensvm_task.h @@ -0,0 +1,53 @@ +/** + * @file gensvm_task.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Struct for a single task in the queue + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for the tasks in the queue. + * Initialization and free functions are also included. + * + */ + +#ifndef GENSVM_TASK_H +#define GENSVM_TASK_H + +#include "gensvm_base.h" + +/** + * @brief A structure for a single task in the queue. + * + * @param folds number of folds in cross validation + * @param ID numeric id of the task in the queue + * @param weight_idx parameter for the GenModel + * @param p parameter for the GenModel + * @param kappa parameter for the GenModel + * @param lambda parameter for the GenModel + * @param epsilon parameter for the GenModel + * @param kerneltype parameter for the GenModel + * @param *kernelparam parameters for the GenModel + * @param *train_data pointer to the training data + * @param *test_data pointer to the test data (if any) + * @param performance performance after cross validation + */ +struct GenTask { + KernelType kerneltype; + int weight_idx; + long folds; + long ID; + double p; + double kappa; + double lambda; + double epsilon; + double *kernelparam; + struct GenData *train_data; + struct GenData *test_data; + double performance; +}; + +struct GenTask *gensvm_init_task(); +void gensvm_free_task(struct GenTask *task); + +#endif diff --git a/include/gensvm_timer.h b/include/gensvm_timer.h index 29c45cd..11e61e5 100644 --- a/include/gensvm_timer.h +++ b/include/gensvm_timer.h @@ -12,8 +12,10 @@ #ifndef GENSVM_TIMER_H #define GENSVM_TIMER_H -double elapsed_time(clock_t s_time, clock_t e_time); +// includes +#include "globals.h" -void get_time_string(char *buffer); +// function declarations +double gensvm_elapsed_time(clock_t s_time, clock_t e_time); #endif diff --git a/include/gensvm_train.h b/include/gensvm_train.h deleted file mode 100644 index 466b8e2..0000000 --- a/include/gensvm_train.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * @file gensvm_train.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Header file for gensvm_train.c - * - * @details - * Contains function declarations for functions used to train a single - * GenModel. - * - */ - -#ifndef GENSVM_TRAIN_H -#define GENSVM_TRAIN_H - -//forward declarations -struct GenData; -struct GenModel; - -// function declarations -void gensvm_optimize(struct GenModel *model, struct GenData *data); - -double gensvm_get_loss(struct GenModel *model, struct GenData *data, - double *ZV); - -void gensvm_get_update(struct GenModel *model, struct GenData *data, - double *B, double *ZAZ, double *ZAZV, double *ZAZVT); - -#endif diff --git a/include/gensvm_train_dataset.h b/include/gensvm_train_dataset.h deleted file mode 100644 index 9a3fe86..0000000 --- a/include/gensvm_train_dataset.h +++ /dev/null @@ -1,143 +0,0 @@ -/** - * @file gensvm_train_dataset.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Structs and functions necessary for the grid search - * - * @details - * The grid search for the optimal parameters is done through a queue. - * This file contains struct definitions for this queue and a single - * task in a queue, as well as a structure for the complete training - * scheme. Function declarations are also included. - * - */ - -#ifndef GENSVM_TRAIN_DATASET_H -#define GENSVM_TRAIN_DATASET_H - -#include "types.h" - -// forward declarations -struct GenData; -struct GenModel; - -/** - * @brief A structure for a single task in the queue. - * - * @param folds number of folds in cross validation - * @param ID numeric id of the task in the queue - * @param weight_idx parameter for the GenModel - * @param p parameter for the GenModel - * @param kappa parameter for the GenModel - * @param lambda parameter for the GenModel - * @param epsilon parameter for the GenModel - * @param kerneltype parameter for the GenModel - * @param *kernelparam parameters for the GenModel - * @param *train_data pointer to the training data - * @param *test_data pointer to the test data (if any) - * @param performance performance after cross validation - */ -struct Task { - KernelType kerneltype; - int weight_idx; - long folds; - long ID; - double p; - double kappa; - double lambda; - double epsilon; - double *kernelparam; - struct GenData *train_data; - struct GenData *test_data; - double performance; -}; - -/** - * @brief Simple task queue. - * - * This struct is basically just an array of pointers to Task instances, - * with a length and an index of the current task. - * - * @param **tasks array of pointers to Task structs - * @param N size of task array - * @param i index used for keeping track of the queue - */ -struct Queue { - struct Task **tasks; - long N; - long i; -}; - -/** - * @brief Structure for describing the entire grid search - * - * @param traintype type of training to use - * @param kerneltype type of kernel to use throughout training - * @param repeats number of repeats to be done after the grid - * search to find the parameter set with the - * most consistent high performance - * @param folds number of folds in cross validation - * @param Np size of the array of p values - * @param Nl size of the array of lambda values - * @param Nk size of the array of kappa values - * @param Ne size of the array of epsilon values - * @param Nw size of the array of weight_idx values - * @param Ng size of the array of gamma values - * @param Nc size of the array of coef values - * @param Nd size of the array of degree values - * @param *weight_idxs array of weight_idxs - * @param *ps array of p values - * @param *lambdas array of lambda values - * @param *kappas array of kappa values - * @param *epsilons array of epsilon values - * @param *gammas array of gamma values - * @param *coefs array of coef values - * @param *degrees array of degree values - * @param *train_data_file filename of train data file - * @param *test_data_file filename of test data file - * - */ -struct Training { - TrainType traintype; - KernelType kerneltype; - long repeats; - long folds; - long Np; - long Nl; - long Nk; - long Ne; - long Nw; - long Ng; - long Nc; - long Nd; - int *weight_idxs; - double *ps; - double *lambdas; - double *kappas; - double *epsilons; - double *gammas; - double *coefs; - double *degrees; - char *train_data_file; - char *test_data_file; -}; - -void make_queue(struct Training *training, struct Queue *queue, - struct GenData *train_data, struct GenData *test_data); - -struct Task *get_next_task(struct Queue *q); -void free_queue(struct Queue *q); - -void consistency_repeats(struct Queue *q, long repeats, TrainType traintype); - -void make_model_from_task(struct Task *task, struct GenModel *model); -void copy_model(struct GenModel *from, struct GenModel *to); - -void print_progress_string(struct Task *task, long N); - -// new -void start_training(struct Queue *q); -double gensvm_cross_validation(struct GenModel *model, - struct GenData **train_folds, struct GenData **test_folds, - int folds, long n_total); -#endif diff --git a/include/gensvm_util.h b/include/gensvm_util.h deleted file mode 100644 index 5ea2198..0000000 --- a/include/gensvm_util.h +++ /dev/null @@ -1,26 +0,0 @@ -/** - * @file util.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Header file for util.c - * - * @details - * Function declarations for utility functions of the program. - * - */ - -#ifndef GENSVM_UTIL_H -#define GENSVM_UTIL_H - -// forward declarations -struct GenData; -struct GenModel; - -// function declarations -int gensvm_check_argv(int argc, char **argv, char *str); -int gensvm_check_argv_eq(int argc, char **argv, char *str); - -void note(const char *fmt,...); -void err(const char *fmt,...); - -#endif diff --git a/include/globals.h b/include/globals.h index becde35..7fad7b1 100644 --- a/include/globals.h +++ b/include/globals.h @@ -18,12 +18,17 @@ #ifndef GENSVM_GLOBALS_H #define GENSVM_GLOBALS_H +#include "gensvm_memory.h" +#include "types.h" + +#include #include #include #include #include - -#include "gensvm_memory.h" +#include +#include +#include #define MAX_LINE_LENGTH 1024 @@ -33,4 +38,22 @@ #define minimum(a, b) (a) < (b) ? (a) : (b) #endif +// Set a matrix element (RowMajor) +#define matrix_set(M, cols, i, j, val) M[(i)*(cols)+j] = val + +// Get a matrix element (RowMajor) +#define matrix_get(M, cols, i, j) M[(i)*(cols)+j] + +// Add to a matrix element (RowMajor) +#define matrix_add(M, cols, i, j, val) M[(i)*(cols)+j] += val + +// Multiply a matrix element (RowMajor) +#define matrix_mul(M, cols, i, j, val) M[(i)*(cols)+j] *= val + +// Set a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) +#define matrix3_set(M, N2, N3, i, j, k, val) M[k+(N3)*(j+(N2)*(i))] = val + +// Get a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) +#define matrix3_get(M, N2, N3, i, j, k) M[k+(N3)*(j+(N2)*(i))] + #endif diff --git a/include/libGenSVM.h b/include/libGenSVM.h index 9e2d4c2..146fc67 100644 --- a/include/libGenSVM.h +++ b/include/libGenSVM.h @@ -23,7 +23,6 @@ struct GenData; struct GenModel; // function declarations -void gensvm_simplex_gen(long K, double *U); void gensvm_category_matrix(struct GenModel *model, struct GenData *data); void gensvm_simplex_diff(struct GenModel *model, struct GenData *dataset); @@ -33,8 +32,5 @@ void gensvm_calculate_huber(struct GenModel *model); void gensvm_step_doubling(struct GenModel *model); -void gensvm_seed_model_V(struct GenModel *from_model, - struct GenModel *to_model, struct GenData *data); -void gensvm_initialize_weights(struct GenData *data, struct GenModel *model); #endif diff --git a/src/GenSVMgrid.c b/src/GenSVMgrid.c index 89b85a7..a6c749e 100644 --- a/src/GenSVMgrid.c +++ b/src/GenSVMgrid.c @@ -8,9 +8,9 @@ * This is a command line interface to the parameter grid search functionality * of the algorithm. The grid search is specified in a separate file, thereby * reducing the number of command line arguments. See - * read_training_from_file() for documentation on the training file. + * read_grid_from_file() for documentation on the grid file. * - * The program runs a grid search as specified in the training file. If + * The program runs a grid search as specified in the grid file. If * desired the grid search can incorporate consistency checks to find the * configuration among the best configurations which scores consistently high. * All output is written to stdout, unless the quiet mode is specified. @@ -19,18 +19,9 @@ * */ -#include - -#include "globals.h" -#include "gensvm.h" -#include "gensvm_crossval.h" +#include "gensvm_cmdarg.h" #include "gensvm_io.h" -#include "gensvm_init.h" -#include "gensvm_pred.h" -#include "gensvm_strutil.h" -#include "gensvm_train.h" -#include "gensvm_train_dataset.h" -#include "gensvm_util.h" +#include "gensvm_gridsearch.h" #define MINARGS 2 @@ -39,10 +30,7 @@ extern FILE *GENSVM_OUTPUT_FILE; // function declarations void exit_with_help(); void parse_command_line(int argc, char **argv, char *input_filename); -void read_training_from_file(char *input_filename, struct Training *training); -struct Training *gensvm_init_training(); -struct Queue *gensvm_init_queue(); -void gensvm_free_training(struct Training *training); +void read_grid_from_file(char *input_filename, struct GenGrid *grid); /** * @brief Help function @@ -50,7 +38,7 @@ void gensvm_free_training(struct Training *training); void exit_with_help() { printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: trainGenSVMdataset [options] training_file\n"); + printf("Usage: trainGenSVMdataset [options] grid_file\n"); printf("Options:\n"); printf("-h | -help : print this help.\n"); printf("-q : quiet mode (no output)\n"); @@ -62,7 +50,7 @@ void exit_with_help() * @brief Main interface function for trainGenSVMdataset * * @details - * Main interface for the command line program. A given training file which + * Main interface for the command line program. A given grid file which * specifies a grid search over a single dataset is read. From this, a Queue * is created containing all Task instances that need to be performed in the * search. Depending on the type of dataset, either cross validation or @@ -78,28 +66,28 @@ int main(int argc, char **argv) { char input_filename[MAX_LINE_LENGTH]; - struct Training *training = gensvm_init_training(); + struct GenGrid *grid = gensvm_init_grid(); struct GenData *train_data = gensvm_init_data(); struct GenData *test_data = gensvm_init_data(); - struct Queue *q = gensvm_init_queue(); + struct GenQueue *q = gensvm_init_queue(); if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") || gensvm_check_argv_eq(argc, argv, "-h") ) exit_with_help(); parse_command_line(argc, argv, input_filename); - note("Reading training file\n"); - read_training_from_file(input_filename, training); + note("Reading grid file\n"); + read_grid_from_file(input_filename, grid); - note("Reading data from %s\n", training->train_data_file); - gensvm_read_data(train_data, training->train_data_file); - if (training->traintype == TT) { - note("Reading data from %s\n", training->test_data_file); - gensvm_read_data(test_data, training->test_data_file); + note("Reading data from %s\n", grid->train_data_file); + gensvm_read_data(train_data, grid->train_data_file); + if (grid->traintype == TT) { + note("Reading data from %s\n", grid->test_data_file); + gensvm_read_data(test_data, grid->test_data_file); } note("Creating queue\n"); - make_queue(training, q, train_data, test_data); + gensvm_fill_queue(grid, q, train_data, test_data); srand(time(NULL)); @@ -107,12 +95,12 @@ int main(int argc, char **argv) start_training(q); note("Training finished\n"); - if (training->repeats > 0) { - consistency_repeats(q, training->repeats, training->traintype); + if (grid->repeats > 0) { + consistency_repeats(q, grid->repeats, grid->traintype); } - free_queue(q); - gensvm_free_training(training); + gensvm_free_queue(q); + gensvm_free_grid(grid); gensvm_free_data(train_data); gensvm_free_data(test_data); @@ -125,13 +113,13 @@ int main(int argc, char **argv) * * @details * Few arguments can be supplied to the command line. Only quiet mode can be - * specified, or help can be requested. The filename of the training file is - * read from the arguments. Parsing of the training file is done separately in - * read_training_from_file(). + * specified, or help can be requested. The filename of the grid file is + * read from the arguments. Parsing of the grid file is done separately in + * read_grid_from_file(). * * @param[in] argc number of command line arguments * @param[in] argv array of command line arguments - * @param[in] input_filename pre-allocated buffer for the training + * @param[in] input_filename pre-allocated buffer for the grid * filename. * */ @@ -181,21 +169,21 @@ KernelType parse_kernel_str(char *kernel_line) } /** - * @brief Read the Training struct from file + * @brief Read the GenGrid struct from file * * @details - * Read the Training struct from a file. The training file follows a specific - * format specified in @ref spec_training_file. + * Read the GenGrid struct from a file. The grid file follows a specific + * format specified in @ref spec_grid_file. * * Commonly used string functions in this function are all_doubles_str() and * all_longs_str(). * - * @param[in] input_filename filename of the training file - * @param[in] training Training structure to place the parsed + * @param[in] input_filename filename of the grid file + * @param[in] grid GenGrid structure to place the parsed * parameter grid. * */ -void read_training_from_file(char *input_filename, struct Training *training) +void read_grid_from_file(char *input_filename, struct GenGrid *grid) { long i, nr = 0; FILE *fid; @@ -207,108 +195,108 @@ void read_training_from_file(char *input_filename, struct Training *training) fid = fopen(input_filename, "r"); if (fid == NULL) { - fprintf(stderr, "Error opening training file %s\n", + fprintf(stderr, "Error opening grid file %s\n", input_filename); exit(1); } - training->traintype = CV; + grid->traintype = CV; while ( fgets(buffer, MAX_LINE_LENGTH, fid) != NULL ) { Memset(params, double, MAX_LINE_LENGTH); Memset(lparams, long, MAX_LINE_LENGTH); if (str_startswith(buffer, "train:")) { sscanf(buffer, "train: %s\n", train_filename); - training->train_data_file = Calloc(char, + grid->train_data_file = Calloc(char, MAX_LINE_LENGTH); - strcpy(training->train_data_file, train_filename); + strcpy(grid->train_data_file, train_filename); } else if (str_startswith(buffer, "test:")) { sscanf(buffer, "test: %s\n", test_filename); - training->test_data_file = Calloc(char, + grid->test_data_file = Calloc(char, MAX_LINE_LENGTH); - strcpy(training->test_data_file, test_filename); - training->traintype = TT; + strcpy(grid->test_data_file, test_filename); + grid->traintype = TT; } else if (str_startswith(buffer, "p:")) { nr = all_doubles_str(buffer, 2, params); - training->ps = Calloc(double, nr); + grid->ps = Calloc(double, nr); for (i=0; ips[i] = params[i]; - training->Np = nr; + grid->ps[i] = params[i]; + grid->Np = nr; } else if (str_startswith(buffer, "lambda:")) { nr = all_doubles_str(buffer, 7, params); - training->lambdas = Calloc(double, nr); + grid->lambdas = Calloc(double, nr); for (i=0; ilambdas[i] = params[i]; - training->Nl = nr; + grid->lambdas[i] = params[i]; + grid->Nl = nr; } else if (str_startswith(buffer, "kappa:")) { nr = all_doubles_str(buffer, 6, params); - training->kappas = Calloc(double, nr); + grid->kappas = Calloc(double, nr); for (i=0; ikappas[i] = params[i]; - training->Nk = nr; + grid->kappas[i] = params[i]; + grid->Nk = nr; } else if (str_startswith(buffer, "epsilon:")) { nr = all_doubles_str(buffer, 8, params); - training->epsilons = Calloc(double, nr); + grid->epsilons = Calloc(double, nr); for (i=0; iepsilons[i] = params[i]; - training->Ne = nr; + grid->epsilons[i] = params[i]; + grid->Ne = nr; } else if (str_startswith(buffer, "weight:")) { nr = all_longs_str(buffer, 7, lparams); - training->weight_idxs = Calloc(int, nr); + grid->weight_idxs = Calloc(int, nr); for (i=0; iweight_idxs[i] = lparams[i]; - training->Nw = nr; + grid->weight_idxs[i] = lparams[i]; + grid->Nw = nr; } else if (str_startswith(buffer, "folds:")) { nr = all_longs_str(buffer, 6, lparams); - training->folds = lparams[0]; + grid->folds = lparams[0]; if (nr > 1) fprintf(stderr, "Field \"folds\" only takes " "one value. Additional " "fields are ignored.\n"); } else if (str_startswith(buffer, "repeats:")) { nr = all_longs_str(buffer, 8, lparams); - training->repeats = lparams[0]; + grid->repeats = lparams[0]; if (nr > 1) fprintf(stderr, "Field \"repeats\" only " "takes one value. Additional " "fields are ignored.\n"); } else if (str_startswith(buffer, "kernel:")) { - training->kerneltype = parse_kernel_str(buffer); + grid->kerneltype = parse_kernel_str(buffer); } else if (str_startswith(buffer, "gamma:")) { nr = all_doubles_str(buffer, 6, params); - if (training->kerneltype == K_LINEAR) { + if (grid->kerneltype == K_LINEAR) { fprintf(stderr, "Field \"gamma\" ignored, " "linear kernel is used.\n"); - training->Ng = 0; + grid->Ng = 0; break; } - training->gammas = Calloc(double, nr); + grid->gammas = Calloc(double, nr); for (i=0; igammas[i] = params[i]; - training->Ng = nr; + grid->gammas[i] = params[i]; + grid->Ng = nr; } else if (str_startswith(buffer, "coef:")) { nr = all_doubles_str(buffer, 5, params); - if (training->kerneltype == K_LINEAR || - training->kerneltype == K_RBF) { + if (grid->kerneltype == K_LINEAR || + grid->kerneltype == K_RBF) { fprintf(stderr, "Field \"coef\" ignored with " "specified kernel.\n"); - training->Nc = 0; + grid->Nc = 0; break; } - training->coefs = Calloc(double, nr); + grid->coefs = Calloc(double, nr); for (i=0; icoefs[i] = params[i]; - training->Nc = nr; + grid->coefs[i] = params[i]; + grid->Nc = nr; } else if (str_startswith(buffer, "degree:")) { nr = all_doubles_str(buffer, 7, params); - if (training->kerneltype != K_POLY) { + if (grid->kerneltype != K_POLY) { fprintf(stderr, "Field \"degree\" ignored " "with specified kernel.\n"); - training->Nd = 0; + grid->Nd = 0; break; } - training->degrees = Calloc(double, nr); + grid->degrees = Calloc(double, nr); for (i=0; idegrees[i] = params[i]; - training->Nd = nr; + grid->degrees[i] = params[i]; + grid->Nd = nr; } else { fprintf(stderr, "Cannot find any parameters on line: " "%s\n", buffer); @@ -319,62 +307,3 @@ void read_training_from_file(char *input_filename, struct Training *training) free(lparams); fclose(fid); } - -struct Training *gensvm_init_training() -{ - struct Training *training = Malloc(struct Training, 1); - - // initialize to defaults - training->traintype = CV; - training->kerneltype = K_LINEAR; - training->repeats = 0; - training->folds = 10; - training->Np = 0; - training->Nl = 0; - training->Nk = 0; - training->Ne = 0; - training->Nw = 0; - training->Ng = 0; - training->Nc = 0; - training->Nd = 0; - - // set arrays to NULL - training->weight_idxs = NULL; - training->ps = NULL; - training->lambdas = NULL; - training->kappas = NULL; - training->epsilons = NULL; - training->gammas = NULL; - training->coefs = NULL; - training->degrees = NULL; - training->train_data_file = NULL; - training->test_data_file = NULL; - - return training; -} - -struct Queue *gensvm_init_queue() -{ - struct Queue *q = Malloc(struct Queue, 1); - - q->tasks = NULL; - q->N = 0; - q->i = 0; - - return q; -} - -void gensvm_free_training(struct Training *training) -{ - free(training->weight_idxs); - free(training->ps); - free(training->lambdas); - free(training->kappas); - free(training->epsilons); - free(training->gammas); - free(training->coefs); - free(training->degrees); - free(training->train_data_file); - free(training->test_data_file); - free(training); -} diff --git a/src/GenSVMpred.c b/src/GenSVMpred.c deleted file mode 100644 index 57680b1..0000000 --- a/src/GenSVMpred.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * 20140317: - * THIS FUNCTION IS DEPRECATED, SINCE IT DOES NOT WORK WITH KERNELS. - * - */ - -/** - * @file GenSVM_pred.c - * @author Gertjan van den Burg - * @date January, 2014 - * @brief Command line interface for predicting class labels - * - * @details - * This is a command line program for predicting the class labels or - * determining the predictive performance of a pre-determined model on a given - * test dataset. The predictive performance can be written to the screen or - * the predicted class labels can be written to a specified output file. This - * is done using gensvm_write_predictions(). - * - * The specified model file must follow the specification given in - * gensvm_write_model(). - * - * For usage information, see the program help function. - * - */ - -#include "gensvm.h" -#include "gensvm_init.h" -#include "gensvm_io.h" -#include "gensvm_pred.h" -#include "gensvm_util.h" - -#define MINARGS 3 - -extern FILE *GENSVM_OUTPUT_FILE; - -// function declarations -void exit_with_help(); -void parse_command_line(int argc, char **argv, - char *input_filename, char *output_filename, - char *model_filename); - -/** - * @brief Help function - */ -void exit_with_help() -{ - printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: predGenSVM [options] test_data_file model_file\n"); - printf("Options:\n"); - printf("-o output_file : write output to file\n"); - printf("-q : quiet mode (no output)\n"); - exit(0); -} - -/** - * @brief Main interface function for predGenSVM - * - * @details - * Main interface for the command line program. A given model file is read and - * a test dataset is initialized from the given data. The predictive - * performance (hitrate) of the model on the test set is printed to the output - * stream (default = stdout). If an output file is specified the predictions - * are written to the file. - * - * @todo - * Ensure that the program can read model files without class labels - * specified. In that case no prediction accuracy is printed to the screen. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * - */ -int main(int argc, char **argv) -{ - long *predy; - double performance; - - char input_filename[MAX_LINE_LENGTH]; - char model_filename[MAX_LINE_LENGTH]; - char output_filename[MAX_LINE_LENGTH];; - - if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") - || gensvm_check_argv_eq(argc, argv, "-h") ) - exit_with_help(); - parse_command_line(argc, argv, input_filename, output_filename, - model_filename); - - // read the data and model - struct GenModel *model = gensvm_init_model(); - struct GenData *data = gensvm_init_data(); - gensvm_read_data(data, input_filename); - gensvm_read_model(model, model_filename); - - // check if the number of attributes in data equals that in model - if (data->m != model->m) { - fprintf(stderr, "Error: number of attributes in data (%li) " - "does not equal the number of attributes in " - "model (%li)\n", data->m, model->m); - exit(1); - } else if (data->K != model->K) { - fprintf(stderr, "Error: number of classes in data (%li) " - "does not equal the number of classes in " - "model (%li)\n", data->K, model->K); - exit(1); - } - - // predict labels and performance if test data has labels - predy = Calloc(long, data->n); - gensvm_predict_labels(data, model, predy); - if (data->y != NULL) { - performance = gensvm_prediction_perf(data, predy); - note("Predictive performance: %3.2f%%\n", performance); - } - - // if output file is specified, write predictions to it - if (gensvm_check_argv_eq(argc, argv, "-o")) { - gensvm_write_predictions(data, predy, output_filename); - note("Predictions written to: %s\n", output_filename); - } - - // free the model, data, and predictions - gensvm_free_model(model); - gensvm_free_data(data); - free(predy); - - return 0; -} - -/** - * @brief Parse command line arguments - * - * @details - * Read the data filename and model filename from the command line arguments. - * If specified, also read the output filename. If the quiet flag is given, - * set the global output stream to NULL. On error, exit_with_help(). - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * @param[in] input_filename pre-allocated array for the input - * filename - * @param[in] output_filename pre-allocated array for the output - * filename - * @param[in] model_filename pre-allocated array for the model - * filename - * - */ -void parse_command_line(int argc, char **argv, char *input_filename, - char *output_filename, char *model_filename) -{ - int i; - - GENSVM_OUTPUT_FILE = stdout; - - for (i=1; i= argc) - exit_with_help(); - switch (argv[i-1][1]) { - case 'o': - strcpy(output_filename, argv[i]); - break; - case 'q': - GENSVM_OUTPUT_FILE = NULL; - i--; - break; - default: - fprintf(stderr, "Unknown option: -%c\n", - argv[i-1][1]); - exit_with_help(); - } - } - - if (i >= argc) - exit_with_help(); - - strcpy(input_filename, argv[i]); - i++; - strcpy(model_filename, argv[i]); -} diff --git a/src/GenSVMtrain.c b/src/GenSVMtrain.c deleted file mode 100644 index 3bb9c09..0000000 --- a/src/GenSVMtrain.c +++ /dev/null @@ -1,245 +0,0 @@ -/** - * @file GenSVM_train.c - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Command line interface for training a single model with GenSVM - * - * @details - * This is a command line program for training a single model on a given - * dataset. To run a grid search over a number of parameter configurations, - * see trainGenSVMdataset.c. - * - */ - -#include -#include - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_io.h" -#include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_train.h" -#include "gensvm_util.h" - -#define MINARGS 2 - -extern FILE *GENSVM_OUTPUT_FILE; - -// function declarations -void exit_with_help(); -void parse_command_line(int argc, char **argv, struct GenModel *model, - char *input_filename, char *output_filename, char *model_filename); - -/** - * @brief Help function - */ -void exit_with_help() -{ - printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: trainGenSVM [options] training_data_file\n"); - printf("Options:\n"); - printf("-c coef : coefficient for the polynomial and sigmoid kernel\n"); - printf("-d degree : degree for the polynomial kernel\n"); - printf("-e epsilon : set the value of the stopping criterion\n"); - printf("-g gamma : parameter for the rbf, polynomial or sigmoid " - "kernel\n"); - printf("-h | -help : print this help.\n"); - printf("-k kappa : set the value of kappa used in the Huber hinge\n"); - printf("-l lambda : set the value of lambda (lambda > 0)\n"); - printf("-m model_file : use previous model as seed for W and t\n"); - printf("-o output_file : write output to file\n"); - printf("-p p-value : set the value of p in the lp norm " - "(1.0 <= p <= 2.0)\n"); - printf("-q : quiet mode (no output)\n"); - printf("-r rho : choose the weigth specification (1 = unit, 2 = " - "group)\n"); - printf("-t type: kerneltype (LINEAR=0, POLY=1, RBF=2, SIGMOID=3)\n"); - - exit(0); -} - -/** - * @brief Main interface function for trainGenSVM - * - * @details - * Main interface for the command line program. A given dataset file is read - * and a GenSVM model is trained on this data. By default the progress of the - * computations are written to stdout. See for full options of the program the - * help function. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * - */ -int main(int argc, char **argv) -{ - char input_filename[MAX_LINE_LENGTH]; - char model_filename[MAX_LINE_LENGTH]; - char output_filename[MAX_LINE_LENGTH]; - - struct GenModel *model = gensvm_init_model(); - struct GenData *data = gensvm_init_data(); - - if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") - || gensvm_check_argv_eq(argc, argv, "-h") ) - exit_with_help(); - parse_command_line(argc, argv, model, input_filename, - output_filename, model_filename); - - // read data file - gensvm_read_data(data, input_filename); - - // copy dataset parameters to model - model->n = data->n; - model->m = data->m; - model->K = data->K; - model->data_file = input_filename; - - // allocate model - gensvm_allocate_model(model); - - // initialize kernel (if necessary) - //gensvm_make_kernel(model, data); - - // reallocate model and initialize weights - gensvm_reallocate_model(model, data->n, data->m); - gensvm_initialize_weights(data, model); - - // seed the random number generator (only place in programs is in - // command line interfaces) - srand(time(NULL)); - - if (gensvm_check_argv_eq(argc, argv, "-m")) { - struct GenModel *seed_model = gensvm_init_model(); - gensvm_read_model(seed_model, model_filename); - gensvm_seed_model_V(seed_model, model, data); - gensvm_free_model(seed_model); - } else { - gensvm_seed_model_V(NULL, model, data); - } - - // start training - gensvm_optimize(model, data); - - // write_model to file - if (gensvm_check_argv_eq(argc, argv, "-o")) { - gensvm_write_model(model, output_filename); - note("Output written to %s\n", output_filename); - } - - // free model and data - gensvm_free_model(model); - gensvm_free_data(data); - - return 0; -} - -/** - * @brief Parse command line arguments - * - * @details - * Process the command line arguments for the model parameters, and record - * them in the specified GenModel. An input filename for the dataset is read - * and if specified an output filename and a model filename for the seed - * model. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * @param[in] model initialized model - * @param[in] input_filename pre-allocated buffer for the input - * filename - * @param[in] output_filename pre-allocated buffer for the output - * filename - * @param[in] model_filename pre-allocated buffer for the model - * filename - * - */ -void parse_command_line(int argc, char **argv, struct GenModel *model, - char *input_filename, char *output_filename, char *model_filename) -{ - int i; - double gamma = 1.0, - degree = 2.0, - coef = 0.0; - - GENSVM_OUTPUT_FILE = stdout; - - // parse options - for (i=1; i=argc) { - exit_with_help(); - } - switch (argv[i-1][1]) { - case 'c': - coef = atof(argv[i]); - break; - case 'd': - degree = atof(argv[i]); - break; - case 'e': - model->epsilon = atof(argv[i]); - break; - case 'g': - gamma = atof(argv[i]); - break; - case 'k': - model->kappa = atof(argv[i]); - break; - case 'l': - model->lambda = atof(argv[i]); - break; - case 'm': - strcpy(model_filename, argv[i]); - break; - case 'o': - strcpy(output_filename, argv[i]); - break; - case 'p': - model->p = atof(argv[i]); - break; - case 'r': - model->weight_idx = atoi(argv[i]); - break; - case 't': - model->kerneltype = atoi(argv[i]); - break; - case 'q': - GENSVM_OUTPUT_FILE = NULL; - i--; - break; - default: - fprintf(stderr, "Unknown option: -%c\n", - argv[i-1][1]); - exit_with_help(); - } - } - - // read input filename - if (i >= argc) - exit_with_help(); - - strcpy(input_filename, argv[i]); - - // set kernel parameters - switch (model->kerneltype) { - case K_LINEAR: - break; - case K_POLY: - model->kernelparam = Calloc(double, 3); - model->kernelparam[0] = gamma; - model->kernelparam[1] = coef; - model->kernelparam[2] = degree; - break; - case K_RBF: - model->kernelparam = Calloc(double, 1); - model->kernelparam[0] = gamma; - break; - case K_SIGMOID: - model->kernelparam = Calloc(double, 1); - model->kernelparam[0] = gamma; - model->kernelparam[1] = coef; - } -} diff --git a/src/GenSVMtraintest.c b/src/GenSVMtraintest.c index 0199725..47cc900 100644 --- a/src/GenSVMtraintest.c +++ b/src/GenSVMtraintest.c @@ -10,17 +10,11 @@ * */ -#include - -#include "globals.h" -#include "gensvm.h" +#include "gensvm_cmdarg.h" #include "gensvm_io.h" #include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_train.h" +#include "gensvm_optimize.h" #include "gensvm_pred.h" -#include "libGenSVM.h" -#include "gensvm_util.h" #define MINARGS 2 @@ -118,10 +112,10 @@ int main(int argc, char **argv) if (gensvm_check_argv_eq(argc, argv, "-s")) { struct GenModel *seed_model = gensvm_init_model(); gensvm_read_model(seed_model, model_inputfile); - gensvm_seed_model_V(seed_model, model, traindata); + gensvm_init_V(seed_model, model, traindata); gensvm_free_model(seed_model); } else { - gensvm_seed_model_V(NULL, model, traindata); + gensvm_init_V(NULL, model, traindata); } // start training @@ -274,4 +268,3 @@ void parse_command_line(int argc, char **argv, struct GenModel *model, model->kernelparam[1] = coef; } } - diff --git a/src/gensvm_base.c b/src/gensvm_base.c new file mode 100644 index 0000000..eddef5c --- /dev/null +++ b/src/gensvm_base.c @@ -0,0 +1,211 @@ +/** + * @file gensvm_base.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for initializing GenModel and GenData structures + * + * @details + * This file contains functions for initializing, freeing, allocating, and + * reallocating a GenModel instance. It also contains functions for + * initializing and freeing a GenData structure. In addition, default values + * for these structures are defined here (and only here). + * + */ + +#include "gensvm_base.h" + +/** + * @brief Initialize a GenData structure + * + * @details + * A GenData structure is initialized and default values are set. + * A pointer to the initialized data is returned. + * + * @returns initialized GenData + * + */ +struct GenData *gensvm_init_data() +{ + struct GenData *data = Malloc(struct GenData, 1); + data->Sigma = NULL; + data->y = NULL; + data->Z = NULL; + data->RAW = NULL; + + // set default values + data->kerneltype = K_LINEAR; + data->kernelparam = NULL; + + return data; +} + +/** + * @brief Free allocated GenData struct + * + * @details + * Simply free a previously allocated GenData struct by freeing all its + * components. Note that the data struct itself is also freed here. + * + * @param[in] data GenData struct to free + * + */ +void gensvm_free_data(struct GenData *data) +{ + if (data == NULL) + return; + + if (data->Z == data->RAW) { + free(data->Z); + } else { + free(data->Z); + free(data->RAW); + } + free(data->kernelparam); + free(data->y); + free(data->Sigma); + free(data); +} + +/** + * @brief Initialize a GenModel structure + * + * @details + * A GenModel structure is initialized and the default value for the + * parameters are set. A pointer to the initialized model is returned. + * + * @returns initialized GenModel + */ +struct GenModel *gensvm_init_model() +{ + struct GenModel *model = Malloc(struct GenModel, 1); + + // set default values + model->p = 1.0; + model->lambda = pow(2, -8.0); + model->epsilon = 1e-6; + model->kappa = 0.0; + model->weight_idx = 1; + model->kerneltype = K_LINEAR; + model->kernelparam = NULL; + + model->W = NULL; + model->t = NULL; + model->V = NULL; + model->Vbar = NULL; + model->U = NULL; + model->UU = NULL; + model->Q = NULL; + model->H = NULL; + model->R = NULL; + model->rho = NULL; + model->data_file = NULL; + + return model; +} + +/** + * @brief Allocate memory for a GenModel + * + * @details + * This function can be used to allocate the memory needed for a GenModel. All + * arrays in the model are specified and initialized to 0. + * + * @param[in] model GenModel to allocate + * + */ +void gensvm_allocate_model(struct GenModel *model) +{ + long n = model->n; + long m = model->m; + long K = model->K; + + model->W = Calloc(double, m*(K-1)); + model->t = Calloc(double, K-1); + model->V = Calloc(double, (m+1)*(K-1)); + model->Vbar = Calloc(double, (m+1)*(K-1)); + model->U = Calloc(double, K*(K-1)); + model->UU = Calloc(double, n*K*(K-1)); + model->Q = Calloc(double, n*K); + model->H = Calloc(double, n*K); + model->R = Calloc(double, n*K); + model->rho = Calloc(double, n); +} + +/** + * @brief Reallocate memory for GenModel + * + * @details + * This function can be used to reallocate existing memory for a GenModel, + * upon a change in the model dimensions. This is used in combination with + * kernels. + * + * @param[in] model GenModel to reallocate + * @param[in] n new value of GenModel->n + * @param[in] m new value of GenModel->m + * + */ +void gensvm_reallocate_model(struct GenModel *model, long n, long m) +{ + long K = model->K; + + if (model->n == n && model->m == m) + return; + if (model->n != n) { + model->UU = Realloc(model->UU, double, n*K*(K-1)); + Memset(model->UU, double, n*K*(K-1)); + + model->Q = Realloc(model->Q, double, n*K); + Memset(model->Q, double, n*K); + + model->H = Realloc(model->H, double, n*K); + Memset(model->H, double, n*K); + + model->R = Realloc(model->R, double, n*K); + Memset(model->R, double, n*K); + + model->rho = Realloc(model->rho, double, n); + Memset(model->rho, double, n); + + model->n = n; + } + if (model->m != m) { + model->W = Realloc(model->W, double, m*(K-1)); + Memset(model->W, double, m*(K-1)); + + model->V = Realloc(model->V, double, (m+1)*(K-1)); + Memset(model->V, double, (m+1)*(K-1)); + + model->Vbar = Realloc(model->Vbar, double, (m+1)*(K-1)); + Memset(model->Vbar, double, (m+1)*(K-1)); + + model->m = m; + } +} + +/** + * @brief Free allocated GenModel struct + * + * @details + * Simply free a previously allocated GenModel by freeing all its component + * arrays. Note that the model struct itself is also freed here. + * + * @param[in] model GenModel to free + * + */ +void gensvm_free_model(struct GenModel *model) +{ + free(model->W); + free(model->t); + free(model->V); + free(model->Vbar); + free(model->U); + free(model->UU); + free(model->Q); + free(model->H); + free(model->rho); + free(model->R); + free(model->kernelparam); + + free(model); +} + diff --git a/src/gensvm_cmdarg.c b/src/gensvm_cmdarg.c new file mode 100644 index 0000000..8f796bb --- /dev/null +++ b/src/gensvm_cmdarg.c @@ -0,0 +1,71 @@ +/** + * @file gensvm_cmdarg.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for dealing with command line arguments + * + * @details + * This file contains several utility functions for coordinating input and + * output of data and model files. + * + */ + +#include "gensvm_cmdarg.h" + +/** + * @brief Check if any command line arguments contain string + * + * @details + * Check if any of a given array of command line arguments contains a given + * string. If the string is found, the index of the string in argv is + * returned. If the string is not found, 0 is returned. + * + * This function is copied from MSVMpack/libMSVM.c. + * + * @param[in] argc number of command line arguments + * @param[in] argv command line arguments + * @param[in] str string to find in the arguments + * @returns index of the string in the arguments if found, 0 + * otherwise + */ +int gensvm_check_argv(int argc, char **argv, char *str) +{ + int i; + int arg_str = 0; + for (i=1; iweight_idx = from->weight_idx; + to->epsilon = from->epsilon; + to->p = from->p; + to->kappa = from->kappa; + to->lambda = from->lambda; + + to->kerneltype = from->kerneltype; + switch (to->kerneltype) { + case K_LINEAR: + break; + case K_POLY: + to->kernelparam = Malloc(double, 3); + to->kernelparam[0] = from->kernelparam[0]; + to->kernelparam[1] = from->kernelparam[1]; + to->kernelparam[2] = from->kernelparam[2]; + break; + case K_RBF: + to->kernelparam = Malloc(double, 1); + to->kernelparam[0] = from->kernelparam[0]; + break; + case K_SIGMOID: + to->kernelparam = Malloc(double, 2); + to->kernelparam[0] = from->kernelparam[0]; + to->kernelparam[1] = from->kernelparam[1]; + break; + } +} diff --git a/src/gensvm_crossval.c b/src/gensvm_crossval.c deleted file mode 100644 index 8f09cb5..0000000 --- a/src/gensvm_crossval.c +++ /dev/null @@ -1,144 +0,0 @@ -/** - * @file crossval.c - * @author Gertjan van den Burg - * @date January 7, 2014 - * @brief Functions for cross validation - * - * @details - * This file contains functions for performing cross validation. The funtion - * gensvm_make_cv_split() creates a cross validation vector for non-stratified - * cross validation. The function gensvm_get_tt_split() creates a train and - * test dataset from a given dataset and a pre-determined CV partition vector. - * See individual function documentation for details. - * - */ - -#include "globals.h" -#include "gensvm.h" -#include "gensvm_crossval.h" -#include "gensvm_matrix.h" - -/** - * @brief Create a cross validation split vector - * - * @details - * A pre-allocated vector of length N is created which can be used to define - * cross validation splits. The folds are contain between - * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$ - * instances. An instance is mapped to a partition randomly until all folds - * contain @f$ N \% folds @f$ instances. The zero fold then contains - * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$ - * instances are then distributed over the first @f$ N \% folds @f$ folds. - * - * @param[in] N number of instances - * @param[in] folds number of folds - * @param[in,out] cv_idx array of size N which contains the fold index - * for each observation on exit - * - */ -void gensvm_make_cv_split(long N, long folds, long *cv_idx) -{ - long i, j, idx; - - for (i=0; in; - long m = full_data->m; - long K = full_data->K; - - double value; - - test_n = 0; - for (i=0; in = test_n; - train_data->n = train_n; - - train_data->K = K; - test_data->K = K; - - train_data->m = m; - test_data->m = m; - - train_data->y = Calloc(long, train_n); - test_data->y = Calloc(long, test_n); - - train_data->RAW = Calloc(double, train_n*(m+1)); - test_data->RAW = Calloc(double, test_n*(m+1)); - - k = 0; - l = 0; - for (i=0; iy[k] = full_data->y[i]; - for (j=0; jRAW, m+1, i, j); - matrix_set(test_data->RAW, m+1, k, j, value); - } - k++; - } else { - train_data->y[l] = full_data->y[i]; - for (j=0; jRAW, m+1, i, j); - matrix_set(train_data->RAW, m+1, l, j, value); - } - l++; - } - } - - train_data->Z = train_data->RAW; - test_data->Z = test_data->RAW; -} diff --git a/src/gensvm_cv_util.c b/src/gensvm_cv_util.c new file mode 100644 index 0000000..d9cde09 --- /dev/null +++ b/src/gensvm_cv_util.c @@ -0,0 +1,141 @@ +/** + * @file gensvm_cv_util.c + * @author Gertjan van den Burg + * @date January 7, 2014 + * @brief Functions for cross validation + * + * @details + * This file contains functions for performing cross validation. The funtion + * gensvm_make_cv_split() creates a cross validation vector for non-stratified + * cross validation. The function gensvm_get_tt_split() creates a train and + * test dataset from a given dataset and a pre-determined CV partition vector. + * See individual function documentation for details. + * + */ + +#include "gensvm_cv_util.h" + +/** + * @brief Create a cross validation split vector + * + * @details + * A pre-allocated vector of length N is created which can be used to define + * cross validation splits. The folds are contain between + * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$ + * instances. An instance is mapped to a partition randomly until all folds + * contain @f$ N \% folds @f$ instances. The zero fold then contains + * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$ + * instances are then distributed over the first @f$ N \% folds @f$ folds. + * + * @param[in] N number of instances + * @param[in] folds number of folds + * @param[in,out] cv_idx array of size N which contains the fold index + * for each observation on exit + * + */ +void gensvm_make_cv_split(long N, long folds, long *cv_idx) +{ + long i, j, idx; + + for (i=0; in; + long m = full_data->m; + long K = full_data->K; + + double value; + + test_n = 0; + for (i=0; in = test_n; + train_data->n = train_n; + + train_data->K = K; + test_data->K = K; + + train_data->m = m; + test_data->m = m; + + train_data->y = Calloc(long, train_n); + test_data->y = Calloc(long, test_n); + + train_data->RAW = Calloc(double, train_n*(m+1)); + test_data->RAW = Calloc(double, test_n*(m+1)); + + k = 0; + l = 0; + for (i=0; iy[k] = full_data->y[i]; + for (j=0; jRAW, m+1, i, j); + matrix_set(test_data->RAW, m+1, k, j, value); + } + k++; + } else { + train_data->y[l] = full_data->y[i]; + for (j=0; jRAW, m+1, i, j); + matrix_set(train_data->RAW, m+1, l, j, value); + } + l++; + } + } + + train_data->Z = train_data->RAW; + test_data->Z = test_data->RAW; +} diff --git a/src/gensvm_debug.c b/src/gensvm_debug.c new file mode 100644 index 0000000..d94711a --- /dev/null +++ b/src/gensvm_debug.c @@ -0,0 +1,34 @@ +/** + * @file gensvm_debug.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions facilitating debugging + * + * @details + * Defines functions useful for debugging matrices. + * + */ + +#include "gensvm_debug.h" + +/** + * @brief print a matrix + * + * @details + * Debug function to print a matrix + * + * @param[in] M matrix + * @param[in] rows number of rows of M + * @param[in] cols number of columns of M + */ +void print_matrix(double *M, long rows, long cols) +{ + long i, j; + + for (i=0; itraintype = CV; + grid->kerneltype = K_LINEAR; + grid->repeats = 0; + grid->folds = 10; + grid->Np = 0; + grid->Nl = 0; + grid->Nk = 0; + grid->Ne = 0; + grid->Nw = 0; + grid->Ng = 0; + grid->Nc = 0; + grid->Nd = 0; + + // set arrays to NULL + grid->weight_idxs = NULL; + grid->ps = NULL; + grid->lambdas = NULL; + grid->kappas = NULL; + grid->epsilons = NULL; + grid->gammas = NULL; + grid->coefs = NULL; + grid->degrees = NULL; + grid->train_data_file = NULL; + grid->test_data_file = NULL; + + return grid; +} + +void gensvm_free_grid(struct GenGrid *grid) +{ + free(grid->weight_idxs); + free(grid->ps); + free(grid->lambdas); + free(grid->kappas); + free(grid->epsilons); + free(grid->gammas); + free(grid->coefs); + free(grid->degrees); + free(grid->train_data_file); + free(grid->test_data_file); + free(grid); +} diff --git a/src/gensvm_gridsearch.c b/src/gensvm_gridsearch.c new file mode 100644 index 0000000..deee033 --- /dev/null +++ b/src/gensvm_gridsearch.c @@ -0,0 +1,652 @@ +/** + * @file gensvm_train_dataset.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Functions for finding the optimal parameters for the dataset + * + * @details + * The GenSVM algorithm takes a number of parameters. The functions in + * this file are used to find the optimal parameters. + */ + +#include "gensvm_gridsearch.h" + +extern FILE *GENSVM_OUTPUT_FILE; + +/** + * @brief Initialize a GenQueue from a Training instance + * + * @details + * A Training instance describes the grid to search over. This funtion + * creates all tasks that need to be performed and adds these to + * a GenQueue. Each task contains a pointer to the train and test datasets + * which are supplied. Note that the tasks are created in a specific order of + * the parameters, to ensure that the GenModel::V of a previous parameter + * set provides the best possible initial estimate of GenModel::V for the next + * parameter set. + * + * @param[in] grid Training struct describing the grid search + * @param[in] queue pointer to a GenQueue that will be used to + * add the tasks to + * @param[in] train_data GenData of the training set + * @param[in] test_data GenData of the test set + * + */ +void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, + struct GenData *train_data, struct GenData *test_data) +{ + long i, j, k; + long N, cnt = 0; + struct GenTask *task; + queue->i = 0; + + N = grid->Np; + N *= grid->Nl; + N *= grid->Nk; + N *= grid->Ne; + N *= grid->Nw; + // these parameters are not necessarily non-zero + N *= grid->Ng > 0 ? grid->Ng : 1; + N *= grid->Nc > 0 ? grid->Nc : 1; + N *= grid->Nd > 0 ? grid->Nd : 1; + + queue->tasks = Calloc(struct GenTask *, N); + queue->N = N; + + // initialize all tasks + for (i=0; iID = i; + task->train_data = train_data; + task->test_data = test_data; + task->folds = grid->folds; + task->kerneltype = grid->kerneltype; + task->kernelparam = Calloc(double, grid->Ng + + grid->Nc + grid->Nd); + queue->tasks[i] = task; + } + + // These loops mimick a large nested for loop. The advantage is that + // Nd, Nc and Ng which are on the outside of the nested for loop can + // now be zero, without large modification (see below). Whether this + // is indeed better than the nested for loop has not been tested. + cnt = 1; + i = 0; + while (i < N ) + for (j=0; jNp; j++) + for (k=0; ktasks[i]->p = grid->ps[j]; + i++; + } + + cnt *= grid->Np; + i = 0; + while (i < N ) + for (j=0; jNl; j++) + for (k=0; ktasks[i]->lambda = + grid->lambdas[j]; + i++; + } + + cnt *= grid->Nl; + i = 0; + while (i < N ) + for (j=0; jNk; j++) + for (k=0; ktasks[i]->kappa = grid->kappas[j]; + i++; + } + + cnt *= grid->Nk; + i = 0; + while (i < N ) + for (j=0; jNw; j++) + for (k=0; ktasks[i]->weight_idx = + grid->weight_idxs[j]; + i++; + } + + cnt *= grid->Nw; + i = 0; + while (i < N ) + for (j=0; jNe; j++) + for (k=0; ktasks[i]->epsilon = + grid->epsilons[j]; + i++; + } + + cnt *= grid->Ne; + i = 0; + while (i < N && grid->Ng > 0) + for (j=0; jNg; j++) + for (k=0; ktasks[i]->kernelparam[0] = + grid->gammas[j]; + i++; + } + + cnt *= grid->Ng > 0 ? grid->Ng : 1; + i = 0; + while (i < N && grid->Nc > 0) + for (j=0; jNc; j++) + for (k=0; ktasks[i]->kernelparam[1] = + grid->coefs[j]; + i++; + } + + cnt *= grid->Nc > 0 ? grid->Nc : 1; + i = 0; + while (i < N && grid->Nd > 0) + for (j=0; jNd; j++) + for (k=0; ktasks[i]->kernelparam[2] = + grid->degrees[j]; + i++; + } +} + +/** + * @brief Comparison function for doubl + * + * @param[in] elem1 number 1 + * @param[in] elem2 number 2 + * @returns comparison of number 1 larger than number 2 + */ +int doublesort(const void *elem1, const void *elem2) +{ + const double t1 = (*(double *) elem1); + const double t2 = (*(double *) elem2); + return t1 > t2; +} + +/** + * @brief Calculate the percentile of an array of doubles + * + * @details + * The percentile of performance is used to find the top performing + * configurations. Since no standard definition of the percentile exists, we + * use the method used in MATLAB and Octave. Since calculating the percentile + * requires a sorted list of the values, a local copy is made first. + * + * @param[in] values array of doubles + * @param[in] N length of the array + * @param[in] p percentile to calculate ( 0 <= p <= 100.0 ). + * @returns the p-th percentile of the values + */ +double prctile(double *values, long N, double p) +{ + if (N == 1) + return values[0]; + + long i; + double pi, pr, boundary; + double *local = Malloc(double, N); + for (i=0; iN); + for (i=0; iN; i++) { + perf[i] = q->tasks[i]->performance; + } + boundary = prctile(perf, q->N, 95.0); + free(perf); + note("boundary determined at: %f\n", boundary); + + // find the number of tasks that perform at or above the boundary + for (i=0; iN; i++) { + if (q->tasks[i]->performance >= boundary) + N++; + } + + // create a new queue with the best tasks + nq->tasks = Malloc(struct GenTask *, N); + k = 0; + for (i=0; iN; i++) { + if (q->tasks[i]->performance >= boundary) + nq->tasks[k++] = q->tasks[i]; + } + nq->N = N; + nq->i = 0; + + return nq; +} + + +/** + * @brief Run repeats of the GenTask structs in GenQueue to find the best + * configuration + * + * @details + * The best performing tasks in the supplied GenQueue are found by taking those + * GenTask structs that have a performance greater or equal to the 95% percentile + * of the performance of all tasks. These tasks are then gathered in a new + * GenQueue. For each of the tasks in this new GenQueue the cross validation run is + * repeated a number of times. + * + * For each of the GenTask configurations that are repeated the mean performance, + * standard deviation of the performance and the mean computation time are + * reported. + * + * Finally, the overall best tasks are written to the specified output. These + * tasks are selected to have both the highest mean performance, as well as the + * smallest standard deviation in their performance. This is done as follows. + * First the 99th percentile of task performance and the 1st percentile of + * standard deviation is calculated. If a task exists for which the mean + * performance of the repeats and the standard deviation equals these values + * respectively, this task is found to be the best and is written to the + * output. If no such task exists, the 98th percentile of performance and the + * 2nd percentile of standard deviation is considered. This is repeated until + * an interval is found which contains tasks. If one or more tasks are found, + * this loop stops. + * + * @param[in] q GenQueue of GenTask structs which have already been + * run and have a GenTask::performance value + * @param[in] repeats Number of times to repeat the best + * configurations for consistency + * @param[in] traintype type of training to do (CV or TT) + * + */ +void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype) +{ + long i, f, r, N, *cv_idx; + double p, pi, pr, pt, *time, *std, *mean, *perf; + struct GenQueue *nq; + struct GenData **train_folds, **test_folds; + struct GenModel *model = gensvm_init_model(); + struct GenTask *task = NULL; + clock_t loop_s, loop_e; + + nq = create_top_queue(q); + N = nq->N; + + note("Number of items: %li\n", nq->N); + std = Calloc(double, N); + mean = Calloc(double, N); + time = Calloc(double, N); + perf = Calloc(double, N*repeats); + + task = get_next_task(nq); + + model->n = 0; + model->m = task->train_data->m; + model->K = task->train_data->K; + gensvm_allocate_model(model); + gensvm_init_V(NULL, model, task->train_data); + + cv_idx = Calloc(long, task->train_data->n); + + train_folds = Malloc(struct GenData *, task->folds); + test_folds = Malloc(struct GenData *, task->folds); + + i = 0; + while (task) { + make_model_from_task(task, model); + + time[i] = 0.0; + note("(%02li/%02li:%03li)\t", i+1, N, task->ID); + for (r=0; rtrain_data->n); + gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); + train_folds = Malloc(struct GenData *, task->folds); + test_folds = Malloc(struct GenData *, task->folds); + for (f=0; ffolds; f++) { + train_folds[f] = gensvm_init_data(); + test_folds[f] = gensvm_init_data(); + gensvm_get_tt_split(task->train_data, train_folds[f], + test_folds[f], cv_idx, f); + gensvm_kernel_preprocess(model, train_folds[f]); + gensvm_kernel_postprocess(model, train_folds[f], + test_folds[f]); + } + + loop_s = clock(); + p = gensvm_cross_validation(model, train_folds, test_folds, + task->folds, task->train_data->n); + loop_e = clock(); + time[i] += gensvm_elapsed_time(loop_s, loop_e); + matrix_set(perf, repeats, i, r, p); + mean[i] += p/((double) repeats); + note("%3.3f\t", p); + // this is done because if we reuse the V it's not a + // consistency check + gensvm_init_V(NULL, model, task->train_data); + for (f=0; ffolds; f++) { + gensvm_free_data(train_folds[f]); + gensvm_free_data(test_folds[f]); + } + free(train_folds); + free(test_folds); + } + for (r=0; r 1) { + std[i] /= ((double) repeats) - 1.0; + std[i] = sqrt(std[i]); + } else { + std[i] = 0.0; + } + note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], + time[i]); + task = get_next_task(nq); + i++; + } + + // find the best overall configurations: those with high average + // performance and low deviation in the performance + note("\nBest overall configuration(s):\n"); + note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t" + "mean_perf\tstd_perf\ttime_perf\n"); + p = 0.0; + bool breakout = false; + while (breakout == false) { + pi = prctile(mean, N, (100.0-p)); + pr = prctile(std, N, p); + pt = prctile(time, N, p); + for (i=0; itasks[i]->ID, + nq->tasks[i]->weight_idx, + nq->tasks[i]->epsilon, + nq->tasks[i]->p, + nq->tasks[i]->kappa, + nq->tasks[i]->lambda, + mean[i], + std[i], + time[i]); + breakout = true; + } + p += 1.0; + } + + free(cv_idx); + // make sure no double free occurs with the copied kernelparam + model->kernelparam = NULL; + gensvm_free_model(model); + free(nq->tasks); + free(nq); + free(perf); + free(std); + free(mean); + free(time); +} + +/** + * @brief Check if the kernel parameters change between tasks + * + * @details + * In the current strategy for training the kernel matrix is decomposed once, + * and tasks with the same kernel settings are performed sequentially. When a + * task needs to be done with different kernel parameters, the kernel matrix + * needs to be recalculated. This function is used to check whether this is + * the case. + * + * @param[in] newtask the next task + * @param[in] oldtask the old task + * @return whether the kernel needs to be reevaluated + */ +bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) +{ + int i; + if (oldtask == NULL) + return true; + if (newtask->kerneltype != oldtask->kerneltype) { + return true; + } else if (newtask->kerneltype == K_POLY) { + for (i=0; i<3; i++) + if (newtask->kernelparam[i] != oldtask->kernelparam[i]) + return true; + return false; + } else if (newtask->kerneltype == K_RBF) { + if (newtask->kernelparam[0] != oldtask->kernelparam[0]) + return true; + return false; + } else if (newtask->kerneltype == K_SIGMOID) { + for (i=0; i<2; i++) + if (newtask->kernelparam[i] != oldtask->kernelparam[i]) + return true; + return false; + } + return false; +} + +/** + * @brief Run the grid search for a GenQueue + * + * @details + * Given a GenQueue of GenTask struct to be trained, a grid search is launched to + * find the optimal parameter configuration. As is also done within + * cross_validation(), the optimal weights of one parameter set are used as + * initial estimates for GenModel::V in the next parameter set. Note that to + * optimally exploit this feature of the optimization algorithm, the order in + * which tasks are considered is important. This is considered in + * make_queue(). + * + * The performance found by cross validation is stored in the GenTask struct. + * + * @param[in,out] q GenQueue with GenTask instances to run + */ + +void start_training(struct GenQueue *q) +{ + int f, folds; + double perf, current_max = 0; + struct GenTask *task = get_next_task(q); + struct GenTask *prevtask = NULL; + struct GenModel *model = gensvm_init_model(); + clock_t main_s, main_e, loop_s, loop_e; + + // in principle this can change between tasks, but this shouldn't be + // the case TODO + folds = task->folds; + + model->n = 0; + model->m = task->train_data->m; + model->K = task->train_data->K; + gensvm_allocate_model(model); + gensvm_init_V(NULL, model, task->train_data); + + long *cv_idx = Calloc(long, task->train_data->n); + gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); + + struct GenData **train_folds = Malloc(struct GenData *, task->folds); + struct GenData **test_folds = Malloc(struct GenData *, task->folds); + for (f=0; ftrain_data, train_folds[f], + test_folds[f], cv_idx, f); + } + + main_s = clock(); + while (task) { + make_model_from_task(task, model); + if (kernel_changed(task, prevtask)) { + note("Computing kernel"); + for (f=0; fZ != train_folds[f]->RAW) + free(train_folds[f]->Z); + if (test_folds[f]->Z != test_folds[f]->RAW) + free(test_folds[f]->Z); + gensvm_kernel_preprocess(model, + train_folds[f]); + gensvm_kernel_postprocess(model, + train_folds[f], test_folds[f]); + } + note(".\n"); + } + print_progress_string(task, q->N); + + loop_s = clock(); + perf = gensvm_cross_validation(model, train_folds, test_folds, + folds, task->train_data->n); + loop_e = clock(); + current_max = maximum(current_max, perf); + + note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, + gensvm_elapsed_time(loop_s, loop_e), + current_max); + + q->tasks[task->ID]->performance = perf; + prevtask = task; + task = get_next_task(q); + } + main_e = clock(); + + note("\nTotal elapsed training time: %8.8f seconds\n", + gensvm_elapsed_time(main_s, main_e)); + + // make sure no double free occurs with the copied kernelparam + model->kernelparam = NULL; + gensvm_free_model(model); + for (f=0; fn, + train_folds[f]->r); + + // initialize object weights + gensvm_initialize_weights(train_folds[f], model); + + // train the model (surpressing output) + fid = GENSVM_OUTPUT_FILE; + GENSVM_OUTPUT_FILE = NULL; + gensvm_optimize(model, train_folds[f]); + GENSVM_OUTPUT_FILE = fid; + + // calculate prediction performance on test set + predy = Calloc(long, test_folds[f]->n); + gensvm_predict_labels(test_folds[f], model, predy); + performance = gensvm_prediction_perf(test_folds[f], predy); + total_perf += performance * test_folds[f]->n; + + free(predy); + } + + total_perf /= ((double) n_total); + + return total_perf; +} + + +/** + * @brief Copy parameters from GenTask to GenModel + * + * @details + * A GenTask struct only contains the parameters of the GenModel to be estimated. + * This function is used to copy these parameters. + * + * @param[in] task GenTask instance with parameters + * @param[in,out] model GenModel to which the parameters are copied + */ +void make_model_from_task(struct GenTask *task, struct GenModel *model) +{ + // copy basic model parameters + model->weight_idx = task->weight_idx; + model->epsilon = task->epsilon; + model->p = task->p; + model->kappa = task->kappa; + model->lambda = task->lambda; + + // copy kernel parameters + model->kerneltype = task->kerneltype; + model->kernelparam = task->kernelparam; +} + +/** + * @brief Print the description of the current task on screen + * + * @details + * To track the progress of the grid search the parameters of the current task + * are written to the output specified in GENSVM_OUTPUT_FILE. Since the + * parameters differ with the specified kernel, this function writes a + * parameter string depending on which kernel is used. + * + * @param[in] task the GenTask specified + * @param[in] N total number of tasks + * + */ +void print_progress_string(struct GenTask *task, long N) +{ + char buffer[MAX_LINE_LENGTH]; + sprintf(buffer, "(%03li/%03li)\t", task->ID+1, N); + if (task->kerneltype == K_POLY) + sprintf(buffer + strlen(buffer), "d = %2.2f\t", + task->kernelparam[2]); + if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID) + sprintf(buffer + strlen(buffer), "c = %2.2f\t", + task->kernelparam[1]); + if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID || + task->kerneltype == K_RBF) + sprintf(buffer + strlen(buffer), "g = %3.3f\t", + task->kernelparam[0]); + sprintf(buffer + strlen(buffer), "eps = %g\tw = %i\tk = %2.2f\t" + "l = %f\tp = %2.2f\t", task->epsilon, + task->weight_idx, task->kappa, task->lambda, task->p); + note(buffer); +} diff --git a/src/gensvm_init.c b/src/gensvm_init.c index 947c60a..28f34ba 100644 --- a/src/gensvm_init.c +++ b/src/gensvm_init.c @@ -13,201 +13,100 @@ * */ -#include - #include "gensvm_init.h" -/** - * @brief Initialize a GenModel structure - * - * @details - * A GenModel structure is initialized and the default value for the - * parameters are set. A pointer to the initialized model is returned. - * - * @returns initialized GenModel - */ -struct GenModel *gensvm_init_model() -{ - struct GenModel *model = Malloc(struct GenModel, 1); - - // set default values - model->p = 1.0; - model->lambda = pow(2, -8.0); - model->epsilon = 1e-6; - model->kappa = 0.0; - model->weight_idx = 1; - model->kerneltype = K_LINEAR; - model->kernelparam = NULL; - - model->W = NULL; - model->t = NULL; - model->V = NULL; - model->Vbar = NULL; - model->U = NULL; - model->UU = NULL; - model->Q = NULL; - model->H = NULL; - model->R = NULL; - model->rho = NULL; - model->data_file = NULL; - - return model; -} +inline double rnd() { return (double) rand()/0x7FFFFFFF; } /** - * @brief Initialize a GenData structure + * @brief seed the matrix V from an existing model or using rand * * @details - * A GenData structure is initialized and default values are set. - * A pointer to the initialized data is returned. - * - * @returns initialized GenData - * + * The matrix V must be seeded before the main_loop() can start. + * This can be done by either seeding it with random numbers or + * using the solution from a previous model on the same dataset + * as initial seed. The latter option usually allows for a + * significant improvement in the number of iterations necessary + * because the seeded model V is closer to the optimal V. + * + * @param[in] from_model GenModel from which to copy V + * @param[in,out] to_model GenModel to which V will be copied */ -struct GenData *gensvm_init_data() +void gensvm_init_V(struct GenModel *from_model, + struct GenModel *to_model, struct GenData *data) { - struct GenData *data = Malloc(struct GenData, 1); - data->Sigma = NULL; - data->y = NULL; - data->Z = NULL; - data->RAW = NULL; - - // set default values - data->kerneltype = K_LINEAR; - data->kernelparam = NULL; - - return data; + long i, j, k; + double cmin, cmax, value; + + long n = data->n; + long m = data->m; + long K = data->K; + + if (from_model == NULL) { + for (i=0; iZ, m+1, k, i); + cmin = minimum(cmin, value); + cmax = maximum(cmax, value); + } + for (j=0; jV, K-1, i, j, value); + } + } + } else { + for (i=0; iV, K-1, i, j); + matrix_set(to_model->V, K-1, i, j, value); + } + } } /** - * @brief Allocate memory for a GenModel + * @brief Initialize instance weights * * @details - * This function can be used to allocate the memory needed for a GenModel. All - * arrays in the model are specified and initialized to 0. - * - * @param[in] model GenModel to allocate - * + * Instance weights can for instance be used to add additional weights to + * instances of certain classes. Two default weight possibilities are + * implemented here. The first is unit weights, where each instance gets + * weight 1. + * + * The second are group size correction weights, which are calculated as + * @f[ + * \rho_i = \frac{n}{Kn_k} , + * @f] + * where @f$ n_k @f$ is the number of instances in group @f$ k @f$ and + * @f$ y_i = k @f$. + * + * @param[in] data GenData with the dataset + * @param[in,out] model GenModel with the weight specification. On + * exit GenModel::rho contains the instance + * weights. */ -void gensvm_allocate_model(struct GenModel *model) +void gensvm_initialize_weights(struct GenData *data, struct GenModel *model) { - long n = model->n; - long m = model->m; - long K = model->K; - - model->W = Calloc(double, m*(K-1)); - model->t = Calloc(double, K-1); - model->V = Calloc(double, (m+1)*(K-1)); - model->Vbar = Calloc(double, (m+1)*(K-1)); - model->U = Calloc(double, K*(K-1)); - model->UU = Calloc(double, n*K*(K-1)); - model->Q = Calloc(double, n*K); - model->H = Calloc(double, n*K); - model->R = Calloc(double, n*K); - model->rho = Calloc(double, n); -} + long *groups; + long i; -/** - * @brief Reallocate memory for GenModel - * - * @details - * This function can be used to reallocate existing memory for a GenModel, - * upon a change in the model dimensions. This is used in combination with - * kernels. - * - * @param[in] model GenModel to reallocate - * @param[in] n new value of GenModel->n - * @param[in] m new value of GenModel->m - * - */ -void gensvm_reallocate_model(struct GenModel *model, long n, long m) -{ + long n = model->n; long K = model->K; - if (model->n == n && model->m == m) - return; - if (model->n != n) { - model->UU = Realloc(model->UU, double, n*K*(K-1)); - Memset(model->UU, double, n*K*(K-1)); - - model->Q = Realloc(model->Q, double, n*K); - Memset(model->Q, double, n*K); - - model->H = Realloc(model->H, double, n*K); - Memset(model->H, double, n*K); - - model->R = Realloc(model->R, double, n*K); - Memset(model->R, double, n*K); - - model->rho = Realloc(model->rho, double, n); - Memset(model->rho, double, n); - - model->n = n; + if (model->weight_idx == 1) { + for (i=0; irho[i] = 1.0; } - if (model->m != m) { - model->W = Realloc(model->W, double, m*(K-1)); - Memset(model->W, double, m*(K-1)); - - model->V = Realloc(model->V, double, (m+1)*(K-1)); - Memset(model->V, double, (m+1)*(K-1)); - - model->Vbar = Realloc(model->Vbar, double, (m+1)*(K-1)); - Memset(model->Vbar, double, (m+1)*(K-1)); - - model->m = m; - } -} - -/** - * @brief Free allocated GenModel struct - * - * @details - * Simply free a previously allocated GenModel by freeing all its component - * arrays. Note that the model struct itself is also freed here. - * - * @param[in] model GenModel to free - * - */ -void gensvm_free_model(struct GenModel *model) -{ - free(model->W); - free(model->t); - free(model->V); - free(model->Vbar); - free(model->U); - free(model->UU); - free(model->Q); - free(model->H); - free(model->rho); - free(model->R); - free(model->kernelparam); - - free(model); -} - -/** - * @brief Free allocated GenData struct - * - * @details - * Simply free a previously allocated GenData struct by freeing all its - * components. Note that the data struct itself is also freed here. - * - * @param[in] data GenData struct to free - * - */ -void gensvm_free_data(struct GenData *data) -{ - if (data == NULL) - return; - - if (data->Z == data->RAW) { - free(data->Z); + else if (model->weight_idx == 2) { + groups = Calloc(long, K); + for (i=0; iy[i]-1]++; + for (i=0; irho[i] = ((double) n)/((double) (groups[data->y[i]-1]*K)); } else { - free(data->Z); - free(data->RAW); + fprintf(stderr, "Unknown weight specification.\n"); + exit(1); } - free(data->kernelparam); - free(data->y); - free(data->Sigma); - free(data); } diff --git a/src/gensvm_io.c b/src/gensvm_io.c index c4798d8..696f46f 100644 --- a/src/gensvm_io.c +++ b/src/gensvm_io.c @@ -6,16 +6,12 @@ * * @details * This file contains functions for reading and writing model files, and data - * files. - * + * files. It also contains a function for generating a string of the current + * time, used in writing output files. +* */ -#include "globals.h" -#include "gensvm.h" #include "gensvm_io.h" -#include "gensvm_matrix.h" -#include "gensvm_strutil.h" -#include "gensvm_timer.h" /** * @brief Read data from file @@ -226,7 +222,7 @@ void gensvm_write_model(struct GenModel *model, char *output_filename) output_filename); exit(1); } - get_time_string(timestr); + gensvm_time_string(timestr); // Write output to file fprintf(fid, "Output file for GenSVM (version %1.1f)\n", VERSION); @@ -298,3 +294,49 @@ void gensvm_write_predictions(struct GenData *data, long *predy, fclose(fid); } + +/** + * @brief Get time string with UTC offset + * + * @details + * Create a string for the current system time. Include an offset of UTC for + * consistency. The format of the generated string is "DDD MMM D HH:MM:SS + * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". + * + * @param[in,out] buffer allocated string buffer, on exit contains + * formatted string + * + */ +void gensvm_time_string(char *buffer) +{ + int diff, hours, minutes; + char timestr[MAX_LINE_LENGTH]; + time_t current_time, lt, gt; + struct tm *lclt; + + // get current time (in epoch) + current_time = time(NULL); + if (current_time == ((time_t)-1)) { + fprintf(stderr, "Failed to compute the current time.\n"); + return; + } + + // convert time to local time and create a string + lclt = localtime(¤t_time); + strftime(timestr, MAX_LINE_LENGTH, "%c", lclt); + if (timestr == NULL) { + fprintf(stderr, "Failed to convert time to string.\n"); + return; + } + + // calculate the UTC offset including DST + lt = mktime(localtime(¤t_time)); + gt = mktime(gmtime(¤t_time)); + diff = -difftime(gt, lt); + hours = (diff/3600); + minutes = (diff%3600)/60; + if (lclt->tm_isdst == 1) + hours++; + + sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); +} diff --git a/src/gensvm_kernel.c b/src/gensvm_kernel.c index 1d7e5e4..a4f2277 100644 --- a/src/gensvm_kernel.c +++ b/src/gensvm_kernel.c @@ -11,15 +11,7 @@ * */ -#include -#include - -#include "globals.h" -#include "gensvm.h" #include "gensvm_kernel.h" -#include "gensvm_lapack.h" -#include "gensvm_matrix.h" -#include "gensvm_util.h" /** * @brief Do the preprocessing steps needed to perform kernel GenSVM @@ -451,3 +443,46 @@ double gensvm_dot_sigmoid(double *x1, double *x2, double *kernelparam, long n) value += kernelparam[1]; return tanh(value); } + +/** + * @brief Compute the eigenvalues and optionally the eigenvectors of a + * symmetric matrix. + * + * @details + * This is a wrapper function around the external LAPACK function. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d2/d97/dsyevx_8f.html + * + * + */ +int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, + double VL, double VU, int IL, int IU, double ABSTOL, int *M, + double *W, double *Z, int LDZ, double *WORK, int LWORK, + int *IWORK, int *IFAIL) +{ + extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *Np, + double *A, int *LDAp, double *VLp, double *VUp, + int *ILp, int *IUp, double *ABSTOLp, int *M, + double *W, double *Z, int *LDZp, double *WORK, + int *LWORKp, int *IWORK, int *IFAIL, int *INFOp); + int INFO; + dsyevx_(&JOBZ, &RANGE, &UPLO, &N, A, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, + M, W, Z, &LDZ, WORK, &LWORK, IWORK, IFAIL, &INFO); + return INFO; +} + +/** + * @brief Determine double precision machine parameters. + * + * @details + * This is a wrapper function around the external LAPACK function. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d5/dd4/dlamch_8f.html + */ +double dlamch(char CMACH) +{ + extern double dlamch_(char *CMACH); + return dlamch_(&CMACH); +} diff --git a/src/gensvm_lapack.c b/src/gensvm_lapack.c index 56dfc20..2a9c120 100644 --- a/src/gensvm_lapack.c +++ b/src/gensvm_lapack.c @@ -11,125 +11,3 @@ #include "gensvm_lapack.h" -/** - * @brief Solve AX = B where A is symmetric positive definite. - * - * @details - * Solve a linear system of equations AX = B where A is symmetric positive - * definite. This function uses the externel LAPACK routine dposv. - * - * @param[in] UPLO which triangle of A is stored - * @param[in] N order of A - * @param[in] NRHS number of columns of B - * @param[in,out] A double precision array of size (LDA, N). On - * exit contains the upper or lower factor of the - * Cholesky factorization of A. - * @param[in] LDA leading dimension of A - * @param[in,out] B double precision array of size (LDB, NRHS). On - * exit contains the N-by-NRHS solution matrix X. - * @param[in] LDB the leading dimension of B - * @returns info parameter which contains the status of the - * computation: - * - =0: success - * - <0: if -i, the i-th argument had - * an illegal value - * - >0: if i, the leading minor of A - * was not positive definite - * - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/dc/de9/group__double_p_osolve.html - */ -int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, - int LDB) -{ - extern void dposv_(char *UPLO, int *Np, int *NRHSp, double *A, - int *LDAp, double *B, int *LDBp, int *INFOp); - int INFO; - dposv_(&UPLO, &N, &NRHS, A, &LDA, B, &LDB, &INFO); - return INFO; -} - -/** - * @brief Solve a system of equations AX = B where A is symmetric. - * - * @details - * Solve a linear system of equations AX = B where A is symmetric. This - * function uses the external LAPACK routine dsysv. - * - * @param[in] UPLO which triangle of A is stored - * @param[in] N order of A - * @param[in] NRHS number of columns of B - * @param[in,out] A double precision array of size (LDA, N). On - * exit contains the block diagonal matrix D and - * the multipliers used to obtain the factor U or - * L from the factorization A = U*D*U**T or - * A = L*D*L**T. - * @param[in] LDA leading dimension of A - * @param[in] IPIV integer array containing the details of D - * @param[in,out] B double precision array of size (LDB, NRHS). On - * exit contains the N-by-NRHS matrix X - * @param[in] LDB leading dimension of B - * @param[out] WORK double precision array of size max(1,LWORK). On - * exit, WORK(1) contains the optimal LWORK - * @param[in] LWORK the length of WORK, can be used for determining - * the optimal blocksize for dsystrf. - * @returns info parameter which contains the status of the - * computation: - * - =0: success - * - <0: if -i, the i-th argument had an - * illegal value - * - >0: if i, D(i, i) is exactly zero, - * no solution can be computed. - * - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d6/d0e/group__double_s_ysolve.html - */ -int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, - double *B, int LDB, double *WORK, int LWORK) -{ - extern void dsysv_(char *UPLO, int *Np, int *NRHSp, double *A, - int *LDAp, int *IPIV, double *B, int *LDBp, - double *WORK, int *LWORK, int *INFOp); - int INFO; - dsysv_(&UPLO, &N, &NRHS, A, &LDA, IPIV, B, &LDB, WORK, &LWORK, &INFO); - return INFO; -} - -/** - * @brief Compute the eigenvalues and optionally the eigenvectors of a - * symmetric matrix. - * - * @details - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d2/d97/dsyevx_8f.html - * - * - */ -int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, - double VL, double VU, int IL, int IU, double ABSTOL, int *M, - double *W, double *Z, int LDZ, double *WORK, int LWORK, - int *IWORK, int *IFAIL) -{ - extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *Np, - double *A, int *LDAp, double *VLp, double *VUp, - int *ILp, int *IUp, double *ABSTOLp, int *M, - double *W, double *Z, int *LDZp, double *WORK, - int *LWORKp, int *IWORK, int *IFAIL, int *INFOp); - int INFO; - dsyevx_(&JOBZ, &RANGE, &UPLO, &N, A, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, - M, W, Z, &LDZ, WORK, &LWORK, IWORK, IFAIL, &INFO); - return INFO; -} - -/** - * @brief Determine double precision machine parameters. - * - * @details - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d5/dd4/dlamch_8f.html - */ -double dlamch(char CMACH) -{ - extern double dlamch_(char *CMACH); - return dlamch_(&CMACH); -} diff --git a/src/gensvm_matrix.c b/src/gensvm_matrix.c deleted file mode 100644 index c2e5986..0000000 --- a/src/gensvm_matrix.c +++ /dev/null @@ -1,38 +0,0 @@ -/** - * @file gensvm_matrix.c - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Functions facilitating matrix access - * - * @details - * The functions contained in this file are used when - * accessing or writing to matrices. Seperate functions - * exist of adding and multiplying existing matrix - * elements, to ensure this is done in place. - * - */ - -#include "gensvm_matrix.h" -#include "gensvm_util.h" - -/** - * @brief print a matrix - * - * @details - * Debug function to print a matrix - * - * @param[in] M matrix - * @param[in] rows number of rows of M - * @param[in] cols number of columns of M - */ -void print_matrix(double *M, long rows, long cols) -{ - long i, j; - - for (i=0; in; + long m = model->m; + long K = model->K; + + double *B = Calloc(double, n*(K-1)); + double *ZV = Calloc(double, n*(K-1)); + double *ZAZ = Calloc(double, (m+1)*(m+1)); + double *ZAZV = Calloc(double, (m+1)*(K-1)); + double *ZAZVT = Calloc(double, (m+1)*(K-1)); + + note("Starting main loop.\n"); + note("Dataset:\n"); + note("\tn = %i\n", n); + note("\tm = %i\n", m); + note("\tK = %i\n", K); + note("Parameters:\n"); + note("\tkappa = %f\n", model->kappa); + note("\tp = %f\n", model->p); + note("\tlambda = %15.16f\n", model->lambda); + note("\tepsilon = %g\n", model->epsilon); + note("\n"); + + gensvm_simplex(model->K, model->U); + gensvm_simplex_diff(model, data); + gensvm_category_matrix(model, data); + + L = gensvm_get_loss(model, data, ZV); + Lbar = L + 2.0*model->epsilon*L; + + while ((it < MAX_ITER) && (Lbar - L)/L > model->epsilon) + { + // ensure V contains newest V and Vbar contains V from + // previous + gensvm_get_update(model, data, B, ZAZ, ZAZV, ZAZVT); + if (it > 50) + gensvm_step_doubling(model); + + Lbar = L; + L = gensvm_get_loss(model, data, ZV); + + if (it%100 == 0) + note("iter = %li, L = %15.16f, Lbar = %15.16f, " + "reldiff = %15.16f\n", it, L, Lbar, (Lbar - L)/L); + it++; + } + if (L > Lbar) + fprintf(stderr, "[WARNING]: Negative step occurred in " + "majorization.\n"); + if (it >= MAX_ITER) + fprintf(stderr, "[WARNING]: maximum number of iterations " + "reached.\n"); + + note("Optimization finished, iter = %li, loss = %15.16f, " + "rel. diff. = %15.16f\n", it-1, L, + (Lbar - L)/L); + note("Number of support vectors: %li\n", gensvm_num_sv(model, data)); + + model->training_error = (Lbar - L)/L; + + for (i=0; it[i] = matrix_get(model->V, K-1, 0, i); + for (i=1; iV, K-1, i, j); + matrix_set(model->W, K-1, i-1, j, value); + } + } + free(B); + free(ZV); + free(ZAZ); + free(ZAZV); + free(ZAZVT); +} + +/** + * @brief Calculate the current value of the loss function + * + * @details + * The current loss function value is calculated based on the matrix V in the + * given model. Note that the matrix ZV is passed explicitly to avoid having + * to reallocate memory at every step. + * + * @param[in] model GenModel structure which holds the current + * estimate V + * @param[in] data GenData structure + * @param[in,out] ZV pre-allocated matrix ZV which is updated on + * output + * @returns the current value of the loss function + */ +double gensvm_get_loss(struct GenModel *model, struct GenData *data, + double *ZV) +{ + long i, j; + long n = model->n; + long K = model->K; + long m = model->m; + + double value, rowvalue, loss = 0.0; + + gensvm_calculate_errors(model, data, ZV); + gensvm_calculate_huber(model); + + for (i=0; iH, K, i, j); + value = pow(value, model->p); + value *= matrix_get(model->R, K, i, j); + rowvalue += value; + } + rowvalue = pow(rowvalue, 1.0/(model->p)); + rowvalue *= model->rho[i]; + loss += rowvalue; + } + loss /= ((double) n); + + value = 0; + for (i=1; iV, K-1, i, j), 2.0); + } + } + loss += model->lambda * value; + + return loss; +} + +/** + * @brief Perform a single step of the majorization algorithm to update V + * + * @details + * This function contains the main update calculations of the algorithm. These + * calculations are necessary to find a new update V. The calculations exist of + * recalculating the majorization coefficients for all instances and all + * classes, and solving a linear system to find V. + * + * Because the function gensvm_get_update() is always called after a call to + * gensvm_get_loss() with the same GenModel::V, it is unnecessary to calculate + * the updated errors GenModel::Q and GenModel::H here too. This saves on + * computation time. + * + * In calculating the majorization coefficients we calculate the elements of a + * diagonal matrix A with elements + * @f[ + * A_{i, i} = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ + * \varepsilon_i a_{ijk}^{(p)} + (1 - \varepsilon_i) \omega_i + * a_{ijk}^{(p)} \right], + * @f] + * where @f$ k = y_i @f$. + * Since this matrix is only used to calculate the matrix @f$ Z' A Z @f$, it is + * efficient to update a matrix ZAZ through consecutive rank 1 updates with + * a single element of A and the corresponding row of Z. The BLAS function + * dsyr is used for this. + * + * The B matrix is has rows + * @f[ + * \boldsymbol{\beta}_i' = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ + * \varepsilon_i \left( b_{ijk}^{(1)} - a_{ijk}^{(1)} + * \overline{q}_i^{(kj)} \right) + (1 - \varepsilon_i) + * \omega_i \left( b_{ijk}^{(p)} - a_{ijk}^{(p)} + * \overline{q}_i^{(kj)} \right) \right] + * \boldsymbol{\delta}_{kj}' + * @f] + * This is also split into two cases, one for which @f$ \varepsilon_i = 1 @f$, + * and one for when it is 0. The 3D simplex difference matrix is used here, in + * the form of the @f$ \boldsymbol{\delta}_{kj}' @f$. + * + * Finally, the following system is solved + * @f[ + * (\textbf{Z}'\textbf{AZ} + \lambda \textbf{J})\textbf{V} = + * (\textbf{Z}'\textbf{AZ}\overline{\textbf{V}} + \textbf{Z}' + * \textbf{B}) + * @f] + * solving this system is done through dposv(). + * + * @todo + * Consider allocating IPIV and WORK at a higher level, they probably don't + * change much during the iterations. + * + * @param [in,out] model model to be updated + * @param [in] data data used in model + * @param [in] B pre-allocated matrix used for linear coefficients + * @param [in] ZAZ pre-allocated matrix used in system + * @param [in] ZAZV pre-allocated matrix used in system solving + * @param [in] ZAZVT pre-allocated matrix used in system solving + */ +void gensvm_get_update(struct GenModel *model, struct GenData *data, double *B, + double *ZAZ, double *ZAZV, double *ZAZVT) +{ + int status, class; + long i, j, k; + double Avalue, Bvalue; + double omega, value, a, b, q, h, r; + + long n = model->n; + long m = model->m; + long K = model->K; + + double kappa = model->kappa; + double p = model->p; + double *rho = model->rho; + + // constants which are used often throughout + const double a2g2 = 0.25*p*(2.0*p - 1.0)*pow((kappa+1.0)/2.0,p-2.0); + const double in = 1.0/((double) n); + + // clear matrices + Memset(B, double, n*(K-1)); + Memset(ZAZ, double, (m+1)*(m+1)); + + b = 0; + for (i=0; iH, K, i, j); + r = matrix_get(model->R, K, i, j); + value += (h*r > 0) ? 1 : 0; + omega += pow(h, p)*r; + } + class = (value <= 1.0) ? 1 : 0; + omega = (1.0/p)*pow(omega, 1.0/p - 1.0); + + Avalue = 0; + if (class == 1) { + for (j=0; jQ, K, i, j); + if (q <= -kappa) { + a = 0.25/(0.5 - kappa/2.0 - q); + b = 0.5; + } else if (q <= 1.0) { + a = 1.0/(2.0*kappa + 2.0); + b = (1.0 - q)*a; + } else { + a = -0.25/(0.5 - kappa/2.0 - q); + b = 0; + } + for (k=0; kUU, K-1, K, i, k, j); + matrix_add(B, K-1, i, k, Bvalue); + } + Avalue += a*matrix_get(model->R, K, i, j); + } + } else { + if (2.0 - p < 0.0001) { + for (j=0; jQ, K, i, j); + if (q <= -kappa) { + b = 0.5 - kappa/2.0 - q; + } else if ( q <= 1.0) { + b = pow(1.0 - q, 3.0)/( + 2.0*pow(kappa + 1.0, + 2.0)); + } else { + b = 0; + } + for (k=0; kUU, + K-1, + K, + i, + k, + j); + matrix_add( + B, + K-1, + i, + k, + Bvalue); + } + } + Avalue = 1.5*(K - 1.0); + } else { + for (j=0; jQ, K, i, j); + if (q <= (p + kappa - 1.0)/(p - 2.0)) { + a = 0.25*pow(p, 2.0)*pow( + 0.5 - kappa/2.0 - q, + p - 2.0); + } else if (q <= 1.0) { + a = a2g2; + } else { + a = 0.25*pow(p, 2.0)*pow( + (p/(p - 2.0))* + (0.5 - kappa/2.0 - q), + p - 2.0); + b = a*(2.0*q + kappa - 1.0)/ + (p - 2.0) + + 0.5*p*pow( + p/(p - 2.0)* + (0.5 - kappa/ + 2.0 - q), + p - 1.0); + } + if (q <= -kappa) { + b = 0.5*p*pow( + 0.5 - kappa/2.0 - q, + p - 1.0); + } else if ( q <= 1.0) { + b = p*pow(1.0 - q, + 2.0*p - 1.0)/ + pow(2*kappa+2.0, p); + } + for (k=0; kUU, + K-1, + K, + i, + k, + j); + matrix_add( + B, + K-1, + i, + k, + Bvalue); + } + Avalue += a*matrix_get(model->R, + K, i, j); + } + } + Avalue *= omega; + } + Avalue *= in * rho[i]; + + // Now we calculate the matrix ZAZ. Since this is + // guaranteed to be symmetric, we only calculate the + // upper part of the matrix, and then copy this over + // to the lower part after all calculations are done. + // Note that the use of dsym is faster than dspr, even + // though dspr uses less memory. + cblas_dsyr( + CblasRowMajor, + CblasUpper, + m+1, + Avalue, + &data->Z[i*(m+1)], + 1, + ZAZ, + m+1); + } + // Copy upper to lower (necessary because we need to switch + // to Col-Major order for LAPACK). + /* + for (i=0; iV, + K-1, + 0.0, + ZAZV, + K-1); + + cblas_dgemm( + CblasRowMajor, + CblasTrans, + CblasNoTrans, + m+1, + K-1, + n, + 1.0, + data->Z, + m+1, + B, + K-1, + 1.0, + ZAZV, + K-1); + + /* + * Add lambda to all diagonal elements except the first one. Recall + * that ZAZ is of size m+1 and is symmetric. + */ + i = 0; + for (j=0; jlambda; + } + + // For the LAPACK call we need to switch to Column- + // Major order. This is unnecessary for the matrix + // ZAZ because it is symmetric. The matrix ZAZV + // must be converted however. + for (i=0; iVbar; + model->Vbar = model->V; + model->V = ZAZVT; + ZAZVT = ptr; + */ + + for (i=0; iV, K-1, i, j); + matrix_set(model->Vbar, K-1, i, j, value); + value = matrix_get(ZAZV, K-1, i, j); + matrix_set(model->V, K-1, i, j, value); + } + } +} + +/** + * @brief Generate the category matrix + * + * @details + * Generate the category matrix R. The category matrix has 1's everywhere + * except at the column corresponding to the label of instance i, there the + * element is 0. + * + * @param[in,out] model corresponding GenModel + * @param[in] dataset corresponding GenData + * + */ +void gensvm_category_matrix(struct GenModel *model, struct GenData *dataset) +{ + long i, j; + long n = model->n; + long K = model->K; + + for (i=0; iy[i] != j+1) + matrix_set(model->R, K, i, j, 1.0); + else + matrix_set(model->R, K, i, j, 0.0); + } + } +} + +/** + * @brief Generate the simplex difference matrix + * + * @details + * The simplex difference matrix is a 3D matrix which is constructed + * as follows. For each instance i, the difference vectors between the row of + * the simplex matrix corresponding to the class label of instance i and the + * other rows of the simplex matrix are calculated. These difference vectors + * are stored in a matrix, which is one horizontal slice of the 3D matrix. + * + * @param[in,out] model the corresponding GenModel + * @param[in] data the corresponding GenData + * + */ +void gensvm_simplex_diff(struct GenModel *model, struct GenData *data) +{ + long i, j, k; + double value; + + long n = model->n; + long K = model->K; + + for (i=0; iU, K-1, data->y[i]-1, j); + value -= matrix_get(model->U, K-1, k, j); + matrix3_set(model->UU, K-1, K, i, j, k, value); + } + } + } +} + +/** + * @brief Use step doubling + * + * @details + * Step doubling can be used to speed up the maorization algorithm. Instead of + * using the value at the minimimum of the majorization function, the value + * ``opposite'' the majorization point is used. This can essentially cut the + * number of iterations necessary to reach the minimum in half. + * + * @param[in] model GenModel containing the augmented parameters + */ +void gensvm_step_doubling(struct GenModel *model) +{ + long i, j; + double value; + + long m = model->m; + long K = model->K; + + for (i=0; iV, K-1, i, j, 2.0); + value = - matrix_get(model->Vbar, K-1, i, j); + matrix_add(model->V, K-1, i, j, value); + } + } +} + +/** + * @brief Calculate the Huber hinge errors + * + * @details + * For each of the scalar errors in Q the Huber hinge errors are + * calculated. The Huber hinge is here defined as + * @f[ + * h(q) = + * \begin{dcases} + * 1 - q - \frac{\kappa + 1}{2} & \text{if } q \leq -\kappa \\ + * \frac{1}{2(\kappa + 1)} ( 1 - q)^2 & \text{if } q \in (-\kappa, 1] \\ + * 0 & \text{if } q > 1 + * \end{dcases} + * @f] + * + * @param[in,out] model the corresponding GenModel + */ +void gensvm_calculate_huber(struct GenModel *model) +{ + long i, j; + double q, value; + + for (i=0; in; i++) { + for (j=0; jK; j++) { + q = matrix_get(model->Q, model->K, i, j); + value = 0.0; + if (q <= -model->kappa) { + value = 1.0 - q - (model->kappa+1.0)/2.0; + } else if (q <= 1.0) { + value = 1.0/(2.0*model->kappa+2.0)*pow(1.0 - q, + 2.0); + } + matrix_set(model->H, model->K, i, j, value); + } + } +} + +/** + * @brief Calculate the scalar errors + * + * @details + * Calculate the scalar errors q based on the current estimate of V, and + * store these in Q. It is assumed that the memory for Q has already been + * allocated. In addition, the matrix ZV is calculated here. It is assigned + * to a pre-allocated block of memory, which is passed to this function. + * + * @param[in,out] model the corresponding GenModel + * @param[in] data the corresponding GenData + * @param[in,out] ZV a pointer to a memory block for ZV. On exit + * this block is updated with the new ZV matrix + * calculated with GenModel::V. + * + */ +void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, + double *ZV) +{ + long i, j, k; + double a, value; + + long n = model->n; + long m = model->m; + long K = model->K; + + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + K-1, + m+1, + 1.0, + data->Z, + m+1, + model->V, + K-1, + 0.0, + ZV, + K-1); + + Memset(model->Q, double, n*K); + for (i=0; iUU, K-1, K, i, + j, k); + matrix_add(model->Q, K, i, k, value); + } + } + } +} + +/** + * @brief Solve AX = B where A is symmetric positive definite. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric positive + * definite. This function is a wrapper for the external LAPACK routine + * dposv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the upper or lower factor of the + * Cholesky factorization of A. + * @param[in] LDA leading dimension of A + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS solution matrix X. + * @param[in] LDB the leading dimension of B + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had + * an illegal value + * - >0: if i, the leading minor of A + * was not positive definite + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/dc/de9/group__double_p_osolve.html + */ +int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, + int LDB) +{ + extern void dposv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, double *B, int *LDBp, int *INFOp); + int INFO; + dposv_(&UPLO, &N, &NRHS, A, &LDA, B, &LDB, &INFO); + return INFO; +} + +/** + * @brief Solve a system of equations AX = B where A is symmetric. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric. This + * function is a wrapper for the external LAPACK routine dsysv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the block diagonal matrix D and + * the multipliers used to obtain the factor U or + * L from the factorization A = U*D*U**T or + * A = L*D*L**T. + * @param[in] LDA leading dimension of A + * @param[in] IPIV integer array containing the details of D + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS matrix X + * @param[in] LDB leading dimension of B + * @param[out] WORK double precision array of size max(1,LWORK). On + * exit, WORK(1) contains the optimal LWORK + * @param[in] LWORK the length of WORK, can be used for determining + * the optimal blocksize for dsystrf. + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had an + * illegal value + * - >0: if i, D(i, i) is exactly zero, + * no solution can be computed. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d6/d0e/group__double_s_ysolve.html + */ +int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, + double *B, int LDB, double *WORK, int LWORK) +{ + extern void dsysv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, int *IPIV, double *B, int *LDBp, + double *WORK, int *LWORK, int *INFOp); + int INFO; + dsysv_(&UPLO, &N, &NRHS, A, &LDA, IPIV, B, &LDB, WORK, &LWORK, &INFO); + return INFO; +} diff --git a/src/gensvm_pred.c b/src/gensvm_pred.c index 15a6be6..8a9a43e 100644 --- a/src/gensvm_pred.c +++ b/src/gensvm_pred.c @@ -11,14 +11,6 @@ * */ -#include -#include - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_kernel.h" -#include "gensvm_matrix.h" #include "gensvm_pred.h" /** @@ -51,7 +43,7 @@ void gensvm_predict_labels(struct GenData *testdata, struct GenModel *model, U = Calloc(double, K*(K-1)); // Generate the simplex matrix - gensvm_simplex_gen(K, U); + gensvm_simplex(K, U); // Generate the simplex space vectors cblas_dgemm( diff --git a/src/gensvm_print.c b/src/gensvm_print.c new file mode 100644 index 0000000..2c00512 --- /dev/null +++ b/src/gensvm_print.c @@ -0,0 +1,88 @@ +/** + * @file util.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Utility functions + * + * @details + * This file contains several utility functions for coordinating input and + * output of data and model files. It also contains string functions. + * + */ + +#include "gensvm_print.h" + +FILE *GENSVM_OUTPUT_FILE; ///< The #GENSVM_OUTPUT_FILE specifies the + ///< output stream to which all output is + ///< written. This is done through the + ///< internal (!) + ///< function gensvm_print_string(). The + ///< advantage of using a global output + ///< stream variable is that the output can + ///< temporarily be suppressed by importing + ///< this variable through @c extern and + ///< (temporarily) setting it to NULL. + +/** + * @brief Print a given string to the specified output stream + * + * @details + * This function is used to print a given string to the output stream + * specified by #GENSVM_OUTPUT_FILE. The stream is flushed after the string + * is written to the stream. If #GENSVM_OUTPUT_FILE is NULL, nothing is + * written. Note that this function is only used by note(), it should never be + * used directly. + * + * @param[in] s string to write to the stream + * + */ +static void gensvm_print_string(const char *s) +{ + if (GENSVM_OUTPUT_FILE != NULL) { + fputs(s, GENSVM_OUTPUT_FILE); + fflush(GENSVM_OUTPUT_FILE); + } +} + +/** + * @brief Parse a formatted string and write to the output stream + * + * @details + * This function is a replacement of fprintf(), such that the output stream + * does not have to be specified at each function call. The functionality is + * exactly the same however. Writing the formatted string to the output stream + * is handled by gensvm_print_string(). + * + * @param[in] fmt String format + * @param[in] ... variable argument list for the string format + * + */ +void note(const char *fmt,...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap,fmt); + vsprintf(buf,fmt,ap); + va_end(ap); + (*gensvm_print_string)(buf); +} + +/** + * @brief Parse a formatted string and write it to standard error + * + * @details + * Shorthand for fprintf(stderr, ...) + * + * @param[in] fmt string format + * @param[in] ... variable argument list for the string format + */ +void err(const char *fmt, ...) +{ + char buf[BUFSIZ]; + va_list ap; + va_start(ap, fmt); + vsprintf(buf, fmt, ap); + va_end(ap); + fputs(buf, stderr); + fflush(stderr); +} diff --git a/src/gensvm_queue.c b/src/gensvm_queue.c new file mode 100644 index 0000000..bbf57b2 --- /dev/null +++ b/src/gensvm_queue.c @@ -0,0 +1,71 @@ +/** + * @file gensvm_queue.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for initializing and freeing a GenQueue + * + */ + +#include "gensvm_queue.h" + +/** + * @brief Initialize a GenQueue structure + * + * @details + * A GenQueue structure is initialized and the default value for the + * parameters are set. A pointer to the initialized queue is returned. + * + * @returns initialized GenQueue + */ +struct GenQueue *gensvm_init_queue() +{ + struct GenQueue *q = Malloc(struct GenQueue, 1); + + q->tasks = NULL; + q->N = 0; + q->i = 0; + + return q; +} + +/** + * @brief Free the GenQueue struct + * + * @details + * Freeing the allocated memory of the GenQueue means freeing every GenTask + * struct and then freeing the Queue. + * + * @param[in] q GenQueue to be freed + * + */ +void gensvm_free_queue(struct GenQueue *q) +{ + long i; + for (i=0; iN; i++) { + gensvm_free_task(q->tasks[i]); + } + free(q->tasks); + free(q); +} + +/** + * @brief Get new GenTask from GenQueue + * + * @details + * Return a pointer to the next GenTask in the GenQueue. If no GenTask + * instances are left, NULL is returned. The internal counter GenQueue::i is + * used for finding the next GenTask. + * + * @param[in] q GenQueue instance + * @returns pointer to next GenTask + * + */ +struct GenTask *get_next_task(struct GenQueue *q) +{ + long i = q->i; + if (i < q->N) { + q->i++; + return q->tasks[i]; + } + return NULL; +} diff --git a/src/gensvm_simplex.c b/src/gensvm_simplex.c new file mode 100644 index 0000000..1fd5f14 --- /dev/null +++ b/src/gensvm_simplex.c @@ -0,0 +1,45 @@ +/** + * @file gensvm_simplex.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Function for generating the simplex matrix + * + * @details + * Contains the function for generating the simplex matrix for a given number + * of classes. + * + */ + +#include "gensvm_simplex.h" + +/** + * @brief Generate matrix of simplex vertex coordinates + * + * @details + * Generate the simplex matrix. Each row of the created + * matrix contains the coordinate vector of a single + * vertex of the K-simplex in K-1 dimensions. The simplex + * generated is a special simplex with edges of length 1. + * The simplex matrix U must already have been allocated. + * + * @param[in] K number of classes + * @param[in,out] U simplex matrix of size K * (K-1) + */ +void gensvm_simplex(long K, double *U) +{ + long i, j; + for (i=0; ikerneltype = K_LINEAR; + t->weight_idx = 1; + t->folds = 10; + t->ID = -1; + t->p = 1.0; + t->kappa = 0.0; + t->lambda = 1.0; + t->epsilon = 1e-6; + t->kernelparam = NULL; + t->train_data = NULL; + t->test_data = NULL; + t->performance = 0.0; + + return t; +} + +/** + * @brief Free the GenTask struct + * + * @details + * Freeing the allocated memory of the GenTask means freeing _only_ the + * kernelparam array, and the task itself. The datasets are not freed, as + * these are shared between all tasks. + * + * @param[in] t GenTask to be freed + * + */ +void gensvm_free_task(struct GenTask *t) +{ + free(t->kernelparam); + free(t); +} diff --git a/src/gensvm_timer.c b/src/gensvm_timer.c index 9802091..04d93b1 100644 --- a/src/gensvm_timer.c +++ b/src/gensvm_timer.c @@ -6,13 +6,10 @@ * * @details * This file contains a simple function for calculating the time in seconds - * elapsed between two clock() calls. It also contains a function for - * generating a string of the current time, used in writing output files. + * elapsed between two clock() calls. + * */ -#include - -#include "globals.h" #include "gensvm_timer.h" /** @@ -22,53 +19,7 @@ * @param[in] e_time end time * @returns time elapsed in seconds */ -double elapsed_time(clock_t s_time, clock_t e_time) +double gensvm_elapsed_time(clock_t s_time, clock_t e_time) { return ((double) (e_time - s_time))/((double) CLOCKS_PER_SEC); } - -/** - * @brief Get time string with UTC offset - * - * @details - * Create a string for the current system time. Include an offset of UTC for - * consistency. The format of the generated string is "DDD MMM D HH:MM:SS - * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". - * - * @param[in,out] buffer allocated string buffer, on exit contains - * formatted string - * - */ -void get_time_string(char *buffer) -{ - int diff, hours, minutes; - char timestr[MAX_LINE_LENGTH]; - time_t current_time, lt, gt; - struct tm *lclt; - - // get current time (in epoch) - current_time = time(NULL); - if (current_time == ((time_t)-1)) { - fprintf(stderr, "Failed to compute the current time.\n"); - return; - } - - // convert time to local time and create a string - lclt = localtime(¤t_time); - strftime(timestr, MAX_LINE_LENGTH, "%c", lclt); - if (timestr == NULL) { - fprintf(stderr, "Failed to convert time to string.\n"); - return; - } - - // calculate the UTC offset including DST - lt = mktime(localtime(¤t_time)); - gt = mktime(gmtime(¤t_time)); - diff = -difftime(gt, lt); - hours = (diff/3600); - minutes = (diff%3600)/60; - if (lclt->tm_isdst == 1) - hours++; - - sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); -} diff --git a/src/gensvm_train.c b/src/gensvm_train.c deleted file mode 100644 index 371970a..0000000 --- a/src/gensvm_train.c +++ /dev/null @@ -1,538 +0,0 @@ -/** - * @file gensvm_train.c - * @author Gertjan van den Burg - * @date August 9, 2013 - * @brief Main functions for training the GenSVM solution. - * - * @details - * Contains update and loss functions used to actually find - * the optimal V. - * - */ - -#include -#include - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_lapack.h" -#include "gensvm_matrix.h" -#include "gensvm_sv.h" -#include "gensvm_train.h" -#include "gensvm_util.h" - -/** - * Maximum number of iterations of the algorithm. - */ -#define MAX_ITER 1000000000 - -/** - * @brief The main training loop for GenSVM - * - * @details - * This function is the main training function. This function - * handles the optimization of the model with the given model parameters, with - * the data given. On return the matrix GenModel::V contains the optimal - * weight matrix. - * - * In this function, step doubling is used in the majorization algorithm after - * a burn-in of 50 iterations. If the training is finished, GenModel::t and - * GenModel::W are extracted from GenModel::V. - * - * @param[in,out] model the GenModel to be trained. Contains optimal - * V on exit. - * @param[in] data the GenData to train the model with. - */ -void gensvm_optimize(struct GenModel *model, struct GenData *data) -{ - long i, j, it = 0; - double L, Lbar, value; - - long n = model->n; - long m = model->m; - long K = model->K; - - double *B = Calloc(double, n*(K-1)); - double *ZV = Calloc(double, n*(K-1)); - double *ZAZ = Calloc(double, (m+1)*(m+1)); - double *ZAZV = Calloc(double, (m+1)*(K-1)); - double *ZAZVT = Calloc(double, (m+1)*(K-1)); - - note("Starting main loop.\n"); - note("Dataset:\n"); - note("\tn = %i\n", n); - note("\tm = %i\n", m); - note("\tK = %i\n", K); - note("Parameters:\n"); - note("\tkappa = %f\n", model->kappa); - note("\tp = %f\n", model->p); - note("\tlambda = %15.16f\n", model->lambda); - note("\tepsilon = %g\n", model->epsilon); - note("\n"); - - gensvm_simplex_gen(model->K, model->U); - gensvm_simplex_diff(model, data); - gensvm_category_matrix(model, data); - - L = gensvm_get_loss(model, data, ZV); - Lbar = L + 2.0*model->epsilon*L; - - while ((it < MAX_ITER) && (Lbar - L)/L > model->epsilon) - { - // ensure V contains newest V and Vbar contains V from - // previous - gensvm_get_update(model, data, B, ZAZ, ZAZV, ZAZVT); - if (it > 50) - gensvm_step_doubling(model); - - Lbar = L; - L = gensvm_get_loss(model, data, ZV); - - if (it%100 == 0) - note("iter = %li, L = %15.16f, Lbar = %15.16f, " - "reldiff = %15.16f\n", it, L, Lbar, (Lbar - L)/L); - it++; - } - if (L > Lbar) - fprintf(stderr, "[WARNING]: Negative step occurred in " - "majorization.\n"); - if (it >= MAX_ITER) - fprintf(stderr, "[WARNING]: maximum number of iterations " - "reached.\n"); - - note("Optimization finished, iter = %li, loss = %15.16f, " - "rel. diff. = %15.16f\n", it-1, L, - (Lbar - L)/L); - note("Number of support vectors: %li\n", gensvm_num_sv(model, data)); - - model->training_error = (Lbar - L)/L; - - for (i=0; it[i] = matrix_get(model->V, K-1, 0, i); - for (i=1; iV, K-1, i, j); - matrix_set(model->W, K-1, i-1, j, value); - } - } - free(B); - free(ZV); - free(ZAZ); - free(ZAZV); - free(ZAZVT); -} - -/** - * @brief Calculate the current value of the loss function - * - * @details - * The current loss function value is calculated based on the matrix V in the - * given model. Note that the matrix ZV is passed explicitly to avoid having - * to reallocate memory at every step. - * - * @param[in] model GenModel structure which holds the current - * estimate V - * @param[in] data GenData structure - * @param[in,out] ZV pre-allocated matrix ZV which is updated on - * output - * @returns the current value of the loss function - */ -double gensvm_get_loss(struct GenModel *model, struct GenData *data, - double *ZV) -{ - long i, j; - long n = model->n; - long K = model->K; - long m = model->m; - - double value, rowvalue, loss = 0.0; - - gensvm_calculate_errors(model, data, ZV); - gensvm_calculate_huber(model); - - for (i=0; iH, K, i, j); - value = pow(value, model->p); - value *= matrix_get(model->R, K, i, j); - rowvalue += value; - } - rowvalue = pow(rowvalue, 1.0/(model->p)); - rowvalue *= model->rho[i]; - loss += rowvalue; - } - loss /= ((double) n); - - value = 0; - for (i=1; iV, K-1, i, j), 2.0); - } - } - loss += model->lambda * value; - - return loss; -} - -/** - * @brief Perform a single step of the majorization algorithm to update V - * - * @details - * This function contains the main update calculations of the algorithm. These - * calculations are necessary to find a new update V. The calculations exist of - * recalculating the majorization coefficients for all instances and all - * classes, and solving a linear system to find V. - * - * Because the function gensvm_get_update() is always called after a call to - * gensvm_get_loss() with the same GenModel::V, it is unnecessary to calculate - * the updated errors GenModel::Q and GenModel::H here too. This saves on - * computation time. - * - * In calculating the majorization coefficients we calculate the elements of a - * diagonal matrix A with elements - * @f[ - * A_{i, i} = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ - * \varepsilon_i a_{ijk}^{(p)} + (1 - \varepsilon_i) \omega_i - * a_{ijk}^{(p)} \right], - * @f] - * where @f$ k = y_i @f$. - * Since this matrix is only used to calculate the matrix @f$ Z' A Z @f$, it is - * efficient to update a matrix ZAZ through consecutive rank 1 updates with - * a single element of A and the corresponding row of Z. The BLAS function - * dsyr is used for this. - * - * The B matrix is has rows - * @f[ - * \boldsymbol{\beta}_i' = \frac{1}{n} \rho_i \sum_{j \neq k} \left[ - * \varepsilon_i \left( b_{ijk}^{(1)} - a_{ijk}^{(1)} - * \overline{q}_i^{(kj)} \right) + (1 - \varepsilon_i) - * \omega_i \left( b_{ijk}^{(p)} - a_{ijk}^{(p)} - * \overline{q}_i^{(kj)} \right) \right] - * \boldsymbol{\delta}_{kj}' - * @f] - * This is also split into two cases, one for which @f$ \varepsilon_i = 1 @f$, - * and one for when it is 0. The 3D simplex difference matrix is used here, in - * the form of the @f$ \boldsymbol{\delta}_{kj}' @f$. - * - * Finally, the following system is solved - * @f[ - * (\textbf{Z}'\textbf{AZ} + \lambda \textbf{J})\textbf{V} = - * (\textbf{Z}'\textbf{AZ}\overline{\textbf{V}} + \textbf{Z}' - * \textbf{B}) - * @f] - * solving this system is done through dposv(). - * - * @todo - * Consider allocating IPIV and WORK at a higher level, they probably don't - * change much during the iterations. - * - * @param [in,out] model model to be updated - * @param [in] data data used in model - * @param [in] B pre-allocated matrix used for linear coefficients - * @param [in] ZAZ pre-allocated matrix used in system - * @param [in] ZAZV pre-allocated matrix used in system solving - * @param [in] ZAZVT pre-allocated matrix used in system solving - */ -void gensvm_get_update(struct GenModel *model, struct GenData *data, double *B, - double *ZAZ, double *ZAZV, double *ZAZVT) -{ - int status, class; - long i, j, k; - double Avalue, Bvalue; - double omega, value, a, b, q, h, r; - - long n = model->n; - long m = model->m; - long K = model->K; - - double kappa = model->kappa; - double p = model->p; - double *rho = model->rho; - - // constants which are used often throughout - const double a2g2 = 0.25*p*(2.0*p - 1.0)*pow((kappa+1.0)/2.0,p-2.0); - const double in = 1.0/((double) n); - - // clear matrices - Memset(B, double, n*(K-1)); - Memset(ZAZ, double, (m+1)*(m+1)); - - b = 0; - for (i=0; iH, K, i, j); - r = matrix_get(model->R, K, i, j); - value += (h*r > 0) ? 1 : 0; - omega += pow(h, p)*r; - } - class = (value <= 1.0) ? 1 : 0; - omega = (1.0/p)*pow(omega, 1.0/p - 1.0); - - Avalue = 0; - if (class == 1) { - for (j=0; jQ, K, i, j); - if (q <= -kappa) { - a = 0.25/(0.5 - kappa/2.0 - q); - b = 0.5; - } else if (q <= 1.0) { - a = 1.0/(2.0*kappa + 2.0); - b = (1.0 - q)*a; - } else { - a = -0.25/(0.5 - kappa/2.0 - q); - b = 0; - } - for (k=0; kUU, K-1, K, i, k, j); - matrix_add(B, K-1, i, k, Bvalue); - } - Avalue += a*matrix_get(model->R, K, i, j); - } - } else { - if (2.0 - p < 0.0001) { - for (j=0; jQ, K, i, j); - if (q <= -kappa) { - b = 0.5 - kappa/2.0 - q; - } else if ( q <= 1.0) { - b = pow(1.0 - q, 3.0)/( - 2.0*pow(kappa + 1.0, - 2.0)); - } else { - b = 0; - } - for (k=0; kUU, - K-1, - K, - i, - k, - j); - matrix_add( - B, - K-1, - i, - k, - Bvalue); - } - } - Avalue = 1.5*(K - 1.0); - } else { - for (j=0; jQ, K, i, j); - if (q <= (p + kappa - 1.0)/(p - 2.0)) { - a = 0.25*pow(p, 2.0)*pow( - 0.5 - kappa/2.0 - q, - p - 2.0); - } else if (q <= 1.0) { - a = a2g2; - } else { - a = 0.25*pow(p, 2.0)*pow( - (p/(p - 2.0))* - (0.5 - kappa/2.0 - q), - p - 2.0); - b = a*(2.0*q + kappa - 1.0)/ - (p - 2.0) + - 0.5*p*pow( - p/(p - 2.0)* - (0.5 - kappa/ - 2.0 - q), - p - 1.0); - } - if (q <= -kappa) { - b = 0.5*p*pow( - 0.5 - kappa/2.0 - q, - p - 1.0); - } else if ( q <= 1.0) { - b = p*pow(1.0 - q, - 2.0*p - 1.0)/ - pow(2*kappa+2.0, p); - } - for (k=0; kUU, - K-1, - K, - i, - k, - j); - matrix_add( - B, - K-1, - i, - k, - Bvalue); - } - Avalue += a*matrix_get(model->R, - K, i, j); - } - } - Avalue *= omega; - } - Avalue *= in * rho[i]; - - // Now we calculate the matrix ZAZ. Since this is - // guaranteed to be symmetric, we only calculate the - // upper part of the matrix, and then copy this over - // to the lower part after all calculations are done. - // Note that the use of dsym is faster than dspr, even - // though dspr uses less memory. - cblas_dsyr( - CblasRowMajor, - CblasUpper, - m+1, - Avalue, - &data->Z[i*(m+1)], - 1, - ZAZ, - m+1); - } - // Copy upper to lower (necessary because we need to switch - // to Col-Major order for LAPACK). - /* - for (i=0; iV, - K-1, - 0.0, - ZAZV, - K-1); - - cblas_dgemm( - CblasRowMajor, - CblasTrans, - CblasNoTrans, - m+1, - K-1, - n, - 1.0, - data->Z, - m+1, - B, - K-1, - 1.0, - ZAZV, - K-1); - - /* - * Add lambda to all diagonal elements except the first one. Recall - * that ZAZ is of size m+1 and is symmetric. - */ - i = 0; - for (j=0; jlambda; - } - - // For the LAPACK call we need to switch to Column- - // Major order. This is unnecessary for the matrix - // ZAZ because it is symmetric. The matrix ZAZV - // must be converted however. - for (i=0; iVbar; - model->Vbar = model->V; - model->V = ZAZVT; - ZAZVT = ptr; - */ - - for (i=0; iV, K-1, i, j); - matrix_set(model->Vbar, K-1, i, j, value); - value = matrix_get(ZAZV, K-1, i, j); - matrix_set(model->V, K-1, i, j, value); - } - } -} diff --git a/src/gensvm_train_dataset.c b/src/gensvm_train_dataset.c deleted file mode 100644 index d1650a7..0000000 --- a/src/gensvm_train_dataset.c +++ /dev/null @@ -1,766 +0,0 @@ -/** - * @file gensvm_train_dataset.c - * @author Gertjan van den Burg - * @date January, 2014 - * @brief Functions for finding the optimal parameters for the dataset - * - * @details - * The GenSVM algorithm takes a number of parameters. The functions in - * this file are used to find the optimal parameters. - */ - -#include -#include - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_crossval.h" -#include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_matrix.h" -#include "gensvm_train.h" -#include "gensvm_train_dataset.h" -#include "gensvm_pred.h" -#include "gensvm_util.h" -#include "gensvm_timer.h" - -extern FILE *GENSVM_OUTPUT_FILE; - -/** - * @brief Initialize a Queue from a Training instance - * - * @details - * A Training instance describes the grid to search over. This funtion - * creates all tasks that need to be performed and adds these to - * a Queue. Each task contains a pointer to the train and test datasets - * which are supplied. Note that the tasks are created in a specific order of - * the parameters, to ensure that the GenModel::V of a previous parameter - * set provides the best possible initial estimate of GenModel::V for the next - * parameter set. - * - * @param[in] training Training struct describing the grid search - * @param[in] queue pointer to a Queue that will be used to - * add the tasks to - * @param[in] train_data GenData of the training set - * @param[in] test_data GenData of the test set - * - */ -void make_queue(struct Training *training, struct Queue *queue, - struct GenData *train_data, struct GenData *test_data) -{ - long i, j, k; - long N, cnt = 0; - struct Task *task; - queue->i = 0; - - N = training->Np; - N *= training->Nl; - N *= training->Nk; - N *= training->Ne; - N *= training->Nw; - // these parameters are not necessarily non-zero - N *= training->Ng > 0 ? training->Ng : 1; - N *= training->Nc > 0 ? training->Nc : 1; - N *= training->Nd > 0 ? training->Nd : 1; - - queue->tasks = Calloc(struct Task *, N); - queue->N = N; - - // initialize all tasks - for (i=0; iID = i; - task->train_data = train_data; - task->test_data = test_data; - task->folds = training->folds; - task->kerneltype = training->kerneltype; - task->kernelparam = Calloc(double, training->Ng + - training->Nc + training->Nd); - queue->tasks[i] = task; - } - - // These loops mimick a large nested for loop. The advantage is that - // Nd, Nc and Ng which are on the outside of the nested for loop can - // now be zero, without large modification (see below). Whether this - // is indeed better than the nested for loop has not been tested. - cnt = 1; - i = 0; - while (i < N ) - for (j=0; jNp; j++) - for (k=0; ktasks[i]->p = training->ps[j]; - i++; - } - - cnt *= training->Np; - i = 0; - while (i < N ) - for (j=0; jNl; j++) - for (k=0; ktasks[i]->lambda = - training->lambdas[j]; - i++; - } - - cnt *= training->Nl; - i = 0; - while (i < N ) - for (j=0; jNk; j++) - for (k=0; ktasks[i]->kappa = training->kappas[j]; - i++; - } - - cnt *= training->Nk; - i = 0; - while (i < N ) - for (j=0; jNw; j++) - for (k=0; ktasks[i]->weight_idx = - training->weight_idxs[j]; - i++; - } - - cnt *= training->Nw; - i = 0; - while (i < N ) - for (j=0; jNe; j++) - for (k=0; ktasks[i]->epsilon = - training->epsilons[j]; - i++; - } - - cnt *= training->Ne; - i = 0; - while (i < N && training->Ng > 0) - for (j=0; jNg; j++) - for (k=0; ktasks[i]->kernelparam[0] = - training->gammas[j]; - i++; - } - - cnt *= training->Ng > 0 ? training->Ng : 1; - i = 0; - while (i < N && training->Nc > 0) - for (j=0; jNc; j++) - for (k=0; ktasks[i]->kernelparam[1] = - training->coefs[j]; - i++; - } - - cnt *= training->Nc > 0 ? training->Nc : 1; - i = 0; - while (i < N && training->Nd > 0) - for (j=0; jNd; j++) - for (k=0; ktasks[i]->kernelparam[2] = - training->degrees[j]; - i++; - } -} - -/** - * @brief Get new Task from Queue - * - * @details - * Return a pointer to the next Task in the Queue. If no Task instances are - * left, NULL is returned. The internal counter Queue::i is used for finding - * the next Task. - * - * @param[in] q Queue instance - * @returns pointer to next Task - * - */ -struct Task *get_next_task(struct Queue *q) -{ - long i = q->i; - if (i < q->N) { - q->i++; - return q->tasks[i]; - } - return NULL; -} - -/** - * @brief Comparison function for Tasks based on performance - * - * @details - * To be able to sort Task structures on the performance of their specific - * set of parameters, this comparison function is implemented. Task structs - * are sorted with highest performance first. - * - * @param[in] elem1 Task 1 - * @param[in] elem2 Task 2 - * @returns result of inequality of Task 1 performance over - * Task 2 performance - */ -int tasksort(const void *elem1, const void *elem2) -{ - const struct Task *t1 = (*(struct Task **) elem1); - const struct Task *t2 = (*(struct Task **) elem2); - return (t1->performance > t2->performance); -} - -/** - * @brief Comparison function for doubl - * - * @param[in] elem1 number 1 - * @param[in] elem2 number 2 - * @returns comparison of number 1 larger than number 2 - */ -int doublesort(const void *elem1, const void *elem2) -{ - const double t1 = (*(double *) elem1); - const double t2 = (*(double *) elem2); - return t1 > t2; -} - -/** - * @brief Calculate the percentile of an array of doubles - * - * @details - * The percentile of performance is used to find the top performing - * configurations. Since no standard definition of the percentile exists, we - * use the method used in MATLAB and Octave. Since calculating the percentile - * requires a sorted list of the values, a local copy is made first. - * - * @param[in] values array of doubles - * @param[in] N length of the array - * @param[in] p percentile to calculate ( 0 <= p <= 100.0 ). - * @returns the p-th percentile of the values - */ -double prctile(double *values, long N, double p) -{ - if (N == 1) - return values[0]; - - long i; - double pi, pr, boundary; - double *local = Malloc(double, N); - for (i=0; iN); - for (i=0; iN; i++) { - perf[i] = q->tasks[i]->performance; - } - boundary = prctile(perf, q->N, 95.0); - free(perf); - note("boundary determined at: %f\n", boundary); - - // find the number of tasks that perform at or above the boundary - for (i=0; iN; i++) { - if (q->tasks[i]->performance >= boundary) - N++; - } - - // create a new queue with the best tasks - nq->tasks = Malloc(struct Task *, N); - k = 0; - for (i=0; iN; i++) { - if (q->tasks[i]->performance >= boundary) - nq->tasks[k++] = q->tasks[i]; - } - nq->N = N; - nq->i = 0; - - return nq; -} - - -/** - * @brief Run repeats of the Task structs in Queue to find the best - * configuration - * - * @details - * The best performing tasks in the supplied Queue are found by taking those - * Task structs that have a performance greater or equal to the 95% percentile - * of the performance of all tasks. These tasks are then gathered in a new - * Queue. For each of the tasks in this new Queue the cross validation run is - * repeated a number of times. - * - * For each of the Task configurations that are repeated the mean performance, - * standard deviation of the performance and the mean computation time are - * reported. - * - * Finally, the overall best tasks are written to the specified output. These - * tasks are selected to have both the highest mean performance, as well as the - * smallest standard deviation in their performance. This is done as follows. - * First the 99th percentile of task performance and the 1st percentile of - * standard deviation is calculated. If a task exists for which the mean - * performance of the repeats and the standard deviation equals these values - * respectively, this task is found to be the best and is written to the - * output. If no such task exists, the 98th percentile of performance and the - * 2nd percentile of standard deviation is considered. This is repeated until - * an interval is found which contains tasks. If one or more tasks are found, - * this loop stops. - * - * @param[in] q Queue of Task structs which have already been - * run and have a Task::performance value - * @param[in] repeats Number of times to repeat the best - * configurations for consistency - * @param[in] traintype type of training to do (CV or TT) - * - */ -void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) -{ - long i, f, r, N, *cv_idx; - double p, pi, pr, pt, *time, *std, *mean, *perf; - struct Queue *nq; - struct GenData **train_folds, **test_folds; - struct GenModel *model = gensvm_init_model(); - struct Task *task = NULL; - clock_t loop_s, loop_e; - - nq = create_top_queue(q); - N = nq->N; - - note("Number of items: %li\n", nq->N); - std = Calloc(double, N); - mean = Calloc(double, N); - time = Calloc(double, N); - perf = Calloc(double, N*repeats); - - task = get_next_task(nq); - - model->n = 0; - model->m = task->train_data->m; - model->K = task->train_data->K; - gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); - - cv_idx = Calloc(long, task->train_data->n); - - train_folds = Malloc(struct GenData *, task->folds); - test_folds = Malloc(struct GenData *, task->folds); - - i = 0; - while (task) { - make_model_from_task(task, model); - - time[i] = 0.0; - note("(%02li/%02li:%03li)\t", i+1, N, task->ID); - for (r=0; rtrain_data->n); - gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); - train_folds = Malloc(struct GenData *, task->folds); - test_folds = Malloc(struct GenData *, task->folds); - for (f=0; ffolds; f++) { - train_folds[f] = gensvm_init_data(); - test_folds[f] = gensvm_init_data(); - gensvm_get_tt_split(task->train_data, train_folds[f], - test_folds[f], cv_idx, f); - gensvm_kernel_preprocess(model, train_folds[f]); - gensvm_kernel_postprocess(model, train_folds[f], - test_folds[f]); - } - - loop_s = clock(); - p = gensvm_cross_validation(model, train_folds, test_folds, - task->folds, task->train_data->n); - loop_e = clock(); - time[i] += elapsed_time(loop_s, loop_e); - matrix_set(perf, repeats, i, r, p); - mean[i] += p/((double) repeats); - note("%3.3f\t", p); - // this is done because if we reuse the V it's not a - // consistency check - gensvm_seed_model_V(NULL, model, task->train_data); - for (f=0; ffolds; f++) { - gensvm_free_data(train_folds[f]); - gensvm_free_data(test_folds[f]); - } - free(train_folds); - free(test_folds); - } - for (r=0; r 1) { - std[i] /= ((double) repeats) - 1.0; - std[i] = sqrt(std[i]); - } else { - std[i] = 0.0; - } - note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time[i]); - task = get_next_task(nq); - i++; - } - - // find the best overall configurations: those with high average - // performance and low deviation in the performance - note("\nBest overall configuration(s):\n"); - note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t" - "mean_perf\tstd_perf\ttime_perf\n"); - p = 0.0; - bool breakout = false; - while (breakout == false) { - pi = prctile(mean, N, (100.0-p)); - pr = prctile(std, N, p); - pt = prctile(time, N, p); - for (i=0; itasks[i]->ID, - nq->tasks[i]->weight_idx, - nq->tasks[i]->epsilon, - nq->tasks[i]->p, - nq->tasks[i]->kappa, - nq->tasks[i]->lambda, - mean[i], - std[i], - time[i]); - breakout = true; - } - p += 1.0; - } - - free(cv_idx); - // make sure no double free occurs with the copied kernelparam - model->kernelparam = NULL; - gensvm_free_model(model); - free(nq->tasks); - free(nq); - free(perf); - free(std); - free(mean); - free(time); -} - -/** - * @brief Check if the kernel parameters change between tasks - * - * @details - * In the current strategy for training the kernel matrix is decomposed once, - * and tasks with the same kernel settings are performed sequentially. When a - * task needs to be done with different kernel parameters, the kernel matrix - * needs to be recalculated. This function is used to check whether this is - * the case. - * - * @param[in] newtask the next task - * @param[in] oldtask the old task - * @return whether the kernel needs to be reevaluated - */ -bool kernel_changed(struct Task *newtask, struct Task *oldtask) -{ - int i; - if (oldtask == NULL) - return true; - if (newtask->kerneltype != oldtask->kerneltype) { - return true; - } else if (newtask->kerneltype == K_POLY) { - for (i=0; i<3; i++) - if (newtask->kernelparam[i] != oldtask->kernelparam[i]) - return true; - return false; - } else if (newtask->kerneltype == K_RBF) { - if (newtask->kernelparam[0] != oldtask->kernelparam[0]) - return true; - return false; - } else if (newtask->kerneltype == K_SIGMOID) { - for (i=0; i<2; i++) - if (newtask->kernelparam[i] != oldtask->kernelparam[i]) - return true; - return false; - } - return false; -} - -/** - * @brief Run the grid search for a Queue - * - * @details - * Given a Queue of Task struct to be trained, a grid search is launched to - * find the optimal parameter configuration. As is also done within - * cross_validation(), the optimal weights of one parameter set are used as - * initial estimates for GenModel::V in the next parameter set. Note that to - * optimally exploit this feature of the optimization algorithm, the order in - * which tasks are considered is important. This is considered in - * make_queue(). - * - * The performance found by cross validation is stored in the Task struct. - * - * @param[in,out] q Queue with Task instances to run - */ - -void start_training(struct Queue *q) -{ - int f, folds; - double perf, current_max = 0; - struct Task *task = get_next_task(q); - struct Task *prevtask = NULL; - struct GenModel *model = gensvm_init_model(); - clock_t main_s, main_e, loop_s, loop_e; - - // in principle this can change between tasks, but this shouldn't be - // the case TODO - folds = task->folds; - - model->n = 0; - model->m = task->train_data->m; - model->K = task->train_data->K; - gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); - - long *cv_idx = Calloc(long, task->train_data->n); - gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); - - struct GenData **train_folds = Malloc(struct GenData *, task->folds); - struct GenData **test_folds = Malloc(struct GenData *, task->folds); - for (f=0; ftrain_data, train_folds[f], - test_folds[f], cv_idx, f); - } - - main_s = clock(); - while (task) { - make_model_from_task(task, model); - if (kernel_changed(task, prevtask)) { - note("Computing kernel"); - for (f=0; fZ != train_folds[f]->RAW) - free(train_folds[f]->Z); - if (test_folds[f]->Z != test_folds[f]->RAW) - free(test_folds[f]->Z); - gensvm_kernel_preprocess(model, - train_folds[f]); - gensvm_kernel_postprocess(model, - train_folds[f], test_folds[f]); - } - note(".\n"); - } - print_progress_string(task, q->N); - - loop_s = clock(); - perf = gensvm_cross_validation(model, train_folds, test_folds, - folds, task->train_data->n); - loop_e = clock(); - current_max = maximum(current_max, perf); - - note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, - elapsed_time(loop_s, loop_e), current_max); - - q->tasks[task->ID]->performance = perf; - prevtask = task; - task = get_next_task(q); - } - main_e = clock(); - - note("\nTotal elapsed training time: %8.8f seconds\n", - elapsed_time(main_s, main_e)); - - // make sure no double free occurs with the copied kernelparam - model->kernelparam = NULL; - gensvm_free_model(model); - for (f=0; fn, - train_folds[f]->r); - - // initialize object weights - gensvm_initialize_weights(train_folds[f], model); - - // train the model (surpressing output) - fid = GENSVM_OUTPUT_FILE; - GENSVM_OUTPUT_FILE = NULL; - gensvm_optimize(model, train_folds[f]); - GENSVM_OUTPUT_FILE = fid; - - // calculate prediction performance on test set - predy = Calloc(long, test_folds[f]->n); - gensvm_predict_labels(test_folds[f], model, predy); - performance = gensvm_prediction_perf(test_folds[f], predy); - total_perf += performance * test_folds[f]->n; - - free(predy); - } - - total_perf /= ((double) n_total); - - return total_perf; -} - - -/** - * @brief Free the Queue struct - * - * @details - * Freeing the allocated memory of the Queue means freeing every Task struct - * and then freeing the Queue. - * - * @param[in] q Queue to be freed - * - */ -void free_queue(struct Queue *q) -{ - long i; - for (i=0; iN; i++) { - free(q->tasks[i]->kernelparam); - free(q->tasks[i]); - } - free(q->tasks); - free(q); -} - -/** - * @brief Copy parameters from Task to GenModel - * - * @details - * A Task struct only contains the parameters of the GenModel to be estimated. - * This function is used to copy these parameters. - * - * @param[in] task Task instance with parameters - * @param[in,out] model GenModel to which the parameters are copied - */ -void make_model_from_task(struct Task *task, struct GenModel *model) -{ - // copy basic model parameters - model->weight_idx = task->weight_idx; - model->epsilon = task->epsilon; - model->p = task->p; - model->kappa = task->kappa; - model->lambda = task->lambda; - - // copy kernel parameters - model->kerneltype = task->kerneltype; - model->kernelparam = task->kernelparam; -} - -/** - * @brief Copy model parameters between two GenModel structs - * - * @details - * The parameters copied are GenModel::weight_idx, GenModel::epsilon, - * GenModel::p, GenModel::kappa, and GenModel::lambda. - * - * @param[in] from GenModel to copy parameters from - * @param[in,out] to GenModel to copy parameters to - */ -void copy_model(struct GenModel *from, struct GenModel *to) -{ - to->weight_idx = from->weight_idx; - to->epsilon = from->epsilon; - to->p = from->p; - to->kappa = from->kappa; - to->lambda = from->lambda; - - to->kerneltype = from->kerneltype; - switch (to->kerneltype) { - case K_LINEAR: - break; - case K_POLY: - to->kernelparam = Malloc(double, 3); - to->kernelparam[0] = from->kernelparam[0]; - to->kernelparam[1] = from->kernelparam[1]; - to->kernelparam[2] = from->kernelparam[2]; - break; - case K_RBF: - to->kernelparam = Malloc(double, 1); - to->kernelparam[0] = from->kernelparam[0]; - break; - case K_SIGMOID: - to->kernelparam = Malloc(double, 2); - to->kernelparam[0] = from->kernelparam[0]; - to->kernelparam[1] = from->kernelparam[1]; - break; - } -} - -/** - * @brief Print the description of the current task on screen - * - * @details - * To track the progress of the grid search the parameters of the current task - * are written to the output specified in GENSVM_OUTPUT_FILE. Since the - * parameters differ with the specified kernel, this function writes a - * parameter string depending on which kernel is used. - * - * @param[in] task the Task specified - * @param[in] N total number of tasks - * - */ -void print_progress_string(struct Task *task, long N) -{ - char buffer[MAX_LINE_LENGTH]; - sprintf(buffer, "(%03li/%03li)\t", task->ID+1, N); - if (task->kerneltype == K_POLY) - sprintf(buffer + strlen(buffer), "d = %2.2f\t", - task->kernelparam[2]); - if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID) - sprintf(buffer + strlen(buffer), "c = %2.2f\t", - task->kernelparam[1]); - if (task->kerneltype == K_POLY || task->kerneltype == K_SIGMOID || - task->kerneltype == K_RBF) - sprintf(buffer + strlen(buffer), "g = %3.3f\t", - task->kernelparam[0]); - sprintf(buffer + strlen(buffer), "eps = %g\tw = %i\tk = %2.2f\t" - "l = %f\tp = %2.2f\t", task->epsilon, - task->weight_idx, task->kappa, task->lambda, task->p); - note(buffer); -} diff --git a/src/gensvm_util.c b/src/gensvm_util.c deleted file mode 100644 index d12a85c..0000000 --- a/src/gensvm_util.c +++ /dev/null @@ -1,149 +0,0 @@ -/** - * @file util.c - * @author Gertjan van den Burg - * @date January, 2014 - * @brief Utility functions - * - * @details - * This file contains several utility functions for coordinating input and - * output of data and model files. It also contains string functions. - * - */ -#include - -#include "globals.h" -#include "gensvm_util.h" - -FILE *GENSVM_OUTPUT_FILE; ///< The #GENSVM_OUTPUT_FILE specifies the - ///< output stream to which all output is - ///< written. This is done through the - ///< internal (!) - ///< function gensvm_print_string(). The - ///< advantage of using a global output - ///< stream variable is that the output can - ///< temporarily be suppressed by importing - ///< this variable through @c extern and - ///< (temporarily) setting it to NULL. - -/** - * @brief Check if any command line arguments contain string - * - * @details - * Check if any of a given array of command line arguments contains a given - * string. If the string is found, the index of the string in argv is - * returned. If the string is not found, 0 is returned. - * - * This function is copied from MSVMpack/libMSVM.c. - * - * @param[in] argc number of command line arguments - * @param[in] argv command line arguments - * @param[in] str string to find in the arguments - * @returns index of the string in the arguments if found, 0 - * otherwise - */ -int gensvm_check_argv(int argc, char **argv, char *str) -{ - int i; - int arg_str = 0; - for (i=1; in; - long K = model->K; - - for (i=0; iy[i] != j+1) - matrix_set(model->R, K, i, j, 1.0); - else - matrix_set(model->R, K, i, j, 0.0); - } - } -} - -/** - * @brief Generate the simplex difference matrix - * - * @details - * The simplex difference matrix is a 3D matrix which is constructed - * as follows. For each instance i, the difference vectors between the row of - * the simplex matrix corresponding to the class label of instance i and the - * other rows of the simplex matrix are calculated. These difference vectors - * are stored in a matrix, which is one horizontal slice of the 3D matrix. - * - * @param[in,out] model the corresponding GenModel - * @param[in] data the corresponding GenData - * - */ -void gensvm_simplex_diff(struct GenModel *model, struct GenData *data) -{ - long i, j, k; - double value; - - long n = model->n; - long K = model->K; - - for (i=0; iU, K-1, data->y[i]-1, j); - value -= matrix_get(model->U, K-1, k, j); - matrix3_set(model->UU, K-1, K, i, j, k, value); - } - } - } -} - -/** - * @brief Calculate the scalar errors - * - * @details - * Calculate the scalar errors q based on the current estimate of V, and - * store these in Q. It is assumed that the memory for Q has already been - * allocated. In addition, the matrix ZV is calculated here. It is assigned - * to a pre-allocated block of memory, which is passed to this function. - * - * @param[in,out] model the corresponding GenModel - * @param[in] data the corresponding GenData - * @param[in,out] ZV a pointer to a memory block for ZV. On exit - * this block is updated with the new ZV matrix - * calculated with GenModel::V. - * - */ -void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, - double *ZV) -{ - long i, j, k; - double a, value; - - long n = model->n; - long m = model->m; - long K = model->K; - - cblas_dgemm( - CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - n, - K-1, - m+1, - 1.0, - data->Z, - m+1, - model->V, - K-1, - 0.0, - ZV, - K-1); - - Memset(model->Q, double, n*K); - for (i=0; iUU, K-1, K, i, - j, k); - matrix_add(model->Q, K, i, k, value); - } - } - } -} - -/** - * @brief Calculate the Huber hinge errors - * - * @details - * For each of the scalar errors in Q the Huber hinge errors are - * calculated. The Huber hinge is here defined as - * @f[ - * h(q) = - * \begin{dcases} - * 1 - q - \frac{\kappa + 1}{2} & \text{if } q \leq -\kappa \\ - * \frac{1}{2(\kappa + 1)} ( 1 - q)^2 & \text{if } q \in (-\kappa, 1] \\ - * 0 & \text{if } q > 1 - * \end{dcases} - * @f] - * - * @param[in,out] model the corresponding GenModel - */ -void gensvm_calculate_huber(struct GenModel *model) -{ - long i, j; - double q, value; - - for (i=0; in; i++) { - for (j=0; jK; j++) { - q = matrix_get(model->Q, model->K, i, j); - value = 0.0; - if (q <= -model->kappa) { - value = 1.0 - q - (model->kappa+1.0)/2.0; - } else if (q <= 1.0) { - value = 1.0/(2.0*model->kappa+2.0)*pow(1.0 - q, 2.0); - } - matrix_set(model->H, model->K, i, j, value); - } - } -} - -/** - * @brief seed the matrix V from an existing model or using rand - * - * @details - * The matrix V must be seeded before the main_loop() can start. - * This can be done by either seeding it with random numbers or - * using the solution from a previous model on the same dataset - * as initial seed. The latter option usually allows for a - * significant improvement in the number of iterations necessary - * because the seeded model V is closer to the optimal V. - * - * @param[in] from_model GenModel from which to copy V - * @param[in,out] to_model GenModel to which V will be copied - */ -void gensvm_seed_model_V(struct GenModel *from_model, - struct GenModel *to_model, struct GenData *data) -{ - long i, j, k; - double cmin, cmax, value; - - long n = data->n; - long m = data->m; - long K = data->K; - - if (from_model == NULL) { - for (i=0; iZ, m+1, k, i); - cmin = minimum(cmin, value); - cmax = maximum(cmax, value); - } - for (j=0; jV, K-1, i, j, value); - } - } - } else { - for (i=0; iV, K-1, i, j); - matrix_set(to_model->V, K-1, i, j, value); - } - } -} - -/** - * @brief Use step doubling - * - * @details - * Step doubling can be used to speed up the maorization algorithm. Instead of - * using the value at the minimimum of the majorization function, the value - * ``opposite'' the majorization point is used. This can essentially cut the - * number of iterations necessary to reach the minimum in half. - * - * @param[in] model GenModel containing the augmented parameters - */ -void gensvm_step_doubling(struct GenModel *model) -{ - long i, j; - double value; - - long m = model->m; - long K = model->K; - - for (i=0; iV, K-1, i, j, 2.0); - value = - matrix_get(model->Vbar, K-1, i, j); - matrix_add(model->V, K-1, i, j, value); - } - } -} - -/** - * @brief Initialize instance weights - * - * @details - * Instance weights can for instance be used to add additional weights to - * instances of certain classes. Two default weight possibilities are - * implemented here. The first is unit weights, where each instance gets - * weight 1. - * - * The second are group size correction weights, which are calculated as - * @f[ - * \rho_i = \frac{n}{Kn_k} , - * @f] - * where @f$ n_k @f$ is the number of instances in group @f$ k @f$ and - * @f$ y_i = k @f$. - * - * @param[in] data GenData with the dataset - * @param[in,out] model GenModel with the weight specification. On - * exit GenModel::rho contains the instance - * weights. - */ -void gensvm_initialize_weights(struct GenData *data, struct GenModel *model) -{ - long *groups; - long i; - - long n = model->n; - long K = model->K; - - if (model->weight_idx == 1) { - for (i=0; irho[i] = 1.0; - } - else if (model->weight_idx == 2) { - groups = Calloc(long, K); - for (i=0; iy[i]-1]++; - for (i=0; irho[i] = ((double) n)/((double) (groups[data->y[i]-1]*K)); - } else { - fprintf(stderr, "Unknown weight specification.\n"); - exit(1); - } -} -- cgit v1.2.3