diff options
55 files changed, 1740 insertions, 1857 deletions
@@ -2,3 +2,9 @@ *.swp GenSVM_train GenSVM_grid +doc/* +training/* +codegrep* +log* +gensvm +gensvm_grid @@ -7,7 +7,7 @@ DOXY=doxygen DOCDIR=doc DOXYFILE=$(DOCDIR)/Doxyfile -EXECS=GenSVM_train GenSVM_grid gensvm +EXECS=gensvm gensvm_grid .PHONY: all clean doc test @@ -29,58 +29,55 @@ test: lib/libgensvm.a $(MAKE) -C tests all lib/libgensvm.a: \ - src/libGenSVM.o \ - src/gensvm_crossval.o \ + src/gensvm_base.o \ + src/gensvm_cmdarg.o \ + src/gensvm_copy.o \ + src/gensvm_cv_util.o \ + src/gensvm_grid.o \ + src/gensvm_gridsearch.o \ src/gensvm_init.o \ src/gensvm_io.o \ src/gensvm_kernel.o \ - src/gensvm_lapack.o \ - src/gensvm_matrix.o \ src/gensvm_memory.o \ + src/gensvm_optimize.o \ src/gensvm_pred.o \ + src/gensvm_print.o \ + src/gensvm_queue.o \ + src/gensvm_simplex.o \ src/gensvm_strutil.o \ src/gensvm_sv.o \ - src/gensvm_train.o \ - src/gensvm_train_dataset.o \ - src/gensvm_timer.o \ - src/gensvm_util.o + src/gensvm_task.o \ + src/gensvm_timer.o @ar rcs lib/libgensvm.a \ - src/libGenSVM.o \ - src/gensvm_crossval.o \ + src/gensvm_base.o \ + src/gensvm_cmdarg.o \ + src/gensvm_copy.o \ + src/gensvm_cv_util.o \ + src/gensvm_grid.o \ + src/gensvm_gridsearch.o \ src/gensvm_init.o \ src/gensvm_io.o \ - src/gensvm_matrix.o \ - src/gensvm_memory.o \ src/gensvm_kernel.o \ - src/gensvm_lapack.o \ + src/gensvm_memory.o \ + src/gensvm_optimize.o \ src/gensvm_pred.o \ + src/gensvm_print.o \ + src/gensvm_queue.o \ + src/gensvm_simplex.o \ src/gensvm_strutil.o \ src/gensvm_sv.o \ - src/gensvm_train.o \ - src/gensvm_train_dataset.o \ - src/gensvm_timer.o \ - src/gensvm_util.o + src/gensvm_task.o \ + src/gensvm_timer.o @echo libgensvm.a... gensvm: src/GenSVMtraintest.c lib/libgensvm.a @$(CC) -o $@ $< $(CFLAGS) $(INCLUDE) $(LIB) -lgensvm $(LDFLAGS) - @echo gensvm... - -GenSVM_train: src/GenSVMtrain.c lib/libgensvm.a - @$(CC) -o GenSVM_train src/GenSVMtrain.c $(CFLAGS) $(INCLUDE) $(LIB)\ - -lgensvm $(LDFLAGS) - @echo GenSVM_train... + @echo gensvm ... -GenSVM_grid: src/GenSVMgrid.c lib/libgensvm.a - @$(CC) -o GenSVM_grid src/GenSVMgrid.c $(CFLAGS) $(INCLUDE) $(LIB) \ - -lgensvm $(LDFLAGS) - @echo GenSVM_grid... - -GenSVM_pred: src/GenSVMpred.c lib/libgensvm.a - @$(CC) -o GenSVM_pred src/GenSVMpred.c $(CFLAGS) $(INCLUDE) $(LIB) \ - -lgensvm $(LDFLAGS) - @echo GenSVM_pred... +gensvm_grid: src/GenSVMgrid.c lib/libgensvm.a + @$(CC) -o $@ $< $(CFLAGS) $(INCLUDE) $(LIB) -lgensvm $(LDFLAGS) + @echo gensvm_grid ... src/%.o: src/%.c @$(CC) $(CFLAGS) $(INCLUDE) $(LDFLAGS) -c $< -o $@ - @echo $<... + @echo $< ... diff --git a/include/gensvm_base.h b/include/gensvm_base.h new file mode 100644 index 0000000..b1f4a6b --- /dev/null +++ b/include/gensvm_base.h @@ -0,0 +1,116 @@ +/** + * @file gensvm_base.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Definitions for GenData and GenModel structures + * + * @details + * Contains documentation and declarations of GenModel and GenData. + * + */ + +#ifndef GENSVM_BASE_H +#define GENSVM_BASE_H + +// includes +#include "globals.h" + +// type declarations + +/** + * @brief A structure to represent the data. + * + * @param K number of classes + * @param n number of instances + * @param m number of predictors + * @param *y pointer to vector of class labels + * @param *Z pointer to augmented data matrix + * @param *RAW pointer to augmented raw data matrix + * @param *J pointer to regularization vector + * @param kerneltype kerneltype used in GenData::Z + * @param *kernelparam kernel parameters used in GenData::Z + * + */ +struct GenData { + long K; + ///< number of classes + long n; + ///< number of instances + long m; + ///< number of predictors (width of RAW) + long r; + ///< number of eigenvalues (width of Z) + long *y; + ///< array of class labels, 1..K + double *Z; + ///< augmented data matrix (either equal to RAW or to the eigenvectors + ///< of the kernel matrix) + double *RAW; + ///< augmented raw data matrix + double *Sigma; + KernelType kerneltype; + double *kernelparam; +}; + +/** + * @brief A structure to represent a single GenSVM model. + * + */ +struct GenModel { + int weight_idx; + ///< which weights to use (1 = unit, 2 = group) + long K; + ///< number of classes in the dataset + long n; + ///< number of instances in the dataset + long m; + ///< number of predictor variables in the dataset + double epsilon; + ///< stopping criterion for the IM algorithm. + double p; + ///< parameter for the L-p norm in the loss function + double kappa; + ///< parameter for the Huber hinge function + double lambda; + ///< regularization parameter in the loss function + double *W; + ///< weight matrix + double *t; + ///< translation vector + double *V; + ///< augmented weight matrix + double *Vbar; + ///< augmented weight matrix from the previous iteration of the IM + ///< algorithm + double *U; + ///< simplex matrix + double *UU; + ///< 3D simplex difference matrix + double *Q; + ///< error matrix + double *H; + ///< Huber weighted error matrix + double *R; + ///< 0-1 auixiliary matrix, this matrix is n x K, with for row i a 0 on + ///< column y[i]-1, and 1 everywhere else. + double *rho; + ///< vector of instance weights + double training_error; + ///< loss function value after training has finished + char *data_file; + ///< filename of the data + KernelType kerneltype; + ///< type of kernel used in the model + double *kernelparam; + ///< array of kernel parameters, size depends on kernel type +}; + +// function declarations +struct GenModel *gensvm_init_model(); +void gensvm_allocate_model(struct GenModel *model); +void gensvm_reallocate_model(struct GenModel *model, long n, long m); +void gensvm_free_model(struct GenModel *model); +struct GenData *gensvm_init_data(); +void gensvm_free_data(struct GenData *data); + +#endif diff --git a/include/gensvm_cmdarg.h b/include/gensvm_cmdarg.h new file mode 100644 index 0000000..ac33be8 --- /dev/null +++ b/include/gensvm_cmdarg.h @@ -0,0 +1,21 @@ +/** + * @file gensvm_cmdarg.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_cmdarg.c + * + * @details + * Function declarations for dealing with command line arguments. + * + */ + +#ifndef GENSVM_CMDARG_H +#define GENSVM_CMDARG_H + +#include "globals.h" + +// function declarations +int gensvm_check_argv(int argc, char **argv, char *str); +int gensvm_check_argv_eq(int argc, char **argv, char *str); + +#endif diff --git a/include/gensvm_copy.h b/include/gensvm_copy.h new file mode 100644 index 0000000..bdf6eec --- /dev/null +++ b/include/gensvm_copy.h @@ -0,0 +1,18 @@ +/** + * @file gensvm_copy.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_copy.c + * + */ + +#ifndef GENSVM_COPY_H +#define GENSVM_COPY_H + +// includes +#include "gensvm_base.h" + +// function declarations +void gensvm_copy_model(struct GenModel *from, struct GenModel *to); + +#endif diff --git a/include/gensvm_crossval.h b/include/gensvm_cv_util.h index 3ac5fa9..ada727d 100644 --- a/include/gensvm_crossval.h +++ b/include/gensvm_cv_util.h @@ -1,8 +1,8 @@ /** - * @file crossval.h + * @file gensvm_cv_util.h * @author Gertjan van den Burg * @date January, 2014 - * @brief Header file for crossval.c + * @brief Header file for gensvm_cv_util.c * * @details * Contains function declarations for functions needed for performing cross @@ -10,11 +10,10 @@ * */ -#ifndef GENSVM_CROSSVAL_H -#define GENSVM_CROSSVAL_H +#ifndef GENSVM_CV_UTIL_H +#define GENSVM_CV_UTIL_H -// forward delaration -struct GenData; +#include "gensvm_base.h" void gensvm_make_cv_split(long N, long folds, long *cv_idx); void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, diff --git a/include/gensvm_debug.h b/include/gensvm_debug.h new file mode 100644 index 0000000..1cab4ca --- /dev/null +++ b/include/gensvm_debug.h @@ -0,0 +1,20 @@ + +/** + * @file gensvm_debug.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header for useful debug functions + * + * @details + * Contains defines useful for debugging. + * + */ + +#ifndef GENSVM_DEBUG_H +#define GENSVM_DEBUG_H + +#include "gensvm_print.h" + +void print_matrix(double *M, long rows, long cols); + +#endif diff --git a/include/gensvm_grid.h b/include/gensvm_grid.h new file mode 100644 index 0000000..d335d7c --- /dev/null +++ b/include/gensvm_grid.h @@ -0,0 +1,78 @@ +/** + * @file gensvm_grid.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Structs necessary for the grid search + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue and a single + * task in a queue, as well as a structure for the complete training + * scheme. Function declarations are also included. + * + */ + +#ifndef GENSVM_GRID_H +#define GENSVM_GRID_H + +#include "globals.h" + +/** + * @brief Structure for describing the entire grid search + * + * @param traintype type of training to use + * @param kerneltype type of kernel to use throughout training + * @param repeats number of repeats to be done after the grid + * search to find the parameter set with the + * most consistent high performance + * @param folds number of folds in cross validation + * @param Np size of the array of p values + * @param Nl size of the array of lambda values + * @param Nk size of the array of kappa values + * @param Ne size of the array of epsilon values + * @param Nw size of the array of weight_idx values + * @param Ng size of the array of gamma values + * @param Nc size of the array of coef values + * @param Nd size of the array of degree values + * @param *weight_idxs array of weight_idxs + * @param *ps array of p values + * @param *lambdas array of lambda values + * @param *kappas array of kappa values + * @param *epsilons array of epsilon values + * @param *gammas array of gamma values + * @param *coefs array of coef values + * @param *degrees array of degree values + * @param *train_data_file filename of train data file + * @param *test_data_file filename of test data file + * + */ +struct GenGrid { + TrainType traintype; + KernelType kerneltype; + long repeats; + long folds; + long Np; + long Nl; + long Nk; + long Ne; + long Nw; + long Ng; + long Nc; + long Nd; + int *weight_idxs; + double *ps; + double *lambdas; + double *kappas; + double *epsilons; + double *gammas; + double *coefs; + double *degrees; + char *train_data_file; + char *test_data_file; +}; + +// function declarations +struct GenGrid *gensvm_init_grid(); +void gensvm_free_grid(struct GenGrid *grid); + +#endif diff --git a/include/gensvm_gridsearch.h b/include/gensvm_gridsearch.h new file mode 100644 index 0000000..dcd9b93 --- /dev/null +++ b/include/gensvm_gridsearch.h @@ -0,0 +1,37 @@ +/** + * @file gensvm_gridsearch.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_gridsearch.c + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue and a single + * task in a queue, as well as a structure for the complete training + * scheme. Function declarations are also included. + * + */ + +#ifndef GENSVM_GRIDSEARCH_H +#define GENSVM_GRIDSEARCH_H + +// includes +#include "gensvm_cv_util.h" +#include "gensvm_init.h" +#include "gensvm_grid.h" +#include "gensvm_optimize.h" +#include "gensvm_pred.h" +#include "gensvm_queue.h" +#include "gensvm_timer.h" + +// function declarations +void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, + struct GenData *train_data, struct GenData *test_data); +void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype); +void make_model_from_task(struct GenTask *task, struct GenModel *model); +void print_progress_string(struct GenTask *task, long N); +void start_training(struct GenQueue *q); +double gensvm_cross_validation(struct GenModel *model, + struct GenData **train_folds, struct GenData **test_folds, + int folds, long n_total); +#endif diff --git a/include/gensvm_init.h b/include/gensvm_init.h index 980366b..3f4a1cb 100644 --- a/include/gensvm_init.h +++ b/include/gensvm_init.h @@ -1,28 +1,21 @@ /** * @file gensvm_init.h * @author Gertjan van den Burg - * @date January, 2014 + * @date May, 2016 * @brief Header file for gensvm_init.c * * @details - * Contains function declarations for the initialization functions for - * GenModel and GenData structures. + * Contains function declarations for the initialization functions for the + * model weights and model V matrix. */ #ifndef GENSVM_INIT_H #define GENSVM_INIT_H -// include -#include "globals.h" -#include "gensvm.h" +#include "gensvm_base.h" -struct GenModel *gensvm_init_model(); - -struct GenData *gensvm_init_data(); - -void gensvm_allocate_model(struct GenModel *model); -void gensvm_reallocate_model(struct GenModel *model, long n, long m); -void gensvm_free_model(struct GenModel *model); -void gensvm_free_data(struct GenData *data); +void gensvm_init_V(struct GenModel *from_model, struct GenModel *to_model, + struct GenData *data); +void gensvm_initialize_weights(struct GenData *data, struct GenModel *model); #endif diff --git a/include/gensvm_io.h b/include/gensvm_io.h index 4581c5f..9b0d973 100644 --- a/include/gensvm_io.h +++ b/include/gensvm_io.h @@ -12,9 +12,9 @@ #ifndef GENSVM_IO_H #define GENSVM_IO_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_base.h" +#include "gensvm_strutil.h" // function declarations void gensvm_read_data(struct GenData *dataset, char *data_file); @@ -24,5 +24,6 @@ void gensvm_write_model(struct GenModel *model, char *output_filename); void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename); +void gensvm_time_string(char *buffer); #endif diff --git a/include/gensvm_kernel.h b/include/gensvm_kernel.h index 45b7e62..a1fac20 100644 --- a/include/gensvm_kernel.h +++ b/include/gensvm_kernel.h @@ -14,12 +14,10 @@ #ifndef GENSVM_KERNEL_H #define GENSVM_KERNEL_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_base.h" // function declarations - void gensvm_kernel_preprocess(struct GenModel *model, struct GenData *data); void gensvm_kernel_postprocess(struct GenModel *model, struct GenData *traindata, struct GenData *testdata); @@ -36,5 +34,9 @@ void gensvm_make_testfactor(struct GenData *testdata, double gensvm_dot_rbf(double *x1, double *x2, double *kernelparam, long n); double gensvm_dot_poly(double *x1, double *x2, double *kernelparam, long n); double gensvm_dot_sigmoid(double *x1, double *x2, double *kernelparam, long n); - +int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, + double VL, double VU, int IL, int IU, double ABSTOL, + int *M, double *W, double *Z, int LDZ, double *WORK, int LWORK, + int *IWORK, int *IFAIL); +double dlamch(char CMACH); #endif diff --git a/include/gensvm_lapack.h b/include/gensvm_lapack.h index c4e58e8..843169b 100644 --- a/include/gensvm_lapack.h +++ b/include/gensvm_lapack.h @@ -12,13 +12,4 @@ #ifndef GENSVM_LAPACK_H #define GENSVM_LAPACK_H -int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, - int LDB); -int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, - double *B, int LDB, double *WORK, int LWORK); -int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, - double VL, double VU, int IL, int IU, double ABSTOL, - int *M, double *W, double *Z, int LDZ, double *WORK, int LWORK, - int *IWORK, int *IFAIL); -double dlamch(char CMACH); #endif diff --git a/include/gensvm_matrix.h b/include/gensvm_matrix.h index 5c88f0b..9982b78 100644 --- a/include/gensvm_matrix.h +++ b/include/gensvm_matrix.h @@ -2,34 +2,15 @@ * @file gensvm_matrix.h * @author Gertjan van den Burg * @date August, 2013 - * @brief Header file for gensvm_matrix.c + * @brief Header with defines for matrix access * * @details - * Contains function declarations for functions useful for dealing with matrices. + * Contains defines useful for dealing with matrices. * */ #ifndef GENSVM_MATRIX_H #define GENSVM_MATRIX_H -// Set a matrix element (RowMajor) -#define matrix_set(M, cols, i, j, val) M[(i)*(cols)+j] = val - -// Get a matrix element (RowMajor) -#define matrix_get(M, cols, i, j) M[(i)*(cols)+j] - -// Add to a matrix element (RowMajor) -#define matrix_add(M, cols, i, j, val) M[(i)*(cols)+j] += val - -// Multiply a matrix element (RowMajor) -#define matrix_mul(M, cols, i, j, val) M[(i)*(cols)+j] *= val - -// Set a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) -#define matrix3_set(M, N2, N3, i, j, k, val) M[k+(N3)*(j+(N2)*(i))] = val - -// Get a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) -#define matrix3_get(M, N2, N3, i, j, k) M[k+(N3)*(j+(N2)*(i))] - -void print_matrix(double *M, long rows, long cols); #endif diff --git a/include/gensvm_memory.h b/include/gensvm_memory.h index bc4aae9..08d6f2d 100644 --- a/include/gensvm_memory.h +++ b/include/gensvm_memory.h @@ -9,6 +9,8 @@ #ifndef GENSVM_MEMORY_H #define GENSVM_MEMORY_H +#include <stddef.h> + #define Calloc(type, size) \ mycalloc(__FILE__, __LINE__, size, sizeof(type)) #define Malloc(type, size) \ diff --git a/include/gensvm_optimize.h b/include/gensvm_optimize.h new file mode 100644 index 0000000..7a23bdb --- /dev/null +++ b/include/gensvm_optimize.h @@ -0,0 +1,36 @@ +/** + * @file gensvm_train.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_train.c + * + * @details + * Contains function declarations for functions used to train a single + * GenModel. + * + */ + +#ifndef GENSVM_TRAIN_H +#define GENSVM_TRAIN_H + +#include "gensvm_sv.h" +#include "gensvm_print.h" +#include "gensvm_simplex.h" + +// function declarations +void gensvm_optimize(struct GenModel *model, struct GenData *data); +double gensvm_get_loss(struct GenModel *model, struct GenData *data, + double *ZV); +void gensvm_get_update(struct GenModel *model, struct GenData *data, + double *B, double *ZAZ, double *ZAZV, double *ZAZVT); +void gensvm_category_matrix(struct GenModel *model, struct GenData *data); +void gensvm_simplex_diff(struct GenModel *model, struct GenData *dataset); +void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, + double *ZV); +void gensvm_calculate_huber(struct GenModel *model); +void gensvm_step_doubling(struct GenModel *model); +int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, int LDB); +int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, double *B, + int LDB, double *WORK, int LWORK); + +#endif diff --git a/include/gensvm_pred.h b/include/gensvm_pred.h index 97af01f..56e16e8 100644 --- a/include/gensvm_pred.h +++ b/include/gensvm_pred.h @@ -12,9 +12,9 @@ #ifndef GENSVM_PRED_H #define GENSVM_PRED_H -// forward declarations -struct GenData; -struct GenModel; +// includes +#include "gensvm_kernel.h" +#include "gensvm_simplex.h" // function declarations void gensvm_predict_labels(struct GenData *testdata, diff --git a/include/gensvm_print.h b/include/gensvm_print.h new file mode 100644 index 0000000..fff7af5 --- /dev/null +++ b/include/gensvm_print.h @@ -0,0 +1,22 @@ +/** + * @file gensvm_print.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_print.c + * + * @details + * Function declarations for printing to stdout and stderr. + * + */ + +#ifndef GENSVM_PRINT_H +#define GENSVM_PRINT_H + +// includes +#include "globals.h" + +// function declarations +void note(const char *fmt,...); +void err(const char *fmt,...); + +#endif diff --git a/include/gensvm_queue.h b/include/gensvm_queue.h new file mode 100644 index 0000000..e8d26d6 --- /dev/null +++ b/include/gensvm_queue.h @@ -0,0 +1,40 @@ +/** + * @file gensvm_queue.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Header file for gensvm_queue.c + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for this queue. Function declarations + * for initializing and freeing the queue are also included. + * + */ + +#ifndef GENSVM_QUEUE_H +#define GENSVM_QUEUE_H + +#include "gensvm_task.h" + +/** + * @brief Simple task queue. + * + * This struct is basically just an array of pointers to Task instances, + * with a length and an index of the current task. + * + * @param **tasks array of pointers to Task structs + * @param N size of task array + * @param i index used for keeping track of the queue + */ +struct GenQueue { + struct GenTask **tasks; + long N; + long i; +}; + +// function declarations +struct GenQueue *gensvm_init_queue(); +void gensvm_free_queue(struct GenQueue *q); +struct GenTask *get_next_task(struct GenQueue *q); + +#endif diff --git a/include/gensvm_simplex.h b/include/gensvm_simplex.h new file mode 100644 index 0000000..9bb40b1 --- /dev/null +++ b/include/gensvm_simplex.h @@ -0,0 +1,18 @@ +/** + * @file gensvm_simplex.h + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Header file for gensvm_simplex.c + * + */ + +#ifndef GENSVM_SIMPLEX_H +#define GENSVM_SIMPLEX_H + +// includes +#include "globals.h" + +// forward declarations +void gensvm_simplex(long K, double *U); + +#endif diff --git a/include/gensvm_strutil.h b/include/gensvm_strutil.h index c51422f..efaa5ec 100644 --- a/include/gensvm_strutil.h +++ b/include/gensvm_strutil.h @@ -1,8 +1,8 @@ /** - * @file strutil.h + * @file gensvm_strutil.h * @author Gertjan van den Burg * @date August, 2013 - * @brief Header file for strutil.c + * @brief Header file for gensvm_strutil.c * * @details * Function declarations for useful string functions used in parsing @@ -13,7 +13,7 @@ #ifndef GENSVM_STRUTIL_H #define GENSVM_STRUTIL_H -#include "types.h" +#include "globals.h" bool str_startswith(const char *str, const char *pre); bool str_endswith(const char *str, const char *suf); diff --git a/include/gensvm_sv.h b/include/gensvm_sv.h index 2c7cf57..8347b95 100644 --- a/include/gensvm_sv.h +++ b/include/gensvm_sv.h @@ -12,6 +12,10 @@ #ifndef GENSVM_SV_H #define GENSVM_SV_H +// includes +#include "gensvm_base.h" + +// function declarations long gensvm_num_sv(struct GenModel *model, struct GenData *data); #endif diff --git a/include/gensvm_task.h b/include/gensvm_task.h new file mode 100644 index 0000000..98c8f26 --- /dev/null +++ b/include/gensvm_task.h @@ -0,0 +1,53 @@ +/** + * @file gensvm_task.h + * @author Gertjan van den Burg + * @date August, 2013 + * @brief Struct for a single task in the queue + * + * @details + * The grid search for the optimal parameters is done through a queue. + * This file contains struct definitions for the tasks in the queue. + * Initialization and free functions are also included. + * + */ + +#ifndef GENSVM_TASK_H +#define GENSVM_TASK_H + +#include "gensvm_base.h" + +/** + * @brief A structure for a single task in the queue. + * + * @param folds number of folds in cross validation + * @param ID numeric id of the task in the queue + * @param weight_idx parameter for the GenModel + * @param p parameter for the GenModel + * @param kappa parameter for the GenModel + * @param lambda parameter for the GenModel + * @param epsilon parameter for the GenModel + * @param kerneltype parameter for the GenModel + * @param *kernelparam parameters for the GenModel + * @param *train_data pointer to the training data + * @param *test_data pointer to the test data (if any) + * @param performance performance after cross validation + */ +struct GenTask { + KernelType kerneltype; + int weight_idx; + long folds; + long ID; + double p; + double kappa; + double lambda; + double epsilon; + double *kernelparam; + struct GenData *train_data; + struct GenData *test_data; + double performance; +}; + +struct GenTask *gensvm_init_task(); +void gensvm_free_task(struct GenTask *task); + +#endif diff --git a/include/gensvm_timer.h b/include/gensvm_timer.h index 29c45cd..11e61e5 100644 --- a/include/gensvm_timer.h +++ b/include/gensvm_timer.h @@ -12,8 +12,10 @@ #ifndef GENSVM_TIMER_H #define GENSVM_TIMER_H -double elapsed_time(clock_t s_time, clock_t e_time); +// includes +#include "globals.h" -void get_time_string(char *buffer); +// function declarations +double gensvm_elapsed_time(clock_t s_time, clock_t e_time); #endif diff --git a/include/gensvm_train.h b/include/gensvm_train.h deleted file mode 100644 index 466b8e2..0000000 --- a/include/gensvm_train.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * @file gensvm_train.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Header file for gensvm_train.c - * - * @details - * Contains function declarations for functions used to train a single - * GenModel. - * - */ - -#ifndef GENSVM_TRAIN_H -#define GENSVM_TRAIN_H - -//forward declarations -struct GenData; -struct GenModel; - -// function declarations -void gensvm_optimize(struct GenModel *model, struct GenData *data); - -double gensvm_get_loss(struct GenModel *model, struct GenData *data, - double *ZV); - -void gensvm_get_update(struct GenModel *model, struct GenData *data, - double *B, double *ZAZ, double *ZAZV, double *ZAZVT); - -#endif diff --git a/include/gensvm_train_dataset.h b/include/gensvm_train_dataset.h deleted file mode 100644 index 9a3fe86..0000000 --- a/include/gensvm_train_dataset.h +++ /dev/null @@ -1,143 +0,0 @@ -/** - * @file gensvm_train_dataset.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Structs and functions necessary for the grid search - * - * @details - * The grid search for the optimal parameters is done through a queue. - * This file contains struct definitions for this queue and a single - * task in a queue, as well as a structure for the complete training - * scheme. Function declarations are also included. - * - */ - -#ifndef GENSVM_TRAIN_DATASET_H -#define GENSVM_TRAIN_DATASET_H - -#include "types.h" - -// forward declarations -struct GenData; -struct GenModel; - -/** - * @brief A structure for a single task in the queue. - * - * @param folds number of folds in cross validation - * @param ID numeric id of the task in the queue - * @param weight_idx parameter for the GenModel - * @param p parameter for the GenModel - * @param kappa parameter for the GenModel - * @param lambda parameter for the GenModel - * @param epsilon parameter for the GenModel - * @param kerneltype parameter for the GenModel - * @param *kernelparam parameters for the GenModel - * @param *train_data pointer to the training data - * @param *test_data pointer to the test data (if any) - * @param performance performance after cross validation - */ -struct Task { - KernelType kerneltype; - int weight_idx; - long folds; - long ID; - double p; - double kappa; - double lambda; - double epsilon; - double *kernelparam; - struct GenData *train_data; - struct GenData *test_data; - double performance; -}; - -/** - * @brief Simple task queue. - * - * This struct is basically just an array of pointers to Task instances, - * with a length and an index of the current task. - * - * @param **tasks array of pointers to Task structs - * @param N size of task array - * @param i index used for keeping track of the queue - */ -struct Queue { - struct Task **tasks; - long N; - long i; -}; - -/** - * @brief Structure for describing the entire grid search - * - * @param traintype type of training to use - * @param kerneltype type of kernel to use throughout training - * @param repeats number of repeats to be done after the grid - * search to find the parameter set with the - * most consistent high performance - * @param folds number of folds in cross validation - * @param Np size of the array of p values - * @param Nl size of the array of lambda values - * @param Nk size of the array of kappa values - * @param Ne size of the array of epsilon values - * @param Nw size of the array of weight_idx values - * @param Ng size of the array of gamma values - * @param Nc size of the array of coef values - * @param Nd size of the array of degree values - * @param *weight_idxs array of weight_idxs - * @param *ps array of p values - * @param *lambdas array of lambda values - * @param *kappas array of kappa values - * @param *epsilons array of epsilon values - * @param *gammas array of gamma values - * @param *coefs array of coef values - * @param *degrees array of degree values - * @param *train_data_file filename of train data file - * @param *test_data_file filename of test data file - * - */ -struct Training { - TrainType traintype; - KernelType kerneltype; - long repeats; - long folds; - long Np; - long Nl; - long Nk; - long Ne; - long Nw; - long Ng; - long Nc; - long Nd; - int *weight_idxs; - double *ps; - double *lambdas; - double *kappas; - double *epsilons; - double *gammas; - double *coefs; - double *degrees; - char *train_data_file; - char *test_data_file; -}; - -void make_queue(struct Training *training, struct Queue *queue, - struct GenData *train_data, struct GenData *test_data); - -struct Task *get_next_task(struct Queue *q); -void free_queue(struct Queue *q); - -void consistency_repeats(struct Queue *q, long repeats, TrainType traintype); - -void make_model_from_task(struct Task *task, struct GenModel *model); -void copy_model(struct GenModel *from, struct GenModel *to); - -void print_progress_string(struct Task *task, long N); - -// new -void start_training(struct Queue *q); -double gensvm_cross_validation(struct GenModel *model, - struct GenData **train_folds, struct GenData **test_folds, - int folds, long n_total); -#endif diff --git a/include/gensvm_util.h b/include/gensvm_util.h deleted file mode 100644 index 5ea2198..0000000 --- a/include/gensvm_util.h +++ /dev/null @@ -1,26 +0,0 @@ -/** - * @file util.h - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Header file for util.c - * - * @details - * Function declarations for utility functions of the program. - * - */ - -#ifndef GENSVM_UTIL_H -#define GENSVM_UTIL_H - -// forward declarations -struct GenData; -struct GenModel; - -// function declarations -int gensvm_check_argv(int argc, char **argv, char *str); -int gensvm_check_argv_eq(int argc, char **argv, char *str); - -void note(const char *fmt,...); -void err(const char *fmt,...); - -#endif diff --git a/include/globals.h b/include/globals.h index becde35..7fad7b1 100644 --- a/include/globals.h +++ b/include/globals.h @@ -18,12 +18,17 @@ #ifndef GENSVM_GLOBALS_H #define GENSVM_GLOBALS_H +#include "gensvm_memory.h" +#include "types.h" + +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <stdbool.h> #include <string.h> - -#include "gensvm_memory.h" +#include <math.h> +#include <time.h> +#include <cblas.h> #define MAX_LINE_LENGTH 1024 @@ -33,4 +38,22 @@ #define minimum(a, b) (a) < (b) ? (a) : (b) #endif +// Set a matrix element (RowMajor) +#define matrix_set(M, cols, i, j, val) M[(i)*(cols)+j] = val + +// Get a matrix element (RowMajor) +#define matrix_get(M, cols, i, j) M[(i)*(cols)+j] + +// Add to a matrix element (RowMajor) +#define matrix_add(M, cols, i, j, val) M[(i)*(cols)+j] += val + +// Multiply a matrix element (RowMajor) +#define matrix_mul(M, cols, i, j, val) M[(i)*(cols)+j] *= val + +// Set a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) +#define matrix3_set(M, N2, N3, i, j, k, val) M[k+(N3)*(j+(N2)*(i))] = val + +// Get a 3D matrix element (N2 = second dim, N3 = third dim, RowMajor) +#define matrix3_get(M, N2, N3, i, j, k) M[k+(N3)*(j+(N2)*(i))] + #endif diff --git a/include/libGenSVM.h b/include/libGenSVM.h index 9e2d4c2..146fc67 100644 --- a/include/libGenSVM.h +++ b/include/libGenSVM.h @@ -23,7 +23,6 @@ struct GenData; struct GenModel; // function declarations -void gensvm_simplex_gen(long K, double *U); void gensvm_category_matrix(struct GenModel *model, struct GenData *data); void gensvm_simplex_diff(struct GenModel *model, struct GenData *dataset); @@ -33,8 +32,5 @@ void gensvm_calculate_huber(struct GenModel *model); void gensvm_step_doubling(struct GenModel *model); -void gensvm_seed_model_V(struct GenModel *from_model, - struct GenModel *to_model, struct GenData *data); -void gensvm_initialize_weights(struct GenData *data, struct GenModel *model); #endif diff --git a/src/GenSVMgrid.c b/src/GenSVMgrid.c index 89b85a7..a6c749e 100644 --- a/src/GenSVMgrid.c +++ b/src/GenSVMgrid.c @@ -8,9 +8,9 @@ * This is a command line interface to the parameter grid search functionality * of the algorithm. The grid search is specified in a separate file, thereby * reducing the number of command line arguments. See - * read_training_from_file() for documentation on the training file. + * read_grid_from_file() for documentation on the grid file. * - * The program runs a grid search as specified in the training file. If + * The program runs a grid search as specified in the grid file. If * desired the grid search can incorporate consistency checks to find the * configuration among the best configurations which scores consistently high. * All output is written to stdout, unless the quiet mode is specified. @@ -19,18 +19,9 @@ * */ -#include <time.h> - -#include "globals.h" -#include "gensvm.h" -#include "gensvm_crossval.h" +#include "gensvm_cmdarg.h" #include "gensvm_io.h" -#include "gensvm_init.h" -#include "gensvm_pred.h" -#include "gensvm_strutil.h" -#include "gensvm_train.h" -#include "gensvm_train_dataset.h" -#include "gensvm_util.h" +#include "gensvm_gridsearch.h" #define MINARGS 2 @@ -39,10 +30,7 @@ extern FILE *GENSVM_OUTPUT_FILE; // function declarations void exit_with_help(); void parse_command_line(int argc, char **argv, char *input_filename); -void read_training_from_file(char *input_filename, struct Training *training); -struct Training *gensvm_init_training(); -struct Queue *gensvm_init_queue(); -void gensvm_free_training(struct Training *training); +void read_grid_from_file(char *input_filename, struct GenGrid *grid); /** * @brief Help function @@ -50,7 +38,7 @@ void gensvm_free_training(struct Training *training); void exit_with_help() { printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: trainGenSVMdataset [options] training_file\n"); + printf("Usage: trainGenSVMdataset [options] grid_file\n"); printf("Options:\n"); printf("-h | -help : print this help.\n"); printf("-q : quiet mode (no output)\n"); @@ -62,7 +50,7 @@ void exit_with_help() * @brief Main interface function for trainGenSVMdataset * * @details - * Main interface for the command line program. A given training file which + * Main interface for the command line program. A given grid file which * specifies a grid search over a single dataset is read. From this, a Queue * is created containing all Task instances that need to be performed in the * search. Depending on the type of dataset, either cross validation or @@ -78,28 +66,28 @@ int main(int argc, char **argv) { char input_filename[MAX_LINE_LENGTH]; - struct Training *training = gensvm_init_training(); + struct GenGrid *grid = gensvm_init_grid(); struct GenData *train_data = gensvm_init_data(); struct GenData *test_data = gensvm_init_data(); - struct Queue *q = gensvm_init_queue(); + struct GenQueue *q = gensvm_init_queue(); if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") || gensvm_check_argv_eq(argc, argv, "-h") ) exit_with_help(); parse_command_line(argc, argv, input_filename); - note("Reading training file\n"); - read_training_from_file(input_filename, training); + note("Reading grid file\n"); + read_grid_from_file(input_filename, grid); - note("Reading data from %s\n", training->train_data_file); - gensvm_read_data(train_data, training->train_data_file); - if (training->traintype == TT) { - note("Reading data from %s\n", training->test_data_file); - gensvm_read_data(test_data, training->test_data_file); + note("Reading data from %s\n", grid->train_data_file); + gensvm_read_data(train_data, grid->train_data_file); + if (grid->traintype == TT) { + note("Reading data from %s\n", grid->test_data_file); + gensvm_read_data(test_data, grid->test_data_file); } note("Creating queue\n"); - make_queue(training, q, train_data, test_data); + gensvm_fill_queue(grid, q, train_data, test_data); srand(time(NULL)); @@ -107,12 +95,12 @@ int main(int argc, char **argv) start_training(q); note("Training finished\n"); - if (training->repeats > 0) { - consistency_repeats(q, training->repeats, training->traintype); + if (grid->repeats > 0) { + consistency_repeats(q, grid->repeats, grid->traintype); } - free_queue(q); - gensvm_free_training(training); + gensvm_free_queue(q); + gensvm_free_grid(grid); gensvm_free_data(train_data); gensvm_free_data(test_data); @@ -125,13 +113,13 @@ int main(int argc, char **argv) * * @details * Few arguments can be supplied to the command line. Only quiet mode can be - * specified, or help can be requested. The filename of the training file is - * read from the arguments. Parsing of the training file is done separately in - * read_training_from_file(). + * specified, or help can be requested. The filename of the grid file is + * read from the arguments. Parsing of the grid file is done separately in + * read_grid_from_file(). * * @param[in] argc number of command line arguments * @param[in] argv array of command line arguments - * @param[in] input_filename pre-allocated buffer for the training + * @param[in] input_filename pre-allocated buffer for the grid * filename. * */ @@ -181,21 +169,21 @@ KernelType parse_kernel_str(char *kernel_line) } /** - * @brief Read the Training struct from file + * @brief Read the GenGrid struct from file * * @details - * Read the Training struct from a file. The training file follows a specific - * format specified in @ref spec_training_file. + * Read the GenGrid struct from a file. The grid file follows a specific + * format specified in @ref spec_grid_file. * * Commonly used string functions in this function are all_doubles_str() and * all_longs_str(). * - * @param[in] input_filename filename of the training file - * @param[in] training Training structure to place the parsed + * @param[in] input_filename filename of the grid file + * @param[in] grid GenGrid structure to place the parsed * parameter grid. * */ -void read_training_from_file(char *input_filename, struct Training *training) +void read_grid_from_file(char *input_filename, struct GenGrid *grid) { long i, nr = 0; FILE *fid; @@ -207,108 +195,108 @@ void read_training_from_file(char *input_filename, struct Training *training) fid = fopen(input_filename, "r"); if (fid == NULL) { - fprintf(stderr, "Error opening training file %s\n", + fprintf(stderr, "Error opening grid file %s\n", input_filename); exit(1); } - training->traintype = CV; + grid->traintype = CV; while ( fgets(buffer, MAX_LINE_LENGTH, fid) != NULL ) { Memset(params, double, MAX_LINE_LENGTH); Memset(lparams, long, MAX_LINE_LENGTH); if (str_startswith(buffer, "train:")) { sscanf(buffer, "train: %s\n", train_filename); - training->train_data_file = Calloc(char, + grid->train_data_file = Calloc(char, MAX_LINE_LENGTH); - strcpy(training->train_data_file, train_filename); + strcpy(grid->train_data_file, train_filename); } else if (str_startswith(buffer, "test:")) { sscanf(buffer, "test: %s\n", test_filename); - training->test_data_file = Calloc(char, + grid->test_data_file = Calloc(char, MAX_LINE_LENGTH); - strcpy(training->test_data_file, test_filename); - training->traintype = TT; + strcpy(grid->test_data_file, test_filename); + grid->traintype = TT; } else if (str_startswith(buffer, "p:")) { nr = all_doubles_str(buffer, 2, params); - training->ps = Calloc(double, nr); + grid->ps = Calloc(double, nr); for (i=0; i<nr; i++) - training->ps[i] = params[i]; - training->Np = nr; + grid->ps[i] = params[i]; + grid->Np = nr; } else if (str_startswith(buffer, "lambda:")) { nr = all_doubles_str(buffer, 7, params); - training->lambdas = Calloc(double, nr); + grid->lambdas = Calloc(double, nr); for (i=0; i<nr; i++) - training->lambdas[i] = params[i]; - training->Nl = nr; + grid->lambdas[i] = params[i]; + grid->Nl = nr; } else if (str_startswith(buffer, "kappa:")) { nr = all_doubles_str(buffer, 6, params); - training->kappas = Calloc(double, nr); + grid->kappas = Calloc(double, nr); for (i=0; i<nr; i++) - training->kappas[i] = params[i]; - training->Nk = nr; + grid->kappas[i] = params[i]; + grid->Nk = nr; } else if (str_startswith(buffer, "epsilon:")) { nr = all_doubles_str(buffer, 8, params); - training->epsilons = Calloc(double, nr); + grid->epsilons = Calloc(double, nr); for (i=0; i<nr; i++) - training->epsilons[i] = params[i]; - training->Ne = nr; + grid->epsilons[i] = params[i]; + grid->Ne = nr; } else if (str_startswith(buffer, "weight:")) { nr = all_longs_str(buffer, 7, lparams); - training->weight_idxs = Calloc(int, nr); + grid->weight_idxs = Calloc(int, nr); for (i=0; i<nr; i++) - training->weight_idxs[i] = lparams[i]; - training->Nw = nr; + grid->weight_idxs[i] = lparams[i]; + grid->Nw = nr; } else if (str_startswith(buffer, "folds:")) { nr = all_longs_str(buffer, 6, lparams); - training->folds = lparams[0]; + grid->folds = lparams[0]; if (nr > 1) fprintf(stderr, "Field \"folds\" only takes " "one value. Additional " "fields are ignored.\n"); } else if (str_startswith(buffer, "repeats:")) { nr = all_longs_str(buffer, 8, lparams); - training->repeats = lparams[0]; + grid->repeats = lparams[0]; if (nr > 1) fprintf(stderr, "Field \"repeats\" only " "takes one value. Additional " "fields are ignored.\n"); } else if (str_startswith(buffer, "kernel:")) { - training->kerneltype = parse_kernel_str(buffer); + grid->kerneltype = parse_kernel_str(buffer); } else if (str_startswith(buffer, "gamma:")) { nr = all_doubles_str(buffer, 6, params); - if (training->kerneltype == K_LINEAR) { + if (grid->kerneltype == K_LINEAR) { fprintf(stderr, "Field \"gamma\" ignored, " "linear kernel is used.\n"); - training->Ng = 0; + grid->Ng = 0; break; } - training->gammas = Calloc(double, nr); + grid->gammas = Calloc(double, nr); for (i=0; i<nr; i++) - training->gammas[i] = params[i]; - training->Ng = nr; + grid->gammas[i] = params[i]; + grid->Ng = nr; } else if (str_startswith(buffer, "coef:")) { nr = all_doubles_str(buffer, 5, params); - if (training->kerneltype == K_LINEAR || - training->kerneltype == K_RBF) { + if (grid->kerneltype == K_LINEAR || + grid->kerneltype == K_RBF) { fprintf(stderr, "Field \"coef\" ignored with " "specified kernel.\n"); - training->Nc = 0; + grid->Nc = 0; break; } - training->coefs = Calloc(double, nr); + grid->coefs = Calloc(double, nr); for (i=0; i<nr; i++) - training->coefs[i] = params[i]; - training->Nc = nr; + grid->coefs[i] = params[i]; + grid->Nc = nr; } else if (str_startswith(buffer, "degree:")) { nr = all_doubles_str(buffer, 7, params); - if (training->kerneltype != K_POLY) { + if (grid->kerneltype != K_POLY) { fprintf(stderr, "Field \"degree\" ignored " "with specified kernel.\n"); - training->Nd = 0; + grid->Nd = 0; break; } - training->degrees = Calloc(double, nr); + grid->degrees = Calloc(double, nr); for (i=0; i<nr; i++) - training->degrees[i] = params[i]; - training->Nd = nr; + grid->degrees[i] = params[i]; + grid->Nd = nr; } else { fprintf(stderr, "Cannot find any parameters on line: " "%s\n", buffer); @@ -319,62 +307,3 @@ void read_training_from_file(char *input_filename, struct Training *training) free(lparams); fclose(fid); } - -struct Training *gensvm_init_training() -{ - struct Training *training = Malloc(struct Training, 1); - - // initialize to defaults - training->traintype = CV; - training->kerneltype = K_LINEAR; - training->repeats = 0; - training->folds = 10; - training->Np = 0; - training->Nl = 0; - training->Nk = 0; - training->Ne = 0; - training->Nw = 0; - training->Ng = 0; - training->Nc = 0; - training->Nd = 0; - - // set arrays to NULL - training->weight_idxs = NULL; - training->ps = NULL; - training->lambdas = NULL; - training->kappas = NULL; - training->epsilons = NULL; - training->gammas = NULL; - training->coefs = NULL; - training->degrees = NULL; - training->train_data_file = NULL; - training->test_data_file = NULL; - - return training; -} - -struct Queue *gensvm_init_queue() -{ - struct Queue *q = Malloc(struct Queue, 1); - - q->tasks = NULL; - q->N = 0; - q->i = 0; - - return q; -} - -void gensvm_free_training(struct Training *training) -{ - free(training->weight_idxs); - free(training->ps); - free(training->lambdas); - free(training->kappas); - free(training->epsilons); - free(training->gammas); - free(training->coefs); - free(training->degrees); - free(training->train_data_file); - free(training->test_data_file); - free(training); -} diff --git a/src/GenSVMpred.c b/src/GenSVMpred.c deleted file mode 100644 index 57680b1..0000000 --- a/src/GenSVMpred.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * 20140317: - * THIS FUNCTION IS DEPRECATED, SINCE IT DOES NOT WORK WITH KERNELS. - * - */ - -/** - * @file GenSVM_pred.c - * @author Gertjan van den Burg - * @date January, 2014 - * @brief Command line interface for predicting class labels - * - * @details - * This is a command line program for predicting the class labels or - * determining the predictive performance of a pre-determined model on a given - * test dataset. The predictive performance can be written to the screen or - * the predicted class labels can be written to a specified output file. This - * is done using gensvm_write_predictions(). - * - * The specified model file must follow the specification given in - * gensvm_write_model(). - * - * For usage information, see the program help function. - * - */ - -#include "gensvm.h" -#include "gensvm_init.h" -#include "gensvm_io.h" -#include "gensvm_pred.h" -#include "gensvm_util.h" - -#define MINARGS 3 - -extern FILE *GENSVM_OUTPUT_FILE; - -// function declarations -void exit_with_help(); -void parse_command_line(int argc, char **argv, - char *input_filename, char *output_filename, - char *model_filename); - -/** - * @brief Help function - */ -void exit_with_help() -{ - printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: predGenSVM [options] test_data_file model_file\n"); - printf("Options:\n"); - printf("-o output_file : write output to file\n"); - printf("-q : quiet mode (no output)\n"); - exit(0); -} - -/** - * @brief Main interface function for predGenSVM - * - * @details - * Main interface for the command line program. A given model file is read and - * a test dataset is initialized from the given data. The predictive - * performance (hitrate) of the model on the test set is printed to the output - * stream (default = stdout). If an output file is specified the predictions - * are written to the file. - * - * @todo - * Ensure that the program can read model files without class labels - * specified. In that case no prediction accuracy is printed to the screen. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * - */ -int main(int argc, char **argv) -{ - long *predy; - double performance; - - char input_filename[MAX_LINE_LENGTH]; - char model_filename[MAX_LINE_LENGTH]; - char output_filename[MAX_LINE_LENGTH];; - - if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") - || gensvm_check_argv_eq(argc, argv, "-h") ) - exit_with_help(); - parse_command_line(argc, argv, input_filename, output_filename, - model_filename); - - // read the data and model - struct GenModel *model = gensvm_init_model(); - struct GenData *data = gensvm_init_data(); - gensvm_read_data(data, input_filename); - gensvm_read_model(model, model_filename); - - // check if the number of attributes in data equals that in model - if (data->m != model->m) { - fprintf(stderr, "Error: number of attributes in data (%li) " - "does not equal the number of attributes in " - "model (%li)\n", data->m, model->m); - exit(1); - } else if (data->K != model->K) { - fprintf(stderr, "Error: number of classes in data (%li) " - "does not equal the number of classes in " - "model (%li)\n", data->K, model->K); - exit(1); - } - - // predict labels and performance if test data has labels - predy = Calloc(long, data->n); - gensvm_predict_labels(data, model, predy); - if (data->y != NULL) { - performance = gensvm_prediction_perf(data, predy); - note("Predictive performance: %3.2f%%\n", performance); - } - - // if output file is specified, write predictions to it - if (gensvm_check_argv_eq(argc, argv, "-o")) { - gensvm_write_predictions(data, predy, output_filename); - note("Predictions written to: %s\n", output_filename); - } - - // free the model, data, and predictions - gensvm_free_model(model); - gensvm_free_data(data); - free(predy); - - return 0; -} - -/** - * @brief Parse command line arguments - * - * @details - * Read the data filename and model filename from the command line arguments. - * If specified, also read the output filename. If the quiet flag is given, - * set the global output stream to NULL. On error, exit_with_help(). - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * @param[in] input_filename pre-allocated array for the input - * filename - * @param[in] output_filename pre-allocated array for the output - * filename - * @param[in] model_filename pre-allocated array for the model - * filename - * - */ -void parse_command_line(int argc, char **argv, char *input_filename, - char *output_filename, char *model_filename) -{ - int i; - - GENSVM_OUTPUT_FILE = stdout; - - for (i=1; i<argc; i++) { - if (argv[i][0] != '-') break; - if (++i >= argc) - exit_with_help(); - switch (argv[i-1][1]) { - case 'o': - strcpy(output_filename, argv[i]); - break; - case 'q': - GENSVM_OUTPUT_FILE = NULL; - i--; - break; - default: - fprintf(stderr, "Unknown option: -%c\n", - argv[i-1][1]); - exit_with_help(); - } - } - - if (i >= argc) - exit_with_help(); - - strcpy(input_filename, argv[i]); - i++; - strcpy(model_filename, argv[i]); -} diff --git a/src/GenSVMtrain.c b/src/GenSVMtrain.c deleted file mode 100644 index 3bb9c09..0000000 --- a/src/GenSVMtrain.c +++ /dev/null @@ -1,245 +0,0 @@ -/** - * @file GenSVM_train.c - * @author Gertjan van den Burg - * @date August, 2013 - * @brief Command line interface for training a single model with GenSVM - * - * @details - * This is a command line program for training a single model on a given - * dataset. To run a grid search over a number of parameter configurations, - * see trainGenSVMdataset.c. - * - */ - -#include <time.h> -#include <math.h> - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_io.h" -#include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_train.h" -#include "gensvm_util.h" - -#define MINARGS 2 - -extern FILE *GENSVM_OUTPUT_FILE; - -// function declarations -void exit_with_help(); -void parse_command_line(int argc, char **argv, struct GenModel *model, - char *input_filename, char *output_filename, char *model_filename); - -/** - * @brief Help function - */ -void exit_with_help() -{ - printf("This is GenSVM, version %1.1f\n\n", VERSION); - printf("Usage: trainGenSVM [options] training_data_file\n"); - printf("Options:\n"); - printf("-c coef : coefficient for the polynomial and sigmoid kernel\n"); - printf("-d degree : degree for the polynomial kernel\n"); - printf("-e epsilon : set the value of the stopping criterion\n"); - printf("-g gamma : parameter for the rbf, polynomial or sigmoid " - "kernel\n"); - printf("-h | -help : print this help.\n"); - printf("-k kappa : set the value of kappa used in the Huber hinge\n"); - printf("-l lambda : set the value of lambda (lambda > 0)\n"); - printf("-m model_file : use previous model as seed for W and t\n"); - printf("-o output_file : write output to file\n"); - printf("-p p-value : set the value of p in the lp norm " - "(1.0 <= p <= 2.0)\n"); - printf("-q : quiet mode (no output)\n"); - printf("-r rho : choose the weigth specification (1 = unit, 2 = " - "group)\n"); - printf("-t type: kerneltype (LINEAR=0, POLY=1, RBF=2, SIGMOID=3)\n"); - - exit(0); -} - -/** - * @brief Main interface function for trainGenSVM - * - * @details - * Main interface for the command line program. A given dataset file is read - * and a GenSVM model is trained on this data. By default the progress of the - * computations are written to stdout. See for full options of the program the - * help function. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * - */ -int main(int argc, char **argv) -{ - char input_filename[MAX_LINE_LENGTH]; - char model_filename[MAX_LINE_LENGTH]; - char output_filename[MAX_LINE_LENGTH]; - - struct GenModel *model = gensvm_init_model(); - struct GenData *data = gensvm_init_data(); - - if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help") - || gensvm_check_argv_eq(argc, argv, "-h") ) - exit_with_help(); - parse_command_line(argc, argv, model, input_filename, - output_filename, model_filename); - - // read data file - gensvm_read_data(data, input_filename); - - // copy dataset parameters to model - model->n = data->n; - model->m = data->m; - model->K = data->K; - model->data_file = input_filename; - - // allocate model - gensvm_allocate_model(model); - - // initialize kernel (if necessary) - //gensvm_make_kernel(model, data); - - // reallocate model and initialize weights - gensvm_reallocate_model(model, data->n, data->m); - gensvm_initialize_weights(data, model); - - // seed the random number generator (only place in programs is in - // command line interfaces) - srand(time(NULL)); - - if (gensvm_check_argv_eq(argc, argv, "-m")) { - struct GenModel *seed_model = gensvm_init_model(); - gensvm_read_model(seed_model, model_filename); - gensvm_seed_model_V(seed_model, model, data); - gensvm_free_model(seed_model); - } else { - gensvm_seed_model_V(NULL, model, data); - } - - // start training - gensvm_optimize(model, data); - - // write_model to file - if (gensvm_check_argv_eq(argc, argv, "-o")) { - gensvm_write_model(model, output_filename); - note("Output written to %s\n", output_filename); - } - - // free model and data - gensvm_free_model(model); - gensvm_free_data(data); - - return 0; -} - -/** - * @brief Parse command line arguments - * - * @details - * Process the command line arguments for the model parameters, and record - * them in the specified GenModel. An input filename for the dataset is read - * and if specified an output filename and a model filename for the seed - * model. - * - * @param[in] argc number of command line arguments - * @param[in] argv array of command line arguments - * @param[in] model initialized model - * @param[in] input_filename pre-allocated buffer for the input - * filename - * @param[in] output_filename pre-allocated buffer for the output - * filename - * @param[in] model_filename pre-allocated buffer for the model - * filename - * - */ -void parse_command_line(int argc, char **argv, struct GenModel *model, - char *input_filename, char *output_filename, char *model_filename) -{ - int i; - double gamma = 1.0, - degree = 2.0, - coef = 0.0; - - GENSVM_OUTPUT_FILE = stdout; - - // parse options - for (i=1; i<argc; i++) { - if (argv[i][0] != '-') break; - if (++i>=argc) { - exit_with_help(); - } - switch (argv[i-1][1]) { - case 'c': - coef = atof(argv[i]); - break; - case 'd': - degree = atof(argv[i]); - break; - case 'e': - model->epsilon = atof(argv[i]); - break; - case 'g': - gamma = atof(argv[i]); - break; - case 'k': - model->kappa = atof(argv[i]); - break; - case 'l': - model->lambda = atof(argv[i]); - break; - case 'm': - strcpy(model_filename, argv[i]); - break; - case 'o': - strcpy(output_filename, argv[i]); - break; - case 'p': - model->p = atof(argv[i]); - break; - case 'r': - model->weight_idx = atoi(argv[i]); - break; - case 't': - model->kerneltype = atoi(argv[i]); - break; - case 'q': - GENSVM_OUTPUT_FILE = NULL; - i--; - break; - default: - fprintf(stderr, "Unknown option: -%c\n", - argv[i-1][1]); - exit_with_help(); - } - } - - // read input filename - if (i >= argc) - exit_with_help(); - - strcpy(input_filename, argv[i]); - - // set kernel parameters - switch (model->kerneltype) { - case K_LINEAR: - break; - case K_POLY: - model->kernelparam = Calloc(double, 3); - model->kernelparam[0] = gamma; - model->kernelparam[1] = coef; - model->kernelparam[2] = degree; - break; - case K_RBF: - model->kernelparam = Calloc(double, 1); - model->kernelparam[0] = gamma; - break; - case K_SIGMOID: - model->kernelparam = Calloc(double, 1); - model->kernelparam[0] = gamma; - model->kernelparam[1] = coef; - } -} diff --git a/src/GenSVMtraintest.c b/src/GenSVMtraintest.c index 0199725..47cc900 100644 --- a/src/GenSVMtraintest.c +++ b/src/GenSVMtraintest.c @@ -10,17 +10,11 @@ * */ -#include <time.h> - -#include "globals.h" -#include "gensvm.h" +#include "gensvm_cmdarg.h" #include "gensvm_io.h" #include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_train.h" +#include "gensvm_optimize.h" #include "gensvm_pred.h" -#include "libGenSVM.h" -#include "gensvm_util.h" #define MINARGS 2 @@ -118,10 +112,10 @@ int main(int argc, char **argv) if (gensvm_check_argv_eq(argc, argv, "-s")) { struct GenModel *seed_model = gensvm_init_model(); gensvm_read_model(seed_model, model_inputfile); - gensvm_seed_model_V(seed_model, model, traindata); + gensvm_init_V(seed_model, model, traindata); gensvm_free_model(seed_model); } else { - gensvm_seed_model_V(NULL, model, traindata); + gensvm_init_V(NULL, model, traindata); } // start training @@ -274,4 +268,3 @@ void parse_command_line(int argc, char **argv, struct GenModel *model, model->kernelparam[1] = coef; } } - diff --git a/src/gensvm_base.c b/src/gensvm_base.c new file mode 100644 index 0000000..eddef5c --- /dev/null +++ b/src/gensvm_base.c @@ -0,0 +1,211 @@ +/** + * @file gensvm_base.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for initializing GenModel and GenData structures + * + * @details + * This file contains functions for initializing, freeing, allocating, and + * reallocating a GenModel instance. It also contains functions for + * initializing and freeing a GenData structure. In addition, default values + * for these structures are defined here (and only here). + * + */ + +#include "gensvm_base.h" + +/** + * @brief Initialize a GenData structure + * + * @details + * A GenData structure is initialized and default values are set. + * A pointer to the initialized data is returned. + * + * @returns initialized GenData + * + */ +struct GenData *gensvm_init_data() +{ + struct GenData *data = Malloc(struct GenData, 1); + data->Sigma = NULL; + data->y = NULL; + data->Z = NULL; + data->RAW = NULL; + + // set default values + data->kerneltype = K_LINEAR; + data->kernelparam = NULL; + + return data; +} + +/** + * @brief Free allocated GenData struct + * + * @details + * Simply free a previously allocated GenData struct by freeing all its + * components. Note that the data struct itself is also freed here. + * + * @param[in] data GenData struct to free + * + */ +void gensvm_free_data(struct GenData *data) +{ + if (data == NULL) + return; + + if (data->Z == data->RAW) { + free(data->Z); + } else { + free(data->Z); + free(data->RAW); + } + free(data->kernelparam); + free(data->y); + free(data->Sigma); + free(data); +} + +/** + * @brief Initialize a GenModel structure + * + * @details + * A GenModel structure is initialized and the default value for the + * parameters are set. A pointer to the initialized model is returned. + * + * @returns initialized GenModel + */ +struct GenModel *gensvm_init_model() +{ + struct GenModel *model = Malloc(struct GenModel, 1); + + // set default values + model->p = 1.0; + model->lambda = pow(2, -8.0); + model->epsilon = 1e-6; + model->kappa = 0.0; + model->weight_idx = 1; + model->kerneltype = K_LINEAR; + model->kernelparam = NULL; + + model->W = NULL; + model->t = NULL; + model->V = NULL; + model->Vbar = NULL; + model->U = NULL; + model->UU = NULL; + model->Q = NULL; + model->H = NULL; + model->R = NULL; + model->rho = NULL; + model->data_file = NULL; + + return model; +} + +/** + * @brief Allocate memory for a GenModel + * + * @details + * This function can be used to allocate the memory needed for a GenModel. All + * arrays in the model are specified and initialized to 0. + * + * @param[in] model GenModel to allocate + * + */ +void gensvm_allocate_model(struct GenModel *model) +{ + long n = model->n; + long m = model->m; + long K = model->K; + + model->W = Calloc(double, m*(K-1)); + model->t = Calloc(double, K-1); + model->V = Calloc(double, (m+1)*(K-1)); + model->Vbar = Calloc(double, (m+1)*(K-1)); + model->U = Calloc(double, K*(K-1)); + model->UU = Calloc(double, n*K*(K-1)); + model->Q = Calloc(double, n*K); + model->H = Calloc(double, n*K); + model->R = Calloc(double, n*K); + model->rho = Calloc(double, n); +} + +/** + * @brief Reallocate memory for GenModel + * + * @details + * This function can be used to reallocate existing memory for a GenModel, + * upon a change in the model dimensions. This is used in combination with + * kernels. + * + * @param[in] model GenModel to reallocate + * @param[in] n new value of GenModel->n + * @param[in] m new value of GenModel->m + * + */ +void gensvm_reallocate_model(struct GenModel *model, long n, long m) +{ + long K = model->K; + + if (model->n == n && model->m == m) + return; + if (model->n != n) { + model->UU = Realloc(model->UU, double, n*K*(K-1)); + Memset(model->UU, double, n*K*(K-1)); + + model->Q = Realloc(model->Q, double, n*K); + Memset(model->Q, double, n*K); + + model->H = Realloc(model->H, double, n*K); + Memset(model->H, double, n*K); + + model->R = Realloc(model->R, double, n*K); + Memset(model->R, double, n*K); + + model->rho = Realloc(model->rho, double, n); + Memset(model->rho, double, n); + + model->n = n; + } + if (model->m != m) { + model->W = Realloc(model->W, double, m*(K-1)); + Memset(model->W, double, m*(K-1)); + + model->V = Realloc(model->V, double, (m+1)*(K-1)); + Memset(model->V, double, (m+1)*(K-1)); + + model->Vbar = Realloc(model->Vbar, double, (m+1)*(K-1)); + Memset(model->Vbar, double, (m+1)*(K-1)); + + model->m = m; + } +} + +/** + * @brief Free allocated GenModel struct + * + * @details + * Simply free a previously allocated GenModel by freeing all its component + * arrays. Note that the model struct itself is also freed here. + * + * @param[in] model GenModel to free + * + */ +void gensvm_free_model(struct GenModel *model) +{ + free(model->W); + free(model->t); + free(model->V); + free(model->Vbar); + free(model->U); + free(model->UU); + free(model->Q); + free(model->H); + free(model->rho); + free(model->R); + free(model->kernelparam); + + free(model); +} + diff --git a/src/gensvm_cmdarg.c b/src/gensvm_cmdarg.c new file mode 100644 index 0000000..8f796bb --- /dev/null +++ b/src/gensvm_cmdarg.c @@ -0,0 +1,71 @@ +/** + * @file gensvm_cmdarg.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for dealing with command line arguments + * + * @details + * This file contains several utility functions for coordinating input and + * output of data and model files. + * + */ + +#include "gensvm_cmdarg.h" + +/** + * @brief Check if any command line arguments contain string + * + * @details + * Check if any of a given array of command line arguments contains a given + * string. If the string is found, the index of the string in argv is + * returned. If the string is not found, 0 is returned. + * + * This function is copied from MSVMpack/libMSVM.c. + * + * @param[in] argc number of command line arguments + * @param[in] argv command line arguments + * @param[in] str string to find in the arguments + * @returns index of the string in the arguments if found, 0 + * otherwise + */ +int gensvm_check_argv(int argc, char **argv, char *str) +{ + int i; + int arg_str = 0; + for (i=1; i<argc; i++) + if (strstr(argv[i], str) != NULL) { + arg_str = i; + break; + } + + return arg_str; +} + +/** + * @brief Check if a command line argument equals a string + * + * @details + * Check if any of the command line arguments is exactly equal to a given + * string. If so, return the index of the corresponding command line argument. + * If not, return 0. + * + * This function is copied from MSVMpack/libMSVM.c + * + * @param[in] argc number of command line arguments + * @param[in] argv command line arguments + * @param[in] str string to find in the arguments + * @returns index of the command line argument that corresponds to + * the string, 0 if none matches. + */ +int gensvm_check_argv_eq(int argc, char **argv, char *str) +{ + int i; + int arg_str = 0; + for (i=1; i<argc; i++) + if (strcmp(argv[i], str) == 0) { + arg_str = i; + break; + } + + return arg_str; +} diff --git a/src/gensvm_copy.c b/src/gensvm_copy.c new file mode 100644 index 0000000..501e405 --- /dev/null +++ b/src/gensvm_copy.c @@ -0,0 +1,49 @@ +/** + * @file gensvm_copy.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Function for copying a GenModel instance + * + */ + +#include "gensvm_copy.h" + +/** + * @brief Copy model parameters between two GenModel structs + * + * @details + * The parameters copied are GenModel::weight_idx, GenModel::epsilon, + * GenModel::p, GenModel::kappa, and GenModel::lambda. + * + * @param[in] from GenModel to copy parameters from + * @param[in,out] to GenModel to copy parameters to + */ +void gensvm_copy_model(struct GenModel *from, struct GenModel *to) +{ + to->weight_idx = from->weight_idx; + to->epsilon = from->epsilon; + to->p = from->p; + to->kappa = from->kappa; + to->lambda = from->lambda; + + to->kerneltype = from->kerneltype; + switch (to->kerneltype) { + case K_LINEAR: + break; + case K_POLY: + to->kernelparam = Malloc(double, 3); + to->kernelparam[0] = from->kernelparam[0]; + to->kernelparam[1] = from->kernelparam[1]; + to->kernelparam[2] = from->kernelparam[2]; + break; + case K_RBF: + to->kernelparam = Malloc(double, 1); + to->kernelparam[0] = from->kernelparam[0]; + break; + case K_SIGMOID: + to->kernelparam = Malloc(double, 2); + to->kernelparam[0] = from->kernelparam[0]; + to->kernelparam[1] = from->kernelparam[1]; + break; + } +} diff --git a/src/gensvm_crossval.c b/src/gensvm_cv_util.c index 8f09cb5..d9cde09 100644 --- a/src/gensvm_crossval.c +++ b/src/gensvm_cv_util.c @@ -1,5 +1,5 @@ /** - * @file crossval.c + * @file gensvm_cv_util.c * @author Gertjan van den Burg * @date January 7, 2014 * @brief Functions for cross validation @@ -13,10 +13,7 @@ * */ -#include "globals.h" -#include "gensvm.h" -#include "gensvm_crossval.h" -#include "gensvm_matrix.h" +#include "gensvm_cv_util.h" /** * @brief Create a cross validation split vector diff --git a/src/gensvm_matrix.c b/src/gensvm_debug.c index c2e5986..d94711a 100644 --- a/src/gensvm_matrix.c +++ b/src/gensvm_debug.c @@ -1,19 +1,15 @@ /** - * @file gensvm_matrix.c + * @file gensvm_debug.c * @author Gertjan van den Burg - * @date August, 2013 - * @brief Functions facilitating matrix access + * @date May, 2016 + * @brief Functions facilitating debugging * * @details - * The functions contained in this file are used when - * accessing or writing to matrices. Seperate functions - * exist of adding and multiplying existing matrix - * elements, to ensure this is done in place. + * Defines functions useful for debugging matrices. * */ -#include "gensvm_matrix.h" -#include "gensvm_util.h" +#include "gensvm_debug.h" /** * @brief print a matrix diff --git a/src/gensvm_grid.c b/src/gensvm_grid.c new file mode 100644 index 0000000..ea25b89 --- /dev/null +++ b/src/gensvm_grid.c @@ -0,0 +1,62 @@ +/** + * @file gensvm_grid.c + * @author Gertjan van den Burg + * @date Mayy, 2016 + * @brief Functions for initializing GenGrid structures + * + * @details + * This file contains functions for initializing and freeing a GenGrid + * instance. In addition, default values for this structure are defined here + * (and only here). + * + */ + +#include "gensvm_grid.h" + +struct GenGrid *gensvm_init_grid() +{ + struct GenGrid *grid = Malloc(struct GenGrid, 1); + + // initialize to defaults + grid->traintype = CV; + grid->kerneltype = K_LINEAR; + grid->repeats = 0; + grid->folds = 10; + grid->Np = 0; + grid->Nl = 0; + grid->Nk = 0; + grid->Ne = 0; + grid->Nw = 0; + grid->Ng = 0; + grid->Nc = 0; + grid->Nd = 0; + + // set arrays to NULL + grid->weight_idxs = NULL; + grid->ps = NULL; + grid->lambdas = NULL; + grid->kappas = NULL; + grid->epsilons = NULL; + grid->gammas = NULL; + grid->coefs = NULL; + grid->degrees = NULL; + grid->train_data_file = NULL; + grid->test_data_file = NULL; + + return grid; +} + +void gensvm_free_grid(struct GenGrid *grid) +{ + free(grid->weight_idxs); + free(grid->ps); + free(grid->lambdas); + free(grid->kappas); + free(grid->epsilons); + free(grid->gammas); + free(grid->coefs); + free(grid->degrees); + free(grid->train_data_file); + free(grid->test_data_file); + free(grid); +} diff --git a/src/gensvm_train_dataset.c b/src/gensvm_gridsearch.c index d1650a7..deee033 100644 --- a/src/gensvm_train_dataset.c +++ b/src/gensvm_gridsearch.c @@ -9,74 +9,60 @@ * this file are used to find the optimal parameters. */ -#include <math.h> -#include <time.h> - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_crossval.h" -#include "gensvm_init.h" -#include "gensvm_kernel.h" -#include "gensvm_matrix.h" -#include "gensvm_train.h" -#include "gensvm_train_dataset.h" -#include "gensvm_pred.h" -#include "gensvm_util.h" -#include "gensvm_timer.h" +#include "gensvm_gridsearch.h" extern FILE *GENSVM_OUTPUT_FILE; /** - * @brief Initialize a Queue from a Training instance + * @brief Initialize a GenQueue from a Training instance * * @details * A Training instance describes the grid to search over. This funtion * creates all tasks that need to be performed and adds these to - * a Queue. Each task contains a pointer to the train and test datasets + * a GenQueue. Each task contains a pointer to the train and test datasets * which are supplied. Note that the tasks are created in a specific order of * the parameters, to ensure that the GenModel::V of a previous parameter * set provides the best possible initial estimate of GenModel::V for the next * parameter set. * - * @param[in] training Training struct describing the grid search - * @param[in] queue pointer to a Queue that will be used to + * @param[in] grid Training struct describing the grid search + * @param[in] queue pointer to a GenQueue that will be used to * add the tasks to * @param[in] train_data GenData of the training set * @param[in] test_data GenData of the test set * */ -void make_queue(struct Training *training, struct Queue *queue, +void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, struct GenData *train_data, struct GenData *test_data) { long i, j, k; long N, cnt = 0; - struct Task *task; + struct GenTask *task; queue->i = 0; - N = training->Np; - N *= training->Nl; - N *= training->Nk; - N *= training->Ne; - N *= training->Nw; + N = grid->Np; + N *= grid->Nl; + N *= grid->Nk; + N *= grid->Ne; + N *= grid->Nw; // these parameters are not necessarily non-zero - N *= training->Ng > 0 ? training->Ng : 1; - N *= training->Nc > 0 ? training->Nc : 1; - N *= training->Nd > 0 ? training->Nd : 1; + N *= grid->Ng > 0 ? grid->Ng : 1; + N *= grid->Nc > 0 ? grid->Nc : 1; + N *= grid->Nd > 0 ? grid->Nd : 1; - queue->tasks = Calloc(struct Task *, N); + queue->tasks = Calloc(struct GenTask *, N); queue->N = N; // initialize all tasks for (i=0; i<N; i++) { - task = Calloc(struct Task, 1); + task = gensvm_init_task(); task->ID = i; task->train_data = train_data; task->test_data = test_data; - task->folds = training->folds; - task->kerneltype = training->kerneltype; - task->kernelparam = Calloc(double, training->Ng + - training->Nc + training->Nd); + task->folds = grid->folds; + task->kerneltype = grid->kerneltype; + task->kernelparam = Calloc(double, grid->Ng + + grid->Nc + grid->Nd); queue->tasks[i] = task; } @@ -87,125 +73,83 @@ void make_queue(struct Training *training, struct Queue *queue, cnt = 1; i = 0; while (i < N ) - for (j=0; j<training->Np; j++) + for (j=0; j<grid->Np; j++) for (k=0; k<cnt; k++) { - queue->tasks[i]->p = training->ps[j]; + queue->tasks[i]->p = grid->ps[j]; i++; } - cnt *= training->Np; + cnt *= grid->Np; i = 0; while (i < N ) - for (j=0; j<training->Nl; j++) + for (j=0; j<grid->Nl; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->lambda = - training->lambdas[j]; + grid->lambdas[j]; i++; } - cnt *= training->Nl; + cnt *= grid->Nl; i = 0; while (i < N ) - for (j=0; j<training->Nk; j++) + for (j=0; j<grid->Nk; j++) for (k=0; k<cnt; k++) { - queue->tasks[i]->kappa = training->kappas[j]; + queue->tasks[i]->kappa = grid->kappas[j]; i++; } - cnt *= training->Nk; + cnt *= grid->Nk; i = 0; while (i < N ) - for (j=0; j<training->Nw; j++) + for (j=0; j<grid->Nw; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->weight_idx = - training->weight_idxs[j]; + grid->weight_idxs[j]; i++; } - cnt *= training->Nw; + cnt *= grid->Nw; i = 0; while (i < N ) - for (j=0; j<training->Ne; j++) + for (j=0; j<grid->Ne; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->epsilon = - training->epsilons[j]; + grid->epsilons[j]; i++; } - cnt *= training->Ne; + cnt *= grid->Ne; i = 0; - while (i < N && training->Ng > 0) - for (j=0; j<training->Ng; j++) + while (i < N && grid->Ng > 0) + for (j=0; j<grid->Ng; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[0] = - training->gammas[j]; + grid->gammas[j]; i++; } - cnt *= training->Ng > 0 ? training->Ng : 1; + cnt *= grid->Ng > 0 ? grid->Ng : 1; i = 0; - while (i < N && training->Nc > 0) - for (j=0; j<training->Nc; j++) + while (i < N && grid->Nc > 0) + for (j=0; j<grid->Nc; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[1] = - training->coefs[j]; + grid->coefs[j]; i++; } - cnt *= training->Nc > 0 ? training->Nc : 1; + cnt *= grid->Nc > 0 ? grid->Nc : 1; i = 0; - while (i < N && training->Nd > 0) - for (j=0; j<training->Nd; j++) + while (i < N && grid->Nd > 0) + for (j=0; j<grid->Nd; j++) for (k=0; k<cnt; k++) { queue->tasks[i]->kernelparam[2] = - training->degrees[j]; + grid->degrees[j]; i++; } } /** - * @brief Get new Task from Queue - * - * @details - * Return a pointer to the next Task in the Queue. If no Task instances are - * left, NULL is returned. The internal counter Queue::i is used for finding - * the next Task. - * - * @param[in] q Queue instance - * @returns pointer to next Task - * - */ -struct Task *get_next_task(struct Queue *q) -{ - long i = q->i; - if (i < q->N) { - q->i++; - return q->tasks[i]; - } - return NULL; -} - -/** - * @brief Comparison function for Tasks based on performance - * - * @details - * To be able to sort Task structures on the performance of their specific - * set of parameters, this comparison function is implemented. Task structs - * are sorted with highest performance first. - * - * @param[in] elem1 Task 1 - * @param[in] elem2 Task 2 - * @returns result of inequality of Task 1 performance over - * Task 2 performance - */ -int tasksort(const void *elem1, const void *elem2) -{ - const struct Task *t1 = (*(struct Task **) elem1); - const struct Task *t2 = (*(struct Task **) elem2); - return (t1->performance > t2->performance); -} - -/** * @brief Comparison function for doubl * * @param[in] elem1 number 1 @@ -256,11 +200,11 @@ double prctile(double *values, long N, double p) return boundary; } -struct Queue *create_top_queue(struct Queue *q) +struct GenQueue *create_top_queue(struct GenQueue *q) { long i, k, N = 0; double boundary, *perf; - struct Queue *nq = Malloc(struct Queue, 1); + struct GenQueue *nq = Malloc(struct GenQueue, 1); // find the 95th percentile of performance perf = Calloc(double, q->N); @@ -278,7 +222,7 @@ struct Queue *create_top_queue(struct Queue *q) } // create a new queue with the best tasks - nq->tasks = Malloc(struct Task *, N); + nq->tasks = Malloc(struct GenTask *, N); k = 0; for (i=0; i<q->N; i++) { if (q->tasks[i]->performance >= boundary) @@ -292,17 +236,17 @@ struct Queue *create_top_queue(struct Queue *q) /** - * @brief Run repeats of the Task structs in Queue to find the best + * @brief Run repeats of the GenTask structs in GenQueue to find the best * configuration * * @details - * The best performing tasks in the supplied Queue are found by taking those - * Task structs that have a performance greater or equal to the 95% percentile + * The best performing tasks in the supplied GenQueue are found by taking those + * GenTask structs that have a performance greater or equal to the 95% percentile * of the performance of all tasks. These tasks are then gathered in a new - * Queue. For each of the tasks in this new Queue the cross validation run is + * GenQueue. For each of the tasks in this new GenQueue the cross validation run is * repeated a number of times. * - * For each of the Task configurations that are repeated the mean performance, + * For each of the GenTask configurations that are repeated the mean performance, * standard deviation of the performance and the mean computation time are * reported. * @@ -318,21 +262,21 @@ struct Queue *create_top_queue(struct Queue *q) * an interval is found which contains tasks. If one or more tasks are found, * this loop stops. * - * @param[in] q Queue of Task structs which have already been - * run and have a Task::performance value + * @param[in] q GenQueue of GenTask structs which have already been + * run and have a GenTask::performance value * @param[in] repeats Number of times to repeat the best * configurations for consistency * @param[in] traintype type of training to do (CV or TT) * */ -void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) +void consistency_repeats(struct GenQueue *q, long repeats, TrainType traintype) { long i, f, r, N, *cv_idx; double p, pi, pr, pt, *time, *std, *mean, *perf; - struct Queue *nq; + struct GenQueue *nq; struct GenData **train_folds, **test_folds; struct GenModel *model = gensvm_init_model(); - struct Task *task = NULL; + struct GenTask *task = NULL; clock_t loop_s, loop_e; nq = create_top_queue(q); @@ -350,7 +294,7 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) model->m = task->train_data->m; model->K = task->train_data->K; gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); + gensvm_init_V(NULL, model, task->train_data); cv_idx = Calloc(long, task->train_data->n); @@ -382,13 +326,13 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) p = gensvm_cross_validation(model, train_folds, test_folds, task->folds, task->train_data->n); loop_e = clock(); - time[i] += elapsed_time(loop_s, loop_e); + time[i] += gensvm_elapsed_time(loop_s, loop_e); matrix_set(perf, repeats, i, r, p); mean[i] += p/((double) repeats); note("%3.3f\t", p); // this is done because if we reuse the V it's not a // consistency check - gensvm_seed_model_V(NULL, model, task->train_data); + gensvm_init_V(NULL, model, task->train_data); for (f=0; f<task->folds; f++) { gensvm_free_data(train_folds[f]); gensvm_free_data(test_folds[f]); @@ -397,7 +341,8 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) free(test_folds); } for (r=0; r<repeats; r++) { - std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i], 2.0); + std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i], + 2.0); } if (r > 1) { std[i] /= ((double) repeats) - 1.0; @@ -405,7 +350,8 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) } else { std[i] = 0.0; } - note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], time[i]); + note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i], + time[i]); task = get_next_task(nq); i++; } @@ -469,7 +415,7 @@ void consistency_repeats(struct Queue *q, long repeats, TrainType traintype) * @param[in] oldtask the old task * @return whether the kernel needs to be reevaluated */ -bool kernel_changed(struct Task *newtask, struct Task *oldtask) +bool kernel_changed(struct GenTask *newtask, struct GenTask *oldtask) { int i; if (oldtask == NULL) @@ -495,10 +441,10 @@ bool kernel_changed(struct Task *newtask, struct Task *oldtask) } /** - * @brief Run the grid search for a Queue + * @brief Run the grid search for a GenQueue * * @details - * Given a Queue of Task struct to be trained, a grid search is launched to + * Given a GenQueue of GenTask struct to be trained, a grid search is launched to * find the optimal parameter configuration. As is also done within * cross_validation(), the optimal weights of one parameter set are used as * initial estimates for GenModel::V in the next parameter set. Note that to @@ -506,17 +452,17 @@ bool kernel_changed(struct Task *newtask, struct Task *oldtask) * which tasks are considered is important. This is considered in * make_queue(). * - * The performance found by cross validation is stored in the Task struct. + * The performance found by cross validation is stored in the GenTask struct. * - * @param[in,out] q Queue with Task instances to run + * @param[in,out] q GenQueue with GenTask instances to run */ -void start_training(struct Queue *q) +void start_training(struct GenQueue *q) { int f, folds; double perf, current_max = 0; - struct Task *task = get_next_task(q); - struct Task *prevtask = NULL; + struct GenTask *task = get_next_task(q); + struct GenTask *prevtask = NULL; struct GenModel *model = gensvm_init_model(); clock_t main_s, main_e, loop_s, loop_e; @@ -528,7 +474,7 @@ void start_training(struct Queue *q) model->m = task->train_data->m; model->K = task->train_data->K; gensvm_allocate_model(model); - gensvm_seed_model_V(NULL, model, task->train_data); + gensvm_init_V(NULL, model, task->train_data); long *cv_idx = Calloc(long, task->train_data->n); gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx); @@ -568,7 +514,8 @@ void start_training(struct Queue *q) current_max = maximum(current_max, perf); note("\t%3.3f%% (%3.3fs)\t(best = %3.3f%%)\n", perf, - elapsed_time(loop_s, loop_e), current_max); + gensvm_elapsed_time(loop_s, loop_e), + current_max); q->tasks[task->ID]->performance = perf; prevtask = task; @@ -577,7 +524,7 @@ void start_training(struct Queue *q) main_e = clock(); note("\nTotal elapsed training time: %8.8f seconds\n", - elapsed_time(main_s, main_e)); + gensvm_elapsed_time(main_s, main_e)); // make sure no double free occurs with the copied kernelparam model->kernelparam = NULL; @@ -648,37 +595,16 @@ double gensvm_cross_validation(struct GenModel *model, /** - * @brief Free the Queue struct + * @brief Copy parameters from GenTask to GenModel * * @details - * Freeing the allocated memory of the Queue means freeing every Task struct - * and then freeing the Queue. - * - * @param[in] q Queue to be freed - * - */ -void free_queue(struct Queue *q) -{ - long i; - for (i=0; i<q->N; i++) { - free(q->tasks[i]->kernelparam); - free(q->tasks[i]); - } - free(q->tasks); - free(q); -} - -/** - * @brief Copy parameters from Task to GenModel - * - * @details - * A Task struct only contains the parameters of the GenModel to be estimated. + * A GenTask struct only contains the parameters of the GenModel to be estimated. * This function is used to copy these parameters. * - * @param[in] task Task instance with parameters + * @param[in] task GenTask instance with parameters * @param[in,out] model GenModel to which the parameters are copied */ -void make_model_from_task(struct Task *task, struct GenModel *model) +void make_model_from_task(struct GenTask *task, struct GenModel *model) { // copy basic model parameters model->weight_idx = task->weight_idx; @@ -693,46 +619,6 @@ void make_model_from_task(struct Task *task, struct GenModel *model) } /** - * @brief Copy model parameters between two GenModel structs - * - * @details - * The parameters copied are GenModel::weight_idx, GenModel::epsilon, - * GenModel::p, GenModel::kappa, and GenModel::lambda. - * - * @param[in] from GenModel to copy parameters from - * @param[in,out] to GenModel to copy parameters to - */ -void copy_model(struct GenModel *from, struct GenModel *to) -{ - to->weight_idx = from->weight_idx; - to->epsilon = from->epsilon; - to->p = from->p; - to->kappa = from->kappa; - to->lambda = from->lambda; - - to->kerneltype = from->kerneltype; - switch (to->kerneltype) { - case K_LINEAR: - break; - case K_POLY: - to->kernelparam = Malloc(double, 3); - to->kernelparam[0] = from->kernelparam[0]; - to->kernelparam[1] = from->kernelparam[1]; - to->kernelparam[2] = from->kernelparam[2]; - break; - case K_RBF: - to->kernelparam = Malloc(double, 1); - to->kernelparam[0] = from->kernelparam[0]; - break; - case K_SIGMOID: - to->kernelparam = Malloc(double, 2); - to->kernelparam[0] = from->kernelparam[0]; - to->kernelparam[1] = from->kernelparam[1]; - break; - } -} - -/** * @brief Print the description of the current task on screen * * @details @@ -741,11 +627,11 @@ void copy_model(struct GenModel *from, struct GenModel *to) * parameters differ with the specified kernel, this function writes a * parameter string depending on which kernel is used. * - * @param[in] task the Task specified + * @param[in] task the GenTask specified * @param[in] N total number of tasks * */ -void print_progress_string(struct Task *task, long N) +void print_progress_string(struct GenTask *task, long N) { char buffer[MAX_LINE_LENGTH]; sprintf(buffer, "(%03li/%03li)\t", task->ID+1, N); diff --git a/src/gensvm_init.c b/src/gensvm_init.c index 947c60a..28f34ba 100644 --- a/src/gensvm_init.c +++ b/src/gensvm_init.c @@ -13,201 +13,100 @@ * */ -#include <math.h> - #include "gensvm_init.h" -/** - * @brief Initialize a GenModel structure - * - * @details - * A GenModel structure is initialized and the default value for the - * parameters are set. A pointer to the initialized model is returned. - * - * @returns initialized GenModel - */ -struct GenModel *gensvm_init_model() -{ - struct GenModel *model = Malloc(struct GenModel, 1); - - // set default values - model->p = 1.0; - model->lambda = pow(2, -8.0); - model->epsilon = 1e-6; - model->kappa = 0.0; - model->weight_idx = 1; - model->kerneltype = K_LINEAR; - model->kernelparam = NULL; - - model->W = NULL; - model->t = NULL; - model->V = NULL; - model->Vbar = NULL; - model->U = NULL; - model->UU = NULL; - model->Q = NULL; - model->H = NULL; - model->R = NULL; - model->rho = NULL; - model->data_file = NULL; - - return model; -} +inline double rnd() { return (double) rand()/0x7FFFFFFF; } /** - * @brief Initialize a GenData structure + * @brief seed the matrix V from an existing model or using rand * * @details - * A GenData structure is initialized and default values are set. - * A pointer to the initialized data is returned. - * - * @returns initialized GenData - * + * The matrix V must be seeded before the main_loop() can start. + * This can be done by either seeding it with random numbers or + * using the solution from a previous model on the same dataset + * as initial seed. The latter option usually allows for a + * significant improvement in the number of iterations necessary + * because the seeded model V is closer to the optimal V. + * + * @param[in] from_model GenModel from which to copy V + * @param[in,out] to_model GenModel to which V will be copied */ -struct GenData *gensvm_init_data() +void gensvm_init_V(struct GenModel *from_model, + struct GenModel *to_model, struct GenData *data) { - struct GenData *data = Malloc(struct GenData, 1); - data->Sigma = NULL; - data->y = NULL; - data->Z = NULL; - data->RAW = NULL; - - // set default values - data->kerneltype = K_LINEAR; - data->kernelparam = NULL; - - return data; + long i, j, k; + double cmin, cmax, value; + + long n = data->n; + long m = data->m; + long K = data->K; + + if (from_model == NULL) { + for (i=0; i<m+1; i++) { + cmin = 1e100; + cmax = -1e100; + for (k=0; k<n; k++) { + value = matrix_get(data->Z, m+1, k, i); + cmin = minimum(cmin, value); + cmax = maximum(cmax, value); + } + for (j=0; j<K-1; j++) { + cmin = (abs(cmin) < 1e-10) ? -1 : cmin; + cmax = (abs(cmax) < 1e-10) ? 1 : cmax; + value = 1.0/cmin + (1.0/cmax - 1.0/cmin)*rnd(); + matrix_set(to_model->V, K-1, i, j, value); + } + } + } else { + for (i=0; i<m+1; i++) + for (j=0; j<K-1; j++) { + value = matrix_get(from_model->V, K-1, i, j); + matrix_set(to_model->V, K-1, i, j, value); + } + } } /** - * @brief Allocate memory for a GenModel + * @brief Initialize instance weights * * @details - * This function can be used to allocate the memory needed for a GenModel. All - * arrays in the model are specified and initialized to 0. - * - * @param[in] model GenModel to allocate - * + * Instance weights can for instance be used to add additional weights to + * instances of certain classes. Two default weight possibilities are + * implemented here. The first is unit weights, where each instance gets + * weight 1. + * + * The second are group size correction weights, which are calculated as + * @f[ + * \rho_i = \frac{n}{Kn_k} , + * @f] + * where @f$ n_k @f$ is the number of instances in group @f$ k @f$ and + * @f$ y_i = k @f$. + * + * @param[in] data GenData with the dataset + * @param[in,out] model GenModel with the weight specification. On + * exit GenModel::rho contains the instance + * weights. */ -void gensvm_allocate_model(struct GenModel *model) +void gensvm_initialize_weights(struct GenData *data, struct GenModel *model) { - long n = model->n; - long m = model->m; - long K = model->K; - - model->W = Calloc(double, m*(K-1)); - model->t = Calloc(double, K-1); - model->V = Calloc(double, (m+1)*(K-1)); - model->Vbar = Calloc(double, (m+1)*(K-1)); - model->U = Calloc(double, K*(K-1)); - model->UU = Calloc(double, n*K*(K-1)); - model->Q = Calloc(double, n*K); - model->H = Calloc(double, n*K); - model->R = Calloc(double, n*K); - model->rho = Calloc(double, n); -} + long *groups; + long i; -/** - * @brief Reallocate memory for GenModel - * - * @details - * This function can be used to reallocate existing memory for a GenModel, - * upon a change in the model dimensions. This is used in combination with - * kernels. - * - * @param[in] model GenModel to reallocate - * @param[in] n new value of GenModel->n - * @param[in] m new value of GenModel->m - * - */ -void gensvm_reallocate_model(struct GenModel *model, long n, long m) -{ + long n = model->n; long K = model->K; - if (model->n == n && model->m == m) - return; - if (model->n != n) { - model->UU = Realloc(model->UU, double, n*K*(K-1)); - Memset(model->UU, double, n*K*(K-1)); - - model->Q = Realloc(model->Q, double, n*K); - Memset(model->Q, double, n*K); - - model->H = Realloc(model->H, double, n*K); - Memset(model->H, double, n*K); - - model->R = Realloc(model->R, double, n*K); - Memset(model->R, double, n*K); - - model->rho = Realloc(model->rho, double, n); - Memset(model->rho, double, n); - - model->n = n; + if (model->weight_idx == 1) { + for (i=0; i<n; i++) + model->rho[i] = 1.0; } - if (model->m != m) { - model->W = Realloc(model->W, double, m*(K-1)); - Memset(model->W, double, m*(K-1)); - - model->V = Realloc(model->V, double, (m+1)*(K-1)); - Memset(model->V, double, (m+1)*(K-1)); - - model->Vbar = Realloc(model->Vbar, double, (m+1)*(K-1)); - Memset(model->Vbar, double, (m+1)*(K-1)); - - model->m = m; - } -} - -/** - * @brief Free allocated GenModel struct - * - * @details - * Simply free a previously allocated GenModel by freeing all its component - * arrays. Note that the model struct itself is also freed here. - * - * @param[in] model GenModel to free - * - */ -void gensvm_free_model(struct GenModel *model) -{ - free(model->W); - free(model->t); - free(model->V); - free(model->Vbar); - free(model->U); - free(model->UU); - free(model->Q); - free(model->H); - free(model->rho); - free(model->R); - free(model->kernelparam); - - free(model); -} - -/** - * @brief Free allocated GenData struct - * - * @details - * Simply free a previously allocated GenData struct by freeing all its - * components. Note that the data struct itself is also freed here. - * - * @param[in] data GenData struct to free - * - */ -void gensvm_free_data(struct GenData *data) -{ - if (data == NULL) - return; - - if (data->Z == data->RAW) { - free(data->Z); + else if (model->weight_idx == 2) { + groups = Calloc(long, K); + for (i=0; i<n; i++) + groups[data->y[i]-1]++; + for (i=0; i<n; i++) + model->rho[i] = ((double) n)/((double) (groups[data->y[i]-1]*K)); } else { - free(data->Z); - free(data->RAW); + fprintf(stderr, "Unknown weight specification.\n"); + exit(1); } - free(data->kernelparam); - free(data->y); - free(data->Sigma); - free(data); } diff --git a/src/gensvm_io.c b/src/gensvm_io.c index c4798d8..696f46f 100644 --- a/src/gensvm_io.c +++ b/src/gensvm_io.c @@ -6,16 +6,12 @@ * * @details * This file contains functions for reading and writing model files, and data - * files. - * + * files. It also contains a function for generating a string of the current + * time, used in writing output files. +* */ -#include "globals.h" -#include "gensvm.h" #include "gensvm_io.h" -#include "gensvm_matrix.h" -#include "gensvm_strutil.h" -#include "gensvm_timer.h" /** * @brief Read data from file @@ -226,7 +222,7 @@ void gensvm_write_model(struct GenModel *model, char *output_filename) output_filename); exit(1); } - get_time_string(timestr); + gensvm_time_string(timestr); // Write output to file fprintf(fid, "Output file for GenSVM (version %1.1f)\n", VERSION); @@ -298,3 +294,49 @@ void gensvm_write_predictions(struct GenData *data, long *predy, fclose(fid); } + +/** + * @brief Get time string with UTC offset + * + * @details + * Create a string for the current system time. Include an offset of UTC for + * consistency. The format of the generated string is "DDD MMM D HH:MM:SS + * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". + * + * @param[in,out] buffer allocated string buffer, on exit contains + * formatted string + * + */ +void gensvm_time_string(char *buffer) +{ + int diff, hours, minutes; + char timestr[MAX_LINE_LENGTH]; + time_t current_time, lt, gt; + struct tm *lclt; + + // get current time (in epoch) + current_time = time(NULL); + if (current_time == ((time_t)-1)) { + fprintf(stderr, "Failed to compute the current time.\n"); + return; + } + + // convert time to local time and create a string + lclt = localtime(¤t_time); + strftime(timestr, MAX_LINE_LENGTH, "%c", lclt); + if (timestr == NULL) { + fprintf(stderr, "Failed to convert time to string.\n"); + return; + } + + // calculate the UTC offset including DST + lt = mktime(localtime(¤t_time)); + gt = mktime(gmtime(¤t_time)); + diff = -difftime(gt, lt); + hours = (diff/3600); + minutes = (diff%3600)/60; + if (lclt->tm_isdst == 1) + hours++; + + sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); +} diff --git a/src/gensvm_kernel.c b/src/gensvm_kernel.c index 1d7e5e4..a4f2277 100644 --- a/src/gensvm_kernel.c +++ b/src/gensvm_kernel.c @@ -11,15 +11,7 @@ * */ -#include <cblas.h> -#include <math.h> - -#include "globals.h" -#include "gensvm.h" #include "gensvm_kernel.h" -#include "gensvm_lapack.h" -#include "gensvm_matrix.h" -#include "gensvm_util.h" /** * @brief Do the preprocessing steps needed to perform kernel GenSVM @@ -451,3 +443,46 @@ double gensvm_dot_sigmoid(double *x1, double *x2, double *kernelparam, long n) value += kernelparam[1]; return tanh(value); } + +/** + * @brief Compute the eigenvalues and optionally the eigenvectors of a + * symmetric matrix. + * + * @details + * This is a wrapper function around the external LAPACK function. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d2/d97/dsyevx_8f.html + * + * + */ +int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, + double VL, double VU, int IL, int IU, double ABSTOL, int *M, + double *W, double *Z, int LDZ, double *WORK, int LWORK, + int *IWORK, int *IFAIL) +{ + extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *Np, + double *A, int *LDAp, double *VLp, double *VUp, + int *ILp, int *IUp, double *ABSTOLp, int *M, + double *W, double *Z, int *LDZp, double *WORK, + int *LWORKp, int *IWORK, int *IFAIL, int *INFOp); + int INFO; + dsyevx_(&JOBZ, &RANGE, &UPLO, &N, A, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, + M, W, Z, &LDZ, WORK, &LWORK, IWORK, IFAIL, &INFO); + return INFO; +} + +/** + * @brief Determine double precision machine parameters. + * + * @details + * This is a wrapper function around the external LAPACK function. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d5/dd4/dlamch_8f.html + */ +double dlamch(char CMACH) +{ + extern double dlamch_(char *CMACH); + return dlamch_(&CMACH); +} diff --git a/src/gensvm_lapack.c b/src/gensvm_lapack.c index 56dfc20..2a9c120 100644 --- a/src/gensvm_lapack.c +++ b/src/gensvm_lapack.c @@ -11,125 +11,3 @@ #include "gensvm_lapack.h" -/** - * @brief Solve AX = B where A is symmetric positive definite. - * - * @details - * Solve a linear system of equations AX = B where A is symmetric positive - * definite. This function uses the externel LAPACK routine dposv. - * - * @param[in] UPLO which triangle of A is stored - * @param[in] N order of A - * @param[in] NRHS number of columns of B - * @param[in,out] A double precision array of size (LDA, N). On - * exit contains the upper or lower factor of the - * Cholesky factorization of A. - * @param[in] LDA leading dimension of A - * @param[in,out] B double precision array of size (LDB, NRHS). On - * exit contains the N-by-NRHS solution matrix X. - * @param[in] LDB the leading dimension of B - * @returns info parameter which contains the status of the - * computation: - * - =0: success - * - <0: if -i, the i-th argument had - * an illegal value - * - >0: if i, the leading minor of A - * was not positive definite - * - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/dc/de9/group__double_p_osolve.html - */ -int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, - int LDB) -{ - extern void dposv_(char *UPLO, int *Np, int *NRHSp, double *A, - int *LDAp, double *B, int *LDBp, int *INFOp); - int INFO; - dposv_(&UPLO, &N, &NRHS, A, &LDA, B, &LDB, &INFO); - return INFO; -} - -/** - * @brief Solve a system of equations AX = B where A is symmetric. - * - * @details - * Solve a linear system of equations AX = B where A is symmetric. This - * function uses the external LAPACK routine dsysv. - * - * @param[in] UPLO which triangle of A is stored - * @param[in] N order of A - * @param[in] NRHS number of columns of B - * @param[in,out] A double precision array of size (LDA, N). On - * exit contains the block diagonal matrix D and - * the multipliers used to obtain the factor U or - * L from the factorization A = U*D*U**T or - * A = L*D*L**T. - * @param[in] LDA leading dimension of A - * @param[in] IPIV integer array containing the details of D - * @param[in,out] B double precision array of size (LDB, NRHS). On - * exit contains the N-by-NRHS matrix X - * @param[in] LDB leading dimension of B - * @param[out] WORK double precision array of size max(1,LWORK). On - * exit, WORK(1) contains the optimal LWORK - * @param[in] LWORK the length of WORK, can be used for determining - * the optimal blocksize for dsystrf. - * @returns info parameter which contains the status of the - * computation: - * - =0: success - * - <0: if -i, the i-th argument had an - * illegal value - * - >0: if i, D(i, i) is exactly zero, - * no solution can be computed. - * - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d6/d0e/group__double_s_ysolve.html - */ -int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, - double *B, int LDB, double *WORK, int LWORK) -{ - extern void dsysv_(char *UPLO, int *Np, int *NRHSp, double *A, - int *LDAp, int *IPIV, double *B, int *LDBp, - double *WORK, int *LWORK, int *INFOp); - int INFO; - dsysv_(&UPLO, &N, &NRHS, A, &LDA, IPIV, B, &LDB, WORK, &LWORK, &INFO); - return INFO; -} - -/** - * @brief Compute the eigenvalues and optionally the eigenvectors of a - * symmetric matrix. - * - * @details - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d2/d97/dsyevx_8f.html - * - * - */ -int dsyevx(char JOBZ, char RANGE, char UPLO, int N, double *A, int LDA, - double VL, double VU, int IL, int IU, double ABSTOL, int *M, - double *W, double *Z, int LDZ, double *WORK, int LWORK, - int *IWORK, int *IFAIL) -{ - extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *Np, - double *A, int *LDAp, double *VLp, double *VUp, - int *ILp, int *IUp, double *ABSTOLp, int *M, - double *W, double *Z, int *LDZp, double *WORK, - int *LWORKp, int *IWORK, int *IFAIL, int *INFOp); - int INFO; - dsyevx_(&JOBZ, &RANGE, &UPLO, &N, A, &LDA, &VL, &VU, &IL, &IU, &ABSTOL, - M, W, Z, &LDZ, WORK, &LWORK, IWORK, IFAIL, &INFO); - return INFO; -} - -/** - * @brief Determine double precision machine parameters. - * - * @details - * See the LAPACK documentation at: - * http://www.netlib.org/lapack/explore-html/d5/dd4/dlamch_8f.html - */ -double dlamch(char CMACH) -{ - extern double dlamch_(char *CMACH); - return dlamch_(&CMACH); -} diff --git a/src/gensvm_memory.c b/src/gensvm_memory.c index 529ef79..63c8965 100644 --- a/src/gensvm_memory.c +++ b/src/gensvm_memory.c @@ -7,7 +7,6 @@ */ #include "globals.h" // imports gensvm_memory.h -#include "gensvm_util.h" /** * @brief Wrapper for calloc() which warns when allocation fails @@ -34,7 +33,7 @@ void *mycalloc(const char *file, int line, unsigned long size, void *ptr = calloc(size, typesize); if (!ptr) { - err("Could not allocate memory: %d bytes (%s:%d)\n", + fprintf(stderr, "Couldn't allocate memory: %lu bytes (%s:%d)\n", size, file, line); exit(EXIT_FAILURE); } @@ -63,7 +62,7 @@ void *mymalloc(const char *file, int line, unsigned long size) { void *ptr = malloc(size); if (!ptr) { - err("Could not allocate memory: %d bytes (%s:%d)\n", + fprintf(stderr, "Couldn't allocate memory: %lu bytes (%s:%d)\n", size, file, line); exit(EXIT_FAILURE); } @@ -93,7 +92,7 @@ void *myrealloc(const char *file, int line, unsigned long size, void *var) { void *ptr = realloc(var, size); if (!ptr) { - err("Could not reallocate memory: %d bytes (%s:%d)\n", + fprintf(stderr, "Couldn't reallocate memory: %lu bytes (%s:%d)\n", size, file, line); exit(EXIT_FAILURE); } diff --git a/src/gensvm_train.c b/src/gensvm_optimize.c index 371970a..70b3620 100644 --- a/src/gensvm_train.c +++ b/src/gensvm_optimize.c @@ -10,17 +10,7 @@ * */ -#include <math.h> -#include <cblas.h> - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_lapack.h" -#include "gensvm_matrix.h" -#include "gensvm_sv.h" -#include "gensvm_train.h" -#include "gensvm_util.h" +#include "gensvm_optimize.h" /** * Maximum number of iterations of the algorithm. @@ -71,7 +61,7 @@ void gensvm_optimize(struct GenModel *model, struct GenData *data) note("\tepsilon = %g\n", model->epsilon); note("\n"); - gensvm_simplex_gen(model->K, model->U); + gensvm_simplex(model->K, model->U); gensvm_simplex_diff(model, data); gensvm_category_matrix(model, data); @@ -536,3 +526,269 @@ void gensvm_get_update(struct GenModel *model, struct GenData *data, double *B, } } } + +/** + * @brief Generate the category matrix + * + * @details + * Generate the category matrix R. The category matrix has 1's everywhere + * except at the column corresponding to the label of instance i, there the + * element is 0. + * + * @param[in,out] model corresponding GenModel + * @param[in] dataset corresponding GenData + * + */ +void gensvm_category_matrix(struct GenModel *model, struct GenData *dataset) +{ + long i, j; + long n = model->n; + long K = model->K; + + for (i=0; i<n; i++) { + for (j=0; j<K; j++) { + if (dataset->y[i] != j+1) + matrix_set(model->R, K, i, j, 1.0); + else + matrix_set(model->R, K, i, j, 0.0); + } + } +} + +/** + * @brief Generate the simplex difference matrix + * + * @details + * The simplex difference matrix is a 3D matrix which is constructed + * as follows. For each instance i, the difference vectors between the row of + * the simplex matrix corresponding to the class label of instance i and the + * other rows of the simplex matrix are calculated. These difference vectors + * are stored in a matrix, which is one horizontal slice of the 3D matrix. + * + * @param[in,out] model the corresponding GenModel + * @param[in] data the corresponding GenData + * + */ +void gensvm_simplex_diff(struct GenModel *model, struct GenData *data) +{ + long i, j, k; + double value; + + long n = model->n; + long K = model->K; + + for (i=0; i<n; i++) { + for (j=0; j<K-1; j++) { + for (k=0; k<K; k++) { + value = matrix_get(model->U, K-1, data->y[i]-1, j); + value -= matrix_get(model->U, K-1, k, j); + matrix3_set(model->UU, K-1, K, i, j, k, value); + } + } + } +} + +/** + * @brief Use step doubling + * + * @details + * Step doubling can be used to speed up the maorization algorithm. Instead of + * using the value at the minimimum of the majorization function, the value + * ``opposite'' the majorization point is used. This can essentially cut the + * number of iterations necessary to reach the minimum in half. + * + * @param[in] model GenModel containing the augmented parameters + */ +void gensvm_step_doubling(struct GenModel *model) +{ + long i, j; + double value; + + long m = model->m; + long K = model->K; + + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + matrix_mul(model->V, K-1, i, j, 2.0); + value = - matrix_get(model->Vbar, K-1, i, j); + matrix_add(model->V, K-1, i, j, value); + } + } +} + +/** + * @brief Calculate the Huber hinge errors + * + * @details + * For each of the scalar errors in Q the Huber hinge errors are + * calculated. The Huber hinge is here defined as + * @f[ + * h(q) = + * \begin{dcases} + * 1 - q - \frac{\kappa + 1}{2} & \text{if } q \leq -\kappa \\ + * \frac{1}{2(\kappa + 1)} ( 1 - q)^2 & \text{if } q \in (-\kappa, 1] \\ + * 0 & \text{if } q > 1 + * \end{dcases} + * @f] + * + * @param[in,out] model the corresponding GenModel + */ +void gensvm_calculate_huber(struct GenModel *model) +{ + long i, j; + double q, value; + + for (i=0; i<model->n; i++) { + for (j=0; j<model->K; j++) { + q = matrix_get(model->Q, model->K, i, j); + value = 0.0; + if (q <= -model->kappa) { + value = 1.0 - q - (model->kappa+1.0)/2.0; + } else if (q <= 1.0) { + value = 1.0/(2.0*model->kappa+2.0)*pow(1.0 - q, + 2.0); + } + matrix_set(model->H, model->K, i, j, value); + } + } +} + +/** + * @brief Calculate the scalar errors + * + * @details + * Calculate the scalar errors q based on the current estimate of V, and + * store these in Q. It is assumed that the memory for Q has already been + * allocated. In addition, the matrix ZV is calculated here. It is assigned + * to a pre-allocated block of memory, which is passed to this function. + * + * @param[in,out] model the corresponding GenModel + * @param[in] data the corresponding GenData + * @param[in,out] ZV a pointer to a memory block for ZV. On exit + * this block is updated with the new ZV matrix + * calculated with GenModel::V. + * + */ +void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, + double *ZV) +{ + long i, j, k; + double a, value; + + long n = model->n; + long m = model->m; + long K = model->K; + + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + K-1, + m+1, + 1.0, + data->Z, + m+1, + model->V, + K-1, + 0.0, + ZV, + K-1); + + Memset(model->Q, double, n*K); + for (i=0; i<n; i++) { + for (j=0; j<K-1; j++) { + a = matrix_get(ZV, K-1, i, j); + for (k=0; k<K; k++) { + value = a * matrix3_get(model->UU, K-1, K, i, + j, k); + matrix_add(model->Q, K, i, k, value); + } + } + } +} + +/** + * @brief Solve AX = B where A is symmetric positive definite. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric positive + * definite. This function is a wrapper for the external LAPACK routine + * dposv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the upper or lower factor of the + * Cholesky factorization of A. + * @param[in] LDA leading dimension of A + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS solution matrix X. + * @param[in] LDB the leading dimension of B + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had + * an illegal value + * - >0: if i, the leading minor of A + * was not positive definite + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/dc/de9/group__double_p_osolve.html + */ +int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, + int LDB) +{ + extern void dposv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, double *B, int *LDBp, int *INFOp); + int INFO; + dposv_(&UPLO, &N, &NRHS, A, &LDA, B, &LDB, &INFO); + return INFO; +} + +/** + * @brief Solve a system of equations AX = B where A is symmetric. + * + * @details + * Solve a linear system of equations AX = B where A is symmetric. This + * function is a wrapper for the external LAPACK routine dsysv. + * + * @param[in] UPLO which triangle of A is stored + * @param[in] N order of A + * @param[in] NRHS number of columns of B + * @param[in,out] A double precision array of size (LDA, N). On + * exit contains the block diagonal matrix D and + * the multipliers used to obtain the factor U or + * L from the factorization A = U*D*U**T or + * A = L*D*L**T. + * @param[in] LDA leading dimension of A + * @param[in] IPIV integer array containing the details of D + * @param[in,out] B double precision array of size (LDB, NRHS). On + * exit contains the N-by-NRHS matrix X + * @param[in] LDB leading dimension of B + * @param[out] WORK double precision array of size max(1,LWORK). On + * exit, WORK(1) contains the optimal LWORK + * @param[in] LWORK the length of WORK, can be used for determining + * the optimal blocksize for dsystrf. + * @returns info parameter which contains the status of the + * computation: + * - =0: success + * - <0: if -i, the i-th argument had an + * illegal value + * - >0: if i, D(i, i) is exactly zero, + * no solution can be computed. + * + * See the LAPACK documentation at: + * http://www.netlib.org/lapack/explore-html/d6/d0e/group__double_s_ysolve.html + */ +int dsysv(char UPLO, int N, int NRHS, double *A, int LDA, int *IPIV, + double *B, int LDB, double *WORK, int LWORK) +{ + extern void dsysv_(char *UPLO, int *Np, int *NRHSp, double *A, + int *LDAp, int *IPIV, double *B, int *LDBp, + double *WORK, int *LWORK, int *INFOp); + int INFO; + dsysv_(&UPLO, &N, &NRHS, A, &LDA, IPIV, B, &LDB, WORK, &LWORK, &INFO); + return INFO; +} diff --git a/src/gensvm_pred.c b/src/gensvm_pred.c index 15a6be6..8a9a43e 100644 --- a/src/gensvm_pred.c +++ b/src/gensvm_pred.c @@ -11,14 +11,6 @@ * */ -#include <cblas.h> -#include <math.h> - -#include "globals.h" -#include "libGenSVM.h" -#include "gensvm.h" -#include "gensvm_kernel.h" -#include "gensvm_matrix.h" #include "gensvm_pred.h" /** @@ -51,7 +43,7 @@ void gensvm_predict_labels(struct GenData *testdata, struct GenModel *model, U = Calloc(double, K*(K-1)); // Generate the simplex matrix - gensvm_simplex_gen(K, U); + gensvm_simplex(K, U); // Generate the simplex space vectors cblas_dgemm( diff --git a/src/gensvm_util.c b/src/gensvm_print.c index d12a85c..2c00512 100644 --- a/src/gensvm_util.c +++ b/src/gensvm_print.c @@ -9,10 +9,8 @@ * output of data and model files. It also contains string functions. * */ -#include <stdarg.h> -#include "globals.h" -#include "gensvm_util.h" +#include "gensvm_print.h" FILE *GENSVM_OUTPUT_FILE; ///< The #GENSVM_OUTPUT_FILE specifies the ///< output stream to which all output is @@ -26,65 +24,6 @@ FILE *GENSVM_OUTPUT_FILE; ///< The #GENSVM_OUTPUT_FILE specifies the ///< (temporarily) setting it to NULL. /** - * @brief Check if any command line arguments contain string - * - * @details - * Check if any of a given array of command line arguments contains a given - * string. If the string is found, the index of the string in argv is - * returned. If the string is not found, 0 is returned. - * - * This function is copied from MSVMpack/libMSVM.c. - * - * @param[in] argc number of command line arguments - * @param[in] argv command line arguments - * @param[in] str string to find in the arguments - * @returns index of the string in the arguments if found, 0 - * otherwise - */ -int gensvm_check_argv(int argc, char **argv, char *str) -{ - int i; - int arg_str = 0; - for (i=1; i<argc; i++) - if (strstr(argv[i], str) != NULL) { - arg_str = i; - break; - } - - return arg_str; -} - -/** - * @brief Check if a command line argument equals a string - * - * @details - * Check if any of the command line arguments is exactly equal to a given - * string. If so, return the index of the corresponding command line argument. - * If not, return 0. - * - * This function is copied from MSVMpack/libMSVM.c - * - * @param[in] argc number of command line arguments - * @param[in] argv command line arguments - * @param[in] str string to find in the arguments - * @returns index of the command line argument that corresponds to - * the string, 0 if none matches. - */ -int gensvm_check_argv_eq(int argc, char **argv, char *str) -{ - int i; - int arg_str = 0; - for (i=1; i<argc; i++) - if (strcmp(argv[i], str) == 0) { - arg_str = i; - break; - } - - return arg_str; -} - - -/** * @brief Print a given string to the specified output stream * * @details diff --git a/src/gensvm_queue.c b/src/gensvm_queue.c new file mode 100644 index 0000000..bbf57b2 --- /dev/null +++ b/src/gensvm_queue.c @@ -0,0 +1,71 @@ +/** + * @file gensvm_queue.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for initializing and freeing a GenQueue + * + */ + +#include "gensvm_queue.h" + +/** + * @brief Initialize a GenQueue structure + * + * @details + * A GenQueue structure is initialized and the default value for the + * parameters are set. A pointer to the initialized queue is returned. + * + * @returns initialized GenQueue + */ +struct GenQueue *gensvm_init_queue() +{ + struct GenQueue *q = Malloc(struct GenQueue, 1); + + q->tasks = NULL; + q->N = 0; + q->i = 0; + + return q; +} + +/** + * @brief Free the GenQueue struct + * + * @details + * Freeing the allocated memory of the GenQueue means freeing every GenTask + * struct and then freeing the Queue. + * + * @param[in] q GenQueue to be freed + * + */ +void gensvm_free_queue(struct GenQueue *q) +{ + long i; + for (i=0; i<q->N; i++) { + gensvm_free_task(q->tasks[i]); + } + free(q->tasks); + free(q); +} + +/** + * @brief Get new GenTask from GenQueue + * + * @details + * Return a pointer to the next GenTask in the GenQueue. If no GenTask + * instances are left, NULL is returned. The internal counter GenQueue::i is + * used for finding the next GenTask. + * + * @param[in] q GenQueue instance + * @returns pointer to next GenTask + * + */ +struct GenTask *get_next_task(struct GenQueue *q) +{ + long i = q->i; + if (i < q->N) { + q->i++; + return q->tasks[i]; + } + return NULL; +} diff --git a/src/gensvm_simplex.c b/src/gensvm_simplex.c new file mode 100644 index 0000000..1fd5f14 --- /dev/null +++ b/src/gensvm_simplex.c @@ -0,0 +1,45 @@ +/** + * @file gensvm_simplex.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Function for generating the simplex matrix + * + * @details + * Contains the function for generating the simplex matrix for a given number + * of classes. + * + */ + +#include "gensvm_simplex.h" + +/** + * @brief Generate matrix of simplex vertex coordinates + * + * @details + * Generate the simplex matrix. Each row of the created + * matrix contains the coordinate vector of a single + * vertex of the K-simplex in K-1 dimensions. The simplex + * generated is a special simplex with edges of length 1. + * The simplex matrix U must already have been allocated. + * + * @param[in] K number of classes + * @param[in,out] U simplex matrix of size K * (K-1) + */ +void gensvm_simplex(long K, double *U) +{ + long i, j; + for (i=0; i<K; i++) { + for (j=0; j<K-1; j++) { + if (i <= j) { + matrix_set(U, K-1, i, j, + -1.0/sqrt(2.0*(j+1)*(j+2))); + } else if (i == j+1) { + matrix_set(U, K-1, i, j, + sqrt((j+1)/(2.0*(j+2)))); + } else { + matrix_set(U, K-1, i, j, 0.0); + } + } + } +} + diff --git a/src/gensvm_strutil.c b/src/gensvm_strutil.c index 97dec81..b4c5c54 100644 --- a/src/gensvm_strutil.c +++ b/src/gensvm_strutil.c @@ -1,5 +1,5 @@ /** - * @file strutil.c + * @file gensvm_strutil.c * @author Gertjan van den Burg * @date January, 2014 * @brief Utility functions for dealing with strings @@ -9,7 +9,6 @@ * format and checking start and ends of strings. */ -#include "globals.h" #include "gensvm_strutil.h" /** diff --git a/src/gensvm_sv.c b/src/gensvm_sv.c index c61347a..8a8e6d3 100644 --- a/src/gensvm_sv.c +++ b/src/gensvm_sv.c @@ -10,8 +10,7 @@ * */ -#include "gensvm.h" -#include "gensvm_matrix.h" +#include "gensvm_sv.h" /** * @brief Calculate the number of support vectors in a model diff --git a/src/gensvm_task.c b/src/gensvm_task.c new file mode 100644 index 0000000..587c4c4 --- /dev/null +++ b/src/gensvm_task.c @@ -0,0 +1,55 @@ +/** + * @file gensvm_task.c + * @author Gertjan van den Burg + * @date May, 2016 + * @brief Functions for initializing and freeing a GenTask + * + */ + +#include "gensvm_task.h" + +/** + * @brief Initialize a GenTask structure + * + * @details + * A GenTask structure is initialized and the default value for the + * parameters are set. A pointer to the initialized GenTask is returned. + * + * @returns initialized GenTask + */ +struct GenTask *gensvm_init_task() +{ + struct GenTask *t = Malloc(struct GenTask, 1); + + t->kerneltype = K_LINEAR; + t->weight_idx = 1; + t->folds = 10; + t->ID = -1; + t->p = 1.0; + t->kappa = 0.0; + t->lambda = 1.0; + t->epsilon = 1e-6; + t->kernelparam = NULL; + t->train_data = NULL; + t->test_data = NULL; + t->performance = 0.0; + + return t; +} + +/** + * @brief Free the GenTask struct + * + * @details + * Freeing the allocated memory of the GenTask means freeing _only_ the + * kernelparam array, and the task itself. The datasets are not freed, as + * these are shared between all tasks. + * + * @param[in] t GenTask to be freed + * + */ +void gensvm_free_task(struct GenTask *t) +{ + free(t->kernelparam); + free(t); +} diff --git a/src/gensvm_timer.c b/src/gensvm_timer.c index 9802091..04d93b1 100644 --- a/src/gensvm_timer.c +++ b/src/gensvm_timer.c @@ -6,13 +6,10 @@ * * @details * This file contains a simple function for calculating the time in seconds - * elapsed between two clock() calls. It also contains a function for - * generating a string of the current time, used in writing output files. + * elapsed between two clock() calls. + * */ -#include <time.h> - -#include "globals.h" #include "gensvm_timer.h" /** @@ -22,53 +19,7 @@ * @param[in] e_time end time * @returns time elapsed in seconds */ -double elapsed_time(clock_t s_time, clock_t e_time) +double gensvm_elapsed_time(clock_t s_time, clock_t e_time) { return ((double) (e_time - s_time))/((double) CLOCKS_PER_SEC); } - -/** - * @brief Get time string with UTC offset - * - * @details - * Create a string for the current system time. Include an offset of UTC for - * consistency. The format of the generated string is "DDD MMM D HH:MM:SS - * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". - * - * @param[in,out] buffer allocated string buffer, on exit contains - * formatted string - * - */ -void get_time_string(char *buffer) -{ - int diff, hours, minutes; - char timestr[MAX_LINE_LENGTH]; - time_t current_time, lt, gt; - struct tm *lclt; - - // get current time (in epoch) - current_time = time(NULL); - if (current_time == ((time_t)-1)) { - fprintf(stderr, "Failed to compute the current time.\n"); - return; - } - - // convert time to local time and create a string - lclt = localtime(¤t_time); - strftime(timestr, MAX_LINE_LENGTH, "%c", lclt); - if (timestr == NULL) { - fprintf(stderr, "Failed to convert time to string.\n"); - return; - } - - // calculate the UTC offset including DST - lt = mktime(localtime(¤t_time)); - gt = mktime(gmtime(¤t_time)); - diff = -difftime(gt, lt); - hours = (diff/3600); - minutes = (diff%3600)/60; - if (lclt->tm_isdst == 1) - hours++; - - sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); -} diff --git a/src/libGenSVM.c b/src/libGenSVM.c index 8c22f3e..8917a3e 100644 --- a/src/libGenSVM.c +++ b/src/libGenSVM.c @@ -20,308 +20,4 @@ #include "gensvm.h" #include "gensvm_matrix.h" -inline double rnd() { return (double) rand()/0x7FFFFFFF; } - -/** - * @brief Generate matrix of simplex vertex coordinates - * - * @details - * Generate the simplex matrix. Each row of the created - * matrix contains the coordinate vector of a single - * vertex of the K-simplex in K-1 dimensions. The simplex - * generated is a special simplex with edges of length 1. - * The simplex matrix U must already have been allocated. - * - * @param[in] K number of classes - * @param[in,out] U simplex matrix of size K * (K-1) - */ -void gensvm_simplex_gen(long K, double *U) -{ - long i, j; - for (i=0; i<K; i++) { - for (j=0; j<K-1; j++) { - if (i <= j) { - matrix_set(U, K-1, i, j, -1.0/sqrt(2.0*(j+1)*(j+2))); - } else if (i == j+1) { - matrix_set(U, K-1, i, j, sqrt((j+1)/(2.0*(j+2)))); - } else { - matrix_set(U, K-1, i, j, 0.0); - } - } - } -} - -/** - * @brief Generate the category matrix - * - * @details - * Generate the category matrix R. The category matrix has 1's everywhere - * except at the column corresponding to the label of instance i, there the - * element is 0. - * - * @param[in,out] model corresponding GenModel - * @param[in] dataset corresponding GenData - * - */ -void gensvm_category_matrix(struct GenModel *model, struct GenData *dataset) -{ - long i, j; - long n = model->n; - long K = model->K; - - for (i=0; i<n; i++) { - for (j=0; j<K; j++) { - if (dataset->y[i] != j+1) - matrix_set(model->R, K, i, j, 1.0); - else - matrix_set(model->R, K, i, j, 0.0); - } - } -} - -/** - * @brief Generate the simplex difference matrix - * - * @details - * The simplex difference matrix is a 3D matrix which is constructed - * as follows. For each instance i, the difference vectors between the row of - * the simplex matrix corresponding to the class label of instance i and the - * other rows of the simplex matrix are calculated. These difference vectors - * are stored in a matrix, which is one horizontal slice of the 3D matrix. - * - * @param[in,out] model the corresponding GenModel - * @param[in] data the corresponding GenData - * - */ -void gensvm_simplex_diff(struct GenModel *model, struct GenData *data) -{ - long i, j, k; - double value; - - long n = model->n; - long K = model->K; - - for (i=0; i<n; i++) { - for (j=0; j<K-1; j++) { - for (k=0; k<K; k++) { - value = matrix_get(model->U, K-1, data->y[i]-1, j); - value -= matrix_get(model->U, K-1, k, j); - matrix3_set(model->UU, K-1, K, i, j, k, value); - } - } - } -} - -/** - * @brief Calculate the scalar errors - * - * @details - * Calculate the scalar errors q based on the current estimate of V, and - * store these in Q. It is assumed that the memory for Q has already been - * allocated. In addition, the matrix ZV is calculated here. It is assigned - * to a pre-allocated block of memory, which is passed to this function. - * - * @param[in,out] model the corresponding GenModel - * @param[in] data the corresponding GenData - * @param[in,out] ZV a pointer to a memory block for ZV. On exit - * this block is updated with the new ZV matrix - * calculated with GenModel::V. - * - */ -void gensvm_calculate_errors(struct GenModel *model, struct GenData *data, - double *ZV) -{ - long i, j, k; - double a, value; - - long n = model->n; - long m = model->m; - long K = model->K; - - cblas_dgemm( - CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - n, - K-1, - m+1, - 1.0, - data->Z, - m+1, - model->V, - K-1, - 0.0, - ZV, - K-1); - - Memset(model->Q, double, n*K); - for (i=0; i<n; i++) { - for (j=0; j<K-1; j++) { - a = matrix_get(ZV, K-1, i, j); - for (k=0; k<K; k++) { - value = a * matrix3_get(model->UU, K-1, K, i, - j, k); - matrix_add(model->Q, K, i, k, value); - } - } - } -} - -/** - * @brief Calculate the Huber hinge errors - * - * @details - * For each of the scalar errors in Q the Huber hinge errors are - * calculated. The Huber hinge is here defined as - * @f[ - * h(q) = - * \begin{dcases} - * 1 - q - \frac{\kappa + 1}{2} & \text{if } q \leq -\kappa \\ - * \frac{1}{2(\kappa + 1)} ( 1 - q)^2 & \text{if } q \in (-\kappa, 1] \\ - * 0 & \text{if } q > 1 - * \end{dcases} - * @f] - * - * @param[in,out] model the corresponding GenModel - */ -void gensvm_calculate_huber(struct GenModel *model) -{ - long i, j; - double q, value; - - for (i=0; i<model->n; i++) { - for (j=0; j<model->K; j++) { - q = matrix_get(model->Q, model->K, i, j); - value = 0.0; - if (q <= -model->kappa) { - value = 1.0 - q - (model->kappa+1.0)/2.0; - } else if (q <= 1.0) { - value = 1.0/(2.0*model->kappa+2.0)*pow(1.0 - q, 2.0); - } - matrix_set(model->H, model->K, i, j, value); - } - } -} - -/** - * @brief seed the matrix V from an existing model or using rand - * - * @details - * The matrix V must be seeded before the main_loop() can start. - * This can be done by either seeding it with random numbers or - * using the solution from a previous model on the same dataset - * as initial seed. The latter option usually allows for a - * significant improvement in the number of iterations necessary - * because the seeded model V is closer to the optimal V. - * - * @param[in] from_model GenModel from which to copy V - * @param[in,out] to_model GenModel to which V will be copied - */ -void gensvm_seed_model_V(struct GenModel *from_model, - struct GenModel *to_model, struct GenData *data) -{ - long i, j, k; - double cmin, cmax, value; - - long n = data->n; - long m = data->m; - long K = data->K; - - if (from_model == NULL) { - for (i=0; i<m+1; i++) { - cmin = 1e100; - cmax = -1e100; - for (k=0; k<n; k++) { - value = matrix_get(data->Z, m+1, k, i); - cmin = minimum(cmin, value); - cmax = maximum(cmax, value); - } - for (j=0; j<K-1; j++) { - cmin = (abs(cmin) < 1e-10) ? -1 : cmin; - cmax = (abs(cmax) < 1e-10) ? 1 : cmax; - value = 1.0/cmin + (1.0/cmax - 1.0/cmin)*rnd(); - matrix_set(to_model->V, K-1, i, j, value); - } - } - } else { - for (i=0; i<m+1; i++) - for (j=0; j<K-1; j++) { - value = matrix_get(from_model->V, K-1, i, j); - matrix_set(to_model->V, K-1, i, j, value); - } - } -} - -/** - * @brief Use step doubling - * - * @details - * Step doubling can be used to speed up the maorization algorithm. Instead of - * using the value at the minimimum of the majorization function, the value - * ``opposite'' the majorization point is used. This can essentially cut the - * number of iterations necessary to reach the minimum in half. - * - * @param[in] model GenModel containing the augmented parameters - */ -void gensvm_step_doubling(struct GenModel *model) -{ - long i, j; - double value; - - long m = model->m; - long K = model->K; - - for (i=0; i<m+1; i++) { - for (j=0; j<K-1; j++) { - matrix_mul(model->V, K-1, i, j, 2.0); - value = - matrix_get(model->Vbar, K-1, i, j); - matrix_add(model->V, K-1, i, j, value); - } - } -} - -/** - * @brief Initialize instance weights - * - * @details - * Instance weights can for instance be used to add additional weights to - * instances of certain classes. Two default weight possibilities are - * implemented here. The first is unit weights, where each instance gets - * weight 1. - * - * The second are group size correction weights, which are calculated as - * @f[ - * \rho_i = \frac{n}{Kn_k} , - * @f] - * where @f$ n_k @f$ is the number of instances in group @f$ k @f$ and - * @f$ y_i = k @f$. - * - * @param[in] data GenData with the dataset - * @param[in,out] model GenModel with the weight specification. On - * exit GenModel::rho contains the instance - * weights. - */ -void gensvm_initialize_weights(struct GenData *data, struct GenModel *model) -{ - long *groups; - long i; - - long n = model->n; - long K = model->K; - - if (model->weight_idx == 1) { - for (i=0; i<n; i++) - model->rho[i] = 1.0; - } - else if (model->weight_idx == 2) { - groups = Calloc(long, K); - for (i=0; i<n; i++) - groups[data->y[i]-1]++; - for (i=0; i<n; i++) - model->rho[i] = ((double) n)/((double) (groups[data->y[i]-1]*K)); - } else { - fprintf(stderr, "Unknown weight specification.\n"); - exit(1); - } -} |
