From a4dfdecd380a1e0df9c83c1cff285a0903cfa50e Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Fri, 24 Jan 2014 14:05:37 +0100 Subject: moved input/output functions to seperate file --- Makefile | 33 +++- doc/mainpage.c | 23 --- doc/mainpage.dox | 23 +++ doc/specifications.c | 170 ------------------ doc/specifications.dox | 170 ++++++++++++++++++ include/msvmmaj_init.h | 4 + include/msvmmaj_io.h | 30 ++++ include/util.h | 12 -- src/msvmmaj_init.c | 123 ++++++++++++- src/msvmmaj_io.c | 322 ++++++++++++++++++++++++++++++++++ src/predMSVMMaj.c | 1 + src/trainMSVMMaj.c | 1 + src/trainMSVMMajdataset.c | 2 + src/util.c | 433 ---------------------------------------------- 14 files changed, 706 insertions(+), 641 deletions(-) delete mode 100644 doc/mainpage.c create mode 100644 doc/mainpage.dox delete mode 100644 doc/specifications.c create mode 100644 doc/specifications.dox create mode 100644 include/msvmmaj_io.h create mode 100644 src/msvmmaj_io.c diff --git a/Makefile b/Makefile index c618859..1c4ee22 100644 --- a/Makefile +++ b/Makefile @@ -11,8 +11,34 @@ all: lib/libmsvmmaj.a $(EXECS) override LDFLAGS+=-lblas -llapack -lm -lib/libmsvmmaj.a: src/libMSVMMaj.o src/util.o src/msvmmaj_matrix.o src/msvmmaj_lapack.o src/strutil.o src/crossval.o src/msvmmaj_init.o src/msvmmaj_train.o src/msvmmaj_train_dataset.o src/msvmmaj_pred.o src/timer.o src/msvmmaj_kernel.o - @ar rcs lib/libmsvmmaj.a src/libMSVMMaj.o src/util.o src/msvmmaj_matrix.o src/msvmmaj_lapack.o src/strutil.o src/crossval.o src/msvmmaj_init.o src/msvmmaj_train.o src/msvmmaj_train_dataset.o src/msvmmaj_pred.o src/timer.o src/msvmmaj_kernel.o +lib/libmsvmmaj.a: \ + src/crossval.o \ + src/libMSVMMaj.o \ + src/msvmmaj_init.o \ + src/msvmmaj_io.o \ + src/msvmmaj_kernel.o \ + src/msvmmaj_lapack.o \ + src/msvmmaj_matrix.o \ + src/msvmmaj_pred.o \ + src/msvmmaj_train.o \ + src/msvmmaj_train_dataset.o \ + src/strutil.o \ + src/timer.o \ + src/util.o + @ar rcs lib/libmsvmmaj.a \ + src/crossval.o \ + src/libMSVMMaj.o \ + src/msvmmaj_init.o \ + src/msvmmaj_io.o \ + src/msvmmaj_matrix.o \ + src/msvmmaj_kernel.o \ + src/msvmmaj_lapack.o \ + src/msvmmaj_pred.o \ + src/msvmmaj_train.o \ + src/msvmmaj_train_dataset.o \ + src/strutil.o \ + src/timer.o \ + src/util.o @echo libmsvmmaj.a... trainMSVMMaj: src/trainMSVMMaj.c lib/libmsvmmaj.a @@ -47,6 +73,9 @@ src/msvmmaj_init.o: @$(CC) -c -o src/msvmmaj_init.o src/msvmmaj_init.c $(CFLAGS) $(INCLUDE) @echo msvmmaj_init.o... +src/msvmmaj_io.o: + @$(CC) -c -o $@ src/msvmmaj_io.c $(CFLAGS) $(INCLUDE) + src/msvmmaj_pred.o: @$(CC) -c -o src/msvmmaj_pred.o src/msvmmaj_pred.c $(CFLAGS) $(INCLUDE) @echo msvmmaj_pred.o... diff --git a/doc/mainpage.c b/doc/mainpage.c deleted file mode 100644 index e3c4057..0000000 --- a/doc/mainpage.c +++ /dev/null @@ -1,23 +0,0 @@ -/** - * @mainpage MSVMMaj C Package Documentation - * @author Gertjan van den Burg (add future safe email adres) - * @date January, 2014 - * @version 0.01a - * - * @section intro_sec Introduction - * - * This is the introduction - * - * @section Usage - * - * Describe the usage of the program here. - * - * @section License - * - * Put some license information here. - * - * @section Reference - * - * Reference to the paper published. Include BibTeX entry as code block here. - * - */ diff --git a/doc/mainpage.dox b/doc/mainpage.dox new file mode 100644 index 0000000..e3c4057 --- /dev/null +++ b/doc/mainpage.dox @@ -0,0 +1,23 @@ +/** + * @mainpage MSVMMaj C Package Documentation + * @author Gertjan van den Burg (add future safe email adres) + * @date January, 2014 + * @version 0.01a + * + * @section intro_sec Introduction + * + * This is the introduction + * + * @section Usage + * + * Describe the usage of the program here. + * + * @section License + * + * Put some license information here. + * + * @section Reference + * + * Reference to the paper published. Include BibTeX entry as code block here. + * + */ diff --git a/doc/specifications.c b/doc/specifications.c deleted file mode 100644 index 5b5a8ae..0000000 --- a/doc/specifications.c +++ /dev/null @@ -1,170 +0,0 @@ -/** - * @page spec_training_file Training Input File Specification - * - * This page specifies the training file that can be parsed by - * read_training_from_file(). Below is an example training file. - * - * @verbatim - train: /path/to/training/dataset.txt - test: /path/to/test/dataset.txt - p: 1.0 1.5 2.0 - kappa: -0.9 0.0 1.0 - lambda: 64 16 4 1 0.25 0.0625 0.015625 0.00390625 0.0009765625 0.000244140625 - epsilon: 1e-6 - weight: 1 2 - folds: 10 - kernel: LINEAR - gamma: 1e-3 1e-1 1e1 1e3 - coef: 1.0 2.0 - degree: 2.0 3.0 - @endverbatim - * - * Note that with a @c LINEAR kernel specification, the @c gamma, @c coef, and - * @c degree parameters do not need to be specified. The above merely shows - * all available parameters that can be specified in the grid search. Below - * each of the parameters are described in more detail. Arguments followed by - * an asterisk are optional. - * - * @c train: @n - * The location of the training dataset file. See @ref spec_data_file for the - * specification of a dataset file. - * - * @c test:* @n - * The location of a test dataset file. See @ref spec_data_file for the - * specification of a dataset file. This is optional, if specified the - * train/test split will be used for training. - * - * @c p: @n - * The values of the @c p parameter of the algorithm to search over. The @c p - * parameter is used in the @f$ \ell_p @f$ norm over the Huber weighted scalar - * misclassification errors. Note: @f$ 1 \leq p \leq 2 @f$. - * - * @c kappa: @n - * The values of the @c kappa parameter of the algorithm to search over. The - * @c kappa parameter is used in the Huber hinge error over the scalar - * misclassification errors. Note: @f$ \kappa > -1 @f$. - * - * @c lambda: @n - * The values of the @c lambda parameter of the algorithm to search over. The - * @c lambda parameter is used in the regularization term of the loss - * function. Note: @f$ \lambda > 0 @f$. - * - * @c epsilon: @n - * The values of the @c epsilon parameter of the algorithm to search over. The - * @c epsilon parameter is used as the stopping parameter in the majorization - * algorithm. Note that it often suffices to use only one epsilon value. Using - * more than one value increases the size of the grid search considerably. - * - * @c weight: @n - * The weight specifications for the algorithm to use. Two weight - * specifications are implemented: the unit weights (index = 1) and the group - * size correction weights (index = 2). See also msvmmaj_initialize_weights(). - * - * @c folds: @n - * The number of cross validation folds to use. - * - * @c kernel:* @n - * Kernel to use in training. Only one kernel can be specified. See KernelType - * for available kernel functions. Note: if multiple kernel types are - * specified on this line, only the last value will be used (see the - * implementation of parse_kernel_str() for details). If no kernel is - * specified, the @c LINEAR kernel will be used. - * - * @c gamma:* @n - * Gamma parameters for the @c RBF, @c POLY, and @c SIGMOID kernels. This - * parameter is only optional if the @c LINEAR kernel is specified. See - * msvmmaj_compute_rbf(), msvmmaj_compute_poly(), and - * msvmmaj_compute_sigmoid() for kernel specifications. - * - * @c coef:* @n - * Coefficients for the @c POLY and @c SIGMOID kernels. This parameter is only - * optional if the @c LINEAR or @c RBF kernels are used. See - * msvmmaj_compute_poly() and msvmmaj_compute_sigmoid() for kernel - * specifications. - * - * @c degree:* @n - * Degrees to search over in the grid search when the @c POLY kernel is - * specified. With other kernel specifications this parameter is unnecessary. - * See msvmmaj_compute_poly() for the polynomial kernel specification. - * - */ - - -/** - * @page spec_data_file Data File Specification - * - * This page describes the input file format for a dataset. This specification - * is used by msvmmaj_read_data() and msvmmaj_write_predictions(). The data - * file specification is the same as that used in MSVMpack - * (verified in v. 1.3). - * - * The file is expected to be as follows - * @verbatim -n -m -x_11 x_12 ... x_1m y_1 -x_21 x_22 ... x_2m y_2 -... -x_n1 x_n2 ... x_nm y_n -@endverbatim - * - * Here, @c n denotes the number of instances and @c m denotes the number of - * predictors. The class labels @c y_i are expected in the final column of - * each line. - * - * As an example, below the first 5 lines of the iris dataset are shown. - * - * @verbatim -150 -4 -5.10000 3.50000 1.40000 0.20000 1.00000 -4.90000 3.00000 1.40000 0.20000 1.00000 -4.70000 3.20000 1.30000 0.20000 1.00000 -@endverbatim - * - */ - -/** - * @page spec_model_file Model File Specification - * - * This page describes the input file format for a MajModel. This - * specification is used by msvmmaj_read_model() and msvmmaj_write_model(). - * The model file is designed to fully reproduce a MajModel. - * - * The model output file follows the format - * @verbatim -Output file for MSVMMaj (version 0.1) -Generated on: Tue Jan 14 12:00:00 2014 (UTC +01:00) - -Model: -p = 2.00 -lambda = 0.001 -kappa = 1.0 -epsilon = 1e-06 -weight_idx = 1 - -Data: -filename = /path/to/data_file.txt -n = 150 -m = 4 -K = 3 - -Output: --0.7693429935131153 -1.9335141926875414 -+0.3425555992439160 +1.0939198172438194 -+0.3100589593140404 +0.9872012663780092 -+0.1319873613546321 +0.1207806485439152 -+0.8052481376988456 +0.6507524553955120 -@endverbatim - * - * The first two lines of the file mainly serve a logging purpose, and are - * ignored when reading the model file. The model section fully describes the - * model parameters. Next, the data section describes the data file that was - * used in training and the size of the dataset. Finally, the output section - * shows the augmented weight matrix MajModel::V, in row-major order. - * - * @todo - * Write kernel specification to model file as well and adjust the format - * above. - */ diff --git a/doc/specifications.dox b/doc/specifications.dox new file mode 100644 index 0000000..5b5a8ae --- /dev/null +++ b/doc/specifications.dox @@ -0,0 +1,170 @@ +/** + * @page spec_training_file Training Input File Specification + * + * This page specifies the training file that can be parsed by + * read_training_from_file(). Below is an example training file. + * + * @verbatim + train: /path/to/training/dataset.txt + test: /path/to/test/dataset.txt + p: 1.0 1.5 2.0 + kappa: -0.9 0.0 1.0 + lambda: 64 16 4 1 0.25 0.0625 0.015625 0.00390625 0.0009765625 0.000244140625 + epsilon: 1e-6 + weight: 1 2 + folds: 10 + kernel: LINEAR + gamma: 1e-3 1e-1 1e1 1e3 + coef: 1.0 2.0 + degree: 2.0 3.0 + @endverbatim + * + * Note that with a @c LINEAR kernel specification, the @c gamma, @c coef, and + * @c degree parameters do not need to be specified. The above merely shows + * all available parameters that can be specified in the grid search. Below + * each of the parameters are described in more detail. Arguments followed by + * an asterisk are optional. + * + * @c train: @n + * The location of the training dataset file. See @ref spec_data_file for the + * specification of a dataset file. + * + * @c test:* @n + * The location of a test dataset file. See @ref spec_data_file for the + * specification of a dataset file. This is optional, if specified the + * train/test split will be used for training. + * + * @c p: @n + * The values of the @c p parameter of the algorithm to search over. The @c p + * parameter is used in the @f$ \ell_p @f$ norm over the Huber weighted scalar + * misclassification errors. Note: @f$ 1 \leq p \leq 2 @f$. + * + * @c kappa: @n + * The values of the @c kappa parameter of the algorithm to search over. The + * @c kappa parameter is used in the Huber hinge error over the scalar + * misclassification errors. Note: @f$ \kappa > -1 @f$. + * + * @c lambda: @n + * The values of the @c lambda parameter of the algorithm to search over. The + * @c lambda parameter is used in the regularization term of the loss + * function. Note: @f$ \lambda > 0 @f$. + * + * @c epsilon: @n + * The values of the @c epsilon parameter of the algorithm to search over. The + * @c epsilon parameter is used as the stopping parameter in the majorization + * algorithm. Note that it often suffices to use only one epsilon value. Using + * more than one value increases the size of the grid search considerably. + * + * @c weight: @n + * The weight specifications for the algorithm to use. Two weight + * specifications are implemented: the unit weights (index = 1) and the group + * size correction weights (index = 2). See also msvmmaj_initialize_weights(). + * + * @c folds: @n + * The number of cross validation folds to use. + * + * @c kernel:* @n + * Kernel to use in training. Only one kernel can be specified. See KernelType + * for available kernel functions. Note: if multiple kernel types are + * specified on this line, only the last value will be used (see the + * implementation of parse_kernel_str() for details). If no kernel is + * specified, the @c LINEAR kernel will be used. + * + * @c gamma:* @n + * Gamma parameters for the @c RBF, @c POLY, and @c SIGMOID kernels. This + * parameter is only optional if the @c LINEAR kernel is specified. See + * msvmmaj_compute_rbf(), msvmmaj_compute_poly(), and + * msvmmaj_compute_sigmoid() for kernel specifications. + * + * @c coef:* @n + * Coefficients for the @c POLY and @c SIGMOID kernels. This parameter is only + * optional if the @c LINEAR or @c RBF kernels are used. See + * msvmmaj_compute_poly() and msvmmaj_compute_sigmoid() for kernel + * specifications. + * + * @c degree:* @n + * Degrees to search over in the grid search when the @c POLY kernel is + * specified. With other kernel specifications this parameter is unnecessary. + * See msvmmaj_compute_poly() for the polynomial kernel specification. + * + */ + + +/** + * @page spec_data_file Data File Specification + * + * This page describes the input file format for a dataset. This specification + * is used by msvmmaj_read_data() and msvmmaj_write_predictions(). The data + * file specification is the same as that used in MSVMpack + * (verified in v. 1.3). + * + * The file is expected to be as follows + * @verbatim +n +m +x_11 x_12 ... x_1m y_1 +x_21 x_22 ... x_2m y_2 +... +x_n1 x_n2 ... x_nm y_n +@endverbatim + * + * Here, @c n denotes the number of instances and @c m denotes the number of + * predictors. The class labels @c y_i are expected in the final column of + * each line. + * + * As an example, below the first 5 lines of the iris dataset are shown. + * + * @verbatim +150 +4 +5.10000 3.50000 1.40000 0.20000 1.00000 +4.90000 3.00000 1.40000 0.20000 1.00000 +4.70000 3.20000 1.30000 0.20000 1.00000 +@endverbatim + * + */ + +/** + * @page spec_model_file Model File Specification + * + * This page describes the input file format for a MajModel. This + * specification is used by msvmmaj_read_model() and msvmmaj_write_model(). + * The model file is designed to fully reproduce a MajModel. + * + * The model output file follows the format + * @verbatim +Output file for MSVMMaj (version 0.1) +Generated on: Tue Jan 14 12:00:00 2014 (UTC +01:00) + +Model: +p = 2.00 +lambda = 0.001 +kappa = 1.0 +epsilon = 1e-06 +weight_idx = 1 + +Data: +filename = /path/to/data_file.txt +n = 150 +m = 4 +K = 3 + +Output: +-0.7693429935131153 -1.9335141926875414 ++0.3425555992439160 +1.0939198172438194 ++0.3100589593140404 +0.9872012663780092 ++0.1319873613546321 +0.1207806485439152 ++0.8052481376988456 +0.6507524553955120 +@endverbatim + * + * The first two lines of the file mainly serve a logging purpose, and are + * ignored when reading the model file. The model section fully describes the + * model parameters. Next, the data section describes the data file that was + * used in training and the size of the dataset. Finally, the output section + * shows the augmented weight matrix MajModel::V, in row-major order. + * + * @todo + * Write kernel specification to model file as well and adjust the format + * above. + */ diff --git a/include/msvmmaj_init.h b/include/msvmmaj_init.h index 6e2e36f..febfb4a 100644 --- a/include/msvmmaj_init.h +++ b/include/msvmmaj_init.h @@ -20,4 +20,8 @@ struct MajModel *msvmmaj_init_model(); struct MajData *msvmmaj_init_data(); +void msvmmaj_allocate_model(struct MajModel *model); +void msvmmaj_free_model(struct MajModel *model); +void msvmmaj_free_data(struct MajData *data); + #endif diff --git a/include/msvmmaj_io.h b/include/msvmmaj_io.h new file mode 100644 index 0000000..99fb4dc --- /dev/null +++ b/include/msvmmaj_io.h @@ -0,0 +1,30 @@ +/** + * @file msvmmaj_io.h + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Header files for msvmmaj_io.c + * + * @details + * Function declarations for input/output functions. + * + */ + +#ifndef MSVMMAJ_IO_H +#define MSVMMAJ_IO_H + +#include "globals.h" + +// forward declarations +struct MajData; +struct MajModel; + +// function declarations +void msvmmaj_read_data(struct MajData *dataset, char *data_file); + +void msvmmaj_read_model(struct MajModel *model, char *model_filename); +void msvmmaj_write_model(struct MajModel *model, char *output_filename); + +void msvmmaj_write_predictions(struct MajData *data, long *predy, + char *output_filename); + +#endif diff --git a/include/util.h b/include/util.h index 995a927..375a9c2 100644 --- a/include/util.h +++ b/include/util.h @@ -19,21 +19,9 @@ struct MajData; struct MajModel; // function declarations -void msvmmaj_read_data(struct MajData *dataset, char *data_file); - -void msvmmaj_read_model(struct MajModel *model, char *model_filename); -void msvmmaj_write_model(struct MajModel *model, char *output_filename); - -void msvmmaj_write_predictions(struct MajData *data, long *predy, - char *output_filename); - int msvmmaj_check_argv(int argc, char **argv, char *str); int msvmmaj_check_argv_eq(int argc, char **argv, char *str); void note(const char *fmt,...); -void msvmmaj_allocate_model(struct MajModel *model); -void msvmmaj_free_model(struct MajModel *model); -void msvmmaj_free_data(struct MajData *data); - #endif diff --git a/src/msvmmaj_init.c b/src/msvmmaj_init.c index 14278f9..b4384be 100644 --- a/src/msvmmaj_init.c +++ b/src/msvmmaj_init.c @@ -7,7 +7,9 @@ * @details * This file contains functions for initializing a MajModel instance * and a MajData instance. In addition, default values for these - * structures are defined here (and only here). + * structures are defined here (and only here). Functions for allocating + * memory for the model structure and freeing of the model and data structures + * are also included. * */ @@ -62,3 +64,122 @@ struct MajData *msvmmaj_init_data() return data; } +/** + * @brief Allocate memory for a MajModel + * + * @details + * This function can be used to allocate the memory needed for a MajModel. All + * arrays in the model are specified and initialized to 0. + * + * @param[in] model MajModel to allocate + * + */ +void msvmmaj_allocate_model(struct MajModel *model) +{ + long n = model->n; + long m = model->m; + long K = model->K; + + model->W = Calloc(double, m*(K-1)); + if (model->W == NULL) { + fprintf(stderr, "Failed to allocate memory for W.\n"); + exit(1); + } + + model->t = Calloc(double, K-1); + if (model->t == NULL) { + fprintf(stderr, "Failed to allocate memory for t.\n"); + exit(1); + } + + model->V = Calloc(double, (m+1)*(K-1)); + if (model->V == NULL) { + fprintf(stderr, "Failed to allocate memory for V.\n"); + exit(1); + } + + model->Vbar = Calloc(double, (m+1)*(K-1)); + if (model->Vbar == NULL) { + fprintf(stderr, "Failed to allocate memory for Vbar.\n"); + exit(1); + } + + model->U = Calloc(double, K*(K-1)); + if (model->U == NULL) { + fprintf(stderr, "Failed to allocate memory for U.\n"); + exit(1); + } + + model->UU = Calloc(double, n*K*(K-1)); + if (model->UU == NULL) { + fprintf(stderr, "Failed to allocate memory for UU.\n"); + exit(1); + } + + model->Q = Calloc(double, n*K); + if (model->Q == NULL) { + fprintf(stderr, "Failed to allocate memory for Q.\n"); + exit(1); + } + + model->H = Calloc(double, n*K); + if (model->H == NULL) { + fprintf(stderr, "Failed to allocate memory for H.\n"); + exit(1); + } + + model->R = Calloc(double, n*K); + if (model->R == NULL) { + fprintf(stderr, "Failed to allocate memory for R.\n"); + exit(1); + } + + model->rho = Calloc(double, n); + if (model->rho == NULL) { + fprintf(stderr, "Failed to allocate memory for rho.\n"); + exit(1); + } +} + +/** + * @brief Free allocated MajModel struct + * + * @details + * Simply free a previously allocated MajModel by freeing all its component + * arrays. Note that the model struct itself is also freed here. + * + * @param[in] model MajModel to free + * + */ +void msvmmaj_free_model(struct MajModel *model) +{ + free(model->W); + free(model->t); + free(model->V); + free(model->Vbar); + free(model->U); + free(model->UU); + free(model->Q); + free(model->H); + free(model->rho); + free(model->R); + + free(model); +} + +/** + * @brief Free allocated MajData struct + * + * @details + * Simply free a previously allocated MajData struct by freeing all its + * components. Note that the data struct itself is also freed here. + * + * @param[in] data MajData struct to free + * + */ +void msvmmaj_free_data(struct MajData *data) +{ + free(data->Z); + free(data->y); + free(data); +} diff --git a/src/msvmmaj_io.c b/src/msvmmaj_io.c new file mode 100644 index 0000000..7abb182 --- /dev/null +++ b/src/msvmmaj_io.c @@ -0,0 +1,322 @@ +/** + * @file msvmmaj_io.c + * @author Gertjan van den Burg + * @date January, 2014 + * @brief Functions for input and output of data and model files + * + * @details + * This file contains functions for reading and writing model files, and data + * files. + * + */ + +#include + +#include "msvmmaj.h" +#include "msvmmaj_io.h" +#include "msvmmaj_matrix.h" +#include "strutil.h" + +/** + * @brief Read data from file + * + * @details + * Read the data from the data_file. The data matrix X is augmented + * with a column of ones, to get the matrix Z. The data is expected + * to follow a specific format, which is specified in the @ref spec_data_file. + * The class labels are corrected internally to correspond to the interval + * [1 .. K], where K is the total number of classes. + * + * @todo + * Make sure that this function allows datasets without class labels for + * testing. + * + * @param[in,out] dataset initialized MajData struct + * @param[in] data_file filename of the data file. + */ +void msvmmaj_read_data(struct MajData *dataset, char *data_file) +{ + FILE *fid; + long i, j; + long n, m; // dimensions of data + long nr = 0; // used to check consistency of data + double value; + long K = 0; + long min_y = 1000000; + + char buf[MAX_LINE_LENGTH]; + + if ((fid = fopen(data_file, "r")) == NULL) { + fprintf(stderr, "\nERROR: datafile %s could not be opened.\n", + data_file); + exit(0); + } + + // Read data dimensions + nr += fscanf(fid, "%ld", &n); + nr += fscanf(fid, "%ld", &m); + + // Allocate memory + dataset->Z = Malloc(double, n*(m+1)); + + // Read first line of data + for (j=1; jZ, n, 0, j, value); + } + + // Check if there is a label at the end of the line + if (fgets(buf, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "ERROR: No label found on first line.\n"); + exit(1); + } + if (sscanf(buf, "%lf", &value) > 0) { + dataset->y = Malloc(long, n); + dataset->y[0] = value; + } else if (dataset->y != NULL) { + free(dataset->y); + dataset->y = NULL; + } + + // Read the rest of the file + for (i=1; iZ, m+1, i, j, value); + } + if (dataset->y != NULL) { + nr += fscanf(fid, "%lf", &value); + dataset->y[i] = (long) value; + K = maximum(K, value); + min_y = minimum(min_y, value); + } + } + fclose(fid); + + // Correct labels: must be in [1, K] + if (min_y == 0) { + for (i=0; iy[i]++; + K++; + } else if (min_y < 0 ) { + fprintf(stderr, "ERROR: wrong class labels in %s, minimum " + "value is: %ld\n", + data_file, min_y); + exit(0); + } + + if (nr < n * m) { + fprintf(stderr, "ERROR: not enough data found in %s\n", + data_file); + exit(0); + } + + // Set the column of ones + for (i=0; iZ, m+1, i, 0, 1.0); + + dataset->n = n; + dataset->m = m; + dataset->K = K; +} + + +/** + * @brief Read model from file + * + * @details + * Read a MajModel from a model file. The MajModel struct must have been + * initalized elswhere. The model file is expected to follow the @ref + * spec_model_file. The easiest way to generate a model file is through + * msvmmaj_write_model(), which can for instance be used in trainMSVMMaj.c. + * + * @param[in,out] model initialized MajModel + * @param[in] model_filename filename of the model file + * + */ +void msvmmaj_read_model(struct MajModel *model, char *model_filename) +{ + long i, j, nr = 0; + FILE *fid; + char buffer[MAX_LINE_LENGTH]; + char data_filename[MAX_LINE_LENGTH]; + double value = 0; + + fid = fopen(model_filename, "r"); + if (fid == NULL) { + fprintf(stderr, "Error opening model file %s\n", + model_filename); + exit(1); + } + // skip the first four lines + for (i=0; i<4; i++) + next_line(fid, model_filename); + + // read all model variables + model->p = get_fmt_double(fid, model_filename, "p = %lf"); + model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf"); + model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf"); + model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf"); + model->weight_idx = (int) get_fmt_long(fid, model_filename, + "weight_idx = %li"); + + // skip to data section + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read filename of data file + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading model file %s\n", + model_filename); + exit(1); + } + sscanf(buffer, "filename = %s\n", data_filename); + model->data_file = data_filename; + + // read all data variables + model->n = get_fmt_long(fid, model_filename, "n = %li\n"); + model->m = get_fmt_long(fid, model_filename, "m = %li\n"); + model->K = get_fmt_long(fid, model_filename, "K = %li\n"); + + // skip to output + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read the matrix V and check for consistency + model->V = Malloc(double, (model->m+1)*(model->K-1)); + for (i=0; im+1; i++) { + for (j=0; jK-1; j++) { + nr += fscanf(fid, "%lf ", &value); + matrix_set(model->V, model->K-1, i, j, value); + } + } + if (nr != (model->m+1)*(model->K-1)) { + fprintf(stderr, "Error reading model file %s. " + "Not enough elements of V found.\n", + model_filename); + exit(1); + } +} + +/** + * @brief Write model to file + * + * @details + * Write a MajModel to a file. The current time is specified in the file in + * UTC + offset. The model file further corresponds to the @ref + * spec_model_file. + * + * @param[in] model MajModel which contains an estimate for + * MajModel::V + * @param[in] output_filename the output file to write the model to + * + */ +void msvmmaj_write_model(struct MajModel *model, char *output_filename) +{ + FILE *fid; + long i, j; + int diff, hours, minutes; + char timestr[1000]; + time_t current_time, lt, gt; + struct tm *lclt; + + // open output file + fid = fopen(output_filename, "w"); + if (fid == NULL) { + fprintf(stderr, "Error opening output file %s", + output_filename); + exit(1); + } + + // get current time (in epoch) + current_time = time(NULL); + if (current_time == ((time_t)-1)) { + fprintf(stderr, "Failed to compute the current time.\n"); + exit(1); + } + + // convert time to local time and create a string + lclt = localtime(¤t_time); + strftime(timestr, 1000, "%c", lclt); + if (timestr == NULL) { + fprintf(stderr, "Failed to convert time to string.\n"); + exit(1); + } + + // calculate the difference from UTC including DST + lt = mktime(localtime(¤t_time)); + gt = mktime(gmtime(¤t_time)); + diff = -difftime(gt, lt); + hours = (diff/3600); + minutes = (diff%3600)/60; + if (lclt->tm_isdst == 1) + hours++; + + // Write output to file + fprintf(fid, "Output file for MSVMMaj (version %1.1f)\n", VERSION); + fprintf(fid, "Generated on: %s (UTC %+03i:%02i)\n\n", + timestr, hours, minutes); + fprintf(fid, "Model:\n"); + fprintf(fid, "p = %15.16f\n", model->p); + fprintf(fid, "lambda = %15.16f\n", model->lambda); + fprintf(fid, "kappa = %15.16f\n", model->kappa); + fprintf(fid, "epsilon = %g\n", model->epsilon); + fprintf(fid, "weight_idx = %i\n", model->weight_idx); + fprintf(fid, "\n"); + fprintf(fid, "Data:\n"); + fprintf(fid, "filename = %s\n", model->data_file); + fprintf(fid, "n = %li\n", model->n); + fprintf(fid, "m = %li\n", model->m); + fprintf(fid, "K = %li\n", model->K); + fprintf(fid, "\n"); + fprintf(fid, "Output:\n"); + for (i=0; im+1; i++) { + for (j=0; jK-1; j++) { + fprintf(fid, "%+15.16f ", + matrix_get(model->V, + model->K-1, i, j)); + } + fprintf(fid, "\n"); + } + + fclose(fid); +} + +/** + * @brief Write predictions to file + * + * @details + * Write the given predictions to an output file, such that the resulting file + * corresponds to the @ref spec_data_file. + * + * @param[in] data MajData with the original instances + * @param[in] predy predictions of the class labels of the + * instances in the given MajData. Note that the + * order of the instances is assumed to be the + * same. + * @param[in] output_filename the file to which the predictions are written + * + */ +void msvmmaj_write_predictions(struct MajData *data, long *predy, + char *output_filename) +{ + long i, j; + FILE *fid; + + fid = fopen(output_filename, "w"); + if (fid == NULL) { + fprintf(stderr, "Error opening output file %s", + output_filename); + exit(1); + } + + for (i=0; in; i++) { + for (j=0; jm; j++) + fprintf(fid, "%f ", + matrix_get(data->Z, + data->m+1, i, j+1)); + fprintf(fid, "%li\n", predy[i]); + } + + fclose(fid); +} diff --git a/src/predMSVMMaj.c b/src/predMSVMMaj.c index b41b137..e67e430 100644 --- a/src/predMSVMMaj.c +++ b/src/predMSVMMaj.c @@ -20,6 +20,7 @@ #include "msvmmaj.h" #include "msvmmaj_init.h" +#include "msvmmaj_io.h" #include "msvmmaj_pred.h" #include "util.h" diff --git a/src/trainMSVMMaj.c b/src/trainMSVMMaj.c index 4bc9fec..9f71325 100644 --- a/src/trainMSVMMaj.c +++ b/src/trainMSVMMaj.c @@ -17,6 +17,7 @@ #include "msvmmaj_kernel.h" #include "libMSVMMaj.h" #include "msvmmaj.h" +#include "msvmmaj_io.h" #include "msvmmaj_init.h" #include "msvmmaj_train.h" #include "util.h" diff --git a/src/trainMSVMMajdataset.c b/src/trainMSVMMajdataset.c index a34c642..e9f8b8e 100644 --- a/src/trainMSVMMajdataset.c +++ b/src/trainMSVMMajdataset.c @@ -23,6 +23,8 @@ #include "crossval.h" #include "msvmmaj.h" +#include "msvmmaj_io.h" +#include "msvmmaj_init.h" #include "msvmmaj_pred.h" #include "msvmmaj_train.h" #include "msvmmaj_train_dataset.h" diff --git a/src/util.c b/src/util.c index 8e4b806..e76a074 100644 --- a/src/util.c +++ b/src/util.c @@ -8,17 +8,8 @@ * This file contains several utility functions for coordinating input and * output of data and model files. It also contains string functions. * - * @todo - * Pull this apart. - * */ -#include #include -#include - -#include "msvmmaj.h" -#include "msvmmaj_matrix.h" -#include "strutil.h" #include "util.h" @@ -33,309 +24,6 @@ FILE *MSVMMAJ_OUTPUT_FILE; ///< The #MSVMMAJ_OUTPUT_FILE specifies the ///< this variable through @c extern and ///< (temporarily) setting it to NULL. -/** - * @brief Read data from file - * - * @details - * Read the data from the data_file. The data matrix X is augmented - * with a column of ones, to get the matrix Z. The data is expected - * to follow a specific format, which is specified in the @ref spec_data_file. - * The class labels are corrected internally to correspond to the interval - * [1 .. K], where K is the total number of classes. - * - * @todo - * Make sure that this function allows datasets without class labels for - * testing. - * - * @param[in,out] dataset initialized MajData struct - * @param[in] data_file filename of the data file. - */ -void msvmmaj_read_data(struct MajData *dataset, char *data_file) -{ - FILE *fid; - long i, j; - long n, m; // dimensions of data - long nr = 0; // used to check consistency of data - double value; - long K = 0; - long min_y = 1000000; - - char buf[MAX_LINE_LENGTH]; - - if ((fid = fopen(data_file, "r")) == NULL) { - fprintf(stderr, "\nERROR: datafile %s could not be opened.\n", - data_file); - exit(0); - } - - // Read data dimensions - nr += fscanf(fid, "%ld", &n); - nr += fscanf(fid, "%ld", &m); - - // Allocate memory - dataset->Z = Malloc(double, n*(m+1)); - - // Read first line of data - for (j=1; jZ, n, 0, j, value); - } - - // Check if there is a label at the end of the line - if (fgets(buf, MAX_LINE_LENGTH, fid) == NULL) { - fprintf(stderr, "ERROR: No label found on first line.\n"); - exit(1); - } - if (sscanf(buf, "%lf", &value) > 0) { - dataset->y = Malloc(long, n); - dataset->y[0] = value; - } else if (dataset->y != NULL) { - free(dataset->y); - dataset->y = NULL; - } - - // Read the rest of the file - for (i=1; iZ, m+1, i, j, value); - } - if (dataset->y != NULL) { - nr += fscanf(fid, "%lf", &value); - dataset->y[i] = (long) value; - K = maximum(K, value); - min_y = minimum(min_y, value); - } - } - fclose(fid); - - // Correct labels: must be in [1, K] - if (min_y == 0) { - for (i=0; iy[i]++; - K++; - } else if (min_y < 0 ) { - fprintf(stderr, "ERROR: wrong class labels in %s, minimum " - "value is: %ld\n", - data_file, min_y); - exit(0); - } - - if (nr < n * m) { - fprintf(stderr, "ERROR: not enough data found in %s\n", - data_file); - exit(0); - } - - // Set the column of ones - for (i=0; iZ, m+1, i, 0, 1.0); - - dataset->n = n; - dataset->m = m; - dataset->K = K; -} - -/** - * @brief Read model from file - * - * @details - * Read a MajModel from a model file. The MajModel struct must have been - * initalized elswhere. The model file is expected to follow the @ref - * spec_model_file. The easiest way to generate a model file is through - * msvmmaj_write_model(), which can for instance be used in trainMSVMMaj.c. - * - * @param[in,out] model initialized MajModel - * @param[in] model_filename filename of the model file - * - */ -void msvmmaj_read_model(struct MajModel *model, char *model_filename) -{ - long i, j, nr = 0; - FILE *fid; - char buffer[MAX_LINE_LENGTH]; - char data_filename[MAX_LINE_LENGTH]; - double value = 0; - - fid = fopen(model_filename, "r"); - if (fid == NULL) { - fprintf(stderr, "Error opening model file %s\n", - model_filename); - exit(1); - } - // skip the first four lines - for (i=0; i<4; i++) - next_line(fid, model_filename); - - // read all model variables - model->p = get_fmt_double(fid, model_filename, "p = %lf"); - model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf"); - model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf"); - model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf"); - model->weight_idx = (int) get_fmt_long(fid, model_filename, - "weight_idx = %li"); - - // skip to data section - for (i=0; i<2; i++) - next_line(fid, model_filename); - - // read filename of data file - if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { - fprintf(stderr, "Error reading model file %s\n", - model_filename); - exit(1); - } - sscanf(buffer, "filename = %s\n", data_filename); - model->data_file = data_filename; - - // read all data variables - model->n = get_fmt_long(fid, model_filename, "n = %li\n"); - model->m = get_fmt_long(fid, model_filename, "m = %li\n"); - model->K = get_fmt_long(fid, model_filename, "K = %li\n"); - - // skip to output - for (i=0; i<2; i++) - next_line(fid, model_filename); - - // read the matrix V and check for consistency - model->V = Malloc(double, (model->m+1)*(model->K-1)); - for (i=0; im+1; i++) { - for (j=0; jK-1; j++) { - nr += fscanf(fid, "%lf ", &value); - matrix_set(model->V, model->K-1, i, j, value); - } - } - if (nr != (model->m+1)*(model->K-1)) { - fprintf(stderr, "Error reading model file %s. " - "Not enough elements of V found.\n", - model_filename); - exit(1); - } -} - -/** - * @brief Write model to file - * - * @details - * Write a MajModel to a file. The current time is specified in the file in - * UTC + offset. The model file further corresponds to the @ref - * spec_model_file. - * - * @param[in] model MajModel which contains an estimate for - * MajModel::V - * @param[in] output_filename the output file to write the model to - * - */ -void msvmmaj_write_model(struct MajModel *model, char *output_filename) -{ - FILE *fid; - long i, j; - int diff, hours, minutes; - char timestr[1000]; - time_t current_time, lt, gt; - struct tm *lclt; - - // open output file - fid = fopen(output_filename, "w"); - if (fid == NULL) { - fprintf(stderr, "Error opening output file %s", - output_filename); - exit(1); - } - - // get current time (in epoch) - current_time = time(NULL); - if (current_time == ((time_t)-1)) { - fprintf(stderr, "Failed to compute the current time.\n"); - exit(1); - } - - // convert time to local time and create a string - lclt = localtime(¤t_time); - strftime(timestr, 1000, "%c", lclt); - if (timestr == NULL) { - fprintf(stderr, "Failed to convert time to string.\n"); - exit(1); - } - - // calculate the difference from UTC including DST - lt = mktime(localtime(¤t_time)); - gt = mktime(gmtime(¤t_time)); - diff = -difftime(gt, lt); - hours = (diff/3600); - minutes = (diff%3600)/60; - if (lclt->tm_isdst == 1) - hours++; - - // Write output to file - fprintf(fid, "Output file for MSVMMaj (version %1.1f)\n", VERSION); - fprintf(fid, "Generated on: %s (UTC %+03i:%02i)\n\n", - timestr, hours, minutes); - fprintf(fid, "Model:\n"); - fprintf(fid, "p = %15.16f\n", model->p); - fprintf(fid, "lambda = %15.16f\n", model->lambda); - fprintf(fid, "kappa = %15.16f\n", model->kappa); - fprintf(fid, "epsilon = %g\n", model->epsilon); - fprintf(fid, "weight_idx = %i\n", model->weight_idx); - fprintf(fid, "\n"); - fprintf(fid, "Data:\n"); - fprintf(fid, "filename = %s\n", model->data_file); - fprintf(fid, "n = %li\n", model->n); - fprintf(fid, "m = %li\n", model->m); - fprintf(fid, "K = %li\n", model->K); - fprintf(fid, "\n"); - fprintf(fid, "Output:\n"); - for (i=0; im+1; i++) { - for (j=0; jK-1; j++) { - fprintf(fid, "%+15.16f ", - matrix_get(model->V, - model->K-1, i, j)); - } - fprintf(fid, "\n"); - } - - fclose(fid); -} - -/** - * @brief Write predictions to file - * - * @details - * Write the given predictions to an output file, such that the resulting file - * corresponds to the @ref spec_data_file. - * - * @param[in] data MajData with the original instances - * @param[in] predy predictions of the class labels of the - * instances in the given MajData. Note that the - * order of the instances is assumed to be the - * same. - * @param[in] output_filename the file to which the predictions are written - * - */ -void msvmmaj_write_predictions(struct MajData *data, long *predy, - char *output_filename) -{ - long i, j; - FILE *fid; - - fid = fopen(output_filename, "w"); - if (fid == NULL) { - fprintf(stderr, "Error opening output file %s", - output_filename); - exit(1); - } - - for (i=0; in; i++) { - for (j=0; jm; j++) - fprintf(fid, "%f ", - matrix_get(data->Z, - data->m+1, i, j+1)); - fprintf(fid, "%li\n", predy[i]); - } - - fclose(fid); -} - /** * @brief Check if any command line arguments contain string * @@ -438,124 +126,3 @@ void note(const char *fmt,...) va_end(ap); (*msvmmaj_print_string)(buf); } - -/** - * @brief Allocate memory for a MajModel - * - * @details - * This function can be used to allocate the memory needed for a MajModel. All - * arrays in the model are specified and initialized to 0. - * - * @param[in] model MajModel to allocate - * - */ -void msvmmaj_allocate_model(struct MajModel *model) -{ - long n = model->n; - long m = model->m; - long K = model->K; - - model->W = Calloc(double, m*(K-1)); - if (model->W == NULL) { - fprintf(stderr, "Failed to allocate memory for W.\n"); - exit(1); - } - - model->t = Calloc(double, K-1); - if (model->t == NULL) { - fprintf(stderr, "Failed to allocate memory for t.\n"); - exit(1); - } - - model->V = Calloc(double, (m+1)*(K-1)); - if (model->V == NULL) { - fprintf(stderr, "Failed to allocate memory for V.\n"); - exit(1); - } - - model->Vbar = Calloc(double, (m+1)*(K-1)); - if (model->Vbar == NULL) { - fprintf(stderr, "Failed to allocate memory for Vbar.\n"); - exit(1); - } - - model->U = Calloc(double, K*(K-1)); - if (model->U == NULL) { - fprintf(stderr, "Failed to allocate memory for U.\n"); - exit(1); - } - - model->UU = Calloc(double, n*K*(K-1)); - if (model->UU == NULL) { - fprintf(stderr, "Failed to allocate memory for UU.\n"); - exit(1); - } - - model->Q = Calloc(double, n*K); - if (model->Q == NULL) { - fprintf(stderr, "Failed to allocate memory for Q.\n"); - exit(1); - } - - model->H = Calloc(double, n*K); - if (model->H == NULL) { - fprintf(stderr, "Failed to allocate memory for H.\n"); - exit(1); - } - - model->R = Calloc(double, n*K); - if (model->R == NULL) { - fprintf(stderr, "Failed to allocate memory for R.\n"); - exit(1); - } - - model->rho = Calloc(double, n); - if (model->rho == NULL) { - fprintf(stderr, "Failed to allocate memory for rho.\n"); - exit(1); - } - -} - -/** - * @brief Free allocated MajModel struct - * - * @details - * Simply free a previously allocated MajModel by freeing all its component - * arrays. Note that the model struct itself is also freed here. - * - * @param[in] model MajModel to free - * - */ -void msvmmaj_free_model(struct MajModel *model) -{ - free(model->W); - free(model->t); - free(model->V); - free(model->Vbar); - free(model->U); - free(model->UU); - free(model->Q); - free(model->H); - free(model->rho); - free(model->R); - - free(model); -} - -/** - * @brief Free allocated MajData struct - * - * @details - * Simply free a previously allocated MajData struct by freeing all its - * components. Note that the data struct itself is also freed here. - * - * @param[in] data MajData struct to free - * - */ -void msvmmaj_free_data(struct MajData *data) -{ - free(data->Z); - free(data->y); - free(data); -} -- cgit v1.2.3