diff options
| -rw-r--r-- | Makefile | 5 | ||||
| -rw-r--r-- | include/libMSVMMaj.h | 5 | ||||
| -rw-r--r-- | include/util.h | 9 | ||||
| -rw-r--r-- | src/libMSVMMaj.c | 82 | ||||
| -rw-r--r-- | src/trainMSVMMaj.c | 55 | ||||
| -rw-r--r-- | src/util.c | 178 |
6 files changed, 300 insertions, 34 deletions
@@ -2,7 +2,7 @@ VERSION=0.1 CC=gcc CFLAGS=-Wall -O2 -DVERSION=$(VERSION) -g INCLUDE= -Iinclude/ -EXECS=trainMSVMMaj +EXECS=trainMSVMMaj predMSVMMaj .PHONY: all clean tar @@ -13,6 +13,9 @@ override LDFLAGS+=-lblas -llapack -lm trainMSVMMaj: src/trainMSVMMaj.c src/libMSVMMaj.o src/util.o $(CC) -o trainMSVMMaj src/trainMSVMMaj.c src/libMSVMMaj.o src/util.o $(CFLAGS) $(INCLUDE) $(LDFLAGS) +predMSVMMaj: src/predMSVMMaj.c src/libMSVMMaj.o src/util.o + $(CC) -o predMSVMMaj src/predMSVMMaj.c src/libMSVMMaj.o src/util.o $(CFLAGS) $(INCLUDE) $(LDFLAGS) + src/libMSVMMaj.o: $(CC) -c -o src/libMSVMMaj.o src/libMSVMMaj.c $(CFLAGS) $(INCLUDE) diff --git a/include/libMSVMMaj.h b/include/libMSVMMaj.h index c886ded..6db1253 100644 --- a/include/libMSVMMaj.h +++ b/include/libMSVMMaj.h @@ -1,7 +1,6 @@ #include <stdio.h> #include <stdlib.h> #include <math.h> -#include <time.h> #include <cblas.h> #include <string.h> #include "util.h" @@ -24,3 +23,7 @@ void main_loop(struct Model *model, struct Data *data); int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, int LDB); void initialize_weights(struct Data *data, struct Model *model); + +void predict_labels(struct Data *data, struct Model *model, long *predy); +double prediction_perf(struct Data *data, long *predy); + diff --git a/include/util.h b/include/util.h index 0b5009e..ec415ac 100644 --- a/include/util.h +++ b/include/util.h @@ -3,6 +3,7 @@ #include <stdlib.h> #include <math.h> #include <string.h> +#include <time.h> #include "MSVMMaj.h" #define Calloc(type, n) (type *)calloc((n), sizeof(type)) @@ -11,7 +12,12 @@ #define maximum(a, b) a > b ? a : b #define minimum(a, b) a < b ? a : b -void read_data(struct Data *dataset, struct Model *model, char *data_file); +void read_data(struct Data *dataset, char *data_file); + +void read_model(struct Model *model, char *model_filename); +void write_model(struct Model *model, char *output_filename); + +void write_predictions(struct Data *data, long *predy, char *output_filename); int check_argv(int argc, char **argv, char *str); int check_argv_eq(int argc, char **argv, char *str); @@ -34,3 +40,4 @@ void free_model(struct Model *model); void free_data(struct Data *data); void print_matrix(double *M, long rows, long cols); + diff --git a/src/libMSVMMaj.c b/src/libMSVMMaj.c index 04c5035..f5aeb4e 100644 --- a/src/libMSVMMaj.c +++ b/src/libMSVMMaj.c @@ -167,7 +167,6 @@ double get_msvmmaj_loss(struct Model *model, struct Data *data, double *ZV) return loss; } - /* Training loop is defined here. */ @@ -226,7 +225,7 @@ void main_loop(struct Model *model, struct Data *data) Lbar = L; L = get_msvmmaj_loss(model, data, ZV); - if (it%500 == 0) + if (it%100 == 0) info("iter = %li, L = %15.16f, Lbar = %15.16f, reldiff = %15.16f\n", it, L, Lbar, (Lbar - L)/L); it++; @@ -297,7 +296,7 @@ void msvmmaj_update(struct Model *model, struct Data *data, { // Because msvmmaj_update is always called after a call to // get_msvmmaj_loss with the latest V, it is unnecessary to recalculate - // the matrix ZV, the errors Q and the Huber errors H. Awesome! + // the matrix ZV, the errors Q, or the Huber errors H. Awesome! int status; long i, j, k; double Avalue, Bvalue; @@ -536,19 +535,16 @@ void msvmmaj_update(struct Model *model, struct Data *data, for (i=0; i<m+1; i++) { for (j=0; j<K-1; j++) { - value = matrix_get(model->Vbar, K-1, i, j); matrix_set(model->Vbar, K-1, i, j, matrix_get(model->V, K-1, i, j)); matrix_set(model->V, K-1, i, j, matrix_get(ZAZV, K-1, i, j)); - matrix_set(ZAZV, K-1, i, j, value); } } - } void initialize_weights(struct Data *data, struct Model *model) { - int *groups; + long *groups; long i; long n = model->n; @@ -559,7 +555,7 @@ void initialize_weights(struct Data *data, struct Model *model) model->rho[i] = 1.0; } else if (model->weight_idx == 2) { - groups = Calloc(int, K); + groups = Calloc(long, K); for (i=0; i<n; i++) { groups[data->y[i]-1]++; } @@ -572,3 +568,73 @@ void initialize_weights(struct Data *data, struct Model *model) } } + +void predict_labels(struct Data *data, struct Model *model, long *predy) +{ + long i, j, k, label; + double norm, min_dist; + + long n = data->n; // note that model->n is the size of the training sample. + long m = data->m; + long K = model->K; //data->K does not necessarily equal the original K. + + double *S = Calloc(double, K-1); + double *ZV = Calloc(double, n*(K-1)); + double *U = Calloc(double, K*(K-1)); + + // Get the simplex matrix + simplex_gen(K, U); + + // Generate the simplex-space vectors + cblas_dgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + K-1, + m+1, + 1.0, + data->Z, + m+1, + model->V, + K-1, + 0.0, + ZV, + K-1); + + // Calculate the distance to each of the vertices of the simplex. + // The closest vertex defines the class label. + for (i=0; i<n; i++) { + label = 0; + min_dist = 1000000000.0; + for (j=0; j<K; j++) { + for (k=0; k<K-1; k++) { + S[k] = matrix_get(ZV, K-1, i, k) - matrix_get(U, K-1, j, k); + } + norm = cblas_dnrm2(K, S, 1); + if (norm < min_dist) { + label = j+1; + min_dist = norm; + } + } + predy[i] = label; + } + + free(ZV); + free(U); + free(S); +} + +double prediction_perf(struct Data *data, long *predy) +{ + long i, correct = 0; + double performance; + + for (i=0; i<data->n; i++) + if (data->y[i] == predy[i]) + correct++; + + performance = ((double) correct)/((double) data->n) * 100.0; + + return performance; +} diff --git a/src/trainMSVMMaj.c b/src/trainMSVMMaj.c index a16c008..32c36c4 100644 --- a/src/trainMSVMMaj.c +++ b/src/trainMSVMMaj.c @@ -3,20 +3,21 @@ #define MINARGS 2 void print_null(const char *s) {} - -void parse_command_line(int argc, char **argv, char *input_filename, struct Model *model); void exit_with_help(); +void parse_command_line(int argc, char **argv, struct Model *model, + char *input_filename, char *output_filename); void exit_with_help() { printf("This is MSVMMaj, version %1.1f\n\n", VERSION); - printf("Usage: trainMSVMMaj [options] training_data_file [output_file]\n"); + printf("Usage: trainMSVMMaj [options] training_data_file\n"); printf("Options:\n"); printf("-c folds : perform cross validation with given number of folds\n"); printf("-e epsilon : set the value of the stopping criterion\n"); printf("-h | -help : print this help.\n"); printf("-k kappa : set the value of kappa used in the Huber hinge\n"); printf("-l lambda : set the value of lambda (lambda > 0)\n"); + printf("-o output_file : write output to file\n"); printf("-p p-value : set the value of p in the lp norm (1.0 <= p <= 2.0)\n"); printf("-q : quiet mode (no output)\n"); printf("-r rho : choose the weigth specification (1 = unit, 2 = group)\n"); @@ -30,30 +31,46 @@ void exit_with_help() int main(int argc, char **argv) { char input_filename[MAX_LINE_LENGTH]; + char model_filename[MAX_LINE_LENGTH]; + struct Model *model = Malloc(struct Model, 1); struct Data *data = Malloc(struct Data, 1); - if (argc < MINARGS || check_argv(argc, argv, "-help") || check_argv_eq(argc, argv, "-h") ) { + if (argc < MINARGS || check_argv(argc, argv, "-help") || check_argv_eq(argc, argv, "-h") ) exit_with_help(); - } - parse_command_line(argc, argv, input_filename, model); + parse_command_line(argc, argv, model, input_filename, model_filename); - // read data and allocate all memory for the model - read_data(data, model, input_filename); + // read data file + read_data(data, input_filename); + + // copy dataset parameters to model + model->n = data->n; + model->m = data->m; + model->K = data->K; + model->data_file = input_filename; + + // allocate model and initialize weights allocate_model(model); initialize_weights(data, model); + // start training main_loop(model, data); + // write_model to file + if (check_argv_eq(argc, argv, "-o")) { + write_model(model, model_filename); + info("Output written to %s\n", model_filename); + } + // free model and data free_model(model); free_data(data); return 0; - } -void parse_command_line(int argc, char **argv, char *input_filename, struct Model *model) +void parse_command_line(int argc, char **argv, struct Model *model, + char *input_filename, char *output_filename) { int i; void (*print_func)(const char*) = NULL; @@ -72,24 +89,26 @@ void parse_command_line(int argc, char **argv, char *input_filename, struct Mode exit_with_help(); } switch (argv[i-1][1]) { - case 'p': - model->p = atof(argv[i]); - break; - case 'l': - model->lambda = atof(argv[i]); - break; case 'e': model->epsilon = atof(argv[i]); break; case 'k': model->kappa = atof(argv[i]); break; - case 'r': - model->weight_idx = atoi(argv[i]); + case 'l': + model->lambda = atof(argv[i]); + break; + case 'o': + strcpy(output_filename, argv[i]); + break; + case 'p': + model->p = atof(argv[i]); break; case 'q': print_func = &print_null; i--; + case 'r': + model->weight_idx = atoi(argv[i]); break; default: fprintf(stderr, "Unknown option: -%c\n", argv[i-1][1]); @@ -4,7 +4,7 @@ Read the data from the data_file. The data matrix X is augmented with a column of ones, to get the matrix Z. */ -void read_data(struct Data *dataset, struct Model *model, char *data_file) +void read_data(struct Data *dataset, char *data_file) { FILE *fid; long i, j; @@ -86,13 +86,181 @@ void read_data(struct Data *dataset, struct Model *model, char *data_file) dataset->m = m; dataset->K = K; - model->n = n; - model->m = m; - model->K = K; - info("Succesfully read data file: %s\n", data_file); } +void next_line(FILE *fid, char *filename) +{ + char buffer[MAX_LINE_LENGTH]; + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading file %s\n", filename); + exit(1); + } +} + +double get_fmt_double(FILE *fid, char *filename, const char *fmt) +{ + char buffer[MAX_LINE_LENGTH]; + double value; + + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading line from file %s\n", filename); + exit(1); + } + sscanf(buffer, fmt, &value); + + return value; +} + +long get_fmt_long(FILE *fid, char *filename, const char *fmt) +{ + char buffer[MAX_LINE_LENGTH]; + long value; + + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading line from file %s\n", filename); + exit(1); + } + sscanf(buffer, fmt, &value); + + return value; +} + + +void read_model(struct Model *model, char *model_filename) +{ + long i, j, nr = 0; + FILE *fid; + char buffer[MAX_LINE_LENGTH]; + char data_filename[MAX_LINE_LENGTH]; + double value = 0; + + fid = fopen(model_filename, "r"); + if (fid == NULL) { + fprintf(stderr, "Error opening model file %s\n", model_filename); + exit(1); + } + // skip the first four lines + for (i=0; i<4; i++) + next_line(fid, model_filename); + + // read all model variables + model->p = get_fmt_double(fid, model_filename, "p = %lf"); + model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf"); + model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf"); + model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf"); + model->weight_idx = (int) get_fmt_long(fid, model_filename, "weight_idx = %li"); + + // skip to data section + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read filename of data file + if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) { + fprintf(stderr, "Error reading model file %s\n", model_filename); + exit(1); + } + sscanf(buffer, "filename = %s\n", data_filename); + model->data_file = data_filename; + + // read all data variables + model->n = get_fmt_long(fid, model_filename, "n = %li\n"); + model->m = get_fmt_long(fid, model_filename, "m = %li\n"); + model->K = get_fmt_long(fid, model_filename, "K = %li\n"); + + // skip to output + for (i=0; i<2; i++) + next_line(fid, model_filename); + + // read the matrix V and check for consistency + model->V = Malloc(double, (model->m+1)*(model->K-1)); + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + nr += fscanf(fid, "%lf ", &value); + matrix_set(model->V, model->K-1, i, j, value); + } + } + if (nr != (model->m+1)*(model->K-1)) { + fprintf(stderr, "Error reading model file %s. " + "Not enough elements of V found.\n", model_filename); + exit(1); + } + +} + + + +void write_model(struct Model *model, char *output_filename) +{ + FILE *fid; + int i, j, diff, hours, minutes; + char timestr[1000]; + time_t current_time, lt, gt; + struct tm *lclt; + + // open output file + fid = fopen(output_filename, "w"); + if (fid == NULL) { + fprintf(stderr, "Error opening output file %s", output_filename); + exit(1); + } + + // get current time (in epoch) + current_time = time(NULL); + if (current_time == ((time_t)-1)) { + fprintf(stderr, "Failed to compute the current time.\n"); + exit(1); + } + + // convert time to local time and create a string + lclt = localtime(¤t_time); + strftime(timestr, 1000, "%c", lclt); + if (timestr == NULL) { + fprintf(stderr, "Failed to convert time to string.\n"); + exit(1); + } + + // calculate the difference from UTC including DST + lt = mktime(localtime(¤t_time)); + gt = mktime(gmtime(¤t_time)); + diff = -difftime(gt, lt); + hours = (diff/3600); + minutes = (diff%3600)/60; + if (lclt->tm_isdst == 1) + hours++; + + // Write output to file + fprintf(fid, "Output file for MSVMMaj (version %1.1f)\n", VERSION); + fprintf(fid, "Generated on: %s (UTC %+03i:%02i)\n\n", timestr, hours, minutes); + fprintf(fid, "Model:\n"); + fprintf(fid, "p = %15.16f\n", model->p); + fprintf(fid, "lambda = %15.16f\n", model->lambda); + fprintf(fid, "kappa = %15.16f\n", model->kappa); + fprintf(fid, "epsilon = %g\n", model->epsilon); + fprintf(fid, "weight_idx = %i\n", model->weight_idx); + fprintf(fid, "\n"); + fprintf(fid, "Data:\n"); + fprintf(fid, "filename = %s\n", model->data_file); + fprintf(fid, "n = %li\n", model->n); + fprintf(fid, "m = %li\n", model->m); + fprintf(fid, "K = %li\n", model->K); + fprintf(fid, "\n"); + fprintf(fid, "Output:\n"); + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + fprintf(fid, "%+15.16f ", matrix_get(model->V, model->K-1, i, j)); + } + fprintf(fid, "\n"); + } + + fclose(fid); + +} + +void write_predictions(struct Data *data, long *predy, char *output_filename) +{ +} + int check_argv(int argc, char **argv, char *str) { int i; |
