aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--include/libMSVMMaj.h5
-rw-r--r--include/util.h9
-rw-r--r--src/libMSVMMaj.c82
-rw-r--r--src/trainMSVMMaj.c55
-rw-r--r--src/util.c178
6 files changed, 300 insertions, 34 deletions
diff --git a/Makefile b/Makefile
index f400f2c..f2a6f60 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ VERSION=0.1
CC=gcc
CFLAGS=-Wall -O2 -DVERSION=$(VERSION) -g
INCLUDE= -Iinclude/
-EXECS=trainMSVMMaj
+EXECS=trainMSVMMaj predMSVMMaj
.PHONY: all clean tar
@@ -13,6 +13,9 @@ override LDFLAGS+=-lblas -llapack -lm
trainMSVMMaj: src/trainMSVMMaj.c src/libMSVMMaj.o src/util.o
$(CC) -o trainMSVMMaj src/trainMSVMMaj.c src/libMSVMMaj.o src/util.o $(CFLAGS) $(INCLUDE) $(LDFLAGS)
+predMSVMMaj: src/predMSVMMaj.c src/libMSVMMaj.o src/util.o
+ $(CC) -o predMSVMMaj src/predMSVMMaj.c src/libMSVMMaj.o src/util.o $(CFLAGS) $(INCLUDE) $(LDFLAGS)
+
src/libMSVMMaj.o:
$(CC) -c -o src/libMSVMMaj.o src/libMSVMMaj.c $(CFLAGS) $(INCLUDE)
diff --git a/include/libMSVMMaj.h b/include/libMSVMMaj.h
index c886ded..6db1253 100644
--- a/include/libMSVMMaj.h
+++ b/include/libMSVMMaj.h
@@ -1,7 +1,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
-#include <time.h>
#include <cblas.h>
#include <string.h>
#include "util.h"
@@ -24,3 +23,7 @@ void main_loop(struct Model *model, struct Data *data);
int dposv(char UPLO, int N, int NRHS, double *A, int LDA, double *B, int LDB);
void initialize_weights(struct Data *data, struct Model *model);
+
+void predict_labels(struct Data *data, struct Model *model, long *predy);
+double prediction_perf(struct Data *data, long *predy);
+
diff --git a/include/util.h b/include/util.h
index 0b5009e..ec415ac 100644
--- a/include/util.h
+++ b/include/util.h
@@ -3,6 +3,7 @@
#include <stdlib.h>
#include <math.h>
#include <string.h>
+#include <time.h>
#include "MSVMMaj.h"
#define Calloc(type, n) (type *)calloc((n), sizeof(type))
@@ -11,7 +12,12 @@
#define maximum(a, b) a > b ? a : b
#define minimum(a, b) a < b ? a : b
-void read_data(struct Data *dataset, struct Model *model, char *data_file);
+void read_data(struct Data *dataset, char *data_file);
+
+void read_model(struct Model *model, char *model_filename);
+void write_model(struct Model *model, char *output_filename);
+
+void write_predictions(struct Data *data, long *predy, char *output_filename);
int check_argv(int argc, char **argv, char *str);
int check_argv_eq(int argc, char **argv, char *str);
@@ -34,3 +40,4 @@ void free_model(struct Model *model);
void free_data(struct Data *data);
void print_matrix(double *M, long rows, long cols);
+
diff --git a/src/libMSVMMaj.c b/src/libMSVMMaj.c
index 04c5035..f5aeb4e 100644
--- a/src/libMSVMMaj.c
+++ b/src/libMSVMMaj.c
@@ -167,7 +167,6 @@ double get_msvmmaj_loss(struct Model *model, struct Data *data, double *ZV)
return loss;
}
-
/*
Training loop is defined here.
*/
@@ -226,7 +225,7 @@ void main_loop(struct Model *model, struct Data *data)
Lbar = L;
L = get_msvmmaj_loss(model, data, ZV);
- if (it%500 == 0)
+ if (it%100 == 0)
info("iter = %li, L = %15.16f, Lbar = %15.16f, reldiff = %15.16f\n",
it, L, Lbar, (Lbar - L)/L);
it++;
@@ -297,7 +296,7 @@ void msvmmaj_update(struct Model *model, struct Data *data,
{
// Because msvmmaj_update is always called after a call to
// get_msvmmaj_loss with the latest V, it is unnecessary to recalculate
- // the matrix ZV, the errors Q and the Huber errors H. Awesome!
+ // the matrix ZV, the errors Q, or the Huber errors H. Awesome!
int status;
long i, j, k;
double Avalue, Bvalue;
@@ -536,19 +535,16 @@ void msvmmaj_update(struct Model *model, struct Data *data,
for (i=0; i<m+1; i++) {
for (j=0; j<K-1; j++) {
- value = matrix_get(model->Vbar, K-1, i, j);
matrix_set(model->Vbar, K-1, i, j, matrix_get(model->V, K-1, i, j));
matrix_set(model->V, K-1, i, j, matrix_get(ZAZV, K-1, i, j));
- matrix_set(ZAZV, K-1, i, j, value);
}
}
-
}
void initialize_weights(struct Data *data, struct Model *model)
{
- int *groups;
+ long *groups;
long i;
long n = model->n;
@@ -559,7 +555,7 @@ void initialize_weights(struct Data *data, struct Model *model)
model->rho[i] = 1.0;
}
else if (model->weight_idx == 2) {
- groups = Calloc(int, K);
+ groups = Calloc(long, K);
for (i=0; i<n; i++) {
groups[data->y[i]-1]++;
}
@@ -572,3 +568,73 @@ void initialize_weights(struct Data *data, struct Model *model)
}
}
+
+void predict_labels(struct Data *data, struct Model *model, long *predy)
+{
+ long i, j, k, label;
+ double norm, min_dist;
+
+ long n = data->n; // note that model->n is the size of the training sample.
+ long m = data->m;
+ long K = model->K; //data->K does not necessarily equal the original K.
+
+ double *S = Calloc(double, K-1);
+ double *ZV = Calloc(double, n*(K-1));
+ double *U = Calloc(double, K*(K-1));
+
+ // Get the simplex matrix
+ simplex_gen(K, U);
+
+ // Generate the simplex-space vectors
+ cblas_dgemm(
+ CblasRowMajor,
+ CblasNoTrans,
+ CblasNoTrans,
+ n,
+ K-1,
+ m+1,
+ 1.0,
+ data->Z,
+ m+1,
+ model->V,
+ K-1,
+ 0.0,
+ ZV,
+ K-1);
+
+ // Calculate the distance to each of the vertices of the simplex.
+ // The closest vertex defines the class label.
+ for (i=0; i<n; i++) {
+ label = 0;
+ min_dist = 1000000000.0;
+ for (j=0; j<K; j++) {
+ for (k=0; k<K-1; k++) {
+ S[k] = matrix_get(ZV, K-1, i, k) - matrix_get(U, K-1, j, k);
+ }
+ norm = cblas_dnrm2(K, S, 1);
+ if (norm < min_dist) {
+ label = j+1;
+ min_dist = norm;
+ }
+ }
+ predy[i] = label;
+ }
+
+ free(ZV);
+ free(U);
+ free(S);
+}
+
+double prediction_perf(struct Data *data, long *predy)
+{
+ long i, correct = 0;
+ double performance;
+
+ for (i=0; i<data->n; i++)
+ if (data->y[i] == predy[i])
+ correct++;
+
+ performance = ((double) correct)/((double) data->n) * 100.0;
+
+ return performance;
+}
diff --git a/src/trainMSVMMaj.c b/src/trainMSVMMaj.c
index a16c008..32c36c4 100644
--- a/src/trainMSVMMaj.c
+++ b/src/trainMSVMMaj.c
@@ -3,20 +3,21 @@
#define MINARGS 2
void print_null(const char *s) {}
-
-void parse_command_line(int argc, char **argv, char *input_filename, struct Model *model);
void exit_with_help();
+void parse_command_line(int argc, char **argv, struct Model *model,
+ char *input_filename, char *output_filename);
void exit_with_help()
{
printf("This is MSVMMaj, version %1.1f\n\n", VERSION);
- printf("Usage: trainMSVMMaj [options] training_data_file [output_file]\n");
+ printf("Usage: trainMSVMMaj [options] training_data_file\n");
printf("Options:\n");
printf("-c folds : perform cross validation with given number of folds\n");
printf("-e epsilon : set the value of the stopping criterion\n");
printf("-h | -help : print this help.\n");
printf("-k kappa : set the value of kappa used in the Huber hinge\n");
printf("-l lambda : set the value of lambda (lambda > 0)\n");
+ printf("-o output_file : write output to file\n");
printf("-p p-value : set the value of p in the lp norm (1.0 <= p <= 2.0)\n");
printf("-q : quiet mode (no output)\n");
printf("-r rho : choose the weigth specification (1 = unit, 2 = group)\n");
@@ -30,30 +31,46 @@ void exit_with_help()
int main(int argc, char **argv)
{
char input_filename[MAX_LINE_LENGTH];
+ char model_filename[MAX_LINE_LENGTH];
+
struct Model *model = Malloc(struct Model, 1);
struct Data *data = Malloc(struct Data, 1);
- if (argc < MINARGS || check_argv(argc, argv, "-help") || check_argv_eq(argc, argv, "-h") ) {
+ if (argc < MINARGS || check_argv(argc, argv, "-help") || check_argv_eq(argc, argv, "-h") )
exit_with_help();
- }
- parse_command_line(argc, argv, input_filename, model);
+ parse_command_line(argc, argv, model, input_filename, model_filename);
- // read data and allocate all memory for the model
- read_data(data, model, input_filename);
+ // read data file
+ read_data(data, input_filename);
+
+ // copy dataset parameters to model
+ model->n = data->n;
+ model->m = data->m;
+ model->K = data->K;
+ model->data_file = input_filename;
+
+ // allocate model and initialize weights
allocate_model(model);
initialize_weights(data, model);
+ // start training
main_loop(model, data);
+ // write_model to file
+ if (check_argv_eq(argc, argv, "-o")) {
+ write_model(model, model_filename);
+ info("Output written to %s\n", model_filename);
+ }
+
// free model and data
free_model(model);
free_data(data);
return 0;
-
}
-void parse_command_line(int argc, char **argv, char *input_filename, struct Model *model)
+void parse_command_line(int argc, char **argv, struct Model *model,
+ char *input_filename, char *output_filename)
{
int i;
void (*print_func)(const char*) = NULL;
@@ -72,24 +89,26 @@ void parse_command_line(int argc, char **argv, char *input_filename, struct Mode
exit_with_help();
}
switch (argv[i-1][1]) {
- case 'p':
- model->p = atof(argv[i]);
- break;
- case 'l':
- model->lambda = atof(argv[i]);
- break;
case 'e':
model->epsilon = atof(argv[i]);
break;
case 'k':
model->kappa = atof(argv[i]);
break;
- case 'r':
- model->weight_idx = atoi(argv[i]);
+ case 'l':
+ model->lambda = atof(argv[i]);
+ break;
+ case 'o':
+ strcpy(output_filename, argv[i]);
+ break;
+ case 'p':
+ model->p = atof(argv[i]);
break;
case 'q':
print_func = &print_null;
i--;
+ case 'r':
+ model->weight_idx = atoi(argv[i]);
break;
default:
fprintf(stderr, "Unknown option: -%c\n", argv[i-1][1]);
diff --git a/src/util.c b/src/util.c
index 735c4be..ba48212 100644
--- a/src/util.c
+++ b/src/util.c
@@ -4,7 +4,7 @@
Read the data from the data_file. The data matrix X is augmented
with a column of ones, to get the matrix Z.
*/
-void read_data(struct Data *dataset, struct Model *model, char *data_file)
+void read_data(struct Data *dataset, char *data_file)
{
FILE *fid;
long i, j;
@@ -86,13 +86,181 @@ void read_data(struct Data *dataset, struct Model *model, char *data_file)
dataset->m = m;
dataset->K = K;
- model->n = n;
- model->m = m;
- model->K = K;
-
info("Succesfully read data file: %s\n", data_file);
}
+void next_line(FILE *fid, char *filename)
+{
+ char buffer[MAX_LINE_LENGTH];
+ if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) {
+ fprintf(stderr, "Error reading file %s\n", filename);
+ exit(1);
+ }
+}
+
+double get_fmt_double(FILE *fid, char *filename, const char *fmt)
+{
+ char buffer[MAX_LINE_LENGTH];
+ double value;
+
+ if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) {
+ fprintf(stderr, "Error reading line from file %s\n", filename);
+ exit(1);
+ }
+ sscanf(buffer, fmt, &value);
+
+ return value;
+}
+
+long get_fmt_long(FILE *fid, char *filename, const char *fmt)
+{
+ char buffer[MAX_LINE_LENGTH];
+ long value;
+
+ if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) {
+ fprintf(stderr, "Error reading line from file %s\n", filename);
+ exit(1);
+ }
+ sscanf(buffer, fmt, &value);
+
+ return value;
+}
+
+
+void read_model(struct Model *model, char *model_filename)
+{
+ long i, j, nr = 0;
+ FILE *fid;
+ char buffer[MAX_LINE_LENGTH];
+ char data_filename[MAX_LINE_LENGTH];
+ double value = 0;
+
+ fid = fopen(model_filename, "r");
+ if (fid == NULL) {
+ fprintf(stderr, "Error opening model file %s\n", model_filename);
+ exit(1);
+ }
+ // skip the first four lines
+ for (i=0; i<4; i++)
+ next_line(fid, model_filename);
+
+ // read all model variables
+ model->p = get_fmt_double(fid, model_filename, "p = %lf");
+ model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf");
+ model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf");
+ model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf");
+ model->weight_idx = (int) get_fmt_long(fid, model_filename, "weight_idx = %li");
+
+ // skip to data section
+ for (i=0; i<2; i++)
+ next_line(fid, model_filename);
+
+ // read filename of data file
+ if (fgets(buffer, MAX_LINE_LENGTH, fid) == NULL) {
+ fprintf(stderr, "Error reading model file %s\n", model_filename);
+ exit(1);
+ }
+ sscanf(buffer, "filename = %s\n", data_filename);
+ model->data_file = data_filename;
+
+ // read all data variables
+ model->n = get_fmt_long(fid, model_filename, "n = %li\n");
+ model->m = get_fmt_long(fid, model_filename, "m = %li\n");
+ model->K = get_fmt_long(fid, model_filename, "K = %li\n");
+
+ // skip to output
+ for (i=0; i<2; i++)
+ next_line(fid, model_filename);
+
+ // read the matrix V and check for consistency
+ model->V = Malloc(double, (model->m+1)*(model->K-1));
+ for (i=0; i<model->m+1; i++) {
+ for (j=0; j<model->K-1; j++) {
+ nr += fscanf(fid, "%lf ", &value);
+ matrix_set(model->V, model->K-1, i, j, value);
+ }
+ }
+ if (nr != (model->m+1)*(model->K-1)) {
+ fprintf(stderr, "Error reading model file %s. "
+ "Not enough elements of V found.\n", model_filename);
+ exit(1);
+ }
+
+}
+
+
+
+void write_model(struct Model *model, char *output_filename)
+{
+ FILE *fid;
+ int i, j, diff, hours, minutes;
+ char timestr[1000];
+ time_t current_time, lt, gt;
+ struct tm *lclt;
+
+ // open output file
+ fid = fopen(output_filename, "w");
+ if (fid == NULL) {
+ fprintf(stderr, "Error opening output file %s", output_filename);
+ exit(1);
+ }
+
+ // get current time (in epoch)
+ current_time = time(NULL);
+ if (current_time == ((time_t)-1)) {
+ fprintf(stderr, "Failed to compute the current time.\n");
+ exit(1);
+ }
+
+ // convert time to local time and create a string
+ lclt = localtime(&current_time);
+ strftime(timestr, 1000, "%c", lclt);
+ if (timestr == NULL) {
+ fprintf(stderr, "Failed to convert time to string.\n");
+ exit(1);
+ }
+
+ // calculate the difference from UTC including DST
+ lt = mktime(localtime(&current_time));
+ gt = mktime(gmtime(&current_time));
+ diff = -difftime(gt, lt);
+ hours = (diff/3600);
+ minutes = (diff%3600)/60;
+ if (lclt->tm_isdst == 1)
+ hours++;
+
+ // Write output to file
+ fprintf(fid, "Output file for MSVMMaj (version %1.1f)\n", VERSION);
+ fprintf(fid, "Generated on: %s (UTC %+03i:%02i)\n\n", timestr, hours, minutes);
+ fprintf(fid, "Model:\n");
+ fprintf(fid, "p = %15.16f\n", model->p);
+ fprintf(fid, "lambda = %15.16f\n", model->lambda);
+ fprintf(fid, "kappa = %15.16f\n", model->kappa);
+ fprintf(fid, "epsilon = %g\n", model->epsilon);
+ fprintf(fid, "weight_idx = %i\n", model->weight_idx);
+ fprintf(fid, "\n");
+ fprintf(fid, "Data:\n");
+ fprintf(fid, "filename = %s\n", model->data_file);
+ fprintf(fid, "n = %li\n", model->n);
+ fprintf(fid, "m = %li\n", model->m);
+ fprintf(fid, "K = %li\n", model->K);
+ fprintf(fid, "\n");
+ fprintf(fid, "Output:\n");
+ for (i=0; i<model->m+1; i++) {
+ for (j=0; j<model->K-1; j++) {
+ fprintf(fid, "%+15.16f ", matrix_get(model->V, model->K-1, i, j));
+ }
+ fprintf(fid, "\n");
+ }
+
+ fclose(fid);
+
+}
+
+void write_predictions(struct Data *data, long *predy, char *output_filename)
+{
+}
+
int check_argv(int argc, char **argv, char *str)
{
int i;