Implement fitting and prediction

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2018-02-23 17:10:16 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2018-02-23 17:10:16 +0000
commit: bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468 (patch)
tree: dcf17ae3e5fef6cee367f0da985bb4894495aee9
parent: update gensvm C library (diff)
download: rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.tar.gz
rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.zip
5 files changed, 380 insertions, 27 deletions
diff --git a/R/gensvm.R b/R/gensvm.R
index 1923f06..4a4ab6b 100644
--- a/R/gensvm.R
+++ b/R/gensvm.R
@@ -56,9 +56,9 @@
 #' \code{\link{plot}}, and \code{\link{gensvm.grid}}.
 #'
 #' @export
+#' @useDynLib gensvm_wrapper, .registration = TRUE
 #'
 #' @examples
-#' X <- 
 #'
 gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, 
                    weights='unit', kernel='linear', gamma='auto', coef=0.0, 
@@ -67,10 +67,10 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6,
 {
     call <- match.call()
 
-
-    # TODO: generate the random.seed value in R if it is NULL. Then you can 
-    # return it and people can still reproduce even if they forgot to set it 
-    # explicitly.
+    # Generate the random.seed value in R if it is NULL. This way users can 
+    # reproduce the run because it is returned in the output object.
+    if (is.null(random.seed))
+        random.seed <- runif(1) * (2**31 - 1)
 
     # TODO: Store a labelencoder in the object, preferably as a partially 
     # hidden item. This can then be used with prediction.
@@ -79,9 +79,13 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6,
     n.features <- ncol(X)
     n.classes <- length(unique(y))
 
-
     # Convert labels to integers
-    y.clean <- label.encode(y)
+    classes <- sort(unique(y))
+    y.clean <- match(y, classes)
+
+    # Convert gamma if it is 'auto'
+    if (gamma == 'auto')
+        gamma <- 1.0/n.features
 
     # Convert weights to index
     weight.idx <- which(c("unit", "group") == weights)
@@ -90,39 +94,43 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6,
              "Valid options are 'unit' and 'group'")
     }
 
-    # Convert kernel to index
-    kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel)
+    # Convert kernel to index (remember off-by-one for R vs. C)
+    kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel) - 1
     if (length(kernel.idx) == 0) {
         stop("Incorrect kernel specification. ",
              "Valid options are 'linear', 'poly', 'rbf', and 'sigmoid'")
     }
 
-
     out <- .Call("R_gensvm_train",
-                  as.matrix(t(X)),
+                  as.matrix(X),
                   as.integer(y.clean),
                   p,
                   lambda,
                   kappa,
                   epsilon,
                   weight.idx,
-                  kernel.idx,
+                  as.integer(kernel.idx),
                   gamma,
                   coef,
                   degree,
                   kernel.eigen.cutoff,
-                  verbose,
-                  max.iter,
-                  random.seed,
-                  seed.V)
-
+                  as.integer(verbose),
+                  as.integer(max.iter),
+                  as.integer(random.seed),
+                  seed.V,
+                  as.integer(n.objects),
+                  as.integer(n.features),
+                  as.integer(n.classes))
 
-    object <- list(call = call, lambda = lambda, kappa = kappa, 
+    object <- list(call = call, p = p, lambda = lambda, kappa = kappa, 
                    epsilon = epsilon, weights = weights, kernel = kernel, 
                    gamma = gamma, coef = coef, degree = degree, 
                    kernel.eigen.cutoff = kernel.eigen.cutoff, 
                    random.seed = random.seed, max.iter = max.iter, 
-                   V = out$V, n.iter = out$n.iter, n.support = out$n.support)
+                   n.objects = n.objects, n.features = n.features, 
+                   n.classes = n.classes, classes = classes, V = out$V, 
+                   n.iter = out$n.iter, n.support = out$n.support)
     class(object) <- "gensvm"
+
     return(object)
 }
diff --git a/R/predict.gensvm.R b/R/predict.gensvm.R
index 6cc8851..5c8f2e7 100644
--- a/R/predict.gensvm.R
+++ b/R/predict.gensvm.R
@@ -3,7 +3,7 @@
 #' @description This function predicts the class labels of new data using a 
 #' fitted GenSVM model.
 #'
-#' @param object Fitted \code{gensvm} object
+#' @param fit Fitted \code{gensvm} object
 #' @param newx Matrix of new values for \code{x} for which predictions need to 
 #' be made.
 #' @param \dots further arguments are ignored
@@ -27,10 +27,27 @@
 #'
 #'
 #'
-predict.gensvm <- function(object, newx, ...)
+predict.gensvm <- function(fit, newx, ...)
 {
-    # TODO: C library fitting prediction here (or not? with the column-major 
-    # order it may be faster to do it directly in R)
+    ## Implementation note:
+    ## - It might seem that it would be faster to do the prediction directly in 
+    ## R here, since we then don't need to switch to C, construct model and 
+    ## data structures, copy the data, etc. before doing the prediction.  
+    ## However, if you actually implement it and compare, we find that calling 
+    ## the C implementation is *much* faster than doing it in R.
+
+    # Sanity check
+    if (ncol(newx) != fit$n.features)
+        stop("Number of features of fitted model and supplied data disagree.")
+
+    y.pred.c <- .Call("R_gensvm_predict",
+                      as.matrix(newx),
+                      as.matrix(fit$V),
+                      as.integer(nrow(newx)),
+                      as.integer(ncol(newx)),
+                      as.integer(fit$n.classes)
+                      )
+    yhat <- fit$classes[y.pred.c]
 
     return(yhat)
 }
diff --git a/R/print.gensvm.R b/R/print.gensvm.R
index 8d17b0c..06a3649 100644
--- a/R/print.gensvm.R
+++ b/R/print.gensvm.R
@@ -26,9 +26,31 @@ print.gensvm <- function(object, ...)
 {
     cat("\nCall:\n")
     dput(object$call)
+    cat("\nData:\n")
+    cat("\tn.objects:", object$n.objects, "\n")
+    cat("\tn.features:", object$n.features, "\n")
+    cat("\tn.classes:", object$n.classes, "\n")
+    cat("\tclasses:", object$classes, "\n")
+    cat("Parameters:\n")
+    cat("\tp:", object$p, "\n")
+    cat("\tlambda:", object$lambda, "\n")
+    cat("\tkappa:", object$kappa, "\n")
+    cat("\tepsilon:", object$epsilon, "\n")
+    cat("\tweights:", object$weights, "\n")
+    cat("\tmax.iter:", object$max.iter, "\n")
+    cat("\trandom.seed:", object$random.seed, "\n")
+    cat("\tkernel:", object$kernel, "\n")
+    if (object$kernel %in% c("poly", "rbf", "sigmoid")) {
+        cat("\tkernel.eigen.cutoff:", object$kernel.eigen.cutoff, "\n")
+        cat("\tgamma:", object$gamma, "\n")
+    }
+    if (object$kernel %in% c("poly", "sigmoid"))
+        cat("\tcoef:", object$coef, "\n")
+    if (object$kernel == 'poly')
+        cat("\tdegree:", object$degree, "\n")
+    cat("Results:\n")
+    cat("\tn.iter:", object$n.iter, "\n")
+    cat("\tn.support:", object$n.support, "\n")
 
-    # TODO: fill this out
-    #
-    #
     invisible(object)
 }
diff --git a/R/util.labelencoder.R b/R/util.labelencoder.R
deleted file mode 100644
index 8b13789..0000000
--- a/R/util.labelencoder.R
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/src/gensvm_wrapper.c b/src/gensvm_wrapper.c
new file mode 100644
index 0000000..c3f0f7f
--- /dev/null
+++ b/src/gensvm_wrapper.c
@@ -0,0 +1,307 @@
+
+#define STRICT_R_HEADERS
+
+#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Print.h>
+
+#include "gensvm_debug.h"
+#include "gensvm_print.h"
+#include "gensvm_train.h"
+#include "gensvm_predict.h"
+
+// forward declarations
+
+SEXP R_gensvm_train( SEXP R_X, SEXP R_y, SEXP R_p, SEXP R_lambda, 
+		SEXP R_kappa, SEXP R_epsilon, SEXP R_weight_idx, 
+		SEXP R_kernel_idx, SEXP R_gamma, SEXP R_coef, SEXP R_degree, 
+		SEXP R_kernel_eigen_cutoff, SEXP R_verbose, SEXP R_max_iter, 
+		SEXP R_random_seed, SEXP R_seed_V, SEXP R_n, SEXP R_m, 
+		SEXP R_K);
+SEXP R_gensvm_predict(SEXP R_Xtest, SEXP R_V, SEXP R_n, SEXP R_m, SEXP R_K);
+
+
+void _set_verbosity(int verbosity_flag);
+void _set_seed_model(struct GenModel *model, double *V, long n, long m, 
+		long K);
+
+// Start R package stuff
+
+R_CallMethodDef callMethods[] = {
+	{"R_gensvm_train", (DL_FUNC) &R_gensvm_train, 19},
+	{"R_gensvm_predict", (DL_FUNC) &R_gensvm_predict, 5},
+	{NULL, NULL, 0}
+};
+R_CMethodDef cMethods[] = {
+	{NULL, NULL, 0}
+};
+
+void R_init_gensvm_wrapper(DllInfo *info) {
+	R_registerRoutines(info, cMethods, callMethods, NULL, NULL);
+	R_useDynamicSymbols(info, TRUE);
+}
+
+// End R package stuff
+
+void _set_verbosity(int verbosity_flag)
+{
+	extern FILE *GENSVM_OUTPUT_FILE;
+	extern FILE *GENSVM_ERROR_FILE;
+
+	if (verbosity_flag) {
+		gensvm_print_out = Rprintf;
+		gensvm_print_err = REprintf;
+	}
+	else {
+		GENSVM_OUTPUT_FILE = NULL;
+		GENSVM_ERROR_FILE = NULL;
+	}
+}
+
+void _set_seed_model(struct GenModel *model, double *V, long n, long m, long K)
+{
+	long i, j;
+	double value;
+
+	model->n = 0;
+	model->m = m;
+	model->K = K;
+
+	gensvm_allocate_model(model);
+
+	for (i=0; i<m+1; i++) {
+		for (j=0; j<K-1; j++) {
+			value = matrix_get(V, m+1, K-1, i, j);
+			matrix_set(model->V, m+1, K-1, i, j, value);
+		}
+	}
+}
+
+
+// NOTE: Let's supply X here as it is represented in R: a matrix in 
+// Column-Major order. Since we have to augment the matrix X with the column 
+// of ones to form Z, we might as well do that *and* convert to RowMajor in a 
+// single step. Otherwise we have the RowMajor version of X as well as Z in 
+// memory, which is unnecessary.
+SEXP R_gensvm_train(
+		SEXP R_X,
+		SEXP R_y,
+		SEXP R_p,
+		SEXP R_lambda,
+		SEXP R_kappa,
+		SEXP R_epsilon,
+		SEXP R_weight_idx,
+		SEXP R_kernel_idx,
+		SEXP R_gamma,
+		SEXP R_coef,
+		SEXP R_degree,
+		SEXP R_kernel_eigen_cutoff,
+		SEXP R_verbose,
+		SEXP R_max_iter,
+		SEXP R_random_seed,
+		SEXP R_seed_V,
+		SEXP R_n,
+		SEXP R_m,
+		SEXP R_K
+		)
+{
+	double *X = REAL(R_X);
+	int *y = INTEGER(R_y); // R doesn't know long?
+	double p = *REAL(R_p);
+	double lambda = *REAL(R_lambda);
+	double kappa = *REAL(R_kappa);
+	double epsilon = *REAL(R_epsilon);
+	int weight_idx = *INTEGER(R_weight_idx);
+	int kernel_idx = *INTEGER(R_kernel_idx);
+	double gamma = *REAL(R_gamma);
+	double coef = *REAL(R_coef);
+	double degree = *REAL(R_degree);
+	double kernel_eigen_cutoff = *REAL(R_kernel_eigen_cutoff);
+	int verbose = *INTEGER(R_verbose);
+	int max_iter = *INTEGER(R_max_iter);
+	int random_seed = *INTEGER(R_random_seed);
+	double *seed_V = isNull(R_seed_V) ? NULL : REAL(R_seed_V);
+	int n = *INTEGER(R_n);
+	int m = *INTEGER(R_m);
+	int K = *INTEGER(R_K);
+
+	_set_verbosity(verbose);
+
+	struct GenModel *model = gensvm_init_model();
+	struct GenModel *seed_model = NULL;
+	struct GenData *data = NULL;
+	long i, j;
+	double value;
+
+	// Set model parameters from function input arguments
+	model->p = p;
+	model->lambda = lambda;
+	model->kappa = kappa;
+	model->epsilon = epsilon;
+	model->weight_idx = weight_idx;
+	model->kerneltype = kernel_idx;
+	model->gamma = gamma;
+	model->coef = coef;
+	model->degree = degree;
+	model->kernel_eigen_cutoff = kernel_eigen_cutoff;
+	model->max_iter = max_iter;
+	model->seed = random_seed;
+
+	if (seed_V != NULL) {
+		seed_model = gensvm_init_model();
+		_set_seed_model(seed_model, seed_V, n, m, K);
+	}
+
+	data = gensvm_init_data();
+
+	data->y = Malloc(long, n);
+	for (i=0; i<n; i++)
+		data->y[i] = (long) y[i];
+
+	data->RAW = Malloc(double, n*(m+1));
+	for (i=0; i<n; i++) {
+		for (j=0; j<m; j++) {
+			value = matrix_get(X, n, m, i, j);
+			matrix_set(data->RAW, n, m+1, i, j+1, value);
+		}
+		// column of 1's
+		matrix_set(data->RAW, n, m+1, i, 0, 1.0);
+	}
+
+	data->n = n;
+	data->m = m;
+	data->r = m;
+	data->K = K;
+	data->Z = data->RAW;
+
+	// convert to sparse matrix if possible
+	if (gensvm_could_sparse(data->Z, n, m+1)) {
+		note("Converting to sparse ... ");
+		data->spZ = gensvm_dense_to_sparse(data->Z, n, m+1);
+		note("done.\n");
+		free(data->RAW);
+		data->RAW = NULL;
+		data->Z = NULL;
+	}
+
+	// actually do the training
+	gensvm_train(model, data, seed_model);
+
+	// create the output list
+	SEXP output = PROTECT(allocVector(VECSXP, 3));
+
+	// create and fill output matrix
+	SEXP R_V = PROTECT(allocMatrix(REALSXP, m+1, K-1));
+	double *rR_V = REAL(R_V);
+	for (i=0; i<m+1; i++) {
+		for (j=0; j<K-1; j++) {
+			value = matrix_get(model->V, m+1, K-1, i, j);
+			matrix_set(rR_V, m+1, K-1, i, j, value);
+		}
+	}
+
+	SEXP R_iter = PROTECT(allocVector(INTSXP, 1));
+	int *r_iter = INTEGER(R_iter);
+	r_iter[0] = model->elapsed_iter;
+
+	SEXP R_sv = PROTECT(allocVector(INTSXP, 1));
+	int *r_sv = INTEGER(R_sv);
+	r_sv[0] = gensvm_num_sv(model);
+
+	// set output list elements
+	SET_VECTOR_ELT(output, 0, R_V);
+	SET_VECTOR_ELT(output, 1, R_iter);
+	SET_VECTOR_ELT(output, 2, R_sv);
+
+	// create names
+	SEXP names = PROTECT(allocVector(STRSXP, 3));
+	SET_STRING_ELT(names, 0, mkChar("V"));
+	SET_STRING_ELT(names, 1, mkChar("n.iter"));
+	SET_STRING_ELT(names, 2, mkChar("n.support"));
+
+	// assign names to list
+	setAttrib(output, R_NamesSymbol, names);
+
+	// cleanup
+	UNPROTECT(5);
+
+	gensvm_free_model(model);
+	gensvm_free_model(seed_model);
+	gensvm_free_data(data);
+
+	return output;
+}
+
+SEXP R_gensvm_predict(
+		SEXP R_Xtest,
+		SEXP R_V,
+		SEXP R_n,
+		SEXP R_m,
+		SEXP R_K
+		)
+{
+	double *X = REAL(R_Xtest);
+	double *V = REAL(R_V);
+	int n_test = *INTEGER(R_n);
+	int m = *INTEGER(R_m);
+	int K = *INTEGER(R_K);
+
+	int i, j;
+	double value;
+
+	struct GenModel *model = gensvm_init_model();
+	model->m = m;
+	model->K = K;
+	model->U = Calloc(double, K*(K-1));
+	model->V = Calloc(double, (m+1) * (K-1));
+	for (i=0; i<m+1; i++) {
+		for (j=0; j<K-1; j++) {
+			value = matrix_get(V, m+1, K-1, i, j);
+			matrix_set(model->V, m+1, K-1, i, j, value);
+		}
+	}
+
+	struct GenData *data = gensvm_init_data();
+	data->n = n_test;
+	data->m = m;
+	data->r = m;
+	data->K = K;
+
+	data->RAW = Calloc(double, n_test*(m+1));
+
+	for (i=0; i<n_test; i++) {
+		for (j=0; j<m; j++) {
+			value = matrix_get(X, n_test, m, i, j);
+			matrix_set(data->RAW, n_test, m+1, i, j+1, value);
+		}
+		matrix_set(data->RAW, n_test, m+1, i, 0, 1.0);
+	}
+	data->Z = data->RAW;
+
+	// convert to sparse matrix if possible
+	if (gensvm_could_sparse(data->Z, n_test, m+1)) {
+		note("Converting to sparse ... ");
+		data->spZ = gensvm_dense_to_sparse(data->Z, n_test, m+1);
+		note("done.\n");
+		free(data->RAW);
+		data->RAW = NULL;
+		data->Z = NULL;
+	}
+
+	long *pred_temp = Calloc(long, n_test);
+
+	gensvm_predict_labels(data, model, pred_temp);
+
+	SEXP R_y = PROTECT(allocMatrix(INTSXP, n_test, 1));
+	int *rR_y = INTEGER(R_y);
+	for (i=0; i<n_test; i++)
+		rR_y[i] = pred_temp[i];
+
+	gensvm_free_data(data);
+	gensvm_free_model(model);
+	free(pred_temp);
+
+	UNPROTECT(1);
+
+	return(R_y);
+}
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2018-02-23 17:10:16 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2018-02-23 17:10:16 +0000
commit	bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468 (patch)
tree	dcf17ae3e5fef6cee367f0da985bb4894495aee9
parent	update gensvm C library (diff)
download	rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.tar.gz rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.zip