diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-02-23 17:10:16 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-02-23 17:10:16 +0000 |
| commit | bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468 (patch) | |
| tree | dcf17ae3e5fef6cee367f0da985bb4894495aee9 | |
| parent | update gensvm C library (diff) | |
| download | rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.tar.gz rgensvm-bdeeb59f8f64a9c3a2e083f7e5c33a4c30a2c468.zip | |
Implement fitting and prediction
| -rw-r--r-- | R/gensvm.R | 46 | ||||
| -rw-r--r-- | R/predict.gensvm.R | 25 | ||||
| -rw-r--r-- | R/print.gensvm.R | 28 | ||||
| -rw-r--r-- | R/util.labelencoder.R | 1 | ||||
| -rw-r--r-- | src/gensvm_wrapper.c | 307 |
5 files changed, 380 insertions, 27 deletions
@@ -56,9 +56,9 @@ #' \code{\link{plot}}, and \code{\link{gensvm.grid}}. #' #' @export +#' @useDynLib gensvm_wrapper, .registration = TRUE #' #' @examples -#' X <- #' gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, weights='unit', kernel='linear', gamma='auto', coef=0.0, @@ -67,10 +67,10 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, { call <- match.call() - - # TODO: generate the random.seed value in R if it is NULL. Then you can - # return it and people can still reproduce even if they forgot to set it - # explicitly. + # Generate the random.seed value in R if it is NULL. This way users can + # reproduce the run because it is returned in the output object. + if (is.null(random.seed)) + random.seed <- runif(1) * (2**31 - 1) # TODO: Store a labelencoder in the object, preferably as a partially # hidden item. This can then be used with prediction. @@ -79,9 +79,13 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, n.features <- ncol(X) n.classes <- length(unique(y)) - # Convert labels to integers - y.clean <- label.encode(y) + classes <- sort(unique(y)) + y.clean <- match(y, classes) + + # Convert gamma if it is 'auto' + if (gamma == 'auto') + gamma <- 1.0/n.features # Convert weights to index weight.idx <- which(c("unit", "group") == weights) @@ -90,39 +94,43 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, "Valid options are 'unit' and 'group'") } - # Convert kernel to index - kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel) + # Convert kernel to index (remember off-by-one for R vs. C) + kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel) - 1 if (length(kernel.idx) == 0) { stop("Incorrect kernel specification. ", "Valid options are 'linear', 'poly', 'rbf', and 'sigmoid'") } - out <- .Call("R_gensvm_train", - as.matrix(t(X)), + as.matrix(X), as.integer(y.clean), p, lambda, kappa, epsilon, weight.idx, - kernel.idx, + as.integer(kernel.idx), gamma, coef, degree, kernel.eigen.cutoff, - verbose, - max.iter, - random.seed, - seed.V) - + as.integer(verbose), + as.integer(max.iter), + as.integer(random.seed), + seed.V, + as.integer(n.objects), + as.integer(n.features), + as.integer(n.classes)) - object <- list(call = call, lambda = lambda, kappa = kappa, + object <- list(call = call, p = p, lambda = lambda, kappa = kappa, epsilon = epsilon, weights = weights, kernel = kernel, gamma = gamma, coef = coef, degree = degree, kernel.eigen.cutoff = kernel.eigen.cutoff, random.seed = random.seed, max.iter = max.iter, - V = out$V, n.iter = out$n.iter, n.support = out$n.support) + n.objects = n.objects, n.features = n.features, + n.classes = n.classes, classes = classes, V = out$V, + n.iter = out$n.iter, n.support = out$n.support) class(object) <- "gensvm" + return(object) } diff --git a/R/predict.gensvm.R b/R/predict.gensvm.R index 6cc8851..5c8f2e7 100644 --- a/R/predict.gensvm.R +++ b/R/predict.gensvm.R @@ -3,7 +3,7 @@ #' @description This function predicts the class labels of new data using a #' fitted GenSVM model. #' -#' @param object Fitted \code{gensvm} object +#' @param fit Fitted \code{gensvm} object #' @param newx Matrix of new values for \code{x} for which predictions need to #' be made. #' @param \dots further arguments are ignored @@ -27,10 +27,27 @@ #' #' #' -predict.gensvm <- function(object, newx, ...) +predict.gensvm <- function(fit, newx, ...) { - # TODO: C library fitting prediction here (or not? with the column-major - # order it may be faster to do it directly in R) + ## Implementation note: + ## - It might seem that it would be faster to do the prediction directly in + ## R here, since we then don't need to switch to C, construct model and + ## data structures, copy the data, etc. before doing the prediction. + ## However, if you actually implement it and compare, we find that calling + ## the C implementation is *much* faster than doing it in R. + + # Sanity check + if (ncol(newx) != fit$n.features) + stop("Number of features of fitted model and supplied data disagree.") + + y.pred.c <- .Call("R_gensvm_predict", + as.matrix(newx), + as.matrix(fit$V), + as.integer(nrow(newx)), + as.integer(ncol(newx)), + as.integer(fit$n.classes) + ) + yhat <- fit$classes[y.pred.c] return(yhat) } diff --git a/R/print.gensvm.R b/R/print.gensvm.R index 8d17b0c..06a3649 100644 --- a/R/print.gensvm.R +++ b/R/print.gensvm.R @@ -26,9 +26,31 @@ print.gensvm <- function(object, ...) { cat("\nCall:\n") dput(object$call) + cat("\nData:\n") + cat("\tn.objects:", object$n.objects, "\n") + cat("\tn.features:", object$n.features, "\n") + cat("\tn.classes:", object$n.classes, "\n") + cat("\tclasses:", object$classes, "\n") + cat("Parameters:\n") + cat("\tp:", object$p, "\n") + cat("\tlambda:", object$lambda, "\n") + cat("\tkappa:", object$kappa, "\n") + cat("\tepsilon:", object$epsilon, "\n") + cat("\tweights:", object$weights, "\n") + cat("\tmax.iter:", object$max.iter, "\n") + cat("\trandom.seed:", object$random.seed, "\n") + cat("\tkernel:", object$kernel, "\n") + if (object$kernel %in% c("poly", "rbf", "sigmoid")) { + cat("\tkernel.eigen.cutoff:", object$kernel.eigen.cutoff, "\n") + cat("\tgamma:", object$gamma, "\n") + } + if (object$kernel %in% c("poly", "sigmoid")) + cat("\tcoef:", object$coef, "\n") + if (object$kernel == 'poly') + cat("\tdegree:", object$degree, "\n") + cat("Results:\n") + cat("\tn.iter:", object$n.iter, "\n") + cat("\tn.support:", object$n.support, "\n") - # TODO: fill this out - # - # invisible(object) } diff --git a/R/util.labelencoder.R b/R/util.labelencoder.R deleted file mode 100644 index 8b13789..0000000 --- a/R/util.labelencoder.R +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/gensvm_wrapper.c b/src/gensvm_wrapper.c new file mode 100644 index 0000000..c3f0f7f --- /dev/null +++ b/src/gensvm_wrapper.c @@ -0,0 +1,307 @@ + +#define STRICT_R_HEADERS + +#include <R.h> +#include <Rinternals.h> +#include <R_ext/Print.h> + +#include "gensvm_debug.h" +#include "gensvm_print.h" +#include "gensvm_train.h" +#include "gensvm_predict.h" + +// forward declarations + +SEXP R_gensvm_train( SEXP R_X, SEXP R_y, SEXP R_p, SEXP R_lambda, + SEXP R_kappa, SEXP R_epsilon, SEXP R_weight_idx, + SEXP R_kernel_idx, SEXP R_gamma, SEXP R_coef, SEXP R_degree, + SEXP R_kernel_eigen_cutoff, SEXP R_verbose, SEXP R_max_iter, + SEXP R_random_seed, SEXP R_seed_V, SEXP R_n, SEXP R_m, + SEXP R_K); +SEXP R_gensvm_predict(SEXP R_Xtest, SEXP R_V, SEXP R_n, SEXP R_m, SEXP R_K); + + +void _set_verbosity(int verbosity_flag); +void _set_seed_model(struct GenModel *model, double *V, long n, long m, + long K); + +// Start R package stuff + +R_CallMethodDef callMethods[] = { + {"R_gensvm_train", (DL_FUNC) &R_gensvm_train, 19}, + {"R_gensvm_predict", (DL_FUNC) &R_gensvm_predict, 5}, + {NULL, NULL, 0} +}; +R_CMethodDef cMethods[] = { + {NULL, NULL, 0} +}; + +void R_init_gensvm_wrapper(DllInfo *info) { + R_registerRoutines(info, cMethods, callMethods, NULL, NULL); + R_useDynamicSymbols(info, TRUE); +} + +// End R package stuff + +void _set_verbosity(int verbosity_flag) +{ + extern FILE *GENSVM_OUTPUT_FILE; + extern FILE *GENSVM_ERROR_FILE; + + if (verbosity_flag) { + gensvm_print_out = Rprintf; + gensvm_print_err = REprintf; + } + else { + GENSVM_OUTPUT_FILE = NULL; + GENSVM_ERROR_FILE = NULL; + } +} + +void _set_seed_model(struct GenModel *model, double *V, long n, long m, long K) +{ + long i, j; + double value; + + model->n = 0; + model->m = m; + model->K = K; + + gensvm_allocate_model(model); + + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + value = matrix_get(V, m+1, K-1, i, j); + matrix_set(model->V, m+1, K-1, i, j, value); + } + } +} + + +// NOTE: Let's supply X here as it is represented in R: a matrix in +// Column-Major order. Since we have to augment the matrix X with the column +// of ones to form Z, we might as well do that *and* convert to RowMajor in a +// single step. Otherwise we have the RowMajor version of X as well as Z in +// memory, which is unnecessary. +SEXP R_gensvm_train( + SEXP R_X, + SEXP R_y, + SEXP R_p, + SEXP R_lambda, + SEXP R_kappa, + SEXP R_epsilon, + SEXP R_weight_idx, + SEXP R_kernel_idx, + SEXP R_gamma, + SEXP R_coef, + SEXP R_degree, + SEXP R_kernel_eigen_cutoff, + SEXP R_verbose, + SEXP R_max_iter, + SEXP R_random_seed, + SEXP R_seed_V, + SEXP R_n, + SEXP R_m, + SEXP R_K + ) +{ + double *X = REAL(R_X); + int *y = INTEGER(R_y); // R doesn't know long? + double p = *REAL(R_p); + double lambda = *REAL(R_lambda); + double kappa = *REAL(R_kappa); + double epsilon = *REAL(R_epsilon); + int weight_idx = *INTEGER(R_weight_idx); + int kernel_idx = *INTEGER(R_kernel_idx); + double gamma = *REAL(R_gamma); + double coef = *REAL(R_coef); + double degree = *REAL(R_degree); + double kernel_eigen_cutoff = *REAL(R_kernel_eigen_cutoff); + int verbose = *INTEGER(R_verbose); + int max_iter = *INTEGER(R_max_iter); + int random_seed = *INTEGER(R_random_seed); + double *seed_V = isNull(R_seed_V) ? NULL : REAL(R_seed_V); + int n = *INTEGER(R_n); + int m = *INTEGER(R_m); + int K = *INTEGER(R_K); + + _set_verbosity(verbose); + + struct GenModel *model = gensvm_init_model(); + struct GenModel *seed_model = NULL; + struct GenData *data = NULL; + long i, j; + double value; + + // Set model parameters from function input arguments + model->p = p; + model->lambda = lambda; + model->kappa = kappa; + model->epsilon = epsilon; + model->weight_idx = weight_idx; + model->kerneltype = kernel_idx; + model->gamma = gamma; + model->coef = coef; + model->degree = degree; + model->kernel_eigen_cutoff = kernel_eigen_cutoff; + model->max_iter = max_iter; + model->seed = random_seed; + + if (seed_V != NULL) { + seed_model = gensvm_init_model(); + _set_seed_model(seed_model, seed_V, n, m, K); + } + + data = gensvm_init_data(); + + data->y = Malloc(long, n); + for (i=0; i<n; i++) + data->y[i] = (long) y[i]; + + data->RAW = Malloc(double, n*(m+1)); + for (i=0; i<n; i++) { + for (j=0; j<m; j++) { + value = matrix_get(X, n, m, i, j); + matrix_set(data->RAW, n, m+1, i, j+1, value); + } + // column of 1's + matrix_set(data->RAW, n, m+1, i, 0, 1.0); + } + + data->n = n; + data->m = m; + data->r = m; + data->K = K; + data->Z = data->RAW; + + // convert to sparse matrix if possible + if (gensvm_could_sparse(data->Z, n, m+1)) { + note("Converting to sparse ... "); + data->spZ = gensvm_dense_to_sparse(data->Z, n, m+1); + note("done.\n"); + free(data->RAW); + data->RAW = NULL; + data->Z = NULL; + } + + // actually do the training + gensvm_train(model, data, seed_model); + + // create the output list + SEXP output = PROTECT(allocVector(VECSXP, 3)); + + // create and fill output matrix + SEXP R_V = PROTECT(allocMatrix(REALSXP, m+1, K-1)); + double *rR_V = REAL(R_V); + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + value = matrix_get(model->V, m+1, K-1, i, j); + matrix_set(rR_V, m+1, K-1, i, j, value); + } + } + + SEXP R_iter = PROTECT(allocVector(INTSXP, 1)); + int *r_iter = INTEGER(R_iter); + r_iter[0] = model->elapsed_iter; + + SEXP R_sv = PROTECT(allocVector(INTSXP, 1)); + int *r_sv = INTEGER(R_sv); + r_sv[0] = gensvm_num_sv(model); + + // set output list elements + SET_VECTOR_ELT(output, 0, R_V); + SET_VECTOR_ELT(output, 1, R_iter); + SET_VECTOR_ELT(output, 2, R_sv); + + // create names + SEXP names = PROTECT(allocVector(STRSXP, 3)); + SET_STRING_ELT(names, 0, mkChar("V")); + SET_STRING_ELT(names, 1, mkChar("n.iter")); + SET_STRING_ELT(names, 2, mkChar("n.support")); + + // assign names to list + setAttrib(output, R_NamesSymbol, names); + + // cleanup + UNPROTECT(5); + + gensvm_free_model(model); + gensvm_free_model(seed_model); + gensvm_free_data(data); + + return output; +} + +SEXP R_gensvm_predict( + SEXP R_Xtest, + SEXP R_V, + SEXP R_n, + SEXP R_m, + SEXP R_K + ) +{ + double *X = REAL(R_Xtest); + double *V = REAL(R_V); + int n_test = *INTEGER(R_n); + int m = *INTEGER(R_m); + int K = *INTEGER(R_K); + + int i, j; + double value; + + struct GenModel *model = gensvm_init_model(); + model->m = m; + model->K = K; + model->U = Calloc(double, K*(K-1)); + model->V = Calloc(double, (m+1) * (K-1)); + for (i=0; i<m+1; i++) { + for (j=0; j<K-1; j++) { + value = matrix_get(V, m+1, K-1, i, j); + matrix_set(model->V, m+1, K-1, i, j, value); + } + } + + struct GenData *data = gensvm_init_data(); + data->n = n_test; + data->m = m; + data->r = m; + data->K = K; + + data->RAW = Calloc(double, n_test*(m+1)); + + for (i=0; i<n_test; i++) { + for (j=0; j<m; j++) { + value = matrix_get(X, n_test, m, i, j); + matrix_set(data->RAW, n_test, m+1, i, j+1, value); + } + matrix_set(data->RAW, n_test, m+1, i, 0, 1.0); + } + data->Z = data->RAW; + + // convert to sparse matrix if possible + if (gensvm_could_sparse(data->Z, n_test, m+1)) { + note("Converting to sparse ... "); + data->spZ = gensvm_dense_to_sparse(data->Z, n_test, m+1); + note("done.\n"); + free(data->RAW); + data->RAW = NULL; + data->Z = NULL; + } + + long *pred_temp = Calloc(long, n_test); + + gensvm_predict_labels(data, model, pred_temp); + + SEXP R_y = PROTECT(allocMatrix(INTSXP, n_test, 1)); + int *rR_y = INTEGER(R_y); + for (i=0; i<n_test; i++) + rR_y[i] = pred_temp[i]; + + gensvm_free_data(data); + gensvm_free_model(model); + free(pred_temp); + + UNPROTECT(1); + + return(R_y); +} |
