diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-03-27 12:31:28 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-03-27 12:31:28 +0100 |
| commit | 004941896bac692d354c41a3334d20ee1d4627f7 (patch) | |
| tree | 2b11e42d8524843409e2bf8deb4ceb74c8b69347 | |
| parent | updates to GenSVM C library (diff) | |
| download | rgensvm-004941896bac692d354c41a3334d20ee1d4627f7.tar.gz rgensvm-004941896bac692d354c41a3334d20ee1d4627f7.zip | |
GenSVM R package
41 files changed, 3168 insertions, 236 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..919dd47 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +.Rbuildignore +gensvm.Rproj +src/*.o +src/*.so +src/*.dll diff --git a/DESCRIPTION b/DESCRIPTION index b7e2937..3ef01aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,3 +12,4 @@ Depends: URL: https://github.com/GjjvdBurg/RGenSVM BugReports: https://github.com/GjjvdBurg/RGenSVM RoxygenNote: 5.0.1 +NeedsCompilation: yes diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..a70ecfd --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,20 @@ +# Generated by roxygen2: do not edit by hand + +S3method(coef,gensvm) +S3method(coef,gensvm.grid) +S3method(plot,gensvm) +S3method(plot,gensvm.grid) +S3method(predict,gensvm) +S3method(predict,gensvm.grid) +S3method(print,gensvm) +S3method(print,gensvm.grid) +export(gensvm) +export(gensvm.accuracy) +export(gensvm.grid) +export(gensvm.load.full.grid) +export(gensvm.load.small.grid) +export(gensvm.load.tiny.grid) +export(gensvm.maxabs.scale) +export(gensvm.refit) +export(gensvm.train.test.split) +useDynLib(gensvm_wrapper, .registration = TRUE) diff --git a/R/coef.gensvm.R b/R/coef.gensvm.R index 19ab0aa..45eeb13 100644 --- a/R/coef.gensvm.R +++ b/R/coef.gensvm.R @@ -13,7 +13,7 @@ #' (n_{classes} - 1)} matrix formed by the remaining rows. #' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -25,7 +25,11 @@ #' @export #' #' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] #' +#' fit <- gensvm(x, y) +#' V <- coef(fit) #' coef.gensvm <- function(object, ...) { diff --git a/R/coef.gensvm.grid.R b/R/coef.gensvm.grid.R new file mode 100644 index 0000000..15e6525 --- /dev/null +++ b/R/coef.gensvm.grid.R @@ -0,0 +1,32 @@ +#' @title Get the parameter grid from a GenSVM Grid object +#' +#' @description Returns the parameter grid of a \code{gensvm.grid} object. +#' +#' @param object a \code{gensvm.grid} object +#' @param \dots further arguments are ignored +#' +#' @return The parameter grid of the GenSVMGrid object as a data frame. +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @method coef gensvm.grid +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' grid <- gensvm.grid(x, y) +#' pg <- coef(grid) +#' +coef.gensvm.grid <- function(object, ...) +{ + return(object$param.grid) +} diff --git a/R/gensvm-kernels.R b/R/gensvm-kernels.R deleted file mode 100644 index 8e445c0..0000000 --- a/R/gensvm-kernels.R +++ /dev/null @@ -1,10 +0,0 @@ -#' Kernels in GenSVM -#' -#' GenSVM can be used for both linear multiclass support vector machine -#' classification and for nonlinear classification with kernels. In general, -#' linear classification will be faster but depending on the dataset higher -#' classification performance can be achieved using a nonlinear kernel. -#' -#' The following nonlinear kernels are implemented in the GenSVM package: -#' \describe{ -#' \item{RBF}{The Radial Basis Function kernel is a commonly used kernel. diff --git a/R/gensvm-package.R b/R/gensvm-package.R index 13c2c31..f664577 100644 --- a/R/gensvm-package.R +++ b/R/gensvm-package.R @@ -1,15 +1,15 @@ #' GenSVM: A Generalized Multiclass Support Vector Machine #' #' The GenSVM classifier is a generalized multiclass support vector machine -#' (SVM). This classifier simultaneously aims to find decision boundaries that -#' separate the classes with as wide a margin as possible. In GenSVM, the loss -#' functions that measures how misclassifications are counted is very flexible. -#' This allows the user to tune the classifier to the dataset at hand and +#' (SVM). This classifier aims to find decision boundaries that separate the +#' classes with as wide a margin as possible. In GenSVM, the loss functions +#' that measures how misclassifications are counted is very flexible. This +#' allows the user to tune the classifier to the dataset at hand and #' potentially obtain higher classification accuracy. Moreover, this #' flexibility means that GenSVM has a number of alternative multiclass SVMs as #' special cases. One of the other advantages of GenSVM is that it is trained -#' in the primal, allowing the use of warm starts during optimization. This -#' means that for common tasks such as cross validation or repeated model +#' in the primal space, allowing the use of warm starts during optimization. +#' This means that for common tasks such as cross validation or repeated model #' fitting, GenSVM can be trained very quickly. #' #' This package provides functions for training the GenSVM model either as a @@ -26,19 +26,71 @@ #' GenSVM.} #' } #' -#' Other available functions are: +#' For the GenSVM and GenSVMGrid models the following two functions are +#' available. When applied to a GenSVMGrid object, the function is applied to +#' the best GenSVM model. #' \describe{ #' \item{\code{\link{plot}}}{Plot the low-dimensional \emph{simplex} space -#' where the decision boundaries are fixed.} +#' where the decision boundaries are fixed (for problems with 3 classes).} #' \item{\code{\link{predict}}}{Predict the class labels of new data using the #' GenSVM model.} -#' \item{\code{\link{coef}}}{Get the coefficients of the GenSVM model} -#' \item{\code{\link{print}}}{Print a short description of the fitted GenSVM -#' model} #' } #' +#' Moreover, for the GenSVM and GenSVMGrid models a \code{coef} function is +#' defined: +#' \describe{ +#' \item{\code{\link{coef.gensvm}}}{Get the coefficients of the fitted GenSVM +#' model.} +#' \item{\code{\link{coef.gensvm.grid}}}{Get the parameter grid of the GenSVM +#' grid search.} +#' } +#' +#' The following utility functions are also included: +#' \describe{ +#' \item{\code{\link{gensvm.accuracy}}}{Compute the accuracy score between true +#' and predicted class labels} +#' \item{\code{\link{gensvm.maxabs.scale}}}{Scale each column of the dataset by +#' its maximum absolute value, preserving sparsity and mapping the data to [-1, +#' 1]} +#' \item{\code{\link{gensvm.train.test.split}}}{Split a dataset into a training +#' and testing sample} +#' \item{\code{\link{gensvm.refit}}}{Refit a fitted GenSVM model with slightly +#' different parameters or on a different dataset} +#' } +#' +#' @section Kernels in GenSVM: +#' +#' GenSVM can be used for both linear and nonlinear multiclass support vector +#' machine classification. In general, linear classification will be faster but +#' depending on the dataset higher classification performance can be achieved +#' using a nonlinear kernel. +#' +#' The following nonlinear kernels are implemented in the GenSVM package: +#' \describe{ +#' \item{RBF}{The Radial Basis Function kernel is a well-known kernel function +#' based on the Euclidean distance between objects. It is defined as +#' \deqn{ +#' k(x_i, x_j) = exp( -\gamma || x_i - x_j ||^2 ) +#' } +#' } +#' \item{Polynomial}{A polynomial kernel can also be used in GenSVM. This +#' kernel function is implemented very generally and therefore takes three +#' parameters (\code{coef}, \code{gamma}, and \code{degree}). It is defined +#' as: +#' \deqn{ +#' k(x_i, x_j) = ( \gamma x_i' x_j + coef)^{degree} +#' } +#' } +#' \item{Sigmoid}{The sigmoid kernel is the final kernel implemented in +#' GenSVM. This kernel has two parameters and is implemented as follows: +#' \deqn{ +#' k(x_i, x_j) = \tanh( \gamma x_i' x_j + coef) +#' } +#' } +#' } +#' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -46,11 +98,10 @@ #' Multiclass Support Vector Machine}, Journal of Machine Learning Research, #' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. #' -#' @examples -#' +#' @aliases +#' gensvm.package #' #' @name gensvm-package #' @docType package -#' @import NULL #>NULL @@ -1,7 +1,8 @@ #' @title Fit the GenSVM model #' #' @description Fits the Generalized Multiclass Support Vector Machine model -#' with the given parameters. +#' with the given parameters. See the package documentation +#' (\code{\link{gensvm-package}}) for more general information about GenSVM. #' #' @param X data matrix with the predictors #' @param y class labels @@ -12,7 +13,8 @@ #' @param weights type of instance weights to use. Options are 'unit' for unit #' weights and 'group' for group size correction weight (eq. 4 in the paper). #' @param kernel the kernel type to use in the classifier. It must be one of -#' 'linear', 'poly', 'rbf', or 'sigmoid'. +#' 'linear', 'poly', 'rbf', or 'sigmoid'. See the section "Kernels in GenSVM" +#' in \code{\link{gensvm-package}} for more info. #' @param gamma kernel parameter for the rbf, polynomial, and sigmoid kernel. #' If gamma is 'auto', then 1/n_features will be used. #' @param coef parameter for the polynomial and sigmoid kernel. @@ -25,25 +27,45 @@ #' @param random.seed Seed for the random number generator (useful for #' reproducible output) #' @param max.iter Maximum number of iterations of the optimization algorithm. +#' @param seed.V Matrix to warm-start the optimization algorithm. This is +#' typically the output of \code{coef(fit)}. Note that this function will +#' silently drop seed.V if the dimensions don't match the provided data. #' #' @return A "gensvm" S3 object is returned for which the print, predict, coef, #' and plot methods are available. It has the following items: #' \item{call}{The call that was used to construct the model.} +#' \item{p}{The value of the lp norm in the loss function} #' \item{lambda}{The regularization parameter used in the model.} #' \item{kappa}{The hinge function parameter used.} #' \item{epsilon}{The stopping criterion used.} #' \item{weights}{The instance weights type used.} #' \item{kernel}{The kernel function used.} -#' \item{gamma}{The value of the gamma parameter of the kernel, if applicable}. +#' \item{gamma}{The value of the gamma parameter of the kernel, if applicable} #' \item{coef}{The value of the coef parameter of the kernel, if applicable} #' \item{degree}{The degree of the kernel, if applicable} #' \item{kernel.eigen.cutoff}{The cutoff value of the reduced -#' eigendecomposition of the kernel matrix} +#' eigendecomposition of the kernel matrix.} +#' \item{verbose}{Whether or not the model was fitted with progress output} #' \item{random.seed}{The random seed used to seed the model.} #' \item{max.iter}{Maximum number of iterations of the algorithm.} +#' \item{n.objects}{Number of objects in the dataset} +#' \item{n.features}{Number of features in the dataset} +#' \item{n.classes}{Number of classes in the dataset} +#' \item{classes}{Array with the actual class labels} +#' \item{V}{Coefficient matrix} +#' \item{n.iter}{Number of iterations performed in training} +#' \item{n.support}{Number of support vectors in the final model} +#' \item{training.time}{Total training time} +#' \item{X.train}{When training with nonlinear kernels, the training data is +#' needed to perform prediction. For these kernels it is therefore stored in +#' the fitted model.} +#' +#' @note +#' This function returns partial results when the computation is interrupted by +#' the user. #' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -59,10 +81,37 @@ #' @useDynLib gensvm_wrapper, .registration = TRUE #' #' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # fit using the default parameters +#' fit <- gensvm(x, y) +#' +#' # fit and show progress +#' fit <- gensvm(x, y, verbose=T) +#' +#' # fit with some changed parameters +#' fit <- gensvm(x, y, lambda=1e-8) +#' +#' # Early stopping defined through epsilon +#' fit <- gensvm(x, y, epsilon=1e-3) +#' +#' # Early stopping defined through max.iter +#' fit <- gensvm(x, y, max.iter=1000) #' -gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, - weights='unit', kernel='linear', gamma='auto', coef=0.0, - degree=2.0, kernel.eigen.cutoff=1e-8, verbose=0, +#' # Nonlinear training +#' fit <- gensvm(x, y, kernel='rbf') +#' fit <- gensvm(x, y, kernel='poly', degree=2, gamma=1.0) +#' +#' # Setting the random seed and comparing results +#' fit <- gensvm(x, y, random.seed=123) +#' fit2 <- gensvm(x, y, random.seed=123) +#' all.equal(coef(fit), coef(fit2)) +#' +#' +gensvm <- function(X, y, p=1.0, lambda=1e-8, kappa=0.0, epsilon=1e-6, + weights='unit', kernel='linear', gamma='auto', coef=1.0, + degree=2.0, kernel.eigen.cutoff=1e-8, verbose=FALSE, random.seed=NULL, max.iter=1e8, seed.V=NULL) { call <- match.call() @@ -72,9 +121,6 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, if (is.null(random.seed)) random.seed <- runif(1) * (2**31 - 1) - # TODO: Store a labelencoder in the object, preferably as a partially - # hidden item. This can then be used with prediction. - n.objects <- nrow(X) n.features <- ncol(X) n.classes <- length(unique(y)) @@ -90,17 +136,23 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, # Convert weights to index weight.idx <- which(c("unit", "group") == weights) if (length(weight.idx) == 0) { - stop("Incorrect weight specification. ", + cat("Error: Incorrect weight specification. ", "Valid options are 'unit' and 'group'") + return } # Convert kernel to index (remember off-by-one for R vs. C) kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel) - 1 if (length(kernel.idx) == 0) { - stop("Incorrect kernel specification. ", + cat("Error: Incorrect kernel specification. ", "Valid options are 'linear', 'poly', 'rbf', and 'sigmoid'") + return } + seed.rows <- if(is.null(seed.V)) -1 else nrow(seed.V) + seed.cols <- if(is.null(seed.V)) -1 else ncol(seed.V) + + # Call the C train routine out <- .Call("R_gensvm_train", as.matrix(X), as.integer(y.clean), @@ -118,18 +170,24 @@ gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, as.integer(max.iter), as.integer(random.seed), seed.V, + as.integer(seed.rows), + as.integer(seed.cols), as.integer(n.objects), as.integer(n.features), as.integer(n.classes)) + # build the output object object <- list(call = call, p = p, lambda = lambda, kappa = kappa, epsilon = epsilon, weights = weights, kernel = kernel, gamma = gamma, coef = coef, degree = degree, kernel.eigen.cutoff = kernel.eigen.cutoff, - random.seed = random.seed, max.iter = max.iter, - n.objects = n.objects, n.features = n.features, - n.classes = n.classes, classes = classes, V = out$V, - n.iter = out$n.iter, n.support = out$n.support) + verbose = verbose, random.seed = random.seed, + max.iter = max.iter, n.objects = n.objects, + n.features = n.features, n.classes = n.classes, + classes = classes, V = out$V, n.iter = out$n.iter, + n.support = out$n.support, + training.time = out$training.time, + X.train = if(kernel == 'linear') NULL else X) class(object) <- "gensvm" return(object) diff --git a/R/gensvm.accuracy.R b/R/gensvm.accuracy.R new file mode 100644 index 0000000..dbcd3cc --- /dev/null +++ b/R/gensvm.accuracy.R @@ -0,0 +1,37 @@ +#' @title Compute the accuracy score +#' +#' @param y.true vector of true labels +#' @param y.pred vector of predicted labels +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @seealso +#' \code{\link{predict.gensvm.grid}} +#' +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' fit <- gensvm(x, y) +#' gensvm.accuracy(predict(fit, x), y) +#' +gensvm.accuracy <- function(y.true, y.pred) +{ + n <- length(y.true) + if (n != length(y.pred)) { + cat("Error: Can't compute accuracy if vector don't have the ", + "same length\n") + return + } + + return (sum(y.true == y.pred) / n) +} diff --git a/R/gensvm.grid.R b/R/gensvm.grid.R index 37e2f7f..5d27fde 100644 --- a/R/gensvm.grid.R +++ b/R/gensvm.grid.R @@ -3,44 +3,17 @@ #' @description This function performs a cross-validated grid search of the #' model parameters to find the best hyperparameter configuration for a given #' dataset. This function takes advantage of GenSVM's ability to use warm -#' starts to speed up computation. The function also uses the GenSVM C library -#' for speed. -#' -#' There are two ways to use this function: either by providing a data frame -#' with the parameter configurations to try or by giving each of the function -#' inputs a vector of values to evaluate. In the latter case all combinations -#' of the provided values will be used (i.e. the product set). +#' starts to speed up computation. The function uses the GenSVM C library for +#' speed. #' #' @param X training data matrix. We denote the size of this matrix by #' n_samples x n_features. #' @param y training vector of class labes of length n_samples. The number of #' unique labels in this vector is denoted by n_classes. -#' @param df Data frame with parameter configurations to evaluate. -#' If this is provided it overrides the other parameter ranges provided. The -#' data frame must provide *all* required columns, as described below. -#' @param p vector of values to try for the \eqn{p} hyperparameter -#' for the \eqn{\ell_p} norm in the loss function. All values should be on the -#' interval [1.0, 2.0]. -#' @param lambda vector of values for the regularization parameter -#' \eqn{\lambda} in the loss function. All values should be larger than 0. -#' @param kappa vector of values for the hinge function parameter in -#' the loss function. All values should be larger than -1. -#' @param weights vector of values for the instance weights. Values -#' should be either 'unit', 'group', or both. -#' @param kernel vector of values for the kernel type. Possible -#' values are: 'linear', 'rbf', 'poly', or 'sigmoid', or any combination of -#' these values. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for -#' more information. -#' @param gamma kernel parameter for the 'rbf', 'poly', and 'sigmoid' kernels. -#' If it is 'auto', 1/n_features will be used. See the article -#' \link[=gensvm-kernels]{Kernels in GenSVM} for more information. -#' @param coef kernel parameter for the 'poly' and 'sigmoid' -#' kernels. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for more -#' information. -#' @param degree kernel parameter for the 'poly' kernel. See the -#' article \link[=gensvm-kernels]{Kernels in GenSVM} for more information. -#' @param max.iter maximum number of iterations to run in the -#' optimization algorithm. +#' @param param.grid String (\code{'tiny'}, \code{'small'}, or \code{'full'}) +#' or data frame with parameter configurations to evaluate. Typically this is +#' the output of \code{expand.grid}. For more details, see "Using a Parameter +#' Grid" below. #' @param refit boolean variable. If true, the best model from cross validation #' is fitted again on the entire dataset. #' @param scoring metric to use to evaluate the classifier performance during @@ -49,29 +22,94 @@ #' values are better. If it is NULL, the accuracy score will be used. #' @param cv the number of cross-validation folds to use or a vector with the #' same length as \code{y} where each unique value denotes a test split. -#' @param verbose boolean variable to indicate whether training details should -#' be printed. +#' @param verbose integer to indicate the level of verbosity (higher is more +#' verbose) +#' @param return.train.score whether or not to return the scores on the +#' training splits #' #' @return A "gensvm.grid" S3 object with the following items: +#' \item{call}{Call that produced this object} +#' \item{param.grid}{Sorted version of the parameter grid used in training} #' \item{cv.results}{A data frame with the cross validation results} #' \item{best.estimator}{If refit=TRUE, this is the GenSVM model fitted with #' the best hyperparameter configuration, otherwise it is NULL} -#' \item{best.score}{Mean cross-validated score for the model with the best -#' hyperparameter configuration} +#' \item{best.score}{Mean cross-validated test score for the model with the +#' best hyperparameter configuration} #' \item{best.params}{Parameter configuration that provided the highest mean -#' cross-validated score} +#' cross-validated test score} #' \item{best.index}{Row index of the cv.results data frame that corresponds to #' the best hyperparameter configuration} #' \item{n.splits}{The number of cross-validation splits} +#' \item{n.objects}{The number of instances in the data} +#' \item{n.features}{The number of features of the data} +#' \item{n.classes}{The number of classes in the data} +#' \item{classes}{Array with the unique classes in the data} +#' \item{total.time}{Training time for the grid search} +#' \item{cv.idx}{Array with cross validation indices used to split the data} +#' +#' @section Using a Parameter Grid: +#' To evaluate certain paramater configurations, a data frame can be supplied +#' to the \code{param.grid} argument of the function. Such a data frame can +#' easily be generated using the R function \code{expand.grid}, or could be +#' created through other ways to test specific parameter configurations. +#' +#' Three parameter grids are predefined: +#' \describe{ +#' \item{\code{'tiny'}}{This parameter grid is generated by the function +#' \code{\link{gensvm.load.tiny.grid}} and is the default parameter grid. It +#' consists of parameter configurations that are likely to perform well on +#' various datasets.} +#' \item{\code{'small'}}{This grid is generated by +#' \code{\link{gensvm.load.small.grid}} and generates a data frame with 90 +#' configurations. It is typically fast to train but contains some +#' configurations that are unlikely to perform well. It is included for +#' educational purposes.} +#' \item{\code{'full'}}{This grid loads the parameter grid as used in the +#' GenSVM paper. It consists of 342 configurations and is generated by the +#' \code{\link{gensvm.load.full.grid}} function. Note that in the GenSVM paper +#' cross validation was done with this parameter grid, but the final training +#' step used \code{epsilon=1e-8}. The \code{\link{gensvm.refit}} function is +#' useful in this scenario.} +#' } #' +#' When you provide your own parameter grid, beware that only certain column +#' names are allowed in the data frame corresponding to parameters for the +#' GenSVM model. These names are: #' +#' \describe{ +#' \item{p}{Parameter for the lp norm. Must be in [1.0, 2.0].} +#' \item{kappa}{Parameter for the Huber hinge function. Must be larger than +#' -1.} +#' \item{lambda}{Parameter for the regularization term. Must be larger than 0.} +#' \item{weight}{Instance weight specification. Allowed values are "unit" for +#' unit weights and "group" for group-size correction weights} +#' \item{epsilon}{Stopping parameter for the algorithm. Must be larger than 0.} +#' \item{max.iter}{Maximum number of iterations of the algorithm. Must be +#' larger than 0.} +#' \item{kernel}{The kernel to used, allowed values are "linear", "poly", +#' "rbf", and "sigmoid". The default is "linear"} +#' \item{coef}{Parameter for the "poly" and "sigmoid" kernels. See the section +#' "Kernels in GenSVM" in the code{ink{gensvm-package}} page for more info.} +#' \item{degree}{Parameter for the "poly" kernel. See the section "Kernels in +#' GenSVM" in the code{ink{gensvm-package}} page for more info.} +#' \item{gamma}{Parameter for the "poly", "rbf", and "sigmoid" kernels. See the +#' section "Kernels in GenSVM" in the code{ink{gensvm-package}} page for more +#' info.} +#' } #' -#' @section Using a DataFrame: -#' ... +#' For variables that are not present in the \code{param.grid} data frame the +#' default parameter values in the \code{\link{gensvm}} function will be used. #' +#' Note that this function reorders the parameter grid to make the warm starts +#' as efficient as possible, which is why the param.grid in the result will not +#' be the same as the param.grid in the input. +#' +#' @note +#' This function returns partial results when the computation is interrupted by +#' the user. #' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -80,37 +118,499 @@ #' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. #' #' @seealso -#' \code{\link{coef}}, \code{\link{print}}, \code{\link{predict}}, -#' \code{\link{plot}}, and \code{\link{gensvm.grid}}. -#' +#' \code{\link{predict.gensvm.grid}}, \code{\link{print.gensvm.grid}}, and +#' \code{\link{gensvm}}. #' #' @export #' #' @examples -#' X <- -#' - -gensvm.grid <- function(X, y, - df=NULL, - p=c(1.0, 1.5, 2.0), - lambda=c(1e-8, 1e-6, 1e-4, 1e-2, 1), - kappa=c(-0.9, 0.5, 5.0), - weights=c('unit', 'group'), - kernel=c('linear'), - gamma=c('auto'), - coef=c(0.0), - degree=c(2.0), - max.iter=c(1e8), - refit=TRUE, - scoring=NULL, - cv=3, - verbose=TRUE) +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # use the default parameter grid +#' grid <- gensvm.grid(x, y) +#' +#' # use a smaller parameter grid +#' pg <- expand.grid(p=c(1.0, 1.5, 2.0), kappa=c(-0.9, 1.0), epsilon=c(1e-3)) +#' grid <- gensvm.grid(x, y, param.grid=pg) +#' +#' # print the result +#' print(grid) +#' +#' # Using a custom scoring function (accuracy as percentage) +#' acc.pct <- function(yt, yp) { return (100 * sum(yt == yp) / length(yt)) } +#' grid <- gensvm.grid(x, y, scoring=acc.pct) +#' +gensvm.grid <- function(X, y, param.grid='tiny', refit=TRUE, scoring=NULL, cv=3, + verbose=0, return.train.score=TRUE) { call <- match.call() + n.objects <- nrow(X) + n.features <- ncol(X) + n.classes <- length(unique(y)) + + if (is.character(param.grid)) { + if (param.grid == 'tiny') { + param.grid <- gensvm.load.tiny.grid() + } else if (param.grid == 'small') { + param.grid <- gensvm.load.small.grid() + } else if (param.grid == 'full') { + param.grid <- gensvm.load.full.grid() + } + } + + # Validate the range of the values for the gridsearch + gensvm.validate.param.grid(param.grid) + + # Sort the parameter grid for efficient warm starts + param.grid <- gensvm.sort.param.grid(param.grid) + + # Expand and convert the parameter grid for use in the C function + C.param.grid <- gensvm.expand.param.grid(param.grid, n.features) + + # Convert labels to integers + classes <- sort(unique(y)) + y.clean <- match(y, classes) + + if (is.vector(cv) && length(cv) == n.objects) { + folds <- sort(unique(cv)) + cv.idx <- match(cv, folds) - 1 + n.splits <- length(folds) + } else { + cv.idx <- gensvm.generate.cv.idx(n.objects, cv[1]) + n.splits <- cv + } + + results <- .Call("R_gensvm_grid", + as.matrix(X), + as.integer(y.clean), + as.matrix(C.param.grid), + as.integer(nrow(C.param.grid)), + as.integer(ncol(C.param.grid)), + as.integer(cv.idx), + as.integer(n.splits), + as.integer(verbose), + as.integer(n.objects), + as.integer(n.features), + as.integer(n.classes) + ) + + cv.results <- gensvm.cv.results(results, param.grid, cv.idx, + y.clean, scoring, + return.train.score=return.train.score) + best.index <- which.min(cv.results$rank.test.score)[1] + if (!is.na(best.index)) { # can occur when user interrupts + best.score <- cv.results$mean.test.score[best.index] + best.params <- param.grid[best.index, , drop=F] + # Remove duplicate attributes from best.params + attr(best.params, "out.attrs") <- NULL + } else { + best.score <- NA + best.params <- list() + } - object <- list(...) + if (refit && !is.na(best.index)) { + gensvm.args <- as.list(best.params) + gensvm.args$X <- X + gensvm.args$y <- y + best.estimator <- do.call(gensvm, gensvm.args) + } else { + best.estimator <- NULL + } + + object <- list(call = call, param.grid = param.grid, + cv.results = cv.results, best.estimator = best.estimator, + best.score = best.score, best.params = best.params, + best.index = best.index, n.splits = n.splits, + n.objects = n.objects, n.features = n.features, + n.classes = n.classes, classes = classes, + total.time = results$total.time, cv.idx = cv.idx) class(object) <- "gensvm.grid" return(object) } + +#' @title Load a tiny parameter grid for the GenSVM grid search +#' +#' @description This function returns a parameter grid to use in the GenSVM +#' grid search. This grid was obtained by analyzing the experiments done for +#' the GenSVM paper and selecting the configurations that achieve accuracy +#' within the 95th percentile on over 90% of the datasets. It is a good start +#' for a parameter search with a reasonably high chance of achieving good +#' performance on most datasets. +#' +#' Note that this grid is only tested to work well in combination with the +#' linear kernel. +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @export +#' +#' @seealso +#' \code{\link{gensvm.grid}}, \code{\link{gensvm.load.small.grid}}, +#' \code{\link{gensvm.load.full.grid}}. +#' +gensvm.load.tiny.grid <- function() +{ + df <- data.frame( + p=c(2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.5, 2.0, 2.0), + kappa=c(5.0, 5.0, 0.5, 5.0, -0.9, 5.0, 0.5, -0.9, 0.5, 0.5), + lambda = c(2^-16, 2^-18, 2^-18, 2^-18, 2^-18, 2^-14, 2^-18, + 2^-18, 2^-16, 2^-16), + weight = c('unit', 'unit', 'unit', 'group', 'unit', + 'unit', 'group', 'unit', 'unit', 'group') + ) + return(df) +} + +#' @title Load a large parameter grid for the GenSVM grid search +#' +#' @description This loads the parameter grid from the GenSVM paper. It +#' consists of 342 configurations and is constructed from all possible +#' combinations of the following parameter sets: +#' +#' \code{p = c(1.0, 1.5, 2.0)} +#' +#' \code{lambda = 2^seq(-18, 18, 2)} +#' +#' \code{kappa = c(-0.9, 0.5, 5.0)} +#' +#' \code{weight = c('unit', 'group')} +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @export +#' +#' @seealso +#' \code{\link{gensvm.grid}}, \code{\link{gensvm.load.tiny.grid}}, +#' \code{\link{gensvm.load.full.grid}}. +#' +gensvm.load.full.grid <- function() +{ + df <- expand.grid(p=c(1.0, 1.5, 2.0), lambda=2^seq(-18, 18, 2), + kappa=c(-0.9, 0.5, 5.0), weight=c('unit', 'group'), + epsilon=c(1e-6)) + return(df) +} + + +#' @title Load the default parameter grid for the GenSVM grid search +#' +#' @description This function loads a default parameter grid to use for the +#' GenSVM gridsearch. It contains all possible combinations of the following +#' parameter sets: +#' +#' \code{p = c(1.0, 1.5, 2.0)} +#' +#' \code{lambda = c(1e-8, 1e-6, 1e-4, 1e-2, 1)} +#' +#' \code{kappa = c(-0.9, 0.5, 5.0)} +#' +#' \code{weight = c('unit', 'group')} +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @export +#' +#' @seealso +#' \code{\link{gensvm.grid}}, \code{\link{gensvm.load.tiny.grid}}, +#' \code{\link{gensvm.load.small.grid}}. +#' +gensvm.load.small.grid <- function() +{ + df <- expand.grid(p=c(1.0, 1.5, 2.0), lambda=c(1e-8, 1e-6, 1e-4, 1e-2, 1), + kappa=c(-0.9, 0.5, 5.0), weight=c('unit', 'group')) + return(df) +} + + +#' Generate a vector of cross-validation indices +#' +#' This function generates a vector of length \code{n} with values from 0 to +#' \code{folds-1} to mark train and test splits. +#' +gensvm.generate.cv.idx <- function(n, folds) +{ + cv.idx <- matrix(0, n, 1) + + big.folds <- n %% folds + small.fold.size <- n %/% folds + + j <- 0 + for (i in 0:(small.fold.size * folds)) { + while (TRUE) { + idx <- round(runif(1, 1, n)) + if (cv.idx[idx] == 0) { + cv.idx[idx] <- j + j <- j + 1 + j <- (j %% folds) + break + } + } + } + + j <- 1 + i <- 0 + while (i < big.folds) { + if (cv.idx[j] == 0) { + cv.idx[j] <- i + i <- i + 1 + } + j <- j + 1 + } + + return(cv.idx) +} + +gensvm.validate.param.grid <- function(df) +{ + expected.colnames <- c("kernel", "coef", "degree", "gamma", "weight", + "kappa", "lambda", "p", "epsilon", "max.iter") + for (name in colnames(df)) { + if (!(name %in% expected.colnames)) { + stop("Invalid header name supplied in parameter grid: ", name) + } + } + + conditions <- list( + p=function(x) { x >= 1.0 && x <= 2.0 }, + kappa=function(x) { x > -1.0 }, + lambda=function(x) {x > 0.0 }, + epsilon=function(x) { x > 0.0 }, + gamma=function(x) { x != 0.0 }, + weight=function(x) { x %in% c("unit", "group") }, + kernel=function(x) { x %in% c("linear", "poly", "rbf", "sigmoid") } + ) + + for (idx in 1:nrow(df)) { + for (param in colnames(df)) { + if (!(param %in% names(conditions))) + next + func <- conditions[[param]] + value <- df[[param]][idx] + if (!func(value)) + stop("Invalid value in grid for parameter: ", param) + } + } +} + +gensvm.cv.results <- function(results, param.grid, cv.idx, y.true, + scoring, return.train.score=TRUE) +{ + n.candidates <- nrow(param.grid) + n.splits <- length(unique(cv.idx)) + + score <- if(is.function(scoring)) scoring else gensvm.accuracy + + # Build names and initialize the data.frame + names <- c("mean.fit.time", "mean.score.time", "mean.test.score") + if (return.train.score) + names <- c(names, "mean.train.score") + for (param in names(param.grid)) { + names <- c(names, sprintf("param.%s", param)) + } + names <- c(names, "rank.test.score") + for (idx in sort(unique(cv.idx))) { + names <- c(names, sprintf("split%i.test.score", idx)) + if (return.train.score) + names <- c(names, sprintf("split%i.train.score", idx)) + } + names <- c(names, "std.fit.time", "std.score.time", "std.test.score") + if (return.train.score) + names <- c(names, "std.train.score") + + df <- data.frame(matrix(ncol=length(names), nrow=n.candidates)) + colnames(df) <- names + + for (pidx in 1:n.candidates) { + param <- param.grid[pidx, , drop=F] + durations <- results$durations[pidx, ] + predictions <- results$predictions[pidx, ] + + fit.times <- durations + score.times <- c() + test.scores <- c() + train.scores <- c() + + is.missing <- any(is.na(durations)) + + for (test.idx in sort(unique(cv.idx))) { + score.time <- 0 + + if (return.train.score) { + y.train.pred <- predictions[cv.idx != test.idx] + y.train.true <- y.true[cv.idx != test.idx] + + start.time <- proc.time() + train.score <- score(y.train.true, y.train.pred) + stop.time <- proc.time() + score.time <- score.time + (stop.time - start.time)[3] + + train.scores <- c(train.scores, train.score) + } + + y.test.pred <- predictions[cv.idx == test.idx] + y.test.true <- y.true[cv.idx == test.idx] + + start.time <- proc.time() + test.score <- score(y.test.true, y.test.pred) + stop.time <- proc.time() + score.time <- score.time + (stop.time - start.time)[3] + + test.scores <- c(test.scores, test.score) + + score.times <- c(score.times, score.time) + } + + df$mean.fit.time[pidx] <- mean(fit.times) + df$mean.score.time[pidx] <- if(is.missing) NA else mean(score.times) + df$mean.test.score[pidx] <- mean(test.scores) + df$std.fit.time[pidx] <- sd(fit.times) + df$std.score.time[pidx] <- if(is.missing) NA else sd(score.times) + df$std.test.score[pidx] <- sd(test.scores) + if (return.train.score) { + df$mean.train.score[pidx] <- mean(train.scores) + df$std.train.score[pidx] <- sd(train.scores) + } + + for (parname in names(param.grid)) { + df[[sprintf("param.%s", parname)]][pidx] <- param[[parname]] + } + + j <- 1 + for (test.idx in sort(unique(cv.idx))) { + lbl <- sprintf("split%i.test.score", test.idx) + df[[lbl]][pidx] <- test.scores[j] + if (return.train.score) { + lbl <- sprintf("split%i.train.score", test.idx) + df[[lbl]][pidx] <- train.scores[j] + } + j <- j + 1 + } + } + + df$rank.test.score <- gensvm.rank.score(df$mean.test.score) + + return(df) +} + +gensvm.sort.param.grid <- function(param.grid) +{ + all.cols <- c("kernel", "coef", "degree", "gamma", "weight", "kappa", + "lambda", "p", "epsilon", "max.iter") + + order.args <- NULL + for (name in all.cols) { + if (name %in% colnames(param.grid)) { + if (name == "epsilon") + order.args <- cbind(order.args, -param.grid[[name]]) + else + order.args <- cbind(order.args, param.grid[[name]]) + } + } + sorted.pg <- param.grid[do.call(order, as.list(as.data.frame(order.args))), ] + + rownames(sorted.pg) <- NULL + + return(sorted.pg) +} + +gensvm.expand.param.grid <- function(pg, n.features) +{ + if ("kernel" %in% colnames(pg)) { + all.kernels <- c("linear", "poly", "rbf", "sigmoid") + pg$kernel <- match(pg$kernel, all.kernels) - 1 + } else { + pg$kernel <- 0 + } + + if ("weight" %in% colnames(pg)) { + all.weights <- c("unit", "group") + pg$weight <- match(pg$weight, all.weights) + } else { + pg$weight <- 1 + } + + if ("gamma" %in% colnames(pg)) { + pg$gamma[pg$gamma == "auto"] <- 1.0/n.features + } else { + pg$gamma <- 1.0/n.features + } + + if (!("degree" %in% colnames(pg))) + pg$degree <- 2.0 + if (!("coef" %in% colnames(pg))) + pg$coef <- 0.0 + if (!("p" %in% colnames(pg))) + pg$p <- 1.0 + if (!("lambda" %in% colnames(pg))) + pg$lambda <- 1e-8 + if (!("kappa" %in% colnames(pg))) + pg$kappa <- 0.0 + if (!("epsilon" %in% colnames(pg))) + pg$epsilon <- 1e-6 + if (!("max.iter" %in% colnames(pg))) + pg$max.iter <- 1e8 + + C.param.grid <- data.frame(kernel=pg$kernel, coef=pg$coef, + degree=pg$degree, gamma=pg$gamma, + weight=pg$weight, kappa=pg$kappa, + lambda=pg$lambda, p=pg$p, epsilon=pg$epsilon, + max.iter=pg$max.iter) + + return(C.param.grid) +} + +#' @title Compute the ranks for the numbers in a given vector +#' +#' @details +#' This function computes the ranks for the values in an array. The highest +#' value gets the smallest rank. Ties are broken by assigning the smallest +#' value. +#' +#' @param x array of numeric values +#' +#' @examples +#' x <- c(7, 0.1, 0.5, 0.1, 10, 100, 200) +#' gensvm.rank.score(x) +#' [ 4 6 5 6 3 2 1 ] +#' +gensvm.rank.score <- function(x) +{ + x <- as.array(x) + l <- length(x) + r <- 1 + ranks <- as.vector(matrix(0, l, 1)) + ranks[which(is.na(x))] <- NA + while (!all(mapply(is.na, x))) { + m <- max(x, na.rm=T) + idx <- which(x == m) + ranks[idx] <- r + r <- r + length(idx) + x[idx] <- NA + } + + return(ranks) +} diff --git a/R/gensvm.maxabs.scale.R b/R/gensvm.maxabs.scale.R new file mode 100644 index 0000000..6ac351b --- /dev/null +++ b/R/gensvm.maxabs.scale.R @@ -0,0 +1,77 @@ +#' @title Scale each column of a matrix by its maximum absolute value +#' +#' @description Scaling a dataset can creatly decrease the computation time of +#' GenSVM. This function scales the data by dividing each column of a matrix by +#' the maximum absolute value of that column. This preserves sparsity in the +#' data while mapping each column to the interval [-1, 1]. +#' +#' Optionally a test dataset can be provided as well. In this case, the scaling +#' will be computed on the first argument (\code{x}) and applied to the test +#' dataset. Note that the return value is a list when this argument is +#' supplied. +#' +#' @param x a matrix to scale +#' @param x.test (optional) a test matrix to scale as well. +#' +#' @return if x.test=NULL a scaled matrix where the maximum value of the +#' columns is 1 and the minimum value of the columns isn't below -1. If x.test +#' is supplied, a list with elements \code{x} and \code{x.test} representing +#' the scaled datasets. +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' +#' # check the min and max of the columns +#' apply(x, 2, min) +#' apply(x, 2, max) +#' +#' # scale the data +#' x.scale <- gensvm.maxabs.scale(x) +#' +#' # check again (max should be 1.0, min shouldn't be below -1) +#' apply(x.scale, 2, min) +#' apply(x.scale, 2, max) +#' +#' # with a train and test dataset +#' x <- iris[, -5] +#' split <- gensvm.train.test.split(x) +#' x.train <- split$x.train +#' x.test <- split$x.test +#' scaled <- gensvm.maxabs.scale(x.train, x.test) +#' x.train.scl <- scaled$x +#' x.test.scl <- scaled$x.test +#' +gensvm.maxabs.scale <- function(x, x.test=NULL) +{ + xm <- as.matrix(x) + max.abs <- apply(apply(xm, 2, abs), 2, max) + max.abs[max.abs == 0] <- 1 + + scaled <- xm %*% diag(1.0 / max.abs) + colnames(scaled) <- colnames(x) + rownames(scaled) <- rownames(x) + + if (!is.null(x.test)) { + xtm <- as.matrix(x.test) + scaled.test <- xtm %*% diag(1.0 / max.abs) + colnames(scaled.test) <- colnames(x.test) + rownames(scaled.test) <- rownames(x.test) + + ret.val <- list(x=scaled, x.test=scaled.test) + } else { + ret.val <- scaled + } + + return(ret.val) +} diff --git a/R/gensvm.refit.R b/R/gensvm.refit.R new file mode 100644 index 0000000..a6af3fd --- /dev/null +++ b/R/gensvm.refit.R @@ -0,0 +1,82 @@ +#' @title Train an already fitted model on new data +#' +#' @title This function can be used to train an existing model on new data or +#' fit an existing model with slightly different parameters. It is useful for +#' retraining without having to copy all the parameters over. One common +#' application for this is to refit the best model found by a grid search, as +#' illustrated in the examples. +#' +#' @param fit Fitted \code{gensvm} object +#' @param X Data matrix of the new data +#' @param y Label vector of the new data +#' @param verbose Turn on verbose output and fit progress. If NULL (the +#' default) the value from the fitted model is chosen. +#' +#' @return a new fitted \code{gensvm} model +#' +#' @export +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # fit a standard model and refit with slightly different parameters +#' fit <- gensvm(x, y) +#' fit2 <- gensvm.refit(x, y, epsilon=1e-8) +#' +#' # refit a model returned by a grid search +#' grid <- gensvm.grid(x, y) +#' fit <- gensvm.refit(fit, x, y, epsilon=1e-8) +#' +#' # refit on different data +#' idx <- runif(nrow(x)) > 0.5 +#' x1 <- x[idx, ] +#' x2 <- x[!idx, ] +#' y1 <- y[idx] +#' y2 <- y[!idx] +#' +#' fit1 <- gensvm(x1, y1) +#' fit2 <- gensvm.refit(fit1, x2, y2) +#' +gensvm.refit <- function(fit, X, y, p=NULL, lambda=NULL, kappa=NULL, + epsilon=NULL, weights=NULL, kernel=NULL, gamma=NULL, + coef=NULL, degree=NULL, kernel.eigen.cutoff=NULL, + max.iter=NULL, verbose=NULL, random.seed=NULL) +{ + p <- if(is.null(p)) fit$p else p + lambda <- if(is.null(lambda)) fit$lambda else lambda + kappa <- if(is.null(kappa)) fit$kappa else kappa + epsilon <- if(is.null(epsilon)) fit$epsilon else epsilon + weights <- if(is.null(weights)) fit$weights else weights + kernel <- if(is.null(kernel)) fit$kernel else kernel + gamma <- if(is.null(gamma)) fit$gamma else gamma + coef <- if(is.null(coef)) fit$coef else coef + degree <- if(is.null(degree)) fit$degree else degree + kernel.eigen.cutoff <- (if(is.null(kernel.eigen.cutoff)) + fit$kernel.eigen.cutoff else kernel.eigen.cutoff) + max.iter <- if(is.null(max.iter)) fit$max.iter else max.iter + verbose <- if(is.null(verbose)) fit$verbose else verbose + random.seed <- if(is.null(random.seed)) fit$random.seed else random.seed + + # Setting the error handler here is necessary in case the user interrupts + # this call to gensvm. If we don't set the error handler, R will + # unnecessarily drop to a browser() session. We reset the error handler + # after the call to gensvm(). + options(error=function() {}) + newfit <- gensvm(X, y, p=p, lambda=lambda, kappa=kappa, epsilon=epsilon, + weights=weights, kernel=kernel, gamma=gamma, coef=coef, + degree=degree, kernel.eigen.cutoff=kernel.eigen.cutoff, + verbose=verbose, max.iter=max.iter, seed.V=coef(fit)) + options(error=NULL) + + return(newfit) +} diff --git a/R/gensvm.train.test.split.R b/R/gensvm.train.test.split.R new file mode 100644 index 0000000..406f80e --- /dev/null +++ b/R/gensvm.train.test.split.R @@ -0,0 +1,121 @@ +#' @title Create a train/test split of a dataset +#' +#' @description Often it is desirable to split a dataset into a training and +#' testing sample. This function is included in GenSVM to make it easy to do +#' so. The function is inspired by a similar function in Scikit-Learn. +#' +#' @param x array to split +#' @param y another array to split (typically this is a vector) +#' @param train.size size of the training dataset. This can be provided as +#' float or as int. If it's a float, it should be between 0.0 and 1.0 and +#' represents the fraction of the dataset that should be placed in the training +#' dataset. If it's an int, it represents the exact number of samples in the +#' training dataset. If it is NULL, the complement of \code{test.size} will be +#' used. +#' @param test.size size of the test dataset. Similarly to train.size both a +#' float or an int can be supplied. If it's NULL, the complement of train.size +#' will be used. If both train.size and test.size are NULL, a default test.size +#' of 0.25 will be used. +#' @param shuffle shuffle the rows or not +#' @param random.state seed for the random number generator (int) +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # using the default values +#' split <- gensvm.train.test.split(x, y) +#' +#' # using the split in a GenSVM model +#' fit <- gensvm(split$x.train, split$y.train) +#' gensvm.accuracy(split$y.test, predict(fit, split$x.test)) +#' +#' # using attach makes the results directly available +#' attach(gensvm.train.test.split(x, y)) +#' fit <- gensvm(x.train, y.train) +#' gensvm.accuracy(y.test, predict(fit, x.test)) +#' +gensvm.train.test.split <- function(x, y=NULL, train.size=NULL, test.size=NULL, + shuffle=TRUE, random.state=NULL, + return.idx=FALSE) +{ + if (!is.null(y) && dim(as.matrix(x))[1] != dim(as.matrix(y))[1]) { + cat("Error: First dimension of x and y should be equal.\n") + return + } + + n.objects <- dim(as.matrix(x))[1] + + if (is.null(train.size) && is.null(test.size)) { + test.size <- round(0.25 * n.objects) + train.size <- n.objects - test.size + } + else if (is.null(train.size)) { + if (test.size > 0.0 && test.size < 1.0) + test.size <- round(n.objects * test.size) + train.size <- n.objects - test.size + } + else if (is.null(test.size)) { + if (train.size > 0.0 && train.size < 1.0) + train.size <- round(n.objects * train.size) + test.size <- n.objects - train.size + } + else { + if (train.size > 0.0 && train.size < 1.0) + train.size <- round(n.objects * train.size) + if (test.size > 0.0 && test.size < 1.0) + test.size <- round(n.objects * test.size) + } + + if (!is.null(random.state)) + set.seed(random.state) + + if (shuffle) { + train.idx <- sample(n.objects, train.size) + diff <- setdiff(1:n.objects, train.idx) + test.idx <- sample(diff, test.size) + } else { + train.idx <- 1:train.size + diff <- setdiff(1:n.objects, train.idx) + test.idx <- diff[1:test.size] + } + + x.train <- x[train.idx, ] + x.test <- x[test.idx, ] + + if (!is.null(y)) { + if (is.matrix(y)) { + y.train <- y[train.idx, ] + y.test <- y[test.idx, ] + } else { + y.train <- y[train.idx] + y.test <- y[test.idx] + } + } + + out <- list( + x.train = x.train, + x.test = x.test + ) + if (!is.null(y)) { + out$y.train <- y.train + out$y.test <- y.test + } + if (return.idx) { + out$idx.train <- train.idx + out$idx.test <- test.idx + } + + return(out) +} diff --git a/R/plot.gensvm.R b/R/plot.gensvm.R new file mode 100644 index 0000000..0ce215b --- /dev/null +++ b/R/plot.gensvm.R @@ -0,0 +1,199 @@ +#' @title Plot the simplex space of the fitted GenSVM model +#' +#' @description This function creates a plot of the simplex space for a fitted +#' GenSVM model and the given data set, as long as the dataset consists of only +#' 3 classes. For more than 3 classes, the simplex space is too high +#' dimensional to easily visualize. +#' +#' @param fit A fitted \code{gensvm} object +#' @param x the dataset to plot +#' @param y.true the true data labels. If provided the objects will be colored +#' using the true labels instead of the predicted labels. This makes it easy to +#' identify misclassified objects. +#' @param with.margins plot the margins +#' @param with.shading show shaded areas for the class regions +#' @param with.legend show the legend for the class labels +#' @param center.plot ensure that the boundaries and margins are always visible +#' in the plot +#' @param ... further arguments are ignored +#' +#' @return returns the object passed as input +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @method plot gensvm +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # train the model +#' fit <- gensvm(x, y) +#' +#' # plot the simplex space +#' plot(fit, x) +#' +#' # plot and use the true colors (easier to spot misclassified samples) +#' plot(fit, x, y.true=y) +#' +#' # plot only misclassified samples +#' x.mis <- x[predict(fit, x) != y, ] +#' y.mis.true <- y[predict(fit, x) != y, ] +#' plot(fit, x.bad) +#' plot(fit, x.bad, y.true=y.mis.true) +#' +plot.gensvm <- function(fit, x, y.true=NULL, with.margins=TRUE, + with.shading=TRUE, with.legend=TRUE, center.plot=TRUE, + ...) +{ + if (fit$n.classes != 3) { + cat("Error: Can only plot with 3 classes\n") + return + } + + # Sanity check + if (ncol(x) != fit$n.features) { + cat("Error: Number of features of fitted model and testing data + disagree.\n") + return + } + + x.train <- fit$X.train + if (fit$kernel != 'linear' && is.null(x.train)) { + cat("Error: The training data is needed to plot data for ", + "nonlinear GenSVM. This data is not present in the fitted ", + "model!\n", sep="") + return + } + if (!is.null(x.train) && ncol(x.train) != fit$n.features) { + cat("Error: Number of features of fitted model and training data disagree.") + return + } + + x <- as.matrix(x) + + if (fit$kernel == 'linear') { + V <- coef(fit) + Z <- cbind(matrix(1, dim(x)[1], 1), x) + S <- Z %*% V + y.pred.orig <- predict(fit, x) + } else { + kernels <- c("linear", "poly", "rbf", "sigmoid") + kernel.idx <- which(kernels == fit$kernel) - 1 + plotdata <- .Call("R_gensvm_plotdata_kernels", + as.matrix(x), + as.matrix(x.train), + as.matrix(fit$V), + as.integer(nrow(fit$V)), + as.integer(ncol(fit$V)), + as.integer(nrow(x.train)), + as.integer(nrow(x)), + as.integer(fit$n.features), + as.integer(fit$n.classes), + as.integer(kernel.idx), + fit$gamma, + fit$coef, + fit$degree, + fit$kernel.eigen.cutoff + ) + S <- plotdata$ZV + y.pred.orig <- plotdata$y.pred + } + + classes <- fit$classes + if (is.factor(y.pred.orig)) { + y.pred <- match(y.pred.orig, classes) + } else { + y.pred <- y.pred.orig + } + + # Define some colors + point.blue <- rgb(31, 119, 180, maxColorValue=255) + point.orange <- rgb(255, 127, 14, maxColorValue=255) + point.green <- rgb(44, 160, 44, maxColorValue=255) + fill.blue <- rgb(31, 119, 180, 51, maxColorValue=255) + fill.orange <- rgb(255, 127, 14, 51, maxColorValue=255) + fill.green <- rgb(44, 160, 44, 51, maxColorValue=255) + + colors <- as.matrix(c(point.green, point.blue, point.orange)) + markers <- as.matrix(c(15, 16, 17)) + + if (is.null(y.true)) { + col.vector <- colors[y.pred] + mark.vector <- markers[y.pred] + } else { + col.vector <- colors[y.true] + mark.vector <- markers[y.true] + } + + par(pty="s") + if (center.plot) { + new.xlim <- c(min(min(S[, 1]), -1.2), max(max(S[, 1]), 1.2)) + new.ylim <- c(min(min(S[, 2]), -0.75), max(max(S[, 2]), 1.2)) + plot(S[, 1], S[, 2], col=col.vector, pch=mark.vector, ylab='', xlab='', + asp=1, xlim=new.xlim, ylim=new.ylim) + } else { + plot(S[, 1], S[, 2], col=col.vector, pch=mark.vector, ylab='', xlab='', + asp=1) + } + + limits <- par("usr") + xmin <- limits[1] + xmax <- limits[2] + ymin <- limits[3] + ymax <- limits[4] + + # draw the fixed boundaries + segments(0, 0, 0, ymin) + segments(0, 0, xmax, xmax/sqrt(3)) + segments(xmin, abs(xmin)/sqrt(3), 0, 0) + + if (with.margins) { + # margin from left below decision boundary to center + segments(xmin, -xmin/sqrt(3) - sqrt(4/3), -1, -1/sqrt(3), lty=2) + + # margin from left center to down + segments(-1, -1/sqrt(3), -1, ymin, lty=2) + + # margin from right center to middle + segments(1, -1/sqrt(3), 1, ymin, lty=2) + + # margin from right center to right boundary + segments(1, -1/sqrt(3), xmax, xmax/sqrt(3) - sqrt(4/3), lty=2) + + # margin from center to top left + segments(xmin, -xmin/sqrt(3) + sqrt(4/3), 0, sqrt(4/3), lty=2) + + # margin from center to top right + segments(0, sqrt(4/3), xmax, xmax/sqrt(3) + sqrt(4/3), lty=2) + } + + if (with.shading) { + # bottom left + polygon(c(xmin, -1, -1, xmin), c(ymin, ymin, -1/sqrt(3), -xmin/sqrt(3) - + sqrt(4/3)), col=fill.green, border=NA) + # bottom right + polygon(c(1, xmax, xmax, 1), c(ymin, ymin, xmax/sqrt(3) - sqrt(4/3), + -1/sqrt(3)), col=fill.blue, border=NA) + # top + polygon(c(xmin, 0, xmax, xmax, xmin), + c(-xmin/sqrt(3) + sqrt(4/3), sqrt(4/3), xmax/sqrt(3) + sqrt(4/3), + ymax, ymax), col=fill.orange, + border=NA) + } + + if (with.legend) { + offset <- abs(xmax - xmin) * 0.05 + legend(xmax + offset, ymax, classes, col=colors, pch=markers, xpd=T) + } + + invisible(fit) +} diff --git a/R/plot.gensvm.grid.R b/R/plot.gensvm.grid.R new file mode 100644 index 0000000..da101e6 --- /dev/null +++ b/R/plot.gensvm.grid.R @@ -0,0 +1,39 @@ +#' @title Plot the simplex space of the best fitted model in the GenSVMGrid +#' +#' @description This is a wrapper which calls the plot function for the best +#' model in the provided GenSVMGrid object. See the documentation for +#' \code{\link{plot.gensvm}} for more information. +#' +#' @param grid A \code{gensvm.grid} object trained with refit=TRUE +#' @param x the dataset to plot +#' @param ... further arguments are passed to the plot function +#' +#' @return returns the object passed as input +#' +#' @export +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' grid <- gensvm.grid(x, y) +#' plot(grid, x) +#' +plot.gensvm.grid <- function(grid, x, ...) +{ + if (is.null(grid$best.estimator)) { + cat("Error: Can't plot, the best.estimator element is NULL\n") + return + } + fit <- grid$best.estimator + return(plot(fit, x, ...)) +} diff --git a/R/predict.gensvm.R b/R/predict.gensvm.R index 5c8f2e7..7e04fe4 100644 --- a/R/predict.gensvm.R +++ b/R/predict.gensvm.R @@ -4,8 +4,8 @@ #' fitted GenSVM model. #' #' @param fit Fitted \code{gensvm} object -#' @param newx Matrix of new values for \code{x} for which predictions need to -#' be made. +#' @param x.test Matrix of new values for \code{x} for which predictions need +#' to be made. #' @param \dots further arguments are ignored #' #' @return a vector of class labels, with the same type as the original class @@ -15,7 +15,7 @@ #' @aliases predict #' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -24,10 +24,20 @@ #' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. #' #' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] #' +#' # create a training and test sample +#' attach(gensvm.train.test.split(x, y)) +#' fit <- gensvm(x.train, y.train) #' +#' # predict the class labels of the test sample +#' y.test.pred <- predict(fit, x.test) #' -predict.gensvm <- function(fit, newx, ...) +#' # compute the accuracy with gensvm.accuracy +#' gensvm.accuracy(y.test, y.test.pred) +#' +predict.gensvm <- function(fit, x.test, ...) { ## Implementation note: ## - It might seem that it would be faster to do the prediction directly in @@ -37,16 +47,53 @@ predict.gensvm <- function(fit, newx, ...) ## the C implementation is *much* faster than doing it in R. # Sanity check - if (ncol(newx) != fit$n.features) - stop("Number of features of fitted model and supplied data disagree.") + if (ncol(x.test) != fit$n.features) { + cat("Error: Number of features of fitted model and testing", + "data disagree.\n") + return + } + + x.train <- fit$X.train + if (fit$kernel != 'linear' && is.null(x.train)) { + cat("Error: The training data is needed to compute predictions for ", + "nonlinear GenSVM. This data is not present in the fitted ", + "model!\n", sep="") + } + if (!is.null(x.train) && ncol(x.train) != fit$n.features) { + cat("Error: Number of features of fitted model and training", + "data disagree.\n") + return + } - y.pred.c <- .Call("R_gensvm_predict", - as.matrix(newx), + if (fit$kernel == 'linear') { + y.pred.c <- .Call("R_gensvm_predict", + as.matrix(x.test), as.matrix(fit$V), - as.integer(nrow(newx)), - as.integer(ncol(newx)), + as.integer(nrow(x.test)), + as.integer(ncol(x.test)), as.integer(fit$n.classes) ) + } else { + kernels <- c("linear", "poly", "rbf", "sigmoid") + kernel.idx <- which(kernels == fit$kernel) - 1 + y.pred.c <- .Call("R_gensvm_predict_kernels", + as.matrix(x.test), + as.matrix(x.train), + as.matrix(fit$V), + as.integer(nrow(fit$V)), + as.integer(ncol(fit$V)), + as.integer(nrow(x.train)), + as.integer(nrow(x.test)), + as.integer(fit$n.features), + as.integer(fit$n.classes), + as.integer(kernel.idx), + fit$gamma, + fit$coef, + fit$degree, + fit$kernel.eigen.cutoff + ) + } + yhat <- fit$classes[y.pred.c] return(yhat) diff --git a/R/predict.gensvm.grid.R b/R/predict.gensvm.grid.R new file mode 100644 index 0000000..81a0207 --- /dev/null +++ b/R/predict.gensvm.grid.R @@ -0,0 +1,47 @@ +#' @title Predict class labels from the GenSVMGrid class +#' +#' @description Predict class labels using the best model from a grid search. +#' After doing a grid search with the \code{\link{gensvm.grid}} function, this +#' function can be used to make predictions of class labels. It uses the best +#' GenSVM model found during the grid search to do the predictions. Note that +#' this model is only available if \code{refit=TRUE} was specified in the +#' \code{\link{gensvm.grid}} call (the default). +#' +#' @param grid A \code{gensvm.grid} object trained with \code{refit=TRUE} +#' @param newx Matrix of new values for \code{x} for which predictions need to +#' be computed. +#' @param \dots further arguments are passed to predict.gensvm() +#' +#' @return a vector of class labels, with the same type as the original class +#' labels provided to gensvm.grid() +#' +#' @export +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # run a grid search +#' grid <- gensvm.grid(x, y) +#' +#' # predict training sample +#' y.hat <- predict(grid, x) +#' +predict.gensvm.grid <- function(grid, newx, ...) +{ + if (is.null(grid$best.estimator)) { + cat("Error: Can't predict, the best.estimator element is NULL\n") + return + } + + return(predict(grid$best.estimator, newx, ...)) +} diff --git a/R/print.gensvm.R b/R/print.gensvm.R index 06a3649..119b264 100644 --- a/R/print.gensvm.R +++ b/R/print.gensvm.R @@ -2,13 +2,14 @@ #' #' @description Prints a short description of the fitted GenSVM model #' -#' @param object A \code{gensvm} object to print +#' @param fit A \code{gensvm} object to print #' @param \dots further arguments are ignored #' -#' @return returns the object passed as input +#' @return returns the object passed as input. This can be useful for chaining +#' operations on a fit object. #' #' @author -#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr #' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> #' #' @references @@ -20,37 +21,52 @@ #' @export #' #' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] #' +#' # fit and print the model +#' fit <- gensvm(x, y) +#' print(fit) #' -print.gensvm <- function(object, ...) +#' # (advanced) use the fact that print returns the fitted model +#' fit <- gensvm(x, y) +#' predict(print(fit), x) +#' +print.gensvm <- function(fit, ...) { - cat("\nCall:\n") - dput(object$call) - cat("\nData:\n") - cat("\tn.objects:", object$n.objects, "\n") - cat("\tn.features:", object$n.features, "\n") - cat("\tn.classes:", object$n.classes, "\n") - cat("\tclasses:", object$classes, "\n") + cat("Data:\n") + cat("\tn.objects:", fit$n.objects, "\n") + cat("\tn.features:", fit$n.features, "\n") + cat("\tn.classes:", fit$n.classes, "\n") + if (is.factor(fit$classes)) + cat("\tclasses:", levels(fit$classes), "\n") + else + cat("\tclasses:", fit$classes, "\n") cat("Parameters:\n") - cat("\tp:", object$p, "\n") - cat("\tlambda:", object$lambda, "\n") - cat("\tkappa:", object$kappa, "\n") - cat("\tepsilon:", object$epsilon, "\n") - cat("\tweights:", object$weights, "\n") - cat("\tmax.iter:", object$max.iter, "\n") - cat("\trandom.seed:", object$random.seed, "\n") - cat("\tkernel:", object$kernel, "\n") - if (object$kernel %in% c("poly", "rbf", "sigmoid")) { - cat("\tkernel.eigen.cutoff:", object$kernel.eigen.cutoff, "\n") - cat("\tgamma:", object$gamma, "\n") + cat("\tp:", fit$p, "\n") + cat("\tlambda:", fit$lambda, "\n") + cat("\tkappa:", fit$kappa, "\n") + cat("\tepsilon:", fit$epsilon, "\n") + cat("\tweights:", fit$weights, "\n") + cat("\tmax.iter:", fit$max.iter, "\n") + cat("\trandom.seed:", fit$random.seed, "\n") + if (is.factor(fit$kernel)) { + cat("\tkernel:", levels(fit$kernel)[as.numeric(fit$kernel)], "\n") + } else { + cat("\tkernel:", fit$kernel, "\n") + } + if (fit$kernel %in% c("poly", "rbf", "sigmoid")) { + cat("\tkernel.eigen.cutoff:", fit$kernel.eigen.cutoff, "\n") + cat("\tgamma:", fit$gamma, "\n") } - if (object$kernel %in% c("poly", "sigmoid")) - cat("\tcoef:", object$coef, "\n") - if (object$kernel == 'poly') - cat("\tdegree:", object$degree, "\n") + if (fit$kernel %in% c("poly", "sigmoid")) + cat("\tcoef:", fit$coef, "\n") + if (fit$kernel == 'poly') + cat("\tdegree:", fit$degree, "\n") cat("Results:\n") - cat("\tn.iter:", object$n.iter, "\n") - cat("\tn.support:", object$n.support, "\n") + cat("\ttime:", fit$training.time, "\n") + cat("\tn.iter:", fit$n.iter, "\n") + cat("\tn.support:", fit$n.support, "\n") - invisible(object) + invisible(fit) } diff --git a/R/print.gensvm.grid.R b/R/print.gensvm.grid.R new file mode 100644 index 0000000..88967d7 --- /dev/null +++ b/R/print.gensvm.grid.R @@ -0,0 +1,61 @@ +#' @title Print the fitted GenSVMGrid model +#' +#' @description Prints the summary of the fitted GenSVMGrid model +#' +#' @param grid a \code{gensvm.grid} object to print +#' @param \dots further arguments are ignored +#' +#' @return returns the object passed as input +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @method print gensvm.grid +#' @export +#' +#' @examples +#' x <- iris[, -5] +#' y <- iris[, 5] +#' +#' # fit a grid search and print the resulting object +#' grid <- gensvm.grid(x, y) +#' print(grid) +#' +print.gensvm.grid <- function(grid, ...) +{ + cat("Data:\n") + cat("\tn.objects:", grid$n.objects, "\n") + cat("\tn.features:", grid$n.features, "\n") + cat("\tn.classes:", grid$n.classes, "\n") + if (is.factor(grid$classes)) + cat("\tclasses:", levels(grid$classes), "\n") + else + cat("\tclasses:", grid$classes, "\n") + cat("Config:\n") + cat("\tNumber of cv splits:", grid$n.splits, "\n") + not.run <- sum(is.na(grid$cv.results$rank.test.score)) + if (not.run > 0) { + cat("\tParameter grid size:", dim(grid$param.grid)[1]) + cat(" (", not.run, " incomplete)", sep="") + cat("\n") + } else { + cat("\tParameter grid size:", dim(grid$param.grid)[1], "\n") + } + cat("Results:\n") + cat("\tTotal grid search time:", grid$total.time, "\n") + if (!is.na(grid$best.index)) { + best <- grid$cv.results[grid$best.index, ] + cat("\tBest mean test score:", best$mean.test.score, "\n") + cat("\tBest mean fit time:", best$mean.fit.time, "\n") + for (name in colnames(grid$best.params)) + cat("\tBest parameter", name, "=", grid$best.params[[name]], "\n") + } + + invisible(grid) +} diff --git a/man/coef.gensvm.Rd b/man/coef.gensvm.Rd new file mode 100644 index 0000000..73d7a9a --- /dev/null +++ b/man/coef.gensvm.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coef.gensvm.R +\name{coef.gensvm} +\alias{coef.gensvm} +\title{Get the coefficients of the fitted GenSVM model} +\usage{ +\method{coef}{gensvm}(object, ...) +} +\arguments{ +\item{object}{a \code{gensvm} object} + +\item{\dots}{further arguments are ignored} +} +\value{ +The coefficients of the GenSVM model. This is a matrix of size +\eqn{(n_{features} + 1) x (n_{classes} - 1)}. This matrix is used to project +the input data to a low dimensional space using the equation: \eqn{XW + t} +where \eqn{X} is the input matrix, \eqn{t} is the first row of the matrix +returned by this function, and \eqn{W} is the \eqn{n_{features} x +(n_{classes} - 1)} matrix formed by the remaining rows. +} +\description{ +Returns the model coefficients of the GenSVM object +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +fit <- gensvm(x, y) +V <- coef(fit) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/coef.gensvm.grid.Rd b/man/coef.gensvm.grid.Rd new file mode 100644 index 0000000..b8f8a40 --- /dev/null +++ b/man/coef.gensvm.grid.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/coef.gensvm.grid.R +\name{coef.gensvm.grid} +\alias{coef.gensvm.grid} +\title{Get the parameter grid from a GenSVM Grid object} +\usage{ +\method{coef}{gensvm.grid}(object, ...) +} +\arguments{ +\item{object}{a \code{gensvm.grid} object} + +\item{\dots}{further arguments are ignored} +} +\value{ +The parameter grid of the GenSVMGrid object as a data frame. +} +\description{ +Returns the parameter grid of a \code{gensvm.grid} object. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +grid <- gensvm.grid(x, y) +pg <- coef(grid) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/gensvm-package.Rd b/man/gensvm-package.Rd new file mode 100644 index 0000000..56e28ac --- /dev/null +++ b/man/gensvm-package.Rd @@ -0,0 +1,111 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm-package.R +\docType{package} +\name{gensvm-package} +\alias{gensvm-package} +\alias{gensvm.package} +\title{GenSVM: A Generalized Multiclass Support Vector Machine} +\description{ +The GenSVM classifier is a generalized multiclass support vector machine +(SVM). This classifier aims to find decision boundaries that separate the +classes with as wide a margin as possible. In GenSVM, the loss functions +that measures how misclassifications are counted is very flexible. This +allows the user to tune the classifier to the dataset at hand and +potentially obtain higher classification accuracy. Moreover, this +flexibility means that GenSVM has a number of alternative multiclass SVMs as +special cases. One of the other advantages of GenSVM is that it is trained +in the primal space, allowing the use of warm starts during optimization. +This means that for common tasks such as cross validation or repeated model +fitting, GenSVM can be trained very quickly. +} +\details{ +This package provides functions for training the GenSVM model either as a +separate model or through a cross-validated parameter grid search. In both +cases the GenSVM C library is used for speed. Auxiliary functions for +evaluating and using the model are also provided. +} +\section{GenSVM functions}{ + +The main GenSVM functions are: +\describe{ +\item{\code{\link{gensvm}}}{Fit a GenSVM model for specific model +parameters.} +\item{\code{\link{gensvm.grid}}}{Run a cross-validated grid search for +GenSVM.} +} + +For the GenSVM and GenSVMGrid models the following two functions are +available. When applied to a GenSVMGrid object, the function is applied to +the best GenSVM model. +\describe{ +\item{\code{\link{plot}}}{Plot the low-dimensional \emph{simplex} space +where the decision boundaries are fixed (for problems with 3 classes).} +\item{\code{\link{predict}}}{Predict the class labels of new data using the +GenSVM model.} +} + +Moreover, for the GenSVM and GenSVMGrid models a \code{coef} function is +defined: +\describe{ +\item{\code{\link{coef.gensvm}}}{Get the coefficients of the fitted GenSVM +model.} +\item{\code{\link{coef.gensvm.grid}}}{Get the parameter grid of the GenSVM +grid search.} +} + +The following utility functions are also included: +\describe{ +\item{\code{\link{gensvm.accuracy}}}{Compute the accuracy score between true +and predicted class labels} +\item{\code{\link{gensvm.maxabs.scale}}}{Scale each column of the dataset by +its maximum absolute value, preserving sparsity and mapping the data to [-1, +1]} +\item{\code{\link{gensvm.train.test.split}}}{Split a dataset into a training +and testing sample} +\item{\code{\link{gensvm.refit}}}{Refit a fitted GenSVM model with slightly +different parameters or on a different dataset} +} +} + +\section{Kernels in GenSVM}{ + + +GenSVM can be used for both linear and nonlinear multiclass support vector +machine classification. In general, linear classification will be faster but +depending on the dataset higher classification performance can be achieved +using a nonlinear kernel. + +The following nonlinear kernels are implemented in the GenSVM package: +\describe{ + \item{RBF}{The Radial Basis Function kernel is a well-known kernel function + based on the Euclidean distance between objects. It is defined as + \deqn{ + k(x_i, x_j) = exp( -\gamma || x_i - x_j ||^2 ) + } + } + \item{Polynomial}{A polynomial kernel can also be used in GenSVM. This + kernel function is implemented very generally and therefore takes three + parameters (\code{coef}, \code{gamma}, and \code{degree}). It is defined + as: + \deqn{ + k(x_i, x_j) = ( \gamma x_i' x_j + coef)^{degree} + } + } + \item{Sigmoid}{The sigmoid kernel is the final kernel implemented in + GenSVM. This kernel has two parameters and is implemented as follows: + \deqn{ + k(x_i, x_j) = \tanh( \gamma x_i' x_j + coef) + } + } + } +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/gensvm.Rd b/man/gensvm.Rd new file mode 100644 index 0000000..1db0558 --- /dev/null +++ b/man/gensvm.Rd @@ -0,0 +1,136 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.R +\name{gensvm} +\alias{gensvm} +\title{Fit the GenSVM model} +\usage{ +gensvm(X, y, p = 1, lambda = 1e-08, kappa = 0, epsilon = 1e-06, + weights = "unit", kernel = "linear", gamma = "auto", coef = 1, + degree = 2, kernel.eigen.cutoff = 1e-08, verbose = FALSE, + random.seed = NULL, max.iter = 1e+08, seed.V = NULL) +} +\arguments{ +\item{X}{data matrix with the predictors} + +\item{y}{class labels} + +\item{p}{parameter for the L_p norm of the loss function (1.0 <= p <= 2.0)} + +\item{lambda}{regularization parameter for the loss function (lambda > 0)} + +\item{kappa}{parameter for the hinge function in the loss function (kappa > +-1.0)} + +\item{weights}{type of instance weights to use. Options are 'unit' for unit +weights and 'group' for group size correction weight (eq. 4 in the paper).} + +\item{kernel}{the kernel type to use in the classifier. It must be one of +'linear', 'poly', 'rbf', or 'sigmoid'. See the section "Kernels in GenSVM" +in \code{\link{gensvm-package}} for more info.} + +\item{gamma}{kernel parameter for the rbf, polynomial, and sigmoid kernel. +If gamma is 'auto', then 1/n_features will be used.} + +\item{coef}{parameter for the polynomial and sigmoid kernel.} + +\item{degree}{parameter for the polynomial kernel} + +\item{kernel.eigen.cutoff}{Cutoff point for the reduced eigendecomposition +used with kernel-GenSVM. Eigenvectors for which the ratio between their +corresponding eigenvalue and the largest eigenvalue is smaller than this +cutoff value will be dropped.} + +\item{verbose}{Turn on verbose output and fit progress} + +\item{random.seed}{Seed for the random number generator (useful for +reproducible output)} + +\item{max.iter}{Maximum number of iterations of the optimization algorithm.} + +\item{seed.V}{Matrix to warm-start the optimization algorithm. This is +typically the output of \code{coef(fit)}. Note that this function will +silently drop seed.V if the dimensions don't match the provided data.} +} +\value{ +A "gensvm" S3 object is returned for which the print, predict, coef, +and plot methods are available. It has the following items: +\item{call}{The call that was used to construct the model.} +\item{p}{The value of the lp norm in the loss function} +\item{lambda}{The regularization parameter used in the model.} +\item{kappa}{The hinge function parameter used.} +\item{epsilon}{The stopping criterion used.} +\item{weights}{The instance weights type used.} +\item{kernel}{The kernel function used.} +\item{gamma}{The value of the gamma parameter of the kernel, if applicable} +\item{coef}{The value of the coef parameter of the kernel, if applicable} +\item{degree}{The degree of the kernel, if applicable} +\item{kernel.eigen.cutoff}{The cutoff value of the reduced +eigendecomposition of the kernel matrix.} +\item{verbose}{Whether or not the model was fitted with progress output} +\item{random.seed}{The random seed used to seed the model.} +\item{max.iter}{Maximum number of iterations of the algorithm.} +\item{n.objects}{Number of objects in the dataset} +\item{n.features}{Number of features in the dataset} +\item{n.classes}{Number of classes in the dataset} +\item{classes}{Array with the actual class labels} +\item{V}{Coefficient matrix} +\item{n.iter}{Number of iterations performed in training} +\item{n.support}{Number of support vectors in the final model} +\item{training.time}{Total training time} +\item{X.train}{When training with nonlinear kernels, the training data is +needed to perform prediction. For these kernels it is therefore stored in +the fitted model.} +} +\description{ +Fits the Generalized Multiclass Support Vector Machine model +with the given parameters. See the package documentation +(\code{\link{gensvm-package}}) for more general information about GenSVM. +} +\note{ +This function returns partial results when the computation is interrupted by +the user. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# fit using the default parameters +fit <- gensvm(x, y) + +# fit and show progress +fit <- gensvm(x, y, verbose=T) + +# fit with some changed parameters +fit <- gensvm(x, y, lambda=1e-8) + +# Early stopping defined through epsilon +fit <- gensvm(x, y, epsilon=1e-3) + +# Early stopping defined through max.iter +fit <- gensvm(x, y, max.iter=1000) + +# Nonlinear training +fit <- gensvm(x, y, kernel='rbf') +fit <- gensvm(x, y, kernel='poly', degree=2, gamma=1.0) + +# Setting the random seed and comparing results +fit <- gensvm(x, y, random.seed=123) +fit2 <- gensvm(x, y, random.seed=123) +all.equal(coef(fit), coef(fit2)) + + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{coef}}, \code{\link{print}}, \code{\link{predict}}, +\code{\link{plot}}, and \code{\link{gensvm.grid}}. +} + diff --git a/man/gensvm.accuracy.Rd b/man/gensvm.accuracy.Rd new file mode 100644 index 0000000..60a0f89 --- /dev/null +++ b/man/gensvm.accuracy.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.accuracy.R +\name{gensvm.accuracy} +\alias{gensvm.accuracy} +\title{Compute the accuracy score} +\usage{ +gensvm.accuracy(y.true, y.pred) +} +\arguments{ +\item{y.true}{vector of true labels} + +\item{y.pred}{vector of predicted labels} +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +fit <- gensvm(x, y) +gensvm.accuracy(predict(fit, x), y) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{predict.gensvm.grid}} +} + diff --git a/man/gensvm.generate.cv.idx.Rd b/man/gensvm.generate.cv.idx.Rd new file mode 100644 index 0000000..34f4f64 --- /dev/null +++ b/man/gensvm.generate.cv.idx.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.generate.cv.idx} +\alias{gensvm.generate.cv.idx} +\title{Generate a vector of cross-validation indices} +\usage{ +gensvm.generate.cv.idx(n, folds) +} +\description{ +This function generates a vector of length \code{n} with values from 0 to +\code{folds-1} to mark train and test splits. +} + diff --git a/man/gensvm.grid.Rd b/man/gensvm.grid.Rd new file mode 100644 index 0000000..6dbec22 --- /dev/null +++ b/man/gensvm.grid.Rd @@ -0,0 +1,161 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.grid} +\alias{gensvm.grid} +\title{Cross-validated grid search for GenSVM} +\usage{ +gensvm.grid(X, y, param.grid = "tiny", refit = TRUE, scoring = NULL, + cv = 3, verbose = 0, return.train.score = TRUE) +} +\arguments{ +\item{X}{training data matrix. We denote the size of this matrix by +n_samples x n_features.} + +\item{y}{training vector of class labes of length n_samples. The number of +unique labels in this vector is denoted by n_classes.} + +\item{param.grid}{String (\code{'tiny'}, \code{'small'}, or \code{'full'}) +or data frame with parameter configurations to evaluate. Typically this is +the output of \code{expand.grid}. For more details, see "Using a Parameter +Grid" below.} + +\item{refit}{boolean variable. If true, the best model from cross validation +is fitted again on the entire dataset.} + +\item{scoring}{metric to use to evaluate the classifier performance during +cross validation. The metric should be an R function that takes two +arguments: y_true and y_pred and that returns a float such that higher +values are better. If it is NULL, the accuracy score will be used.} + +\item{cv}{the number of cross-validation folds to use or a vector with the +same length as \code{y} where each unique value denotes a test split.} + +\item{verbose}{integer to indicate the level of verbosity (higher is more +verbose)} + +\item{return.train.score}{whether or not to return the scores on the +training splits} +} +\value{ +A "gensvm.grid" S3 object with the following items: +\item{call}{Call that produced this object} +\item{param.grid}{Sorted version of the parameter grid used in training} +\item{cv.results}{A data frame with the cross validation results} +\item{best.estimator}{If refit=TRUE, this is the GenSVM model fitted with +the best hyperparameter configuration, otherwise it is NULL} +\item{best.score}{Mean cross-validated test score for the model with the +best hyperparameter configuration} +\item{best.params}{Parameter configuration that provided the highest mean +cross-validated test score} +\item{best.index}{Row index of the cv.results data frame that corresponds to +the best hyperparameter configuration} +\item{n.splits}{The number of cross-validation splits} +\item{n.objects}{The number of instances in the data} +\item{n.features}{The number of features of the data} +\item{n.classes}{The number of classes in the data} +\item{classes}{Array with the unique classes in the data} +\item{total.time}{Training time for the grid search} +\item{cv.idx}{Array with cross validation indices used to split the data} +} +\description{ +This function performs a cross-validated grid search of the +model parameters to find the best hyperparameter configuration for a given +dataset. This function takes advantage of GenSVM's ability to use warm +starts to speed up computation. The function uses the GenSVM C library for +speed. +} +\note{ +This function returns partial results when the computation is interrupted by +the user. +} +\section{Using a Parameter Grid}{ + +To evaluate certain paramater configurations, a data frame can be supplied +to the \code{param.grid} argument of the function. Such a data frame can +easily be generated using the R function \code{expand.grid}, or could be +created through other ways to test specific parameter configurations. + +Three parameter grids are predefined: +\describe{ +\item{\code{'tiny'}}{This parameter grid is generated by the function +\code{\link{gensvm.load.tiny.grid}} and is the default parameter grid. It +consists of parameter configurations that are likely to perform well on +various datasets.} +\item{\code{'small'}}{This grid is generated by +\code{\link{gensvm.load.small.grid}} and generates a data frame with 90 +configurations. It is typically fast to train but contains some +configurations that are unlikely to perform well. It is included for +educational purposes.} +\item{\code{'full'}}{This grid loads the parameter grid as used in the +GenSVM paper. It consists of 342 configurations and is generated by the +\code{\link{gensvm.load.full.grid}} function. Note that in the GenSVM paper +cross validation was done with this parameter grid, but the final training +step used \code{epsilon=1e-8}. The \code{\link{gensvm.refit}} function is +useful in this scenario.} +} + +When you provide your own parameter grid, beware that only certain column +names are allowed in the data frame corresponding to parameters for the +GenSVM model. These names are: + +\describe{ +\item{p}{Parameter for the lp norm. Must be in [1.0, 2.0].} +\item{kappa}{Parameter for the Huber hinge function. Must be larger than +-1.} +\item{lambda}{Parameter for the regularization term. Must be larger than 0.} +\item{weight}{Instance weight specification. Allowed values are "unit" for +unit weights and "group" for group-size correction weights} +\item{epsilon}{Stopping parameter for the algorithm. Must be larger than 0.} +\item{max.iter}{Maximum number of iterations of the algorithm. Must be +larger than 0.} +\item{kernel}{The kernel to used, allowed values are "linear", "poly", +"rbf", and "sigmoid". The default is "linear"} +\item{coef}{Parameter for the "poly" and "sigmoid" kernels. See the section +"Kernels in GenSVM" in the code{ink{gensvm-package}} page for more info.} +\item{degree}{Parameter for the "poly" kernel. See the section "Kernels in +GenSVM" in the code{ink{gensvm-package}} page for more info.} +\item{gamma}{Parameter for the "poly", "rbf", and "sigmoid" kernels. See the +section "Kernels in GenSVM" in the code{ink{gensvm-package}} page for more +info.} +} + +For variables that are not present in the \code{param.grid} data frame the +default parameter values in the \code{\link{gensvm}} function will be used. + +Note that this function reorders the parameter grid to make the warm starts +as efficient as possible, which is why the param.grid in the result will not +be the same as the param.grid in the input. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# use the default parameter grid +grid <- gensvm.grid(x, y) + +# use a smaller parameter grid +pg <- expand.grid(p=c(1.0, 1.5, 2.0), kappa=c(-0.9, 1.0), epsilon=c(1e-3)) +grid <- gensvm.grid(x, y, param.grid=pg) + +# print the result +print(grid) + +# Using a custom scoring function (accuracy as percentage) +acc.pct <- function(yt, yp) { return (100 * sum(yt == yp) / length(yt)) } +grid <- gensvm.grid(x, y, scoring=acc.pct) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{predict.gensvm.grid}}, \code{\link{print.gensvm.grid}}, and +\code{\link{gensvm}}. +} + diff --git a/man/gensvm.load.full.grid.Rd b/man/gensvm.load.full.grid.Rd new file mode 100644 index 0000000..5398ef7 --- /dev/null +++ b/man/gensvm.load.full.grid.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.load.full.grid} +\alias{gensvm.load.full.grid} +\title{Load a large parameter grid for the GenSVM grid search} +\usage{ +gensvm.load.full.grid() +} +\description{ +This loads the parameter grid from the GenSVM paper. It +consists of 342 configurations and is constructed from all possible +combinations of the following parameter sets: + +\code{p = c(1.0, 1.5, 2.0)} + +\code{lambda = 2^seq(-18, 18, 2)} + +\code{kappa = c(-0.9, 0.5, 5.0)} + +\code{weight = c('unit', 'group')} +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{gensvm.grid}}, \code{\link{gensvm.load.tiny.grid}}, +\code{\link{gensvm.load.full.grid}}. +} + diff --git a/man/gensvm.load.small.grid.Rd b/man/gensvm.load.small.grid.Rd new file mode 100644 index 0000000..0866f0c --- /dev/null +++ b/man/gensvm.load.small.grid.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.load.small.grid} +\alias{gensvm.load.small.grid} +\title{Load the default parameter grid for the GenSVM grid search} +\usage{ +gensvm.load.small.grid() +} +\description{ +This function loads a default parameter grid to use for the +GenSVM gridsearch. It contains all possible combinations of the following +parameter sets: + +\code{p = c(1.0, 1.5, 2.0)} + +\code{lambda = c(1e-8, 1e-6, 1e-4, 1e-2, 1)} + +\code{kappa = c(-0.9, 0.5, 5.0)} + +\code{weight = c('unit', 'group')} +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{gensvm.grid}}, \code{\link{gensvm.load.tiny.grid}}, +\code{\link{gensvm.load.small.grid}}. +} + diff --git a/man/gensvm.load.tiny.grid.Rd b/man/gensvm.load.tiny.grid.Rd new file mode 100644 index 0000000..9ef0694 --- /dev/null +++ b/man/gensvm.load.tiny.grid.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.load.tiny.grid} +\alias{gensvm.load.tiny.grid} +\title{Load a tiny parameter grid for the GenSVM grid search} +\usage{ +gensvm.load.tiny.grid() +} +\description{ +This function returns a parameter grid to use in the GenSVM +grid search. This grid was obtained by analyzing the experiments done for +the GenSVM paper and selecting the configurations that achieve accuracy +within the 95th percentile on over 90% of the datasets. It is a good start +for a parameter search with a reasonably high chance of achieving good +performance on most datasets. + +Note that this grid is only tested to work well in combination with the +linear kernel. +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} +\seealso{ +\code{\link{gensvm.grid}}, \code{\link{gensvm.load.small.grid}}, +\code{\link{gensvm.load.full.grid}}. +} + diff --git a/man/gensvm.maxabs.scale.Rd b/man/gensvm.maxabs.scale.Rd new file mode 100644 index 0000000..50c6413 --- /dev/null +++ b/man/gensvm.maxabs.scale.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.maxabs.scale.R +\name{gensvm.maxabs.scale} +\alias{gensvm.maxabs.scale} +\title{Scale each column of a matrix by its maximum absolute value} +\usage{ +gensvm.maxabs.scale(x, x.test = NULL) +} +\arguments{ +\item{x}{a matrix to scale} + +\item{x.test}{(optional) a test matrix to scale as well.} +} +\value{ +if x.test=NULL a scaled matrix where the maximum value of the +columns is 1 and the minimum value of the columns isn't below -1. If x.test +is supplied, a list with elements \code{x} and \code{x.test} representing +the scaled datasets. +} +\description{ +Scaling a dataset can creatly decrease the computation time of +GenSVM. This function scales the data by dividing each column of a matrix by +the maximum absolute value of that column. This preserves sparsity in the +data while mapping each column to the interval [-1, 1]. + +Optionally a test dataset can be provided as well. In this case, the scaling +will be computed on the first argument (\code{x}) and applied to the test +dataset. Note that the return value is a list when this argument is +supplied. +} +\examples{ +x <- iris[, -5] + +# check the min and max of the columns +apply(x, 2, min) +apply(x, 2, max) + +# scale the data +x.scale <- gensvm.maxabs.scale(x) + +# check again (max should be 1.0, min shouldn't be below -1) +apply(x.scale, 2, min) +apply(x.scale, 2, max) + +# with a train and test dataset +x <- iris[, -5] +split <- gensvm.train.test.split(x) +x.train <- split$x.train +x.test <- split$x.test +scaled <- gensvm.maxabs.scale(x.train, x.test) +x.train.scl <- scaled$x +x.test.scl <- scaled$x.test + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/gensvm.rank.score.Rd b/man/gensvm.rank.score.Rd new file mode 100644 index 0000000..21d6bcd --- /dev/null +++ b/man/gensvm.rank.score.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.grid.R +\name{gensvm.rank.score} +\alias{gensvm.rank.score} +\title{Compute the ranks for the numbers in a given vector} +\usage{ +gensvm.rank.score(x) +} +\arguments{ +\item{x}{array of numeric values} +} +\details{ +This function computes the ranks for the values in an array. The highest +value gets the smallest rank. Ties are broken by assigning the smallest +value. +} +\examples{ +x <- c(7, 0.1, 0.5, 0.1, 10, 100, 200) +gensvm.rank.score(x) +[ 4 6 5 6 3 2 1 ] + +} + diff --git a/man/gensvm.refit.Rd b/man/gensvm.refit.Rd new file mode 100644 index 0000000..194cde3 --- /dev/null +++ b/man/gensvm.refit.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.refit.R +\name{gensvm.refit} +\alias{gensvm.refit} +\title{Train an already fitted model on new data} +\usage{ +gensvm.refit(fit, X, y, p = NULL, lambda = NULL, kappa = NULL, + epsilon = NULL, weights = NULL, kernel = NULL, gamma = NULL, + coef = NULL, degree = NULL, kernel.eigen.cutoff = NULL, + max.iter = NULL, verbose = NULL, random.seed = NULL) +} +\arguments{ +\item{fit}{Fitted \code{gensvm} object} + +\item{X}{Data matrix of the new data} + +\item{y}{Label vector of the new data} + +\item{verbose}{Turn on verbose output and fit progress. If NULL (the +default) the value from the fitted model is chosen.} +} +\value{ +a new fitted \code{gensvm} model +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# fit a standard model and refit with slightly different parameters +fit <- gensvm(x, y) +fit2 <- gensvm.refit(x, y, epsilon=1e-8) + +# refit a model returned by a grid search +grid <- gensvm.grid(x, y) +fit <- gensvm.refit(fit, x, y, epsilon=1e-8) + +# refit on different data +idx <- runif(nrow(x)) > 0.5 +x1 <- x[idx, ] +x2 <- x[!idx, ] +y1 <- y[idx] +y2 <- y[!idx] + +fit1 <- gensvm(x1, y1) +fit2 <- gensvm.refit(fit1, x2, y2) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/gensvm.train.test.split.Rd b/man/gensvm.train.test.split.Rd new file mode 100644 index 0000000..a99940f --- /dev/null +++ b/man/gensvm.train.test.split.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gensvm.train.test.split.R +\name{gensvm.train.test.split} +\alias{gensvm.train.test.split} +\title{Create a train/test split of a dataset} +\usage{ +gensvm.train.test.split(x, y = NULL, train.size = NULL, test.size = NULL, + shuffle = TRUE, random.state = NULL, return.idx = FALSE) +} +\arguments{ +\item{x}{array to split} + +\item{y}{another array to split (typically this is a vector)} + +\item{train.size}{size of the training dataset. This can be provided as +float or as int. If it's a float, it should be between 0.0 and 1.0 and +represents the fraction of the dataset that should be placed in the training +dataset. If it's an int, it represents the exact number of samples in the +training dataset. If it is NULL, the complement of \code{test.size} will be +used.} + +\item{test.size}{size of the test dataset. Similarly to train.size both a +float or an int can be supplied. If it's NULL, the complement of train.size +will be used. If both train.size and test.size are NULL, a default test.size +of 0.25 will be used.} + +\item{shuffle}{shuffle the rows or not} + +\item{random.state}{seed for the random number generator (int)} +} +\description{ +Often it is desirable to split a dataset into a training and +testing sample. This function is included in GenSVM to make it easy to do +so. The function is inspired by a similar function in Scikit-Learn. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# using the default values +split <- gensvm.train.test.split(x, y) + +# using the split in a GenSVM model +fit <- gensvm(split$x.train, split$y.train) +gensvm.accuracy(split$y.test, predict(fit, split$x.test)) + +# using attach makes the results directly available +attach(gensvm.train.test.split(x, y)) +fit <- gensvm(x.train, y.train) +gensvm.accuracy(y.test, predict(fit, x.test)) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/plot.gensvm.Rd b/man/plot.gensvm.Rd new file mode 100644 index 0000000..b597e18 --- /dev/null +++ b/man/plot.gensvm.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gensvm.R +\name{plot.gensvm} +\alias{plot.gensvm} +\title{Plot the simplex space of the fitted GenSVM model} +\usage{ +\method{plot}{gensvm}(fit, x, y.true = NULL, with.margins = TRUE, + with.shading = TRUE, with.legend = TRUE, center.plot = TRUE, ...) +} +\arguments{ +\item{fit}{A fitted \code{gensvm} object} + +\item{x}{the dataset to plot} + +\item{y.true}{the true data labels. If provided the objects will be colored +using the true labels instead of the predicted labels. This makes it easy to +identify misclassified objects.} + +\item{with.margins}{plot the margins} + +\item{with.shading}{show shaded areas for the class regions} + +\item{with.legend}{show the legend for the class labels} + +\item{center.plot}{ensure that the boundaries and margins are always visible +in the plot} + +\item{...}{further arguments are ignored} +} +\value{ +returns the object passed as input +} +\description{ +This function creates a plot of the simplex space for a fitted +GenSVM model and the given data set, as long as the dataset consists of only +3 classes. For more than 3 classes, the simplex space is too high +dimensional to easily visualize. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# train the model +fit <- gensvm(x, y) + +# plot the simplex space +plot(fit, x) + +# plot and use the true colors (easier to spot misclassified samples) +plot(fit, x, y.true=y) + +# plot only misclassified samples +x.mis <- x[predict(fit, x) != y, ] +y.mis.true <- y[predict(fit, x) != y, ] +plot(fit, x.bad) +plot(fit, x.bad, y.true=y.mis.true) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/plot.gensvm.grid.Rd b/man/plot.gensvm.grid.Rd new file mode 100644 index 0000000..d54196f --- /dev/null +++ b/man/plot.gensvm.grid.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot.gensvm.grid.R +\name{plot.gensvm.grid} +\alias{plot.gensvm.grid} +\title{Plot the simplex space of the best fitted model in the GenSVMGrid} +\usage{ +\method{plot}{gensvm.grid}(grid, x, ...) +} +\arguments{ +\item{grid}{A \code{gensvm.grid} object trained with refit=TRUE} + +\item{x}{the dataset to plot} + +\item{...}{further arguments are passed to the plot function} +} +\value{ +returns the object passed as input +} +\description{ +This is a wrapper which calls the plot function for the best +model in the provided GenSVMGrid object. See the documentation for +\code{\link{plot.gensvm}} for more information. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +grid <- gensvm.grid(x, y) +plot(grid, x) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/predict.gensvm.Rd b/man/predict.gensvm.Rd new file mode 100644 index 0000000..0c55a43 --- /dev/null +++ b/man/predict.gensvm.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.gensvm.R +\name{predict.gensvm} +\alias{predict} +\alias{predict.gensvm} +\title{Predict class labels with the GenSVM model} +\usage{ +\method{predict}{gensvm}(fit, x.test, ...) +} +\arguments{ +\item{fit}{Fitted \code{gensvm} object} + +\item{x.test}{Matrix of new values for \code{x} for which predictions need +to be made.} + +\item{\dots}{further arguments are ignored} +} +\value{ +a vector of class labels, with the same type as the original class +labels. +} +\description{ +This function predicts the class labels of new data using a +fitted GenSVM model. +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# create a training and test sample +attach(gensvm.train.test.split(x, y)) +fit <- gensvm(x.train, y.train) + +# predict the class labels of the test sample +y.test.pred <- predict(fit, x.test) + +# compute the accuracy with gensvm.accuracy +gensvm.accuracy(y.test, y.test.pred) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/predict.gensvm.grid.Rd b/man/predict.gensvm.grid.Rd new file mode 100644 index 0000000..d4cbd68 --- /dev/null +++ b/man/predict.gensvm.grid.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.gensvm.grid.R +\name{predict.gensvm.grid} +\alias{predict.gensvm.grid} +\title{Predict class labels from the GenSVMGrid class} +\usage{ +\method{predict}{gensvm.grid}(grid, newx, ...) +} +\arguments{ +\item{grid}{A \code{gensvm.grid} object trained with \code{refit=TRUE}} + +\item{newx}{Matrix of new values for \code{x} for which predictions need to +be computed.} + +\item{\dots}{further arguments are passed to predict.gensvm()} +} +\value{ +a vector of class labels, with the same type as the original class +labels provided to gensvm.grid() +} +\description{ +Predict class labels using the best model from a grid search. +After doing a grid search with the \code{\link{gensvm.grid}} function, this +function can be used to make predictions of class labels. It uses the best +GenSVM model found during the grid search to do the predictions. Note that +this model is only available if \code{refit=TRUE} was specified in the +\code{\link{gensvm.grid}} call (the default). +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# run a grid search +grid <- gensvm.grid(x, y) + +# predict training sample +y.hat <- predict(grid, x) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/print.gensvm.Rd b/man/print.gensvm.Rd new file mode 100644 index 0000000..75a44b2 --- /dev/null +++ b/man/print.gensvm.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.gensvm.R +\name{print.gensvm} +\alias{print.gensvm} +\title{Print the fitted GenSVM model} +\usage{ +\method{print}{gensvm}(fit, ...) +} +\arguments{ +\item{fit}{A \code{gensvm} object to print} + +\item{\dots}{further arguments are ignored} +} +\value{ +returns the object passed as input. This can be useful for chaining +operations on a fit object. +} +\description{ +Prints a short description of the fitted GenSVM model +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# fit and print the model +fit <- gensvm(x, y) +print(fit) + +# (advanced) use the fact that print returns the fitted model +fit <- gensvm(x, y) +predict(print(fit), x) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/man/print.gensvm.grid.Rd b/man/print.gensvm.grid.Rd new file mode 100644 index 0000000..8a65370 --- /dev/null +++ b/man/print.gensvm.grid.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/print.gensvm.grid.R +\name{print.gensvm.grid} +\alias{print.gensvm.grid} +\title{Print the fitted GenSVMGrid model} +\usage{ +\method{print}{gensvm.grid}(grid, ...) +} +\arguments{ +\item{grid}{a \code{gensvm.grid} object to print} + +\item{\dots}{further arguments are ignored} +} +\value{ +returns the object passed as input +} +\description{ +Prints the summary of the fitted GenSVMGrid model +} +\examples{ +x <- iris[, -5] +y <- iris[, 5] + +# fit a grid search and print the resulting object +grid <- gensvm.grid(x, y) +print(grid) + +} +\author{ +Gerrit J.J. van den Burg, Patrick J.F. Groenen \cr +Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +} +\references{ +Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +Multiclass Support Vector Machine}, Journal of Machine Learning Research, +17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +} + diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..856fc69 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,25 @@ +PKG_CFLAGS = -DGENSVM_R_PACKAGE -DCOLUMN_MAJOR_ORDER -g -DVERSION=0.1.4 +PKG_CPPFLAGS = -Igensvm/include/ +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -Llibgensvm + +SOURCES = $(wildcard gensvm/src/*.c) +OBJECTS = gensvm_wrapper.o $(SOURCES:.c=.o) gensvm/lib/libgensvm.a + +.PHONY: all libgensvm clean + +all: $(SHLIB) + mv $(SHLIB) gensvm_wrapper$(SHLIB_EXT) + +$(SHLIB): gensvm/lib/libgensvm.a + +gensvm/lib/libgensvm.a: + $(AR) rcs $@ $(SOURCES:.c=.o) + +libgensvm: + (cd gensvm; $(MAKE)) + +clean: + rm -f $(SOURCES:.c=.o) + rm -f gensvm/lib/libgensvm.a + +shlib-clean: clean diff --git a/src/gensvm_wrapper.c b/src/gensvm_wrapper.c index c3f0f7f..46578d0 100644 --- a/src/gensvm_wrapper.c +++ b/src/gensvm_wrapper.c @@ -1,3 +1,26 @@ +/** + * @file gensvm_wrapper.c + * @author G.J.J. van den Burg + * @date 2018-03-26 + * @brief Wrapper code for the GenSVM R package + + * Copyright (C) G.J.J. van den Burg + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + */ #define STRICT_R_HEADERS @@ -5,10 +28,10 @@ #include <Rinternals.h> #include <R_ext/Print.h> -#include "gensvm_debug.h" #include "gensvm_print.h" #include "gensvm_train.h" #include "gensvm_predict.h" +#include "gensvm_gridsearch.h" // forward declarations @@ -16,20 +39,35 @@ SEXP R_gensvm_train( SEXP R_X, SEXP R_y, SEXP R_p, SEXP R_lambda, SEXP R_kappa, SEXP R_epsilon, SEXP R_weight_idx, SEXP R_kernel_idx, SEXP R_gamma, SEXP R_coef, SEXP R_degree, SEXP R_kernel_eigen_cutoff, SEXP R_verbose, SEXP R_max_iter, - SEXP R_random_seed, SEXP R_seed_V, SEXP R_n, SEXP R_m, - SEXP R_K); + SEXP R_random_seed, SEXP R_seed_V, SEXP R_seed_rows, + SEXP R_seed_cols, SEXP R_n, SEXP R_m, SEXP R_K); SEXP R_gensvm_predict(SEXP R_Xtest, SEXP R_V, SEXP R_n, SEXP R_m, SEXP R_K); - - -void _set_verbosity(int verbosity_flag); -void _set_seed_model(struct GenModel *model, double *V, long n, long m, - long K); +SEXP R_gensvm_predict_kernels( + SEXP R_Xtest, SEXP R_Xtrain, SEXP R_V, SEXP R_V_row, + SEXP R_V_col, SEXP R_n_train, SEXP R_n_test, SEXP R_m, + SEXP R_K, SEXP R_kernel_idx, SEXP R_gamma, SEXP R_coef, + SEXP R_degree, SEXP R_kernel_eigen_cutoff); +SEXP R_gensvm_plotdata_kernels( + SEXP R_Xtest, SEXP R_Xtrain, SEXP R_V, SEXP R_V_row, + SEXP R_V_col, SEXP R_n_train, SEXP R_n_test, SEXP R_m, + SEXP R_K, SEXP R_kernel_idx, SEXP R_gamma, SEXP R_coef, + SEXP R_degree, SEXP R_kernel_eigen_cutoff); + +SEXP R_gensvm_grid(SEXP R_X, SEXP R_y, SEXP R_df, SEXP R_df_rows, + SEXP R_df_cols, SEXP R_cv_idx, SEXP R_cv_folds, SEXP R_verbosity, + SEXP R_n, SEXP R_m, SEXP R_K); + +void _set_verbosity(int verbosity); +struct GenData *_build_gensvm_data(double *X, int *y, int n, int m, int K); // Start R package stuff R_CallMethodDef callMethods[] = { - {"R_gensvm_train", (DL_FUNC) &R_gensvm_train, 19}, + {"R_gensvm_train", (DL_FUNC) &R_gensvm_train, 21}, {"R_gensvm_predict", (DL_FUNC) &R_gensvm_predict, 5}, + {"R_gensvm_predict_kernels", (DL_FUNC) &R_gensvm_predict_kernels, 14}, + {"R_gensvm_plotdata_kernels", (DL_FUNC) &R_gensvm_plotdata_kernels, 14}, + {"R_gensvm_grid", (DL_FUNC) &R_gensvm_grid, 11}, {NULL, NULL, 0} }; R_CMethodDef cMethods[] = { @@ -43,46 +81,88 @@ void R_init_gensvm_wrapper(DllInfo *info) { // End R package stuff -void _set_verbosity(int verbosity_flag) +/** + * @brief Set the verbosity of the GenSVM library + * + * @description + * This sets the printing functions of the GenSVM library to print to the R + * console if desired. + * + * @param[in] verbosity if 0 all output is surpressed + * + */ +void _set_verbosity(int verbosity) { extern FILE *GENSVM_OUTPUT_FILE; extern FILE *GENSVM_ERROR_FILE; - if (verbosity_flag) { + if (verbosity) { gensvm_print_out = Rprintf; gensvm_print_err = REprintf; } else { + gensvm_print_out = gensvm_print_output_fpt; + gensvm_print_err = gensvm_print_error_fpt; GENSVM_OUTPUT_FILE = NULL; GENSVM_ERROR_FILE = NULL; } } -void _set_seed_model(struct GenModel *model, double *V, long n, long m, long K) + +/** + * @brief Construct a GenData struct from the given dataset + * + * @param[in] X + * @param[in] y can be NULL + * @param[in] n + * @param[in] m + * @param[in] K + * + * @return GenData structure + */ +struct GenData *_build_gensvm_data(double *X, int *y, int n, int m, int K) { - long i, j; + int i, j; double value; - model->n = 0; - model->m = m; - model->K = K; + struct GenData *data = gensvm_init_data(); + data->n = n; + data->m = m; + data->r = m; + data->K = K; - gensvm_allocate_model(model); + data->RAW = Calloc(double, n*(m+1)); - for (i=0; i<m+1; i++) { - for (j=0; j<K-1; j++) { - value = matrix_get(V, m+1, K-1, i, j); - matrix_set(model->V, m+1, K-1, i, j, value); + for (i=0; i<n; i++) { + for (j=0; j<m; j++) { + value = matrix_get(X, n, m, i, j); + matrix_set(data->RAW, n, m+1, i, j+1, value); } + matrix_set(data->RAW, n, m+1, i, 0, 1.0); + } + data->Z = data->RAW; + + // convert to sparse matrix if possible + if (gensvm_could_sparse(data->Z, n, m+1)) { + note("Converting to sparse ... "); + data->spZ = gensvm_dense_to_sparse(data->Z, n, m+1); + note("done.\n"); + free(data->RAW); + data->RAW = NULL; + data->Z = NULL; } -} + if (y == NULL) { + data->y = NULL; + } else { + data->y = Malloc(long, n); + for (i=0; i<n; i++) + data->y[i] = y[i]; + } + + return data; +} -// NOTE: Let's supply X here as it is represented in R: a matrix in -// Column-Major order. Since we have to augment the matrix X with the column -// of ones to form Z, we might as well do that *and* convert to RowMajor in a -// single step. Otherwise we have the RowMajor version of X as well as Z in -// memory, which is unnecessary. SEXP R_gensvm_train( SEXP R_X, SEXP R_y, @@ -100,13 +180,15 @@ SEXP R_gensvm_train( SEXP R_max_iter, SEXP R_random_seed, SEXP R_seed_V, + SEXP R_seed_rows, + SEXP R_seed_cols, SEXP R_n, SEXP R_m, SEXP R_K ) { double *X = REAL(R_X); - int *y = INTEGER(R_y); // R doesn't know long? + int *y = INTEGER(R_y); double p = *REAL(R_p); double lambda = *REAL(R_lambda); double kappa = *REAL(R_kappa); @@ -121,6 +203,8 @@ SEXP R_gensvm_train( int max_iter = *INTEGER(R_max_iter); int random_seed = *INTEGER(R_random_seed); double *seed_V = isNull(R_seed_V) ? NULL : REAL(R_seed_V); + int seed_rows = *INTEGER(R_seed_rows); + int seed_cols = *INTEGER(R_seed_cols); int n = *INTEGER(R_n); int m = *INTEGER(R_m); int K = *INTEGER(R_K); @@ -129,7 +213,6 @@ SEXP R_gensvm_train( struct GenModel *model = gensvm_init_model(); struct GenModel *seed_model = NULL; - struct GenData *data = NULL; long i, j; double value; @@ -149,54 +232,38 @@ SEXP R_gensvm_train( if (seed_V != NULL) { seed_model = gensvm_init_model(); - _set_seed_model(seed_model, seed_V, n, m, K); - } - data = gensvm_init_data(); - - data->y = Malloc(long, n); - for (i=0; i<n; i++) - data->y[i] = (long) y[i]; - - data->RAW = Malloc(double, n*(m+1)); - for (i=0; i<n; i++) { - for (j=0; j<m; j++) { - value = matrix_get(X, n, m, i, j); - matrix_set(data->RAW, n, m+1, i, j+1, value); + seed_model->n = 0; + seed_model->m = seed_rows - 1; + seed_model->K = seed_cols + 1; + gensvm_allocate_model(seed_model); + + for (i=0; i<seed_model->m+1; i++) { + for (j=0; j<seed_model->K-1; j++) { + matrix_set(seed_model->V, seed_model->m+1, + seed_model->K-1, i ,j, + matrix_get(seed_V, seed_rows, + seed_cols, i, j)); + } } - // column of 1's - matrix_set(data->RAW, n, m+1, i, 0, 1.0); } - data->n = n; - data->m = m; - data->r = m; - data->K = K; - data->Z = data->RAW; - - // convert to sparse matrix if possible - if (gensvm_could_sparse(data->Z, n, m+1)) { - note("Converting to sparse ... "); - data->spZ = gensvm_dense_to_sparse(data->Z, n, m+1); - note("done.\n"); - free(data->RAW); - data->RAW = NULL; - data->Z = NULL; - } + struct GenData *data = _build_gensvm_data(X, y, n, m, K); // actually do the training gensvm_train(model, data, seed_model); // create the output list - SEXP output = PROTECT(allocVector(VECSXP, 3)); + SEXP output = PROTECT(allocVector(VECSXP, 4)); // create and fill output matrix - SEXP R_V = PROTECT(allocMatrix(REALSXP, m+1, K-1)); + SEXP R_V = PROTECT(allocMatrix(REALSXP, model->m+1, model->K-1)); double *rR_V = REAL(R_V); - for (i=0; i<m+1; i++) { - for (j=0; j<K-1; j++) { - value = matrix_get(model->V, m+1, K-1, i, j); - matrix_set(rR_V, m+1, K-1, i, j, value); + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + value = matrix_get(model->V, model->m+1, model->K-1, + i, j); + matrix_set(rR_V, model->m+1, model->K-1, i, j, value); } } @@ -208,22 +275,28 @@ SEXP R_gensvm_train( int *r_sv = INTEGER(R_sv); r_sv[0] = gensvm_num_sv(model); + SEXP R_time = PROTECT(allocVector(REALSXP, 1)); + double *r_time = REAL(R_time); + r_time[0] = model->elapsed_time; + // set output list elements SET_VECTOR_ELT(output, 0, R_V); SET_VECTOR_ELT(output, 1, R_iter); SET_VECTOR_ELT(output, 2, R_sv); + SET_VECTOR_ELT(output, 3, R_time); // create names - SEXP names = PROTECT(allocVector(STRSXP, 3)); + SEXP names = PROTECT(allocVector(STRSXP, 4)); SET_STRING_ELT(names, 0, mkChar("V")); SET_STRING_ELT(names, 1, mkChar("n.iter")); SET_STRING_ELT(names, 2, mkChar("n.support")); + SET_STRING_ELT(names, 3, mkChar("training.time")); // assign names to list setAttrib(output, R_NamesSymbol, names); // cleanup - UNPROTECT(5); + UNPROTECT(6); gensvm_free_model(model); gensvm_free_model(seed_model); @@ -261,43 +334,101 @@ SEXP R_gensvm_predict( } } - struct GenData *data = gensvm_init_data(); - data->n = n_test; - data->m = m; - data->r = m; - data->K = K; + struct GenData *data = _build_gensvm_data(X, NULL, n_test, m, K); - data->RAW = Calloc(double, n_test*(m+1)); + long *pred_temp = Calloc(long, n_test); - for (i=0; i<n_test; i++) { - for (j=0; j<m; j++) { - value = matrix_get(X, n_test, m, i, j); - matrix_set(data->RAW, n_test, m+1, i, j+1, value); + gensvm_predict_labels(data, model, pred_temp); + + SEXP R_y = PROTECT(allocMatrix(INTSXP, n_test, 1)); + int *rR_y = INTEGER(R_y); + for (i=0; i<n_test; i++) + rR_y[i] = pred_temp[i]; + + gensvm_free_data(data); + gensvm_free_model(model); + free(pred_temp); + + UNPROTECT(1); + + return(R_y); +} + +SEXP R_gensvm_predict_kernels( + SEXP R_Xtest, + SEXP R_Xtrain, + SEXP R_V, + SEXP R_V_row, + SEXP R_V_col, + SEXP R_n_train, + SEXP R_n_test, + SEXP R_m, + SEXP R_K, + SEXP R_kernel_idx, + SEXP R_gamma, + SEXP R_coef, + SEXP R_degree, + SEXP R_kernel_eigen_cutoff + ) +{ + double *X_test = REAL(R_Xtest); + double *X_train = REAL(R_Xtrain); + double *V = REAL(R_V); + int V_row = *INTEGER(R_V_row); + int V_col = *INTEGER(R_V_col); + int n_train = *INTEGER(R_n_train); + int n_test = *INTEGER(R_n_test); + int m = *INTEGER(R_m); + int K = *INTEGER(R_K); + + int kernel_idx = *INTEGER(R_kernel_idx); + double gamma = *REAL(R_gamma); + double coef = *REAL(R_coef); + double degree = *REAL(R_degree); + double kernel_eigen_cutoff = *REAL(R_kernel_eigen_cutoff); + + int i, j; + double value; + + struct GenModel *model = gensvm_init_model(); + model->n = n_train; + model->m = V_row - 1; + model->K = V_col + 1; + model->kerneltype = kernel_idx; + model->gamma = gamma; + model->coef = coef; + model->degree = degree; + model->kernel_eigen_cutoff = kernel_eigen_cutoff; + gensvm_allocate_model(model); + + struct GenData *traindata = _build_gensvm_data(X_train, NULL, n_train, + m, K); + struct GenData *testdata = _build_gensvm_data(X_test, NULL, n_test, + m, K); + + gensvm_kernel_preprocess(model, traindata); + gensvm_reallocate_model(model, traindata->n, traindata->r); + + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + value = matrix_get(V, V_row, V_col, i, j); + matrix_set(model->V, model->m+1, model->K-1, i, j, + value); } - matrix_set(data->RAW, n_test, m+1, i, 0, 1.0); } - data->Z = data->RAW; - // convert to sparse matrix if possible - if (gensvm_could_sparse(data->Z, n_test, m+1)) { - note("Converting to sparse ... "); - data->spZ = gensvm_dense_to_sparse(data->Z, n_test, m+1); - note("done.\n"); - free(data->RAW); - data->RAW = NULL; - data->Z = NULL; - } + gensvm_kernel_postprocess(model, traindata, testdata); long *pred_temp = Calloc(long, n_test); - - gensvm_predict_labels(data, model, pred_temp); + gensvm_predict_labels(testdata, model, pred_temp); SEXP R_y = PROTECT(allocMatrix(INTSXP, n_test, 1)); int *rR_y = INTEGER(R_y); for (i=0; i<n_test; i++) rR_y[i] = pred_temp[i]; - gensvm_free_data(data); + gensvm_free_data(traindata); + gensvm_free_data(testdata); gensvm_free_model(model); free(pred_temp); @@ -305,3 +436,256 @@ SEXP R_gensvm_predict( return(R_y); } + +SEXP R_gensvm_plotdata_kernels( + SEXP R_Xtest, + SEXP R_Xtrain, + SEXP R_V, + SEXP R_V_row, + SEXP R_V_col, + SEXP R_n_train, + SEXP R_n_test, + SEXP R_m, + SEXP R_K, + SEXP R_kernel_idx, + SEXP R_gamma, + SEXP R_coef, + SEXP R_degree, + SEXP R_kernel_eigen_cutoff + ) +{ + double *X_test = REAL(R_Xtest); + double *X_train = REAL(R_Xtrain); + double *V = REAL(R_V); + int V_row = *INTEGER(R_V_row); + int V_col = *INTEGER(R_V_col); + int n_train = *INTEGER(R_n_train); + int n_test = *INTEGER(R_n_test); + int m = *INTEGER(R_m); + int K = *INTEGER(R_K); + + int kernel_idx = *INTEGER(R_kernel_idx); + double gamma = *REAL(R_gamma); + double coef = *REAL(R_coef); + double degree = *REAL(R_degree); + double kernel_eigen_cutoff = *REAL(R_kernel_eigen_cutoff); + + int i, j; + double value; + + struct GenModel *model = gensvm_init_model(); + model->n = n_train; + model->m = V_row - 1; + model->K = V_col + 1; + model->kerneltype = kernel_idx; + model->gamma = gamma; + model->coef = coef; + model->degree = degree; + model->kernel_eigen_cutoff = kernel_eigen_cutoff; + gensvm_allocate_model(model); + + struct GenData *traindata = _build_gensvm_data(X_train, NULL, n_train, + m, K); + struct GenData *testdata = _build_gensvm_data(X_test, NULL, n_test, + m, K); + + gensvm_kernel_preprocess(model, traindata); + gensvm_reallocate_model(model, traindata->n, traindata->r); + + for (i=0; i<model->m+1; i++) { + for (j=0; j<model->K-1; j++) { + value = matrix_get(V, V_row, V_col, i, j); + matrix_set(model->V, model->m+1, model->K-1, i, j, + value); + } + } + + gensvm_kernel_postprocess(model, traindata, testdata); + + double *ZV = Calloc(long, n_test * (K-1)); + gensvm_calculate_ZV(model, testdata, ZV); + + long *pred_temp = Calloc(long, n_test); + gensvm_predict_labels(testdata, model, pred_temp); + + // create the output list + SEXP output = PROTECT(allocVector(VECSXP, 2)); + + // Copy predictions + SEXP R_y = PROTECT(allocMatrix(INTSXP, n_test, 1)); + int *rR_y = INTEGER(R_y); + for (i=0; i<n_test; i++) + rR_y[i] = pred_temp[i]; + + // Copy ZV + SEXP R_ZV = PROTECT(allocMatrix(REALSXP, n_test, K-1)); + double *rR_ZV = REAL(R_ZV); + for (i=0; i<n_test*(K-1); i++) + rR_ZV[i] = ZV[i]; + + SET_VECTOR_ELT(output, 0, R_y); + SET_VECTOR_ELT(output, 1, R_ZV); + + SEXP names = PROTECT(allocVector(STRSXP, 2)); + SET_STRING_ELT(names, 0, mkChar("y.pred")); + SET_STRING_ELT(names, 1, mkChar("ZV")); + + setAttrib(output, R_NamesSymbol, names); + + UNPROTECT(4); + + gensvm_free_data(traindata); + gensvm_free_data(testdata); + gensvm_free_model(model); + free(pred_temp); + free(ZV); + + return output; +} + +SEXP R_gensvm_grid( + SEXP R_X, + SEXP R_y, + SEXP R_df, + SEXP R_df_rows, + SEXP R_df_cols, + SEXP R_cv_idx, + SEXP R_cv_folds, + SEXP R_verbosity, + SEXP R_n, + SEXP R_m, + SEXP R_K + ) +{ + double *X = REAL(R_X); + int *y = INTEGER(R_y); + double *df = REAL(R_df); + int df_rows = *INTEGER(R_df_rows); + int df_cols = *INTEGER(R_df_cols); + int *icv_idx = INTEGER(R_cv_idx); + int folds = *INTEGER(R_cv_folds); + int verbosity = *INTEGER(R_verbosity); + int n = *INTEGER(R_n); + int m = *INTEGER(R_m); + int K = *INTEGER(R_K); + + int i, j, pred; + long *cv_idx = NULL; + double val, total_time; + + // Check input + if (df_cols < 9) { + // TODO: Raise error to R + } + + // set verbosity + _set_verbosity(verbosity); + + // copy the cv_idx array + cv_idx = Malloc(long, n); + for (i=0; i<n; i++) + cv_idx[i] = icv_idx[i]; + + // Read the data into a GenData struct + struct GenData *data = _build_gensvm_data(X, y, n, m, K); + + // Initialize and populate the queue + struct GenQueue *q = gensvm_init_queue(); + q->tasks = Malloc(struct GenTask *, df_rows); + q->N = df_rows; + + struct GenTask *t = NULL; + + for (i=0; i<df_rows; i++) { + t = gensvm_init_task(); + t->ID = i; + + t->kerneltype = matrix_get(df, df_rows, df_cols, i, 0); + t->coef = matrix_get(df, df_rows, df_cols, i, 1); + t->degree = matrix_get(df, df_rows, df_cols, i, 2); + t->gamma = matrix_get(df, df_rows, df_cols, i, 3); + t->weight_idx = matrix_get(df, df_rows, df_cols, i, 4); + t->kappa = matrix_get(df, df_rows, df_cols, i, 5); + t->lambda = matrix_get(df, df_rows, df_cols, i, 6); + t->p = matrix_get(df, df_rows, df_cols, i, 7); + t->epsilon = matrix_get(df, df_rows, df_cols, i, 8); + t->max_iter = matrix_get(df, df_rows, df_cols, i, 9); + t->folds = folds; + + t->train_data = data; + + q->tasks[i] = t; + } + + // start training + total_time = gensvm_train_queue(q, cv_idx, true, verbosity); + + // create the output list + SEXP output = PROTECT(allocVector(VECSXP, 3)); + + // copy predictions + SEXP R_predictions = PROTECT(allocMatrix(INTSXP, df_rows, n)); + int *rR_predictions = INTEGER(R_predictions); + for (i=0; i<df_rows; i++) { + t = q->tasks[i]; + if (t->predictions == NULL) { // if interrupt occurred + for (j=0; j<n; j++) + matrix_set(rR_predictions, df_rows, n, i, j, + NA_INTEGER); + } else { + for (j=0; j<n; j++) { + pred = t->predictions[j]; + pred = (pred == -1) ? NA_INTEGER : pred; + matrix_set(rR_predictions, df_rows, n, i, j, + pred); + } + } + } + + // copy durations + SEXP R_durations = PROTECT(allocMatrix(REALSXP, df_rows, folds)); + double *rR_durations = REAL(R_durations); + for (i=0; i<df_rows; i++) { + t = q->tasks[i]; + if (t->durations == NULL) { // if interrupt occurred + for (j=0; j<folds; j++) { + matrix_set(rR_durations, df_rows, folds, i, j, + NA_REAL); + } + } else { + for (j=0; j<folds; j++) { + val = t->durations[j]; + val = (val == -1) ? NA_REAL : val; + matrix_set(rR_durations, df_rows, folds, i, j, + val); + } + } + } + + SEXP R_time = PROTECT(allocVector(REALSXP, 1)); + double *r_time = REAL(R_time); + r_time[0] = total_time; + + // set output list elements + SET_VECTOR_ELT(output, 0, R_predictions); + SET_VECTOR_ELT(output, 1, R_durations); + SET_VECTOR_ELT(output, 2, R_time); + + // create names + SEXP names = PROTECT(allocVector(STRSXP, 3)); + SET_STRING_ELT(names, 0, mkChar("predictions")); + SET_STRING_ELT(names, 1, mkChar("durations")); + SET_STRING_ELT(names, 2, mkChar("total.time")); + + // assign names to list + setAttrib(output, R_NamesSymbol, names); + + UNPROTECT(5); + + gensvm_free_data(data); + gensvm_free_queue(q); + + free(cv_idx); + + return output; +} |
