diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-02-09 16:34:57 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2018-02-09 16:34:57 +0000 |
| commit | 381266b57b48e8005099b4ea5762d08455c2e5ba (patch) | |
| tree | 904fd9c0fa37ec30791a2dd5644f1fd67c5515b7 /R | |
| download | rgensvm-381266b57b48e8005099b4ea5762d08455c2e5ba.tar.gz rgensvm-381266b57b48e8005099b4ea5762d08455c2e5ba.zip | |
initial commit
Diffstat (limited to 'R')
| -rw-r--r-- | R/coef.gensvm.R | 33 | ||||
| -rw-r--r-- | R/gensvm-kernels.R | 10 | ||||
| -rw-r--r-- | R/gensvm-package.R | 56 | ||||
| -rw-r--r-- | R/gensvm.R | 128 | ||||
| -rw-r--r-- | R/gensvm.grid.R | 116 | ||||
| -rw-r--r-- | R/predict.gensvm.R | 36 | ||||
| -rw-r--r-- | R/print.gensvm.R | 34 | ||||
| -rw-r--r-- | R/util.labelencoder.R | 1 |
8 files changed, 414 insertions, 0 deletions
diff --git a/R/coef.gensvm.R b/R/coef.gensvm.R new file mode 100644 index 0000000..19ab0aa --- /dev/null +++ b/R/coef.gensvm.R @@ -0,0 +1,33 @@ +#' @title Get the coefficients of the fitted GenSVM model +#' +#' @description Returns the model coefficients of the GenSVM object +#' +#' @param object a \code{gensvm} object +#' @param \dots further arguments are ignored +#' +#' @return The coefficients of the GenSVM model. This is a matrix of size +#' \eqn{(n_{features} + 1) x (n_{classes} - 1)}. This matrix is used to project +#' the input data to a low dimensional space using the equation: \eqn{XW + t} +#' where \eqn{X} is the input matrix, \eqn{t} is the first row of the matrix +#' returned by this function, and \eqn{W} is the \eqn{n_{features} x +#' (n_{classes} - 1)} matrix formed by the remaining rows. +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @method coef gensvm +#' @export +#' +#' @examples +#' +#' +coef.gensvm <- function(object, ...) +{ + return(object$V) +} diff --git a/R/gensvm-kernels.R b/R/gensvm-kernels.R new file mode 100644 index 0000000..8e445c0 --- /dev/null +++ b/R/gensvm-kernels.R @@ -0,0 +1,10 @@ +#' Kernels in GenSVM +#' +#' GenSVM can be used for both linear multiclass support vector machine +#' classification and for nonlinear classification with kernels. In general, +#' linear classification will be faster but depending on the dataset higher +#' classification performance can be achieved using a nonlinear kernel. +#' +#' The following nonlinear kernels are implemented in the GenSVM package: +#' \describe{ +#' \item{RBF}{The Radial Basis Function kernel is a commonly used kernel. diff --git a/R/gensvm-package.R b/R/gensvm-package.R new file mode 100644 index 0000000..13c2c31 --- /dev/null +++ b/R/gensvm-package.R @@ -0,0 +1,56 @@ +#' GenSVM: A Generalized Multiclass Support Vector Machine +#' +#' The GenSVM classifier is a generalized multiclass support vector machine +#' (SVM). This classifier simultaneously aims to find decision boundaries that +#' separate the classes with as wide a margin as possible. In GenSVM, the loss +#' functions that measures how misclassifications are counted is very flexible. +#' This allows the user to tune the classifier to the dataset at hand and +#' potentially obtain higher classification accuracy. Moreover, this +#' flexibility means that GenSVM has a number of alternative multiclass SVMs as +#' special cases. One of the other advantages of GenSVM is that it is trained +#' in the primal, allowing the use of warm starts during optimization. This +#' means that for common tasks such as cross validation or repeated model +#' fitting, GenSVM can be trained very quickly. +#' +#' This package provides functions for training the GenSVM model either as a +#' separate model or through a cross-validated parameter grid search. In both +#' cases the GenSVM C library is used for speed. Auxiliary functions for +#' evaluating and using the model are also provided. +#' +#' @section GenSVM functions: +#' The main GenSVM functions are: +#' \describe{ +#' \item{\code{\link{gensvm}}}{Fit a GenSVM model for specific model +#' parameters.} +#' \item{\code{\link{gensvm.grid}}}{Run a cross-validated grid search for +#' GenSVM.} +#' } +#' +#' Other available functions are: +#' \describe{ +#' \item{\code{\link{plot}}}{Plot the low-dimensional \emph{simplex} space +#' where the decision boundaries are fixed.} +#' \item{\code{\link{predict}}}{Predict the class labels of new data using the +#' GenSVM model.} +#' \item{\code{\link{coef}}}{Get the coefficients of the GenSVM model} +#' \item{\code{\link{print}}}{Print a short description of the fitted GenSVM +#' model} +#' } +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @examples +#' +#' +#' @name gensvm-package +#' @docType package +#' @import +NULL +#>NULL diff --git a/R/gensvm.R b/R/gensvm.R new file mode 100644 index 0000000..1923f06 --- /dev/null +++ b/R/gensvm.R @@ -0,0 +1,128 @@ +#' @title Fit the GenSVM model +#' +#' @description Fits the Generalized Multiclass Support Vector Machine model +#' with the given parameters. +#' +#' @param X data matrix with the predictors +#' @param y class labels +#' @param p parameter for the L_p norm of the loss function (1.0 <= p <= 2.0) +#' @param lambda regularization parameter for the loss function (lambda > 0) +#' @param kappa parameter for the hinge function in the loss function (kappa > +#' -1.0) +#' @param weights type of instance weights to use. Options are 'unit' for unit +#' weights and 'group' for group size correction weight (eq. 4 in the paper). +#' @param kernel the kernel type to use in the classifier. It must be one of +#' 'linear', 'poly', 'rbf', or 'sigmoid'. +#' @param gamma kernel parameter for the rbf, polynomial, and sigmoid kernel. +#' If gamma is 'auto', then 1/n_features will be used. +#' @param coef parameter for the polynomial and sigmoid kernel. +#' @param degree parameter for the polynomial kernel +#' @param kernel.eigen.cutoff Cutoff point for the reduced eigendecomposition +#' used with kernel-GenSVM. Eigenvectors for which the ratio between their +#' corresponding eigenvalue and the largest eigenvalue is smaller than this +#' cutoff value will be dropped. +#' @param verbose Turn on verbose output and fit progress +#' @param random.seed Seed for the random number generator (useful for +#' reproducible output) +#' @param max.iter Maximum number of iterations of the optimization algorithm. +#' +#' @return A "gensvm" S3 object is returned for which the print, predict, coef, +#' and plot methods are available. It has the following items: +#' \item{call}{The call that was used to construct the model.} +#' \item{lambda}{The regularization parameter used in the model.} +#' \item{kappa}{The hinge function parameter used.} +#' \item{epsilon}{The stopping criterion used.} +#' \item{weights}{The instance weights type used.} +#' \item{kernel}{The kernel function used.} +#' \item{gamma}{The value of the gamma parameter of the kernel, if applicable}. +#' \item{coef}{The value of the coef parameter of the kernel, if applicable} +#' \item{degree}{The degree of the kernel, if applicable} +#' \item{kernel.eigen.cutoff}{The cutoff value of the reduced +#' eigendecomposition of the kernel matrix} +#' \item{random.seed}{The random seed used to seed the model.} +#' \item{max.iter}{Maximum number of iterations of the algorithm.} +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @seealso +#' \code{\link{coef}}, \code{\link{print}}, \code{\link{predict}}, +#' \code{\link{plot}}, and \code{\link{gensvm.grid}}. +#' +#' @export +#' +#' @examples +#' X <- +#' +gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6, + weights='unit', kernel='linear', gamma='auto', coef=0.0, + degree=2.0, kernel.eigen.cutoff=1e-8, verbose=0, + random.seed=NULL, max.iter=1e8, seed.V=NULL) +{ + call <- match.call() + + + # TODO: generate the random.seed value in R if it is NULL. Then you can + # return it and people can still reproduce even if they forgot to set it + # explicitly. + + # TODO: Store a labelencoder in the object, preferably as a partially + # hidden item. This can then be used with prediction. + + n.objects <- nrow(X) + n.features <- ncol(X) + n.classes <- length(unique(y)) + + + # Convert labels to integers + y.clean <- label.encode(y) + + # Convert weights to index + weight.idx <- which(c("unit", "group") == weights) + if (length(weight.idx) == 0) { + stop("Incorrect weight specification. ", + "Valid options are 'unit' and 'group'") + } + + # Convert kernel to index + kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel) + if (length(kernel.idx) == 0) { + stop("Incorrect kernel specification. ", + "Valid options are 'linear', 'poly', 'rbf', and 'sigmoid'") + } + + + out <- .Call("R_gensvm_train", + as.matrix(t(X)), + as.integer(y.clean), + p, + lambda, + kappa, + epsilon, + weight.idx, + kernel.idx, + gamma, + coef, + degree, + kernel.eigen.cutoff, + verbose, + max.iter, + random.seed, + seed.V) + + + object <- list(call = call, lambda = lambda, kappa = kappa, + epsilon = epsilon, weights = weights, kernel = kernel, + gamma = gamma, coef = coef, degree = degree, + kernel.eigen.cutoff = kernel.eigen.cutoff, + random.seed = random.seed, max.iter = max.iter, + V = out$V, n.iter = out$n.iter, n.support = out$n.support) + class(object) <- "gensvm" + return(object) +} diff --git a/R/gensvm.grid.R b/R/gensvm.grid.R new file mode 100644 index 0000000..37e2f7f --- /dev/null +++ b/R/gensvm.grid.R @@ -0,0 +1,116 @@ +#' @title Cross-validated grid search for GenSVM +#' +#' @description This function performs a cross-validated grid search of the +#' model parameters to find the best hyperparameter configuration for a given +#' dataset. This function takes advantage of GenSVM's ability to use warm +#' starts to speed up computation. The function also uses the GenSVM C library +#' for speed. +#' +#' There are two ways to use this function: either by providing a data frame +#' with the parameter configurations to try or by giving each of the function +#' inputs a vector of values to evaluate. In the latter case all combinations +#' of the provided values will be used (i.e. the product set). +#' +#' @param X training data matrix. We denote the size of this matrix by +#' n_samples x n_features. +#' @param y training vector of class labes of length n_samples. The number of +#' unique labels in this vector is denoted by n_classes. +#' @param df Data frame with parameter configurations to evaluate. +#' If this is provided it overrides the other parameter ranges provided. The +#' data frame must provide *all* required columns, as described below. +#' @param p vector of values to try for the \eqn{p} hyperparameter +#' for the \eqn{\ell_p} norm in the loss function. All values should be on the +#' interval [1.0, 2.0]. +#' @param lambda vector of values for the regularization parameter +#' \eqn{\lambda} in the loss function. All values should be larger than 0. +#' @param kappa vector of values for the hinge function parameter in +#' the loss function. All values should be larger than -1. +#' @param weights vector of values for the instance weights. Values +#' should be either 'unit', 'group', or both. +#' @param kernel vector of values for the kernel type. Possible +#' values are: 'linear', 'rbf', 'poly', or 'sigmoid', or any combination of +#' these values. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for +#' more information. +#' @param gamma kernel parameter for the 'rbf', 'poly', and 'sigmoid' kernels. +#' If it is 'auto', 1/n_features will be used. See the article +#' \link[=gensvm-kernels]{Kernels in GenSVM} for more information. +#' @param coef kernel parameter for the 'poly' and 'sigmoid' +#' kernels. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for more +#' information. +#' @param degree kernel parameter for the 'poly' kernel. See the +#' article \link[=gensvm-kernels]{Kernels in GenSVM} for more information. +#' @param max.iter maximum number of iterations to run in the +#' optimization algorithm. +#' @param refit boolean variable. If true, the best model from cross validation +#' is fitted again on the entire dataset. +#' @param scoring metric to use to evaluate the classifier performance during +#' cross validation. The metric should be an R function that takes two +#' arguments: y_true and y_pred and that returns a float such that higher +#' values are better. If it is NULL, the accuracy score will be used. +#' @param cv the number of cross-validation folds to use or a vector with the +#' same length as \code{y} where each unique value denotes a test split. +#' @param verbose boolean variable to indicate whether training details should +#' be printed. +#' +#' @return A "gensvm.grid" S3 object with the following items: +#' \item{cv.results}{A data frame with the cross validation results} +#' \item{best.estimator}{If refit=TRUE, this is the GenSVM model fitted with +#' the best hyperparameter configuration, otherwise it is NULL} +#' \item{best.score}{Mean cross-validated score for the model with the best +#' hyperparameter configuration} +#' \item{best.params}{Parameter configuration that provided the highest mean +#' cross-validated score} +#' \item{best.index}{Row index of the cv.results data frame that corresponds to +#' the best hyperparameter configuration} +#' \item{n.splits}{The number of cross-validation splits} +#' +#' +#' +#' @section Using a DataFrame: +#' ... +#' +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @seealso +#' \code{\link{coef}}, \code{\link{print}}, \code{\link{predict}}, +#' \code{\link{plot}}, and \code{\link{gensvm.grid}}. +#' +#' +#' @export +#' +#' @examples +#' X <- +#' + +gensvm.grid <- function(X, y, + df=NULL, + p=c(1.0, 1.5, 2.0), + lambda=c(1e-8, 1e-6, 1e-4, 1e-2, 1), + kappa=c(-0.9, 0.5, 5.0), + weights=c('unit', 'group'), + kernel=c('linear'), + gamma=c('auto'), + coef=c(0.0), + degree=c(2.0), + max.iter=c(1e8), + refit=TRUE, + scoring=NULL, + cv=3, + verbose=TRUE) +{ + call <- match.call() + + + + object <- list(...) + class(object) <- "gensvm.grid" + return(object) +} diff --git a/R/predict.gensvm.R b/R/predict.gensvm.R new file mode 100644 index 0000000..6cc8851 --- /dev/null +++ b/R/predict.gensvm.R @@ -0,0 +1,36 @@ +#' @title Predict class labels with the GenSVM model +#' +#' @description This function predicts the class labels of new data using a +#' fitted GenSVM model. +#' +#' @param object Fitted \code{gensvm} object +#' @param newx Matrix of new values for \code{x} for which predictions need to +#' be made. +#' @param \dots further arguments are ignored +#' +#' @return a vector of class labels, with the same type as the original class +#' labels. +#' +#' @export +#' @aliases predict +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @examples +#' +#' +#' +predict.gensvm <- function(object, newx, ...) +{ + # TODO: C library fitting prediction here (or not? with the column-major + # order it may be faster to do it directly in R) + + return(yhat) +} diff --git a/R/print.gensvm.R b/R/print.gensvm.R new file mode 100644 index 0000000..8d17b0c --- /dev/null +++ b/R/print.gensvm.R @@ -0,0 +1,34 @@ +#' @title Print the fitted GenSVM model +#' +#' @description Prints a short description of the fitted GenSVM model +#' +#' @param object A \code{gensvm} object to print +#' @param \dots further arguments are ignored +#' +#' @return returns the object passed as input +#' +#' @author +#' Gerrit J.J. van den Burg, Patrick J.F. Groenen +#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com> +#' +#' @references +#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized +#' Multiclass Support Vector Machine}, Journal of Machine Learning Research, +#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}. +#' +#' @method print gensvm +#' @export +#' +#' @examples +#' +#' +print.gensvm <- function(object, ...) +{ + cat("\nCall:\n") + dput(object$call) + + # TODO: fill this out + # + # + invisible(object) +} diff --git a/R/util.labelencoder.R b/R/util.labelencoder.R new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/R/util.labelencoder.R @@ -0,0 +1 @@ + |
