aboutsummaryrefslogtreecommitdiff
path: root/R
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2018-02-09 16:34:57 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2018-02-09 16:34:57 +0000
commit381266b57b48e8005099b4ea5762d08455c2e5ba (patch)
tree904fd9c0fa37ec30791a2dd5644f1fd67c5515b7 /R
downloadrgensvm-381266b57b48e8005099b4ea5762d08455c2e5ba.tar.gz
rgensvm-381266b57b48e8005099b4ea5762d08455c2e5ba.zip
initial commit
Diffstat (limited to 'R')
-rw-r--r--R/coef.gensvm.R33
-rw-r--r--R/gensvm-kernels.R10
-rw-r--r--R/gensvm-package.R56
-rw-r--r--R/gensvm.R128
-rw-r--r--R/gensvm.grid.R116
-rw-r--r--R/predict.gensvm.R36
-rw-r--r--R/print.gensvm.R34
-rw-r--r--R/util.labelencoder.R1
8 files changed, 414 insertions, 0 deletions
diff --git a/R/coef.gensvm.R b/R/coef.gensvm.R
new file mode 100644
index 0000000..19ab0aa
--- /dev/null
+++ b/R/coef.gensvm.R
@@ -0,0 +1,33 @@
+#' @title Get the coefficients of the fitted GenSVM model
+#'
+#' @description Returns the model coefficients of the GenSVM object
+#'
+#' @param object a \code{gensvm} object
+#' @param \dots further arguments are ignored
+#'
+#' @return The coefficients of the GenSVM model. This is a matrix of size
+#' \eqn{(n_{features} + 1) x (n_{classes} - 1)}. This matrix is used to project
+#' the input data to a low dimensional space using the equation: \eqn{XW + t}
+#' where \eqn{X} is the input matrix, \eqn{t} is the first row of the matrix
+#' returned by this function, and \eqn{W} is the \eqn{n_{features} x
+#' (n_{classes} - 1)} matrix formed by the remaining rows.
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @method coef gensvm
+#' @export
+#'
+#' @examples
+#'
+#'
+coef.gensvm <- function(object, ...)
+{
+ return(object$V)
+}
diff --git a/R/gensvm-kernels.R b/R/gensvm-kernels.R
new file mode 100644
index 0000000..8e445c0
--- /dev/null
+++ b/R/gensvm-kernels.R
@@ -0,0 +1,10 @@
+#' Kernels in GenSVM
+#'
+#' GenSVM can be used for both linear multiclass support vector machine
+#' classification and for nonlinear classification with kernels. In general,
+#' linear classification will be faster but depending on the dataset higher
+#' classification performance can be achieved using a nonlinear kernel.
+#'
+#' The following nonlinear kernels are implemented in the GenSVM package:
+#' \describe{
+#' \item{RBF}{The Radial Basis Function kernel is a commonly used kernel.
diff --git a/R/gensvm-package.R b/R/gensvm-package.R
new file mode 100644
index 0000000..13c2c31
--- /dev/null
+++ b/R/gensvm-package.R
@@ -0,0 +1,56 @@
+#' GenSVM: A Generalized Multiclass Support Vector Machine
+#'
+#' The GenSVM classifier is a generalized multiclass support vector machine
+#' (SVM). This classifier simultaneously aims to find decision boundaries that
+#' separate the classes with as wide a margin as possible. In GenSVM, the loss
+#' functions that measures how misclassifications are counted is very flexible.
+#' This allows the user to tune the classifier to the dataset at hand and
+#' potentially obtain higher classification accuracy. Moreover, this
+#' flexibility means that GenSVM has a number of alternative multiclass SVMs as
+#' special cases. One of the other advantages of GenSVM is that it is trained
+#' in the primal, allowing the use of warm starts during optimization. This
+#' means that for common tasks such as cross validation or repeated model
+#' fitting, GenSVM can be trained very quickly.
+#'
+#' This package provides functions for training the GenSVM model either as a
+#' separate model or through a cross-validated parameter grid search. In both
+#' cases the GenSVM C library is used for speed. Auxiliary functions for
+#' evaluating and using the model are also provided.
+#'
+#' @section GenSVM functions:
+#' The main GenSVM functions are:
+#' \describe{
+#' \item{\code{\link{gensvm}}}{Fit a GenSVM model for specific model
+#' parameters.}
+#' \item{\code{\link{gensvm.grid}}}{Run a cross-validated grid search for
+#' GenSVM.}
+#' }
+#'
+#' Other available functions are:
+#' \describe{
+#' \item{\code{\link{plot}}}{Plot the low-dimensional \emph{simplex} space
+#' where the decision boundaries are fixed.}
+#' \item{\code{\link{predict}}}{Predict the class labels of new data using the
+#' GenSVM model.}
+#' \item{\code{\link{coef}}}{Get the coefficients of the GenSVM model}
+#' \item{\code{\link{print}}}{Print a short description of the fitted GenSVM
+#' model}
+#' }
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @examples
+#'
+#'
+#' @name gensvm-package
+#' @docType package
+#' @import
+NULL
+#>NULL
diff --git a/R/gensvm.R b/R/gensvm.R
new file mode 100644
index 0000000..1923f06
--- /dev/null
+++ b/R/gensvm.R
@@ -0,0 +1,128 @@
+#' @title Fit the GenSVM model
+#'
+#' @description Fits the Generalized Multiclass Support Vector Machine model
+#' with the given parameters.
+#'
+#' @param X data matrix with the predictors
+#' @param y class labels
+#' @param p parameter for the L_p norm of the loss function (1.0 <= p <= 2.0)
+#' @param lambda regularization parameter for the loss function (lambda > 0)
+#' @param kappa parameter for the hinge function in the loss function (kappa >
+#' -1.0)
+#' @param weights type of instance weights to use. Options are 'unit' for unit
+#' weights and 'group' for group size correction weight (eq. 4 in the paper).
+#' @param kernel the kernel type to use in the classifier. It must be one of
+#' 'linear', 'poly', 'rbf', or 'sigmoid'.
+#' @param gamma kernel parameter for the rbf, polynomial, and sigmoid kernel.
+#' If gamma is 'auto', then 1/n_features will be used.
+#' @param coef parameter for the polynomial and sigmoid kernel.
+#' @param degree parameter for the polynomial kernel
+#' @param kernel.eigen.cutoff Cutoff point for the reduced eigendecomposition
+#' used with kernel-GenSVM. Eigenvectors for which the ratio between their
+#' corresponding eigenvalue and the largest eigenvalue is smaller than this
+#' cutoff value will be dropped.
+#' @param verbose Turn on verbose output and fit progress
+#' @param random.seed Seed for the random number generator (useful for
+#' reproducible output)
+#' @param max.iter Maximum number of iterations of the optimization algorithm.
+#'
+#' @return A "gensvm" S3 object is returned for which the print, predict, coef,
+#' and plot methods are available. It has the following items:
+#' \item{call}{The call that was used to construct the model.}
+#' \item{lambda}{The regularization parameter used in the model.}
+#' \item{kappa}{The hinge function parameter used.}
+#' \item{epsilon}{The stopping criterion used.}
+#' \item{weights}{The instance weights type used.}
+#' \item{kernel}{The kernel function used.}
+#' \item{gamma}{The value of the gamma parameter of the kernel, if applicable}.
+#' \item{coef}{The value of the coef parameter of the kernel, if applicable}
+#' \item{degree}{The degree of the kernel, if applicable}
+#' \item{kernel.eigen.cutoff}{The cutoff value of the reduced
+#' eigendecomposition of the kernel matrix}
+#' \item{random.seed}{The random seed used to seed the model.}
+#' \item{max.iter}{Maximum number of iterations of the algorithm.}
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @seealso
+#' \code{\link{coef}}, \code{\link{print}}, \code{\link{predict}},
+#' \code{\link{plot}}, and \code{\link{gensvm.grid}}.
+#'
+#' @export
+#'
+#' @examples
+#' X <-
+#'
+gensvm <- function(X, y, p=1.0, lambda=1e-5, kappa=0.0, epsilon=1e-6,
+ weights='unit', kernel='linear', gamma='auto', coef=0.0,
+ degree=2.0, kernel.eigen.cutoff=1e-8, verbose=0,
+ random.seed=NULL, max.iter=1e8, seed.V=NULL)
+{
+ call <- match.call()
+
+
+ # TODO: generate the random.seed value in R if it is NULL. Then you can
+ # return it and people can still reproduce even if they forgot to set it
+ # explicitly.
+
+ # TODO: Store a labelencoder in the object, preferably as a partially
+ # hidden item. This can then be used with prediction.
+
+ n.objects <- nrow(X)
+ n.features <- ncol(X)
+ n.classes <- length(unique(y))
+
+
+ # Convert labels to integers
+ y.clean <- label.encode(y)
+
+ # Convert weights to index
+ weight.idx <- which(c("unit", "group") == weights)
+ if (length(weight.idx) == 0) {
+ stop("Incorrect weight specification. ",
+ "Valid options are 'unit' and 'group'")
+ }
+
+ # Convert kernel to index
+ kernel.idx <- which(c("linear", "poly", "rbf", "sigmoid") == kernel)
+ if (length(kernel.idx) == 0) {
+ stop("Incorrect kernel specification. ",
+ "Valid options are 'linear', 'poly', 'rbf', and 'sigmoid'")
+ }
+
+
+ out <- .Call("R_gensvm_train",
+ as.matrix(t(X)),
+ as.integer(y.clean),
+ p,
+ lambda,
+ kappa,
+ epsilon,
+ weight.idx,
+ kernel.idx,
+ gamma,
+ coef,
+ degree,
+ kernel.eigen.cutoff,
+ verbose,
+ max.iter,
+ random.seed,
+ seed.V)
+
+
+ object <- list(call = call, lambda = lambda, kappa = kappa,
+ epsilon = epsilon, weights = weights, kernel = kernel,
+ gamma = gamma, coef = coef, degree = degree,
+ kernel.eigen.cutoff = kernel.eigen.cutoff,
+ random.seed = random.seed, max.iter = max.iter,
+ V = out$V, n.iter = out$n.iter, n.support = out$n.support)
+ class(object) <- "gensvm"
+ return(object)
+}
diff --git a/R/gensvm.grid.R b/R/gensvm.grid.R
new file mode 100644
index 0000000..37e2f7f
--- /dev/null
+++ b/R/gensvm.grid.R
@@ -0,0 +1,116 @@
+#' @title Cross-validated grid search for GenSVM
+#'
+#' @description This function performs a cross-validated grid search of the
+#' model parameters to find the best hyperparameter configuration for a given
+#' dataset. This function takes advantage of GenSVM's ability to use warm
+#' starts to speed up computation. The function also uses the GenSVM C library
+#' for speed.
+#'
+#' There are two ways to use this function: either by providing a data frame
+#' with the parameter configurations to try or by giving each of the function
+#' inputs a vector of values to evaluate. In the latter case all combinations
+#' of the provided values will be used (i.e. the product set).
+#'
+#' @param X training data matrix. We denote the size of this matrix by
+#' n_samples x n_features.
+#' @param y training vector of class labes of length n_samples. The number of
+#' unique labels in this vector is denoted by n_classes.
+#' @param df Data frame with parameter configurations to evaluate.
+#' If this is provided it overrides the other parameter ranges provided. The
+#' data frame must provide *all* required columns, as described below.
+#' @param p vector of values to try for the \eqn{p} hyperparameter
+#' for the \eqn{\ell_p} norm in the loss function. All values should be on the
+#' interval [1.0, 2.0].
+#' @param lambda vector of values for the regularization parameter
+#' \eqn{\lambda} in the loss function. All values should be larger than 0.
+#' @param kappa vector of values for the hinge function parameter in
+#' the loss function. All values should be larger than -1.
+#' @param weights vector of values for the instance weights. Values
+#' should be either 'unit', 'group', or both.
+#' @param kernel vector of values for the kernel type. Possible
+#' values are: 'linear', 'rbf', 'poly', or 'sigmoid', or any combination of
+#' these values. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for
+#' more information.
+#' @param gamma kernel parameter for the 'rbf', 'poly', and 'sigmoid' kernels.
+#' If it is 'auto', 1/n_features will be used. See the article
+#' \link[=gensvm-kernels]{Kernels in GenSVM} for more information.
+#' @param coef kernel parameter for the 'poly' and 'sigmoid'
+#' kernels. See the article \link[=gensvm-kernels]{Kernels in GenSVM} for more
+#' information.
+#' @param degree kernel parameter for the 'poly' kernel. See the
+#' article \link[=gensvm-kernels]{Kernels in GenSVM} for more information.
+#' @param max.iter maximum number of iterations to run in the
+#' optimization algorithm.
+#' @param refit boolean variable. If true, the best model from cross validation
+#' is fitted again on the entire dataset.
+#' @param scoring metric to use to evaluate the classifier performance during
+#' cross validation. The metric should be an R function that takes two
+#' arguments: y_true and y_pred and that returns a float such that higher
+#' values are better. If it is NULL, the accuracy score will be used.
+#' @param cv the number of cross-validation folds to use or a vector with the
+#' same length as \code{y} where each unique value denotes a test split.
+#' @param verbose boolean variable to indicate whether training details should
+#' be printed.
+#'
+#' @return A "gensvm.grid" S3 object with the following items:
+#' \item{cv.results}{A data frame with the cross validation results}
+#' \item{best.estimator}{If refit=TRUE, this is the GenSVM model fitted with
+#' the best hyperparameter configuration, otherwise it is NULL}
+#' \item{best.score}{Mean cross-validated score for the model with the best
+#' hyperparameter configuration}
+#' \item{best.params}{Parameter configuration that provided the highest mean
+#' cross-validated score}
+#' \item{best.index}{Row index of the cv.results data frame that corresponds to
+#' the best hyperparameter configuration}
+#' \item{n.splits}{The number of cross-validation splits}
+#'
+#'
+#'
+#' @section Using a DataFrame:
+#' ...
+#'
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @seealso
+#' \code{\link{coef}}, \code{\link{print}}, \code{\link{predict}},
+#' \code{\link{plot}}, and \code{\link{gensvm.grid}}.
+#'
+#'
+#' @export
+#'
+#' @examples
+#' X <-
+#'
+
+gensvm.grid <- function(X, y,
+ df=NULL,
+ p=c(1.0, 1.5, 2.0),
+ lambda=c(1e-8, 1e-6, 1e-4, 1e-2, 1),
+ kappa=c(-0.9, 0.5, 5.0),
+ weights=c('unit', 'group'),
+ kernel=c('linear'),
+ gamma=c('auto'),
+ coef=c(0.0),
+ degree=c(2.0),
+ max.iter=c(1e8),
+ refit=TRUE,
+ scoring=NULL,
+ cv=3,
+ verbose=TRUE)
+{
+ call <- match.call()
+
+
+
+ object <- list(...)
+ class(object) <- "gensvm.grid"
+ return(object)
+}
diff --git a/R/predict.gensvm.R b/R/predict.gensvm.R
new file mode 100644
index 0000000..6cc8851
--- /dev/null
+++ b/R/predict.gensvm.R
@@ -0,0 +1,36 @@
+#' @title Predict class labels with the GenSVM model
+#'
+#' @description This function predicts the class labels of new data using a
+#' fitted GenSVM model.
+#'
+#' @param object Fitted \code{gensvm} object
+#' @param newx Matrix of new values for \code{x} for which predictions need to
+#' be made.
+#' @param \dots further arguments are ignored
+#'
+#' @return a vector of class labels, with the same type as the original class
+#' labels.
+#'
+#' @export
+#' @aliases predict
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @examples
+#'
+#'
+#'
+predict.gensvm <- function(object, newx, ...)
+{
+ # TODO: C library fitting prediction here (or not? with the column-major
+ # order it may be faster to do it directly in R)
+
+ return(yhat)
+}
diff --git a/R/print.gensvm.R b/R/print.gensvm.R
new file mode 100644
index 0000000..8d17b0c
--- /dev/null
+++ b/R/print.gensvm.R
@@ -0,0 +1,34 @@
+#' @title Print the fitted GenSVM model
+#'
+#' @description Prints a short description of the fitted GenSVM model
+#'
+#' @param object A \code{gensvm} object to print
+#' @param \dots further arguments are ignored
+#'
+#' @return returns the object passed as input
+#'
+#' @author
+#' Gerrit J.J. van den Burg, Patrick J.F. Groenen
+#' Maintainer: Gerrit J.J. van den Burg <gertjanvandenburg@gmail.com>
+#'
+#' @references
+#' Van den Burg, G.J.J. and Groenen, P.J.F. (2016). \emph{GenSVM: A Generalized
+#' Multiclass Support Vector Machine}, Journal of Machine Learning Research,
+#' 17(225):1--42. URL \url{http://jmlr.org/papers/v17/14-526.html}.
+#'
+#' @method print gensvm
+#' @export
+#'
+#' @examples
+#'
+#'
+print.gensvm <- function(object, ...)
+{
+ cat("\nCall:\n")
+ dput(object$call)
+
+ # TODO: fill this out
+ #
+ #
+ invisible(object)
+}
diff --git a/R/util.labelencoder.R b/R/util.labelencoder.R
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/R/util.labelencoder.R
@@ -0,0 +1 @@
+