aboutsummaryrefslogtreecommitdiff
path: root/gensvm/core.py
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2017-12-12 20:18:28 -0500
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2017-12-12 20:18:28 -0500
commit7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch)
tree574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm/core.py
parentupdate library for python package (diff)
downloadpygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz
pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip
added gridsearch and extended gensvm class
Diffstat (limited to 'gensvm/core.py')
-rw-r--r--gensvm/core.py180
1 files changed, 135 insertions, 45 deletions
diff --git a/gensvm/core.py b/gensvm/core.py
index 7594eba..2776ec6 100644
--- a/gensvm/core.py
+++ b/gensvm/core.py
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
-"""
+"""Core functionality for fitting the GenSVM classifier
+
+This module contains the basic definitions to fit a single GenSVM model.
+
"""
from __future__ import print_function, division
@@ -8,7 +11,7 @@ from __future__ import print_function, division
import numpy as np
import warnings
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import check_X_y, check_random_state
@@ -18,8 +21,9 @@ from sklearn.utils.validation import check_is_fitted
from . import wrapper
-def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
- degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None):
+def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma,
+ coef, degree, kernel_eigen_cutoff, verbose, max_iter,
+ random_state=None, seed_V=None):
# process the random state
rnd = check_random_state(random_state)
@@ -27,11 +31,14 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
# set the verbosity in GenSVM
wrapper.set_verbosity_wrap(verbose)
+ # convert the weight index
+ weight_idx = 1 if weights == 'unit' else 2
+
# run the actual training
raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap(
- X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
- degree, kernel_eigen_cutoff, max_iter,
- rnd.randint(np.iinfo('i').max))
+ X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma,
+ coef, degree, kernel_eigen_cutoff, max_iter,
+ rnd.randint(np.iinfo('i').max), seed_V)
# process output
if status_ == 1 and verbose > 0:
@@ -49,7 +56,7 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
return coef_, intercept_, n_iter_, n_SV_
-class GenSVM(BaseEstimator):
+class GenSVM(BaseEstimator, ClassifierMixin):
"""Generalized Multiclass Support Vector Machine Classification.
This class implements the basic GenSVM classifier. GenSVM is a generalized
@@ -57,8 +64,8 @@ class GenSVM(BaseEstimator):
errors. It is this flexibility that makes it perform well on diverse
datasets.
- This methods of this class use the GenSVM C library for the actual
- computations.
+ The :func:`~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class
+ use the GenSVM C library for the actual computations.
Parameters
----------
@@ -71,16 +78,17 @@ class GenSVM(BaseEstimator):
kappa : float, optional (default=0.0)
Parameter for the hinge function in the loss function (kappa > -1.0)
- weight_idx : int, optional (default=1)
- Type of sample weights to use (1 = unit weights, 2 = size correction
- weights)
+ weights: string, optional (default='unit')
+ Type of sample weights to use. Options are 'unit' for unit weights and
+ 'group' for group size correction weights (equation 4 in the paper).
kernel : string, optional (default='linear')
Specify the kernel type to use in the classifier. It must be one of
'linear', 'poly', 'rbf', or 'sigmoid'.
- gamma : float, optional (default=1.0)
- Kernel parameter for the rbf, poly, and sigmoid kernel
+ gamma : float, optional (default='auto')
+ Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is
+ 'auto' then 1/n_features will be used.
coef : float, optional (default=0.0)
Kernel parameter for the poly and sigmoid kernel
@@ -106,9 +114,12 @@ class GenSVM(BaseEstimator):
coef_ : array, shape = [n_features, n_classes-1]
Weights assigned to the features (coefficients in the primal problem)
- intercept_ : array, shape = [n_classes]
+ intercept_ : array, shape = [n_classes-1]
Constants in the decision function
+ combined_coef_ : array, shape = [n_features+1, n_classes-1]
+ Combined weights matrix for the seed_V parameter to the fit method
+
n_iter_ : int
The number of iterations that were run during training.
@@ -116,23 +127,45 @@ class GenSVM(BaseEstimator):
The number of support vectors that were found
- References
- ----------
- * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized
- Multiclass Support Vector Machine. Journal of Machine Learning Research,
- 17(225):1--42, 2016.
+ See Also
+ --------
+ :class:`.GenSVMGridSearchCV`:
+ Helper class to run an efficient grid search for GenSVM.
"""
- def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1,
- kernel='linear', gamma=1.0, coef=0.0, degree=2.0,
- kernel_eigen_cutoff=1e-8, verbose=0, random_state=None,
+ def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6,
+ weights='unit', kernel='linear', gamma='auto', coef=0.0,
+ degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None,
max_iter=1e8):
+
+ if not 1.0 <= p <= 2.0:
+ raise ValueError("Value for p should be within [1, 2]; got p = %r"
+ % p)
+ if not kappa > -1.0:
+ raise ValueError("Value for kappa should be larger than -1; got "
+ "kappa = %r" % kappa)
+ if not lmd > 0:
+ raise ValueError("Value for lmd should be larger than 0; got "
+ "lmd = %r" % lmd)
+ if not epsilon > 0:
+ raise ValueError("Value for epsilon should be larger than 0; got "
+ "epsilon = %r" % epsilon)
+ if gamma == 0.0:
+ raise ValueError("A gamma value of 0.0 is invalid")
+ if not weights in ('unit', 'group'):
+ raise ValueError("Unknown weight parameter specified. Should be "
+ "'unit' or 'group'; got %r" % weights)
+ if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'):
+ raise ValueError("Unknown kernel specified. Should be "
+ "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel)
+
+
self.p = p
self.lmd = lmd
self.kappa = kappa
self.epsilon = epsilon
- self.weight_idx = weight_idx
+ self.weights = weights
self.kernel = kernel
self.gamma = gamma
self.coef = coef
@@ -143,19 +176,42 @@ class GenSVM(BaseEstimator):
self.max_iter = max_iter
- def fit(self, X, y):
- if not 1.0 <= self.p <= 2.0:
- raise ValueError("Value for p should be within [1, 2]; got p = %r)"
- % self.p)
- if not self.kappa > -1.0:
- raise ValueError("Value for kappa should be larger than -1; got "
- "kappa = %r" % self.kappa)
- if not self.lmd > 0:
- raise ValueError("Value for lmd should be larger than 0; got "
- "lmd = %r" % self.lmd)
- if not self.epsilon > 0:
- raise ValueError("Value for epsilon should be larger than 0; got "
- "epsilon = %r" % self.epsilon)
+ def fit(self, X, y, seed_V=None):
+ """Fit the GenSVM model on the given data
+
+ The model can be fit with or without a seed matrix (``seed_V``). This
+ can be used to provide warm starts for the algorithm.
+
+ Parameters
+ ----------
+
+ X : array, shape = (n_observations, n_features)
+ The input data. It is expected that only numeric data is given.
+
+ y : array, shape = (n_observations, )
+ The label vector, labels can be numbers or strings.
+
+ seed_V : array, shape = (n_features+1, n_classes-1), optional
+ Seed coefficient array to use as a warm start for the optimization.
+ It can for instance be the :attr:`combined_coef_
+ <.GenSVM.combined_coef_>` attribute of a different GenSVM model.
+ This is only supported for the linear kernel.
+
+ NOTE: the size of the seed_V matrix is ``n_features+1`` by
+ ``n_classes - 1``. The number of columns of ``seed_V`` is leading
+ for the number of classes in the model. For example, if ``y``
+ contains 3 different classes and ``seed_V`` has 3 columns, we
+ assume that there are actually 4 classes in the problem but one
+ class is just represented in this training data. This can be useful
+ for problems were a certain class has only a few samples.
+
+
+ Returns
+ -------
+ self : object
+ Returns self.
+
+ """
X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64,
order="C")
@@ -163,24 +219,52 @@ class GenSVM(BaseEstimator):
if y_type not in ["binary", "multiclass"]:
raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
+ if self.gamma == 'auto':
+ gamma = 1 / X.shape[1]
+ else:
+ gamma = self.gamma
+
# This is necessary because GenSVM expects classes to go from 1 to
# n_class
self.encoder = LabelEncoder()
y = self.encoder.fit_transform(y_org)
y += 1
+ n_class = len(np.unique(y))
+ if not seed_V is None and self.kernel != 'linear':
+ warnings.warn("Warm starts are only supported for the "
+ "linear kernel. The seed_V parameter will be ignored.")
+ seed_V = None
+ if not seed_V is None:
+ n_samples, n_features = X.shape
+ if seed_V.shape[1] + 1 > n_class:
+ n_class = seed_V.shape[1]
+ if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 <
+ n_class):
+ raise ValueError("Seed V must have shape [%i, %i], "
+ "but has shape [%i, %i]" % (n_features+1, n_class-1,
+ seed_V.shape[0], seed_V.shape[1]))
+
self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
- _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon,
- self.weight_idx, self.kernel, self.gamma, self.coef,
- self.degree, self.kernel_eigen_cutoff, self.verbose,
- self.max_iter, self.random_state)
+ _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa,
+ self.epsilon, self.weights, self.kernel, gamma,
+ self.coef, self.degree, self.kernel_eigen_cutoff,
+ self.verbose, self.max_iter, self.random_state, seed_V)
return self
-
def predict(self, X):
- check_is_fitted(self, "coef_")
+ """Predict the class labels on the given data
- V = np.vstack((self.intercept_, self.coef_))
+ Parameters
+ ----------
+ X : array, shape = [n_samples, n_features]
+
+ Returns
+ -------
+ y_pred : array, shape = (n_samples, )
+
+ """
+ V = self.combined_coef_
predictions = wrapper.predict_wrap(X, V)
# Transform the classes back to the original form
@@ -188,3 +272,9 @@ class GenSVM(BaseEstimator):
outcome = self.encoder.inverse_transform(predictions)
return outcome
+
+ @property
+ def combined_coef_(self):
+ check_is_fitted(self, "coef_")
+ check_is_fitted(self, "intercept_")
+ return np.vstack((self.intercept_, self.coef_))