diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-12 20:18:28 -0500 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-12 20:18:28 -0500 |
| commit | 7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch) | |
| tree | 574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm/core.py | |
| parent | update library for python package (diff) | |
| download | pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip | |
added gridsearch and extended gensvm class
Diffstat (limited to 'gensvm/core.py')
| -rw-r--r-- | gensvm/core.py | 180 |
1 files changed, 135 insertions, 45 deletions
diff --git a/gensvm/core.py b/gensvm/core.py index 7594eba..2776ec6 100644 --- a/gensvm/core.py +++ b/gensvm/core.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- -""" +"""Core functionality for fitting the GenSVM classifier + +This module contains the basic definitions to fit a single GenSVM model. + """ from __future__ import print_function, division @@ -8,7 +11,7 @@ from __future__ import print_function, division import numpy as np import warnings -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import ConvergenceWarning, FitFailedWarning from sklearn.preprocessing import LabelEncoder from sklearn.utils import check_X_y, check_random_state @@ -18,8 +21,9 @@ from sklearn.utils.validation import check_is_fitted from . import wrapper -def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, - degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None): +def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, + coef, degree, kernel_eigen_cutoff, verbose, max_iter, + random_state=None, seed_V=None): # process the random state rnd = check_random_state(random_state) @@ -27,11 +31,14 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, # set the verbosity in GenSVM wrapper.set_verbosity_wrap(verbose) + # convert the weight index + weight_idx = 1 if weights == 'unit' else 2 + # run the actual training raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap( - X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, - degree, kernel_eigen_cutoff, max_iter, - rnd.randint(np.iinfo('i').max)) + X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, + coef, degree, kernel_eigen_cutoff, max_iter, + rnd.randint(np.iinfo('i').max), seed_V) # process output if status_ == 1 and verbose > 0: @@ -49,7 +56,7 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, return coef_, intercept_, n_iter_, n_SV_ -class GenSVM(BaseEstimator): +class GenSVM(BaseEstimator, ClassifierMixin): """Generalized Multiclass Support Vector Machine Classification. This class implements the basic GenSVM classifier. GenSVM is a generalized @@ -57,8 +64,8 @@ class GenSVM(BaseEstimator): errors. It is this flexibility that makes it perform well on diverse datasets. - This methods of this class use the GenSVM C library for the actual - computations. + The :func:`~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class + use the GenSVM C library for the actual computations. Parameters ---------- @@ -71,16 +78,17 @@ class GenSVM(BaseEstimator): kappa : float, optional (default=0.0) Parameter for the hinge function in the loss function (kappa > -1.0) - weight_idx : int, optional (default=1) - Type of sample weights to use (1 = unit weights, 2 = size correction - weights) + weights: string, optional (default='unit') + Type of sample weights to use. Options are 'unit' for unit weights and + 'group' for group size correction weights (equation 4 in the paper). kernel : string, optional (default='linear') Specify the kernel type to use in the classifier. It must be one of 'linear', 'poly', 'rbf', or 'sigmoid'. - gamma : float, optional (default=1.0) - Kernel parameter for the rbf, poly, and sigmoid kernel + gamma : float, optional (default='auto') + Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is + 'auto' then 1/n_features will be used. coef : float, optional (default=0.0) Kernel parameter for the poly and sigmoid kernel @@ -106,9 +114,12 @@ class GenSVM(BaseEstimator): coef_ : array, shape = [n_features, n_classes-1] Weights assigned to the features (coefficients in the primal problem) - intercept_ : array, shape = [n_classes] + intercept_ : array, shape = [n_classes-1] Constants in the decision function + combined_coef_ : array, shape = [n_features+1, n_classes-1] + Combined weights matrix for the seed_V parameter to the fit method + n_iter_ : int The number of iterations that were run during training. @@ -116,23 +127,45 @@ class GenSVM(BaseEstimator): The number of support vectors that were found - References - ---------- - * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized - Multiclass Support Vector Machine. Journal of Machine Learning Research, - 17(225):1--42, 2016. + See Also + -------- + :class:`.GenSVMGridSearchCV`: + Helper class to run an efficient grid search for GenSVM. """ - def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, - kernel='linear', gamma=1.0, coef=0.0, degree=2.0, - kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, + def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, + weights='unit', kernel='linear', gamma='auto', coef=0.0, + degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, max_iter=1e8): + + if not 1.0 <= p <= 2.0: + raise ValueError("Value for p should be within [1, 2]; got p = %r" + % p) + if not kappa > -1.0: + raise ValueError("Value for kappa should be larger than -1; got " + "kappa = %r" % kappa) + if not lmd > 0: + raise ValueError("Value for lmd should be larger than 0; got " + "lmd = %r" % lmd) + if not epsilon > 0: + raise ValueError("Value for epsilon should be larger than 0; got " + "epsilon = %r" % epsilon) + if gamma == 0.0: + raise ValueError("A gamma value of 0.0 is invalid") + if not weights in ('unit', 'group'): + raise ValueError("Unknown weight parameter specified. Should be " + "'unit' or 'group'; got %r" % weights) + if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'): + raise ValueError("Unknown kernel specified. Should be " + "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel) + + self.p = p self.lmd = lmd self.kappa = kappa self.epsilon = epsilon - self.weight_idx = weight_idx + self.weights = weights self.kernel = kernel self.gamma = gamma self.coef = coef @@ -143,19 +176,42 @@ class GenSVM(BaseEstimator): self.max_iter = max_iter - def fit(self, X, y): - if not 1.0 <= self.p <= 2.0: - raise ValueError("Value for p should be within [1, 2]; got p = %r)" - % self.p) - if not self.kappa > -1.0: - raise ValueError("Value for kappa should be larger than -1; got " - "kappa = %r" % self.kappa) - if not self.lmd > 0: - raise ValueError("Value for lmd should be larger than 0; got " - "lmd = %r" % self.lmd) - if not self.epsilon > 0: - raise ValueError("Value for epsilon should be larger than 0; got " - "epsilon = %r" % self.epsilon) + def fit(self, X, y, seed_V=None): + """Fit the GenSVM model on the given data + + The model can be fit with or without a seed matrix (``seed_V``). This + can be used to provide warm starts for the algorithm. + + Parameters + ---------- + + X : array, shape = (n_observations, n_features) + The input data. It is expected that only numeric data is given. + + y : array, shape = (n_observations, ) + The label vector, labels can be numbers or strings. + + seed_V : array, shape = (n_features+1, n_classes-1), optional + Seed coefficient array to use as a warm start for the optimization. + It can for instance be the :attr:`combined_coef_ + <.GenSVM.combined_coef_>` attribute of a different GenSVM model. + This is only supported for the linear kernel. + + NOTE: the size of the seed_V matrix is ``n_features+1`` by + ``n_classes - 1``. The number of columns of ``seed_V`` is leading + for the number of classes in the model. For example, if ``y`` + contains 3 different classes and ``seed_V`` has 3 columns, we + assume that there are actually 4 classes in the problem but one + class is just represented in this training data. This can be useful + for problems were a certain class has only a few samples. + + + Returns + ------- + self : object + Returns self. + + """ X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, order="C") @@ -163,24 +219,52 @@ class GenSVM(BaseEstimator): if y_type not in ["binary", "multiclass"]: raise ValueError("Label type not allowed for GenSVM: %r" % y_type) + if self.gamma == 'auto': + gamma = 1 / X.shape[1] + else: + gamma = self.gamma + # This is necessary because GenSVM expects classes to go from 1 to # n_class self.encoder = LabelEncoder() y = self.encoder.fit_transform(y_org) y += 1 + n_class = len(np.unique(y)) + if not seed_V is None and self.kernel != 'linear': + warnings.warn("Warm starts are only supported for the " + "linear kernel. The seed_V parameter will be ignored.") + seed_V = None + if not seed_V is None: + n_samples, n_features = X.shape + if seed_V.shape[1] + 1 > n_class: + n_class = seed_V.shape[1] + if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 < + n_class): + raise ValueError("Seed V must have shape [%i, %i], " + "but has shape [%i, %i]" % (n_features+1, n_class-1, + seed_V.shape[0], seed_V.shape[1])) + self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \ - _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, - self.weight_idx, self.kernel, self.gamma, self.coef, - self.degree, self.kernel_eigen_cutoff, self.verbose, - self.max_iter, self.random_state) + _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa, + self.epsilon, self.weights, self.kernel, gamma, + self.coef, self.degree, self.kernel_eigen_cutoff, + self.verbose, self.max_iter, self.random_state, seed_V) return self - def predict(self, X): - check_is_fitted(self, "coef_") + """Predict the class labels on the given data - V = np.vstack((self.intercept_, self.coef_)) + Parameters + ---------- + X : array, shape = [n_samples, n_features] + + Returns + ------- + y_pred : array, shape = (n_samples, ) + + """ + V = self.combined_coef_ predictions = wrapper.predict_wrap(X, V) # Transform the classes back to the original form @@ -188,3 +272,9 @@ class GenSVM(BaseEstimator): outcome = self.encoder.inverse_transform(predictions) return outcome + + @property + def combined_coef_(self): + check_is_fitted(self, "coef_") + check_is_fitted(self, "intercept_") + return np.vstack((self.intercept_, self.coef_)) |
