added gridsearch and extended gensvm class

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-12 20:18:28 -0500
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-12 20:18:28 -0500
commit: 7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch)
tree: 574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm/core.py
parent: update library for python package (diff)
download: pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz
pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip
1 files changed, 135 insertions, 45 deletions
diff --git a/gensvm/core.py b/gensvm/core.py
index 7594eba..2776ec6 100644
--- a/gensvm/core.py
+++ b/gensvm/core.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 
-"""
+"""Core functionality for fitting the GenSVM classifier
+
+This module contains the basic definitions to fit a single GenSVM model.
+
 """
 
 from __future__ import print_function, division
@@ -8,7 +11,7 @@ from __future__ import print_function, division
 import numpy as np
 import warnings
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils import check_X_y, check_random_state
@@ -18,8 +21,9 @@ from sklearn.utils.validation import check_is_fitted
 from . import wrapper
 
 
-def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
-        degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None):
+def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, 
+        coef, degree, kernel_eigen_cutoff, verbose, max_iter, 
+        random_state=None, seed_V=None):
 
     # process the random state
     rnd = check_random_state(random_state)
@@ -27,11 +31,14 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
     # set the verbosity in GenSVM
     wrapper.set_verbosity_wrap(verbose)
 
+    # convert the weight index
+    weight_idx = 1 if weights == 'unit' else 2
+
     # run the actual training
     raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap(
-            X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
-            degree, kernel_eigen_cutoff, max_iter, 
-            rnd.randint(np.iinfo('i').max))
+            X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, 
+            coef, degree, kernel_eigen_cutoff, max_iter, 
+            rnd.randint(np.iinfo('i').max), seed_V)
 
     # process output
     if status_ == 1 and verbose > 0:
@@ -49,7 +56,7 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
     return coef_, intercept_, n_iter_, n_SV_
 
 
-class GenSVM(BaseEstimator):
+class GenSVM(BaseEstimator, ClassifierMixin):
     """Generalized Multiclass Support Vector Machine Classification.
 
     This class implements the basic GenSVM classifier. GenSVM is a generalized 
@@ -57,8 +64,8 @@ class GenSVM(BaseEstimator):
     errors. It is this flexibility that makes it perform well on diverse 
     datasets.
 
-    This methods of this class use the GenSVM C library for the actual 
-    computations.
+    The :func:`~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class 
+    use the GenSVM C library for the actual computations.
 
     Parameters
     ----------
@@ -71,16 +78,17 @@ class GenSVM(BaseEstimator):
     kappa : float, optional (default=0.0)
         Parameter for the hinge function in the loss function (kappa > -1.0)
 
-    weight_idx : int, optional (default=1)
-        Type of sample weights to use (1 = unit weights, 2 = size correction 
-        weights)
+    weights: string, optional (default='unit')
+        Type of sample weights to use. Options are 'unit' for unit weights and 
+        'group' for group size correction weights (equation 4 in the paper).
 
     kernel : string, optional (default='linear')
         Specify the kernel type to use in the classifier. It must be one of 
         'linear', 'poly', 'rbf', or 'sigmoid'.
 
-    gamma : float, optional (default=1.0)
-        Kernel parameter for the rbf, poly, and sigmoid kernel
+    gamma : float, optional (default='auto')
+        Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is 
+        'auto' then 1/n_features will be used.
 
     coef : float, optional (default=0.0)
         Kernel parameter for the poly and sigmoid kernel
@@ -106,9 +114,12 @@ class GenSVM(BaseEstimator):
     coef_ : array, shape = [n_features, n_classes-1]
         Weights assigned to the features (coefficients in the primal problem)
 
-    intercept_ : array, shape = [n_classes]
+    intercept_ : array, shape = [n_classes-1]
         Constants in the decision function
 
+    combined_coef_ : array, shape = [n_features+1, n_classes-1]
+        Combined weights matrix for the seed_V parameter to the fit method
+
     n_iter_ : int
         The number of iterations that were run during training.
 
@@ -116,23 +127,45 @@ class GenSVM(BaseEstimator):
         The number of support vectors that were found
 
 
-    References
-    ----------
-    * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized 
-    Multiclass Support Vector Machine. Journal of Machine Learning Research, 
-    17(225):1--42, 2016.
+    See Also
+    --------
+    :class:`.GenSVMGridSearchCV`:
+        Helper class to run an efficient grid search for GenSVM.
 
     """
 
-    def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, 
-            kernel='linear', gamma=1.0, coef=0.0, degree=2.0, 
-            kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, 
+    def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, 
+            weights='unit', kernel='linear', gamma='auto', coef=0.0, 
+            degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, 
             max_iter=1e8):
+
+        if not 1.0 <= p <= 2.0:
+            raise ValueError("Value for p should be within [1, 2]; got p = %r" 
+                    % p)
+        if not kappa > -1.0:
+            raise ValueError("Value for kappa should be larger than -1; got "
+                    "kappa = %r" % kappa)
+        if not lmd > 0:
+            raise ValueError("Value for lmd should be larger than 0; got "
+                    "lmd = %r" % lmd)
+        if not epsilon > 0:
+            raise ValueError("Value for epsilon should be larger than 0; got "
+                    "epsilon = %r" % epsilon)
+        if gamma == 0.0:
+            raise ValueError("A gamma value of 0.0 is invalid")
+        if not weights in ('unit', 'group'):
+            raise ValueError("Unknown weight parameter specified. Should be "
+                    "'unit' or 'group'; got %r" % weights)
+        if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'):
+            raise ValueError("Unknown kernel specified. Should be "
+                    "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel)
+
+
         self.p = p
         self.lmd = lmd
         self.kappa = kappa
         self.epsilon = epsilon
-        self.weight_idx = weight_idx
+        self.weights = weights
         self.kernel = kernel
         self.gamma = gamma
         self.coef = coef
@@ -143,19 +176,42 @@ class GenSVM(BaseEstimator):
         self.max_iter = max_iter
 
 
-    def fit(self, X, y):
-        if not 1.0 <= self.p <= 2.0:
-            raise ValueError("Value for p should be within [1, 2]; got p = %r)" 
-                    % self.p)
-        if not self.kappa > -1.0:
-            raise ValueError("Value for kappa should be larger than -1; got "
-                    "kappa = %r" % self.kappa)
-        if not self.lmd > 0:
-            raise ValueError("Value for lmd should be larger than 0; got "
-                    "lmd = %r" % self.lmd)
-        if not self.epsilon > 0:
-            raise ValueError("Value for epsilon should be larger than 0; got "
-                    "epsilon = %r" % self.epsilon)
+    def fit(self, X, y, seed_V=None):
+        """Fit the GenSVM model on the given data
+
+        The model can be fit with or without a seed matrix (``seed_V``). This 
+        can be used to provide warm starts for the algorithm.
+
+        Parameters
+        ----------
+
+        X : array, shape = (n_observations, n_features)
+            The input data. It is expected that only numeric data is given.
+
+        y : array, shape = (n_observations, )
+            The label vector, labels can be numbers or strings.
+
+        seed_V : array, shape = (n_features+1, n_classes-1), optional
+            Seed coefficient array to use as a warm start for the optimization.  
+            It can for instance be the :attr:`combined_coef_ 
+            <.GenSVM.combined_coef_>` attribute of a different GenSVM model.  
+            This is only supported for the linear kernel.
+
+            NOTE: the size of the seed_V matrix is ``n_features+1`` by 
+            ``n_classes - 1``.  The number of columns of ``seed_V`` is leading 
+            for the number of classes in the model. For example, if ``y`` 
+            contains 3 different classes and ``seed_V`` has 3 columns, we 
+            assume that there are actually 4 classes in the problem but one 
+            class is just represented in this training data. This can be useful 
+            for problems were a certain class has only a few samples.
+
+
+        Returns
+        -------
+        self : object
+            Returns self.
+
+        """
         X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, 
                 order="C")
 
@@ -163,24 +219,52 @@ class GenSVM(BaseEstimator):
         if y_type not in ["binary", "multiclass"]:
             raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
 
+        if self.gamma == 'auto':
+            gamma = 1 / X.shape[1]
+        else:
+            gamma = self.gamma
+
         # This is necessary because GenSVM expects classes to go from 1 to 
         # n_class
         self.encoder = LabelEncoder()
         y = self.encoder.fit_transform(y_org)
         y += 1
 
+        n_class = len(np.unique(y))
+        if not seed_V is None and self.kernel != 'linear':
+            warnings.warn("Warm starts are only supported for the "
+                    "linear kernel. The seed_V parameter will be ignored.")
+            seed_V = None
+        if not seed_V is None:
+            n_samples, n_features = X.shape
+            if seed_V.shape[1] + 1 > n_class:
+                n_class = seed_V.shape[1]
+            if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 < 
+                    n_class):
+                raise ValueError("Seed V must have shape [%i, %i], "
+                        "but has shape [%i, %i]" % (n_features+1, n_class-1, 
+                            seed_V.shape[0], seed_V.shape[1]))
+
         self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
-                _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, 
-                        self.weight_idx, self.kernel, self.gamma, self.coef, 
-                        self.degree, self.kernel_eigen_cutoff, self.verbose, 
-                        self.max_iter, self.random_state)
+                _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa, 
+                        self.epsilon, self.weights, self.kernel, gamma, 
+                        self.coef, self.degree, self.kernel_eigen_cutoff, 
+                        self.verbose, self.max_iter, self.random_state, seed_V)
         return self
 
-
     def predict(self, X):
-        check_is_fitted(self, "coef_")
+        """Predict the class labels on the given data
 
-        V = np.vstack((self.intercept_, self.coef_))
+        Parameters
+        ----------
+        X : array, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        y_pred : array, shape = (n_samples, )
+
+        """
+        V = self.combined_coef_
         predictions = wrapper.predict_wrap(X, V)
 
         # Transform the classes back to the original form
@@ -188,3 +272,9 @@ class GenSVM(BaseEstimator):
         outcome = self.encoder.inverse_transform(predictions)
 
         return outcome
+
+    @property
+    def combined_coef_(self):
+        check_is_fitted(self, "coef_")
+        check_is_fitted(self, "intercept_")
+        return np.vstack((self.intercept_, self.coef_))
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-12 20:18:28 -0500
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-12 20:18:28 -0500
commit	7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch)
tree	574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm/core.py
parent	update library for python package (diff)
download	pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip