added gridsearch and extended gensvm class

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-12 20:18:28 -0500
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-12 20:18:28 -0500
commit: 7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch)
tree: 574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm
parent: update library for python package (diff)
download: pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz
pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip
5 files changed, 956 insertions, 46 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py
index 5e1a743..8fe253b 100644
--- a/gensvm/__init__.py
+++ b/gensvm/__init__.py
@@ -1,3 +1,4 @@
 __version__ = '0.1.0'
 
-from .models import GenSVM
+from .core import GenSVM
+from .gridsearch import GenSVMGridSearchCV
diff --git a/gensvm/core.py b/gensvm/core.py
index 7594eba..2776ec6 100644
--- a/gensvm/core.py
+++ b/gensvm/core.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 
-"""
+"""Core functionality for fitting the GenSVM classifier
+
+This module contains the basic definitions to fit a single GenSVM model.
+
 """
 
 from __future__ import print_function, division
@@ -8,7 +11,7 @@ from __future__ import print_function, division
 import numpy as np
 import warnings
 
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
 from sklearn.preprocessing import LabelEncoder
 from sklearn.utils import check_X_y, check_random_state
@@ -18,8 +21,9 @@ from sklearn.utils.validation import check_is_fitted
 from . import wrapper
 
 
-def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
-        degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None):
+def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, 
+        coef, degree, kernel_eigen_cutoff, verbose, max_iter, 
+        random_state=None, seed_V=None):
 
     # process the random state
     rnd = check_random_state(random_state)
@@ -27,11 +31,14 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
     # set the verbosity in GenSVM
     wrapper.set_verbosity_wrap(verbose)
 
+    # convert the weight index
+    weight_idx = 1 if weights == 'unit' else 2
+
     # run the actual training
     raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap(
-            X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
-            degree, kernel_eigen_cutoff, max_iter, 
-            rnd.randint(np.iinfo('i').max))
+            X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, 
+            coef, degree, kernel_eigen_cutoff, max_iter, 
+            rnd.randint(np.iinfo('i').max), seed_V)
 
     # process output
     if status_ == 1 and verbose > 0:
@@ -49,7 +56,7 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
     return coef_, intercept_, n_iter_, n_SV_
 
 
-class GenSVM(BaseEstimator):
+class GenSVM(BaseEstimator, ClassifierMixin):
     """Generalized Multiclass Support Vector Machine Classification.
 
     This class implements the basic GenSVM classifier. GenSVM is a generalized 
@@ -57,8 +64,8 @@ class GenSVM(BaseEstimator):
     errors. It is this flexibility that makes it perform well on diverse 
     datasets.
 
-    This methods of this class use the GenSVM C library for the actual 
-    computations.
+    The :func:`~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class 
+    use the GenSVM C library for the actual computations.
 
     Parameters
     ----------
@@ -71,16 +78,17 @@ class GenSVM(BaseEstimator):
     kappa : float, optional (default=0.0)
         Parameter for the hinge function in the loss function (kappa > -1.0)
 
-    weight_idx : int, optional (default=1)
-        Type of sample weights to use (1 = unit weights, 2 = size correction 
-        weights)
+    weights: string, optional (default='unit')
+        Type of sample weights to use. Options are 'unit' for unit weights and 
+        'group' for group size correction weights (equation 4 in the paper).
 
     kernel : string, optional (default='linear')
         Specify the kernel type to use in the classifier. It must be one of 
         'linear', 'poly', 'rbf', or 'sigmoid'.
 
-    gamma : float, optional (default=1.0)
-        Kernel parameter for the rbf, poly, and sigmoid kernel
+    gamma : float, optional (default='auto')
+        Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is 
+        'auto' then 1/n_features will be used.
 
     coef : float, optional (default=0.0)
         Kernel parameter for the poly and sigmoid kernel
@@ -106,9 +114,12 @@ class GenSVM(BaseEstimator):
     coef_ : array, shape = [n_features, n_classes-1]
         Weights assigned to the features (coefficients in the primal problem)
 
-    intercept_ : array, shape = [n_classes]
+    intercept_ : array, shape = [n_classes-1]
         Constants in the decision function
 
+    combined_coef_ : array, shape = [n_features+1, n_classes-1]
+        Combined weights matrix for the seed_V parameter to the fit method
+
     n_iter_ : int
         The number of iterations that were run during training.
 
@@ -116,23 +127,45 @@ class GenSVM(BaseEstimator):
         The number of support vectors that were found
 
 
-    References
-    ----------
-    * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized 
-    Multiclass Support Vector Machine. Journal of Machine Learning Research, 
-    17(225):1--42, 2016.
+    See Also
+    --------
+    :class:`.GenSVMGridSearchCV`:
+        Helper class to run an efficient grid search for GenSVM.
 
     """
 
-    def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, 
-            kernel='linear', gamma=1.0, coef=0.0, degree=2.0, 
-            kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, 
+    def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, 
+            weights='unit', kernel='linear', gamma='auto', coef=0.0, 
+            degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, 
             max_iter=1e8):
+
+        if not 1.0 <= p <= 2.0:
+            raise ValueError("Value for p should be within [1, 2]; got p = %r" 
+                    % p)
+        if not kappa > -1.0:
+            raise ValueError("Value for kappa should be larger than -1; got "
+                    "kappa = %r" % kappa)
+        if not lmd > 0:
+            raise ValueError("Value for lmd should be larger than 0; got "
+                    "lmd = %r" % lmd)
+        if not epsilon > 0:
+            raise ValueError("Value for epsilon should be larger than 0; got "
+                    "epsilon = %r" % epsilon)
+        if gamma == 0.0:
+            raise ValueError("A gamma value of 0.0 is invalid")
+        if not weights in ('unit', 'group'):
+            raise ValueError("Unknown weight parameter specified. Should be "
+                    "'unit' or 'group'; got %r" % weights)
+        if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'):
+            raise ValueError("Unknown kernel specified. Should be "
+                    "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel)
+
+
         self.p = p
         self.lmd = lmd
         self.kappa = kappa
         self.epsilon = epsilon
-        self.weight_idx = weight_idx
+        self.weights = weights
         self.kernel = kernel
         self.gamma = gamma
         self.coef = coef
@@ -143,19 +176,42 @@ class GenSVM(BaseEstimator):
         self.max_iter = max_iter
 
 
-    def fit(self, X, y):
-        if not 1.0 <= self.p <= 2.0:
-            raise ValueError("Value for p should be within [1, 2]; got p = %r)" 
-                    % self.p)
-        if not self.kappa > -1.0:
-            raise ValueError("Value for kappa should be larger than -1; got "
-                    "kappa = %r" % self.kappa)
-        if not self.lmd > 0:
-            raise ValueError("Value for lmd should be larger than 0; got "
-                    "lmd = %r" % self.lmd)
-        if not self.epsilon > 0:
-            raise ValueError("Value for epsilon should be larger than 0; got "
-                    "epsilon = %r" % self.epsilon)
+    def fit(self, X, y, seed_V=None):
+        """Fit the GenSVM model on the given data
+
+        The model can be fit with or without a seed matrix (``seed_V``). This 
+        can be used to provide warm starts for the algorithm.
+
+        Parameters
+        ----------
+
+        X : array, shape = (n_observations, n_features)
+            The input data. It is expected that only numeric data is given.
+
+        y : array, shape = (n_observations, )
+            The label vector, labels can be numbers or strings.
+
+        seed_V : array, shape = (n_features+1, n_classes-1), optional
+            Seed coefficient array to use as a warm start for the optimization.  
+            It can for instance be the :attr:`combined_coef_ 
+            <.GenSVM.combined_coef_>` attribute of a different GenSVM model.  
+            This is only supported for the linear kernel.
+
+            NOTE: the size of the seed_V matrix is ``n_features+1`` by 
+            ``n_classes - 1``.  The number of columns of ``seed_V`` is leading 
+            for the number of classes in the model. For example, if ``y`` 
+            contains 3 different classes and ``seed_V`` has 3 columns, we 
+            assume that there are actually 4 classes in the problem but one 
+            class is just represented in this training data. This can be useful 
+            for problems were a certain class has only a few samples.
+
+
+        Returns
+        -------
+        self : object
+            Returns self.
+
+        """
         X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, 
                 order="C")
 
@@ -163,24 +219,52 @@ class GenSVM(BaseEstimator):
         if y_type not in ["binary", "multiclass"]:
             raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
 
+        if self.gamma == 'auto':
+            gamma = 1 / X.shape[1]
+        else:
+            gamma = self.gamma
+
         # This is necessary because GenSVM expects classes to go from 1 to 
         # n_class
         self.encoder = LabelEncoder()
         y = self.encoder.fit_transform(y_org)
         y += 1
 
+        n_class = len(np.unique(y))
+        if not seed_V is None and self.kernel != 'linear':
+            warnings.warn("Warm starts are only supported for the "
+                    "linear kernel. The seed_V parameter will be ignored.")
+            seed_V = None
+        if not seed_V is None:
+            n_samples, n_features = X.shape
+            if seed_V.shape[1] + 1 > n_class:
+                n_class = seed_V.shape[1]
+            if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 < 
+                    n_class):
+                raise ValueError("Seed V must have shape [%i, %i], "
+                        "but has shape [%i, %i]" % (n_features+1, n_class-1, 
+                            seed_V.shape[0], seed_V.shape[1]))
+
         self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
-                _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, 
-                        self.weight_idx, self.kernel, self.gamma, self.coef, 
-                        self.degree, self.kernel_eigen_cutoff, self.verbose, 
-                        self.max_iter, self.random_state)
+                _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa, 
+                        self.epsilon, self.weights, self.kernel, gamma, 
+                        self.coef, self.degree, self.kernel_eigen_cutoff, 
+                        self.verbose, self.max_iter, self.random_state, seed_V)
         return self
 
-
     def predict(self, X):
-        check_is_fitted(self, "coef_")
+        """Predict the class labels on the given data
 
-        V = np.vstack((self.intercept_, self.coef_))
+        Parameters
+        ----------
+        X : array, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        y_pred : array, shape = (n_samples, )
+
+        """
+        V = self.combined_coef_
         predictions = wrapper.predict_wrap(X, V)
 
         # Transform the classes back to the original form
@@ -188,3 +272,9 @@ class GenSVM(BaseEstimator):
         outcome = self.encoder.inverse_transform(predictions)
 
         return outcome
+
+    @property
+    def combined_coef_(self):
+        check_is_fitted(self, "coef_")
+        check_is_fitted(self, "intercept_")
+        return np.vstack((self.intercept_, self.coef_))
diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py
new file mode 100644
index 0000000..6161e0b
--- /dev/null
+++ b/gensvm/gridsearch.py
@@ -0,0 +1,575 @@
+# -*- coding: utf-8 -*-
+
+"""Functions for doing an efficient GenSVM grid search
+
+This module contains functions to run a grid search for the GenSVM model. This 
+is implemented in a separate class because it uses the GenSVM C library to do 
+the actual grid search. The C routines for the grid search use warm starts for 
+the computations and are therefore more efficient.
+
+"""
+
+from __future__ import print_function, division
+
+import numpy as np
+import time
+
+from operator import itemgetter
+
+from sklearn.base import ClassifierMixin, BaseEstimator, MetaEstimatorMixin
+from sklearn.model_selection import ParameterGrid, check_cv
+from sklearn.model_selection._search import _check_param_grid
+from sklearn.model_selection._validation import _score
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_X_y
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import indexable
+
+from . import wrapper
+from .core import GenSVM
+from .sklearn_util import (_skl_format_cv_results, _skl_check_scorers, 
+        _skl_check_is_fitted, _skl_grid_score)
+
+
+def _sort_candidate_params(candidate_params):
+    if any(('epsilon' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('epsilon'), reverse=True)
+    if any(('p' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('p'))
+    if any(('lmd' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('lmd'))
+    if any(('kappa' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('kappa'))
+    if any(('weights' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('weights'))
+    if any(('gamma' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('gamma'))
+    if any(('degree' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('degree'))
+    if any(('coef' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('coef'))
+    if any(('kernel' in p for p in candidate_params)):
+        candidate_params.sort(key=itemgetter('kernel'))
+
+
+def _validate_param_grid(param_grid):
+    """Check if the parameter values are valid
+
+    This basically does the same checks as in the constructor of the 
+    :class:`core.GenSVM` class, but for the entire parameter grid.
+
+    """
+    # the conditions that the parameters must satisfy
+    conditions = {
+            'p': lambda x : 1.0 <= x <= 2.0,
+            'kappa': lambda x : x > -1.0,
+            'lmd': lambda x : x > 0,
+            'epsilon': lambda x : x > 0,
+            'gamma' : lambda x : x != 0,
+            'weights' : lambda x : x in ['unit', 'group'],
+            }
+
+    for param in conditions:
+        if param in param_grid:
+            if not all(map(conditions[param], param_grid[param])):
+                raise ValueError(
+                        "Invalid value in grid for parameter: %s." % (param)
+                        )
+
+
+class _MockEstimator(ClassifierMixin):
+    #This mock estimator facilitates the use of the Scorer class of 
+    #Scikit-Learn. Basically, we want to use the _score function of 
+    #sklearn.model_selection._validation, but we don't keep track of the 
+    #individual estimators in the GenSVM C grid search code. With this wrapper 
+    #we can mock an estimator for the _score function.
+
+    #The ClassifierMixin adds the score method to the estimator. This allows us 
+    #to leave scoring=None as the default to the GenSVMGridSearchCV class and 
+    #ends up using the accuracy_score metric.
+
+    def __init__(self, predictions):
+        self.predictions = predictions
+
+    def predict(self, X):
+        return self.predictions
+
+
+def _format_results(results, cv_idx, true_y, scorers, iid, 
+        return_train_score=True,
+        return_n_test_samples=True,
+        return_times=True,
+        return_parameters=False):
+    """Format the results from the grid search
+
+    Parameters
+    ----------
+
+    scorer : A single callable or dict mapping scorer name to the callable
+        If it is a single callable, the return value for ``train_scores`` and
+        ``test_scores`` is a single float.
+
+        For a dict, it should be one mapping the scorer name to the scorer
+        callable object / function.
+
+        The callable object / fn should have signature
+        ``scorer(estimator, X, y)``.
+
+    """
+
+    out = []
+    candidate_params = results['params']
+    n_candidates = len(candidate_params)
+    n_splits = len(np.unique(cv_idx))
+
+    is_multimetric = not callable(scorers)
+
+    # Out must be a list of dicts of size n_params x n_splits that iterates 
+    # over the params in the list and for each param iterates over the splits.
+    for param, duration, predictions in zip(results['params'], 
+            results['duration'], results['predictions']):
+        for test_idx in np.unique(cv_idx):
+
+            ret = []
+            score_time = 0
+
+            if return_train_score:
+                train_pred = predictions[cv_idx != test_idx, ]
+                y_train = true_y[cv_idx != test_idx, ]
+                train_mock = _MockEstimator(train_pred)
+                start_time = time.time()
+                train_scores = _score(train_mock, None, y_train, scorers, 
+                        is_multimetric)
+                score_time += time.time() - start_time
+                ret.append(train_scores)
+
+            test_pred = predictions[cv_idx == test_idx, ]
+            y_test = true_y[cv_idx == test_idx, ]
+            test_mock = _MockEstimator(test_pred)
+            start_time = time.time()
+            test_scores = _score(test_mock, None, y_test, scorers, 
+                    is_multimetric)
+            score_time += time.time() - start_time
+            ret.append(test_scores)
+
+            if return_n_test_samples:
+                ret.append(len(y_test))
+            if return_times:
+                fit_time = duration
+                ret.extend([fit_time, score_time])
+            if return_parameters:
+                ret.append(param)
+
+            out.append(ret)
+
+    cv_results_ = _skl_format_cv_results(out, return_train_score, 
+            candidate_params, n_candidates, n_splits, scorers, iid)
+
+    return cv_results_
+
+
+def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit, 
+        verbose, return_train_score, iid):
+    """Utility function for fitting the grid search for GenSVM
+
+    This function sorts the parameter grid for optimal computation speed, sets 
+    the desired verbosity, generates the cross validation splits, and calls the 
+    low-level training routine in the Cython wrapper.
+
+    For parameters, see :class:`.GenSVMGridSearchCV`.
+
+    Returns
+    -------
+    cv_results_ : dict
+        The cross validation results. See :func:`~GenSVMGridSearchCV.fit`.
+
+    """
+
+    # sort the candidate params
+    # the optimal order of the parameters from inner to outer loop is: epsilon, 
+    # p, lambda, kappa, weights, kernel, ???
+    _sort_candidate_params(candidate_params)
+
+    # set the verbosity in GenSVM
+    wrapper.set_verbosity_wrap(verbose)
+
+    # NOTE: The C library can compute the accuracy score and destroy the exact 
+    # predictions, but this doesn't allow us to compute the score per fold. So 
+    # we always want to get the raw predictions for each grid point.
+    store_predictions = True
+
+    # Convert the cv variable to a cv_idx array
+    cv = check_cv(cv, y, classifier=True)
+    n_folds = cv.get_n_splits(X, y, groups)
+    cv_idx = np.zeros((X.shape[0], ), dtype=np.int_) - 1
+    fold_idx = 0
+    for train, test in cv.split(X, y, groups):
+        cv_idx[test, ] = fold_idx
+        fold_idx += 1
+
+    results_ = wrapper.grid_wrap(
+            X,
+            y,
+            candidate_params,
+            int(store_predictions),
+            cv_idx,
+            int(n_folds),
+            )
+    cv_results_ = _format_results(results_, cv_idx, y, scorers, iid)
+
+    return cv_results_, n_folds
+
+
+class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
+    """GenSVM cross validated grid search
+
+    This class implements efficient GenSVM grid search with cross validation.  
+    One of the strong features of GenSVM is that seeding the classifier 
+    properly can greatly reduce total training time. This class ensures that 
+    the grid search is done in the most efficient way possible.
+
+    The implementation of this class is based on the `GridSearchCV`_ class in 
+    scikit-learn. The documentation of the various parameters is therefore 
+    mostly the same. This is done to provide the user with a familiar and 
+    easy-to-use interface to doing a grid search with GenSVM. A separate class 
+    was needed to benefit from the fast low-level C implementation of grid 
+    search in the GenSVM library.
+
+    Parameters
+    ----------
+    param_grid : dict or list of dicts
+        Dictionary of parameter names (strings) as keys and lists of parameter 
+        settings to evaluate as values, or a list of such dicts. The GenSVM 
+        model will be evaluated at all combinations of the parameters.
+
+    scoring : string, callable, list/tuple, dict or None
+        A single string (see :ref:`scoring_parameter`) or a callable (see 
+        :ref:`scoring`) to evaluate the predictions on the test set.
+
+        For evaluating multiple metrics, either give a list of (unique) strings 
+        or a dict with names as keys and callables as values.
+
+        NOTE that when using custom scorers, each scorer should return a single 
+        value. Metric functions returning a list/array of values can be wrapped 
+        into multiple scorers that return one value each. 
+
+        If None, the `accuracy_score`_ is used. 
+
+    iid : boolean, default=True
+        If True, the data is assumed to be identically distributed across the 
+        folds, and the loss minimized is the total loss per sample and not the 
+        mean loss across the folds.
+
+    cv : int, cross-validation generator or an iterable, optional
+        Determines the cross-validation splitting strategy. Possible inputs for 
+        cv are:
+
+          - None, to use the default 3-fold cross validation,
+          - integer, to specify the number of folds in a `(Stratified)KFold`,
+          - An object to be used as a cross-validation generator.
+          - An iterable yielding train, test splits.
+
+        For integer/None inputs, :class:`StratifiedKFold 
+        <sklearn.model_selection.StratifiedKFold>` is used.  In all other 
+        cases, :class:`KFold <sklearn.model_selection.KFold>` is used.
+
+        Refer to the `scikit-learn User Guide on cross validation`_ for the 
+        various strategies that can be used here.
+
+    refit : boolean, or string, default=True
+        Refit the GenSVM estimator with the best found parameters on the whole 
+        dataset.
+
+        For multiple metric evaluation, this needs to be a string denoting the 
+        scorer to be used to find the best parameters for refitting the 
+        estimator at the end.
+
+        The refitted estimator is made available at the `:attr:best_estimator_ 
+        <.GenSVMGridSearchCV.best_estimator_>` attribute and allows the user to 
+        use the :func:`~GenSVMGridSearchCV.predict` method directly on this 
+        :class:`.GenSVMGridSearchCV` instance.
+
+        Also for multiple metric evaluation, the attributes :attr:`best_index_ 
+        <.GenSVMGridSearchCV.best_index_>`, :attr:`best_score_ 
+        <.GenSVMGridSearchCV.best_score_>` and :attr:`best_params_ 
+        <.GenSVMGridSearchCV:best_params_>` will only be available if ``refit`` 
+        is set and all of them will be determined w.r.t this specific scorer.
+
+        See ``scoring`` parameter to know more about multiple metric
+        evaluation.
+
+    verbose : integer
+        Controls the verbosity: the higher, the more messages.
+
+    return_train_score : boolean, default=True
+        If ``False``, the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` 
+        attribute will not include training scores.
+
+    Examples
+    --------
+    >>> from gensvm import GenSVMGridSearchCV
+    >>> from sklearn.datasets import load_iris
+    >>> iris = load_iris()
+    >>> param_grid = {'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]}
+    >>> clf = GenSVMGridSearchCV(param_grid)
+    >>> clf.fit(iris.data, iris.target)
+    GenSVMGridSearchCV(cv=None, iid=True,
+          param_grid={'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]},
+          refit=True, return_train_score=True, scoring=None, verbose=0)
+
+    Attributes
+    ----------
+    cv_results_ : dict of numpy (masked) ndarrays
+        A dict with keys as column headers and values as columns, that can be 
+        imported into a pandas `DataFrame`_.
+
+        For instance the below given table
+
+        +------------+-----------+------------+-----------------+---+---------+
+        |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
+        +============+===========+============+=================+===+=========+
+        |  'poly'    |     --    |      2     |        0.8      |...|    2    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'poly'    |     --    |      3     |        0.7      |...|    4    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.1   |     --     |        0.8      |...|    3    |
+        +------------+-----------+------------+-----------------+---+---------+
+        |  'rbf'     |     0.2   |     --     |        0.9      |...|    1    |
+        +------------+-----------+------------+-----------------+---+---------+
+
+        will be represented by a ``cv_results_`` dict of::
+
+            {
+            'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                         mask = [False False False False]...)
+            'param_gamma': masked_array(data = [-- -- 0.1 0.2],
+                                        mask = [ True  True False False]...),
+            'param_degree': masked_array(data = [2.0 3.0 -- --],
+                                         mask = [False False  True  True]...),
+            'split0_test_score'  : [0.8, 0.7, 0.8, 0.9],
+            'split1_test_score'  : [0.82, 0.5, 0.7, 0.78],
+            'mean_test_score'    : [0.81, 0.60, 0.75, 0.82],
+            'std_test_score'     : [0.02, 0.01, 0.03, 0.03],
+            'rank_test_score'    : [2, 4, 3, 1],
+            'split0_train_score' : [0.8, 0.9, 0.7],
+            'split1_train_score' : [0.82, 0.5, 0.7],
+            'mean_train_score'   : [0.81, 0.7, 0.7],
+            'std_train_score'    : [0.03, 0.03, 0.04],
+            'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+            'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+            'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
+            'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
+            'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
+            }
+
+        NOTE:
+
+        The key ``'params'`` is used to store a list of parameter settings 
+        dicts for all the parameter candidates.
+
+        The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and 
+        ``std_score_time`` are all in seconds.
+
+        For multi-metric evaluation, the scores for all the scorers are 
+        available in the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` 
+        dict at the keys ending with that scorer's name (``'_<scorer_name>'``) 
+        instead of ``'_score'`` shown above. ('split0_test_precision', 
+        'mean_train_precision' etc.)
+
+    best_estimator_ : estimator or dict
+        Estimator that was chosen by the search, i.e. estimator which gave 
+        highest score (or smallest loss if specified) on the left out data. Not 
+        available if ``refit=False``.
+
+        See ``refit`` parameter for more information on allowed values.
+
+    best_score_ : float
+        Mean cross-validated score of the best_estimator 
+
+        For multi-metric evaluation, this is present only if ``refit`` is 
+        specified.
+
+    best_params_ : dict
+        Parameter setting that gave the best results on the hold out data. 
+
+        For multi-metric evaluation, this is present only if ``refit`` is 
+        specified.
+
+    best_index_ : int
+        The index (of the ``cv_results_`` arrays) which corresponds to the best 
+        candidate parameter setting.
+
+        The dict at ``search.cv_results_['params'][search.best_index_]`` gives 
+        the parameter setting for the best model, that gives the highest mean 
+        score (``search.best_score_``).
+
+        For multi-metric evaluation, this is present only if ``refit`` is 
+        specified.
+
+    scorer_ : function or a dict
+        Scorer function used on the held out data to choose the best parameters 
+        for the model.
+
+        For multi-metric evaluation, this attribute holds the validated 
+        ``scoring`` dict which maps the scorer key to the scorer callable.
+
+    n_splits_ : int
+        The number of cross-validation splits (folds/iterations).
+
+    Notes
+    -----
+    The parameters selected are those that maximize the score of the left out 
+    data, unless an explicit score is passed in which case it is used instead.
+
+    See Also
+    --------
+    `ParameterGrid`_:
+        Generates all the combinations of a hyperparameter grid.
+
+    :class:`.GenSVM`:
+        The GenSVM classifier
+
+    .. _GridSearchCV:
+        http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
+    .. _accuracy_score:
+        http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
+    .. _scikit-learn User Guide on cross validation:
+        http://scikit-learn.org/stable/modules/cross_validation.html
+
+    .. _ParameterGrid:
+        http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html
+    .. _DataFrame:
+        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
+    """
+
+    def __init__(self, param_grid, scoring=None, iid=True, cv=None, refit=True, 
+            verbose=0, return_train_score=True):
+
+        self.param_grid = param_grid
+        _check_param_grid(self.param_grid)
+        _validate_param_grid(self.param_grid)
+
+        self.scoring = scoring
+        self.cv = cv
+        self.refit = refit
+        self.verbose = verbose
+        self.return_train_score = return_train_score
+        self.iid = iid
+
+    def _get_param_iterator(self):
+        return ParameterGrid(self.param_grid)
+
+
+    def fit(self, X, y, groups=None):
+        """Run GenSVM grid search with all sets of parameters
+
+        Parameters
+        ----------
+
+        X : array-like, shape = (n_samples, n_features)
+            Training data, where n_samples is the number of observations and 
+            n_features is the number of features.
+
+        y : array-like, shape = (n_samples, )
+            Target vector for the training data.
+
+        groups : array-like, with shape (n_samples, ), optional
+            Group labels for the samples used while splitting the dataset into 
+            train/test sets.
+
+        Returns
+        -------
+        self : object
+            Return self.
+
+        """
+
+        X, y_orig = check_X_y(X, y, accept_sparse=False, dtype=np.float64, 
+                order="C")
+
+        y_type = type_of_target(y_orig)
+        if y_type not in ["binary", "multiclass"]:
+            raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
+
+        # This is necessary because GenSVM expects classes to go from 1 to 
+        # n_class
+        self.encoder = LabelEncoder()
+        y = self.encoder.fit_transform(y_orig)
+        y += 1
+
+        candidate_params = list(self._get_param_iterator())
+
+        scorers, self.multimetric_, refit_metric = _skl_check_scorers(
+                self.scoring, self.refit)
+
+        X, y, groups = indexable(X, y, groups)
+
+        results, n_splits = _fit_grid_gensvm(X, y, groups, candidate_params, 
+                scorers, self.cv, self.refit, self.verbose, 
+                self.return_train_score, self.iid)
+
+        self.cv_results_ = results
+
+        # For multi-metric evaluation, store the best_index_, best_params_ and
+        # best_score_ iff refit is one of the scorer names
+        # In single metric evaluation, refit_metric is "score"
+        if self.refit or not self.multimetric_:
+            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
+            self.best_params_ = candidate_params[self.best_index_]
+            self.best_score_ = results["mean_test_%s" % refit_metric][
+                self.best_index_]
+
+        if self.refit:
+            self.best_estimator_ = GenSVM(**self.best_params_)
+            # y_orig because GenSVM fit must know the conversion for predict to 
+            # work correctly
+            self.best_estimator_.fit(X, y_orig)
+
+        ## Store the only scorer not as a dict for single metric evaluation
+        self.scorer_ = scorers if self.multimetric_ else scorers['score']
+
+        self.cv_results_ = results
+        self.n_splits_ = n_splits
+
+        return self
+
+
+    def score(self, X, y):
+        """Compute the score on the test data given the true labels
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Test data, where n_samples is the number of observations and 
+            n_features is the number of features.
+
+        y : array-like, shape = (n_samples, )
+            True labels for the test data.
+
+
+        Returns
+        -------
+        score : float
+
+        """
+        _skl_check_is_fitted(self, 'score', self.refit)
+        return _skl_grid_score(X, y, self.scorer_, self.best_estimator_, 
+                self.refit, self.multimetric_)
+
+    def predict(self, X):
+        """Predict the class labels on the test data
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features)
+            Test data, where n_samples is the number of observations and 
+            n_features is the number of features.
+
+        Returns
+        -------
+        y_pred : array-like, shape = (n_samples, )
+            Predicted class labels of the data in X.
+
+        """
+        _skl_check_is_fitted(self, 'predict', self.refit)
+        return self.best_estimator_.predict(X)
diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py
new file mode 100644
index 0000000..9c46e0e
--- /dev/null
+++ b/gensvm/sklearn_util.py
@@ -0,0 +1,212 @@
+"""Functions in GenSVM that are taken from Scikit-Learn
+
+The GenSVM Python package is designed to work in the same way as Scikit-Learn 
+classifiers, as this makes it easier for people familiar with Scikit-Learn to 
+use GenSVM. As such, some of the functionality of the GenSVM Python package is 
+similar to code in the Scikit-Learn package (such as formatting the grid search 
+results). To keep a clean separation between code from Scikit-Learn (which is 
+licensed under the BSD license) and code written by the author(s) of the GenSVM 
+package, the code from scikit-learn is placed here in explicit self-contained 
+functions. To comply with clause a of the BSD license, it is repeated below as 
+required.
+
+"""
+
+import numpy as np
+
+from collections import defaultdict
+from functools import partial
+
+from .core import GenSVM
+from .util import get_ranks
+
+
+# BEGIN SCIKIT LEARN CODE
+
+"""
+
+New BSD License
+
+Copyright (c) 2007–2017 The scikit-learn developers.
+All rights reserved.
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  a. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+  b. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+  c. Neither the name of the Scikit-learn Developers  nor the names of
+     its contributors may be used to endorse or promote products
+     derived from this software without specific prior written
+     permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+
+"""
+
+from sklearn.exceptions import NotFittedError
+from sklearn.externals import six
+from sklearn.metrics.scorer import _check_multimetric_scoring
+from sklearn.model_selection._validation import _aggregate_score_dicts
+from sklearn.utils.fixes import MaskedArray
+from sklearn.utils.validation import check_is_fitted
+
+def _skl_format_cv_results(out, return_train_score, candidate_params, 
+        n_candidates, n_splits, scorers, iid):
+
+    # if one choose to see train score, "out" will contain train score info
+    if return_train_score:
+        (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
+         score_time) = zip(*out)
+    else:
+        (test_score_dicts, test_sample_counts, fit_time,
+         score_time) = zip(*out)
+
+    # test_score_dicts and train_score dicts are lists of dictionaries and
+    # we make them into dict of lists
+    test_scores = _aggregate_score_dicts(test_score_dicts)
+    if return_train_score:
+        train_scores = _aggregate_score_dicts(train_score_dicts)
+
+    results = dict()
+
+    def _store(key_name, array, weights=None, splits=False, rank=False):
+        """A small helper to store the scores/times to the cv_results_"""
+        # When iterated first by splits, then by parameters
+        # We want `array` to have `n_candidates` rows and `n_splits` cols.
+        array = np.array(array, dtype=np.float64).reshape(n_candidates,
+                                                          n_splits)
+        if splits:
+            for split_i in range(n_splits):
+                # Uses closure to alter the results
+                results["split%d_%s"
+                        % (split_i, key_name)] = array[:, split_i]
+
+        array_means = np.average(array, axis=1, weights=weights)
+        results['mean_%s' % key_name] = array_means
+        # Weighted std is not directly available in numpy
+        array_stds = np.sqrt(np.average((array -
+                                         array_means[:, np.newaxis]) ** 2,
+                                        axis=1, weights=weights))
+        results['std_%s' % key_name] = array_stds
+
+        if rank:
+            results["rank_%s" % key_name] = np.asarray(
+                get_ranks(-array_means), dtype=np.int32)
+
+    _store('fit_time', fit_time)
+    _store('score_time', score_time)
+    # Use one MaskedArray and mask all the places where the param is not
+    # applicable for that candidate. Use defaultdict as each candidate may
+    # not contain all the params
+    param_results = defaultdict(partial(MaskedArray,
+                                        np.empty(n_candidates,),
+                                        mask=True,
+                                        dtype=object))
+    for cand_i, params in enumerate(candidate_params):
+        for name, value in params.items():
+            # An all masked empty array gets created for the key
+            # `"param_%s" % name` at the first occurence of `name`.
+            # Setting the value at an index also unmasks that index
+            param_results["param_%s" % name][cand_i] = value
+
+    results.update(param_results)
+    # Store a list of param dicts at the key 'params'
+    results['params'] = candidate_params
+
+    # NOTE test_sample counts (weights) remain the same for all candidates
+    test_sample_counts = np.array(test_sample_counts[:n_splits],
+                                  dtype=np.int)
+    for scorer_name in scorers.keys():
+        # Computed the (weighted) mean and std for test scores alone
+        _store('test_%s' % scorer_name, test_scores[scorer_name],
+               splits=True, rank=True,
+               weights=test_sample_counts if iid else None)
+        if return_train_score:
+            _store('train_%s' % scorer_name, train_scores[scorer_name],
+                   splits=True)
+
+    return results
+
+
+def _skl_check_scorers(scoring, refit):
+
+    scorers, multimetric_ = _check_multimetric_scoring(
+        GenSVM(), scoring=scoring)
+    if multimetric_:
+        if refit is not False and (
+                not isinstance(refit, six.string_types) or
+                # This will work for both dict / list (tuple)
+                refit not in scorers):
+            raise ValueError("For multi-metric scoring, the parameter "
+                             "refit must be set to a scorer key "
+                             "to refit an estimator with the best "
+                             "parameter setting on the whole data and "
+                             "make the best_* attributes "
+                             "available for that metric. If this is not "
+                             "needed, refit should be set to False "
+                             "explicitly. %r was passed." % refit)
+        else:
+            refit_metric = refit
+    else:
+        refit_metric = 'score'
+
+    return scorers, multimetric_, refit_metric
+
+
+def _skl_check_is_fitted(estimator, method_name, refit):
+    if not refit:
+        raise NotFittedError('This %s instance was initialized '
+                'with refit=False. %s is '
+                'available only after refitting on the best '
+                'parameters. You can refit an estimator '
+                'manually using the ``best_parameters_`` '
+                'attribute'
+                % (type(estimator).__name__, method_name))
+    else:
+        check_is_fitted(estimator, 'best_estimator_')
+
+
+
+def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_):
+        """Returns the score on the given data, if the estimator has been 
+        refit.
+
+        This uses the score defined by ``scoring`` where provided, and the
+        ``best_estimator_.score`` method otherwise.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Input data, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        Returns
+        -------
+        score : float
+        """
+        if scorer_ is None:
+            raise ValueError("No score function explicitly defined, "
+                             "and the estimator doesn't provide one %s"
+                             % best_estimator_)
+        score = scorer_[refit] if multimetric_ else scorer_
+        return score(best_estimator_, X, y)
diff --git a/gensvm/util.py b/gensvm/util.py
new file mode 100644
index 0000000..1e79d75
--- /dev/null
+++ b/gensvm/util.py
@@ -0,0 +1,32 @@
+"""
+Utility functions for GenSVM
+
+"""
+
+
+import numpy as np
+
+
+def get_ranks(x):
+    """
+    Rank data in an array. Low values get a small rank number. Ties are broken 
+    by assigning the lowest value.
+
+    Examples
+    --------
+    >>> x = [7, 0.1, 0.5, 0.1, 10, 100, 200]
+    >>> get_ranks(x)
+    [4, 1, 3, 1, 5, 6, 7]
+
+    """
+    x = np.ravel(np.asarray(x))
+    l = len(x)
+    r = 1
+    ranks = np.zeros((l, ))
+    while not all([k is None for k in x]):
+        m = min([k for k in x if not k is None])
+        idx = [1 if k == m else 0 for k in x]
+        ranks = [r if idx[k] else ranks[k] for k in range(l)]
+        r += sum(idx)
+        x = [None if idx[k] else x[k] for k in range(l)]
+    return ranks
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-12 20:18:28 -0500
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-12 20:18:28 -0500
commit	7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch)
tree	574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm
parent	update library for python package (diff)
download	pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip