diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-12 20:18:28 -0500 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-12 20:18:28 -0500 |
| commit | 7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351 (patch) | |
| tree | 574f193b67438ba739be0f41af0d89bb0fa56a2c /gensvm | |
| parent | update library for python package (diff) | |
| download | pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.tar.gz pygensvm-7ed6c4ac3ea5c409c073f1db3e62d989ffe5f351.zip | |
added gridsearch and extended gensvm class
Diffstat (limited to 'gensvm')
| -rw-r--r-- | gensvm/__init__.py | 3 | ||||
| -rw-r--r-- | gensvm/core.py | 180 | ||||
| -rw-r--r-- | gensvm/gridsearch.py | 575 | ||||
| -rw-r--r-- | gensvm/sklearn_util.py | 212 | ||||
| -rw-r--r-- | gensvm/util.py | 32 |
5 files changed, 956 insertions, 46 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py index 5e1a743..8fe253b 100644 --- a/gensvm/__init__.py +++ b/gensvm/__init__.py @@ -1,3 +1,4 @@ __version__ = '0.1.0' -from .models import GenSVM +from .core import GenSVM +from .gridsearch import GenSVMGridSearchCV diff --git a/gensvm/core.py b/gensvm/core.py index 7594eba..2776ec6 100644 --- a/gensvm/core.py +++ b/gensvm/core.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- -""" +"""Core functionality for fitting the GenSVM classifier + +This module contains the basic definitions to fit a single GenSVM model. + """ from __future__ import print_function, division @@ -8,7 +11,7 @@ from __future__ import print_function, division import numpy as np import warnings -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import ConvergenceWarning, FitFailedWarning from sklearn.preprocessing import LabelEncoder from sklearn.utils import check_X_y, check_random_state @@ -18,8 +21,9 @@ from sklearn.utils.validation import check_is_fitted from . import wrapper -def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, - degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None): +def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, + coef, degree, kernel_eigen_cutoff, verbose, max_iter, + random_state=None, seed_V=None): # process the random state rnd = check_random_state(random_state) @@ -27,11 +31,14 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, # set the verbosity in GenSVM wrapper.set_verbosity_wrap(verbose) + # convert the weight index + weight_idx = 1 if weights == 'unit' else 2 + # run the actual training raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap( - X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, - degree, kernel_eigen_cutoff, max_iter, - rnd.randint(np.iinfo('i').max)) + X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, + coef, degree, kernel_eigen_cutoff, max_iter, + rnd.randint(np.iinfo('i').max), seed_V) # process output if status_ == 1 and verbose > 0: @@ -49,7 +56,7 @@ def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, return coef_, intercept_, n_iter_, n_SV_ -class GenSVM(BaseEstimator): +class GenSVM(BaseEstimator, ClassifierMixin): """Generalized Multiclass Support Vector Machine Classification. This class implements the basic GenSVM classifier. GenSVM is a generalized @@ -57,8 +64,8 @@ class GenSVM(BaseEstimator): errors. It is this flexibility that makes it perform well on diverse datasets. - This methods of this class use the GenSVM C library for the actual - computations. + The :func:`~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class + use the GenSVM C library for the actual computations. Parameters ---------- @@ -71,16 +78,17 @@ class GenSVM(BaseEstimator): kappa : float, optional (default=0.0) Parameter for the hinge function in the loss function (kappa > -1.0) - weight_idx : int, optional (default=1) - Type of sample weights to use (1 = unit weights, 2 = size correction - weights) + weights: string, optional (default='unit') + Type of sample weights to use. Options are 'unit' for unit weights and + 'group' for group size correction weights (equation 4 in the paper). kernel : string, optional (default='linear') Specify the kernel type to use in the classifier. It must be one of 'linear', 'poly', 'rbf', or 'sigmoid'. - gamma : float, optional (default=1.0) - Kernel parameter for the rbf, poly, and sigmoid kernel + gamma : float, optional (default='auto') + Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is + 'auto' then 1/n_features will be used. coef : float, optional (default=0.0) Kernel parameter for the poly and sigmoid kernel @@ -106,9 +114,12 @@ class GenSVM(BaseEstimator): coef_ : array, shape = [n_features, n_classes-1] Weights assigned to the features (coefficients in the primal problem) - intercept_ : array, shape = [n_classes] + intercept_ : array, shape = [n_classes-1] Constants in the decision function + combined_coef_ : array, shape = [n_features+1, n_classes-1] + Combined weights matrix for the seed_V parameter to the fit method + n_iter_ : int The number of iterations that were run during training. @@ -116,23 +127,45 @@ class GenSVM(BaseEstimator): The number of support vectors that were found - References - ---------- - * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized - Multiclass Support Vector Machine. Journal of Machine Learning Research, - 17(225):1--42, 2016. + See Also + -------- + :class:`.GenSVMGridSearchCV`: + Helper class to run an efficient grid search for GenSVM. """ - def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, - kernel='linear', gamma=1.0, coef=0.0, degree=2.0, - kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, + def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, + weights='unit', kernel='linear', gamma='auto', coef=0.0, + degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, max_iter=1e8): + + if not 1.0 <= p <= 2.0: + raise ValueError("Value for p should be within [1, 2]; got p = %r" + % p) + if not kappa > -1.0: + raise ValueError("Value for kappa should be larger than -1; got " + "kappa = %r" % kappa) + if not lmd > 0: + raise ValueError("Value for lmd should be larger than 0; got " + "lmd = %r" % lmd) + if not epsilon > 0: + raise ValueError("Value for epsilon should be larger than 0; got " + "epsilon = %r" % epsilon) + if gamma == 0.0: + raise ValueError("A gamma value of 0.0 is invalid") + if not weights in ('unit', 'group'): + raise ValueError("Unknown weight parameter specified. Should be " + "'unit' or 'group'; got %r" % weights) + if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'): + raise ValueError("Unknown kernel specified. Should be " + "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel) + + self.p = p self.lmd = lmd self.kappa = kappa self.epsilon = epsilon - self.weight_idx = weight_idx + self.weights = weights self.kernel = kernel self.gamma = gamma self.coef = coef @@ -143,19 +176,42 @@ class GenSVM(BaseEstimator): self.max_iter = max_iter - def fit(self, X, y): - if not 1.0 <= self.p <= 2.0: - raise ValueError("Value for p should be within [1, 2]; got p = %r)" - % self.p) - if not self.kappa > -1.0: - raise ValueError("Value for kappa should be larger than -1; got " - "kappa = %r" % self.kappa) - if not self.lmd > 0: - raise ValueError("Value for lmd should be larger than 0; got " - "lmd = %r" % self.lmd) - if not self.epsilon > 0: - raise ValueError("Value for epsilon should be larger than 0; got " - "epsilon = %r" % self.epsilon) + def fit(self, X, y, seed_V=None): + """Fit the GenSVM model on the given data + + The model can be fit with or without a seed matrix (``seed_V``). This + can be used to provide warm starts for the algorithm. + + Parameters + ---------- + + X : array, shape = (n_observations, n_features) + The input data. It is expected that only numeric data is given. + + y : array, shape = (n_observations, ) + The label vector, labels can be numbers or strings. + + seed_V : array, shape = (n_features+1, n_classes-1), optional + Seed coefficient array to use as a warm start for the optimization. + It can for instance be the :attr:`combined_coef_ + <.GenSVM.combined_coef_>` attribute of a different GenSVM model. + This is only supported for the linear kernel. + + NOTE: the size of the seed_V matrix is ``n_features+1`` by + ``n_classes - 1``. The number of columns of ``seed_V`` is leading + for the number of classes in the model. For example, if ``y`` + contains 3 different classes and ``seed_V`` has 3 columns, we + assume that there are actually 4 classes in the problem but one + class is just represented in this training data. This can be useful + for problems were a certain class has only a few samples. + + + Returns + ------- + self : object + Returns self. + + """ X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, order="C") @@ -163,24 +219,52 @@ class GenSVM(BaseEstimator): if y_type not in ["binary", "multiclass"]: raise ValueError("Label type not allowed for GenSVM: %r" % y_type) + if self.gamma == 'auto': + gamma = 1 / X.shape[1] + else: + gamma = self.gamma + # This is necessary because GenSVM expects classes to go from 1 to # n_class self.encoder = LabelEncoder() y = self.encoder.fit_transform(y_org) y += 1 + n_class = len(np.unique(y)) + if not seed_V is None and self.kernel != 'linear': + warnings.warn("Warm starts are only supported for the " + "linear kernel. The seed_V parameter will be ignored.") + seed_V = None + if not seed_V is None: + n_samples, n_features = X.shape + if seed_V.shape[1] + 1 > n_class: + n_class = seed_V.shape[1] + if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 < + n_class): + raise ValueError("Seed V must have shape [%i, %i], " + "but has shape [%i, %i]" % (n_features+1, n_class-1, + seed_V.shape[0], seed_V.shape[1])) + self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \ - _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, - self.weight_idx, self.kernel, self.gamma, self.coef, - self.degree, self.kernel_eigen_cutoff, self.verbose, - self.max_iter, self.random_state) + _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa, + self.epsilon, self.weights, self.kernel, gamma, + self.coef, self.degree, self.kernel_eigen_cutoff, + self.verbose, self.max_iter, self.random_state, seed_V) return self - def predict(self, X): - check_is_fitted(self, "coef_") + """Predict the class labels on the given data - V = np.vstack((self.intercept_, self.coef_)) + Parameters + ---------- + X : array, shape = [n_samples, n_features] + + Returns + ------- + y_pred : array, shape = (n_samples, ) + + """ + V = self.combined_coef_ predictions = wrapper.predict_wrap(X, V) # Transform the classes back to the original form @@ -188,3 +272,9 @@ class GenSVM(BaseEstimator): outcome = self.encoder.inverse_transform(predictions) return outcome + + @property + def combined_coef_(self): + check_is_fitted(self, "coef_") + check_is_fitted(self, "intercept_") + return np.vstack((self.intercept_, self.coef_)) diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py new file mode 100644 index 0000000..6161e0b --- /dev/null +++ b/gensvm/gridsearch.py @@ -0,0 +1,575 @@ +# -*- coding: utf-8 -*- + +"""Functions for doing an efficient GenSVM grid search + +This module contains functions to run a grid search for the GenSVM model. This +is implemented in a separate class because it uses the GenSVM C library to do +the actual grid search. The C routines for the grid search use warm starts for +the computations and are therefore more efficient. + +""" + +from __future__ import print_function, division + +import numpy as np +import time + +from operator import itemgetter + +from sklearn.base import ClassifierMixin, BaseEstimator, MetaEstimatorMixin +from sklearn.model_selection import ParameterGrid, check_cv +from sklearn.model_selection._search import _check_param_grid +from sklearn.model_selection._validation import _score +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_X_y +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import indexable + +from . import wrapper +from .core import GenSVM +from .sklearn_util import (_skl_format_cv_results, _skl_check_scorers, + _skl_check_is_fitted, _skl_grid_score) + + +def _sort_candidate_params(candidate_params): + if any(('epsilon' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('epsilon'), reverse=True) + if any(('p' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('p')) + if any(('lmd' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('lmd')) + if any(('kappa' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('kappa')) + if any(('weights' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('weights')) + if any(('gamma' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('gamma')) + if any(('degree' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('degree')) + if any(('coef' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('coef')) + if any(('kernel' in p for p in candidate_params)): + candidate_params.sort(key=itemgetter('kernel')) + + +def _validate_param_grid(param_grid): + """Check if the parameter values are valid + + This basically does the same checks as in the constructor of the + :class:`core.GenSVM` class, but for the entire parameter grid. + + """ + # the conditions that the parameters must satisfy + conditions = { + 'p': lambda x : 1.0 <= x <= 2.0, + 'kappa': lambda x : x > -1.0, + 'lmd': lambda x : x > 0, + 'epsilon': lambda x : x > 0, + 'gamma' : lambda x : x != 0, + 'weights' : lambda x : x in ['unit', 'group'], + } + + for param in conditions: + if param in param_grid: + if not all(map(conditions[param], param_grid[param])): + raise ValueError( + "Invalid value in grid for parameter: %s." % (param) + ) + + +class _MockEstimator(ClassifierMixin): + #This mock estimator facilitates the use of the Scorer class of + #Scikit-Learn. Basically, we want to use the _score function of + #sklearn.model_selection._validation, but we don't keep track of the + #individual estimators in the GenSVM C grid search code. With this wrapper + #we can mock an estimator for the _score function. + + #The ClassifierMixin adds the score method to the estimator. This allows us + #to leave scoring=None as the default to the GenSVMGridSearchCV class and + #ends up using the accuracy_score metric. + + def __init__(self, predictions): + self.predictions = predictions + + def predict(self, X): + return self.predictions + + +def _format_results(results, cv_idx, true_y, scorers, iid, + return_train_score=True, + return_n_test_samples=True, + return_times=True, + return_parameters=False): + """Format the results from the grid search + + Parameters + ---------- + + scorer : A single callable or dict mapping scorer name to the callable + If it is a single callable, the return value for ``train_scores`` and + ``test_scores`` is a single float. + + For a dict, it should be one mapping the scorer name to the scorer + callable object / function. + + The callable object / fn should have signature + ``scorer(estimator, X, y)``. + + """ + + out = [] + candidate_params = results['params'] + n_candidates = len(candidate_params) + n_splits = len(np.unique(cv_idx)) + + is_multimetric = not callable(scorers) + + # Out must be a list of dicts of size n_params x n_splits that iterates + # over the params in the list and for each param iterates over the splits. + for param, duration, predictions in zip(results['params'], + results['duration'], results['predictions']): + for test_idx in np.unique(cv_idx): + + ret = [] + score_time = 0 + + if return_train_score: + train_pred = predictions[cv_idx != test_idx, ] + y_train = true_y[cv_idx != test_idx, ] + train_mock = _MockEstimator(train_pred) + start_time = time.time() + train_scores = _score(train_mock, None, y_train, scorers, + is_multimetric) + score_time += time.time() - start_time + ret.append(train_scores) + + test_pred = predictions[cv_idx == test_idx, ] + y_test = true_y[cv_idx == test_idx, ] + test_mock = _MockEstimator(test_pred) + start_time = time.time() + test_scores = _score(test_mock, None, y_test, scorers, + is_multimetric) + score_time += time.time() - start_time + ret.append(test_scores) + + if return_n_test_samples: + ret.append(len(y_test)) + if return_times: + fit_time = duration + ret.extend([fit_time, score_time]) + if return_parameters: + ret.append(param) + + out.append(ret) + + cv_results_ = _skl_format_cv_results(out, return_train_score, + candidate_params, n_candidates, n_splits, scorers, iid) + + return cv_results_ + + +def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit, + verbose, return_train_score, iid): + """Utility function for fitting the grid search for GenSVM + + This function sorts the parameter grid for optimal computation speed, sets + the desired verbosity, generates the cross validation splits, and calls the + low-level training routine in the Cython wrapper. + + For parameters, see :class:`.GenSVMGridSearchCV`. + + Returns + ------- + cv_results_ : dict + The cross validation results. See :func:`~GenSVMGridSearchCV.fit`. + + """ + + # sort the candidate params + # the optimal order of the parameters from inner to outer loop is: epsilon, + # p, lambda, kappa, weights, kernel, ??? + _sort_candidate_params(candidate_params) + + # set the verbosity in GenSVM + wrapper.set_verbosity_wrap(verbose) + + # NOTE: The C library can compute the accuracy score and destroy the exact + # predictions, but this doesn't allow us to compute the score per fold. So + # we always want to get the raw predictions for each grid point. + store_predictions = True + + # Convert the cv variable to a cv_idx array + cv = check_cv(cv, y, classifier=True) + n_folds = cv.get_n_splits(X, y, groups) + cv_idx = np.zeros((X.shape[0], ), dtype=np.int_) - 1 + fold_idx = 0 + for train, test in cv.split(X, y, groups): + cv_idx[test, ] = fold_idx + fold_idx += 1 + + results_ = wrapper.grid_wrap( + X, + y, + candidate_params, + int(store_predictions), + cv_idx, + int(n_folds), + ) + cv_results_ = _format_results(results_, cv_idx, y, scorers, iid) + + return cv_results_, n_folds + + +class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): + """GenSVM cross validated grid search + + This class implements efficient GenSVM grid search with cross validation. + One of the strong features of GenSVM is that seeding the classifier + properly can greatly reduce total training time. This class ensures that + the grid search is done in the most efficient way possible. + + The implementation of this class is based on the `GridSearchCV`_ class in + scikit-learn. The documentation of the various parameters is therefore + mostly the same. This is done to provide the user with a familiar and + easy-to-use interface to doing a grid search with GenSVM. A separate class + was needed to benefit from the fast low-level C implementation of grid + search in the GenSVM library. + + Parameters + ---------- + param_grid : dict or list of dicts + Dictionary of parameter names (strings) as keys and lists of parameter + settings to evaluate as values, or a list of such dicts. The GenSVM + model will be evaluated at all combinations of the parameters. + + scoring : string, callable, list/tuple, dict or None + A single string (see :ref:`scoring_parameter`) or a callable (see + :ref:`scoring`) to evaluate the predictions on the test set. + + For evaluating multiple metrics, either give a list of (unique) strings + or a dict with names as keys and callables as values. + + NOTE that when using custom scorers, each scorer should return a single + value. Metric functions returning a list/array of values can be wrapped + into multiple scorers that return one value each. + + If None, the `accuracy_score`_ is used. + + iid : boolean, default=True + If True, the data is assumed to be identically distributed across the + folds, and the loss minimized is the total loss per sample and not the + mean loss across the folds. + + cv : int, cross-validation generator or an iterable, optional + Determines the cross-validation splitting strategy. Possible inputs for + cv are: + + - None, to use the default 3-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - An object to be used as a cross-validation generator. + - An iterable yielding train, test splits. + + For integer/None inputs, :class:`StratifiedKFold + <sklearn.model_selection.StratifiedKFold>` is used. In all other + cases, :class:`KFold <sklearn.model_selection.KFold>` is used. + + Refer to the `scikit-learn User Guide on cross validation`_ for the + various strategies that can be used here. + + refit : boolean, or string, default=True + Refit the GenSVM estimator with the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a string denoting the + scorer to be used to find the best parameters for refitting the + estimator at the end. + + The refitted estimator is made available at the `:attr:best_estimator_ + <.GenSVMGridSearchCV.best_estimator_>` attribute and allows the user to + use the :func:`~GenSVMGridSearchCV.predict` method directly on this + :class:`.GenSVMGridSearchCV` instance. + + Also for multiple metric evaluation, the attributes :attr:`best_index_ + <.GenSVMGridSearchCV.best_index_>`, :attr:`best_score_ + <.GenSVMGridSearchCV.best_score_>` and :attr:`best_params_ + <.GenSVMGridSearchCV:best_params_>` will only be available if ``refit`` + is set and all of them will be determined w.r.t this specific scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + + verbose : integer + Controls the verbosity: the higher, the more messages. + + return_train_score : boolean, default=True + If ``False``, the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` + attribute will not include training scores. + + Examples + -------- + >>> from gensvm import GenSVMGridSearchCV + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> param_grid = {'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]} + >>> clf = GenSVMGridSearchCV(param_grid) + >>> clf.fit(iris.data, iris.target) + GenSVMGridSearchCV(cv=None, iid=True, + param_grid={'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]}, + refit=True, return_train_score=True, scoring=None, verbose=0) + + Attributes + ---------- + cv_results_ : dict of numpy (masked) ndarrays + A dict with keys as column headers and values as columns, that can be + imported into a pandas `DataFrame`_. + + For instance the below given table + + +------------+-----------+------------+-----------------+---+---------+ + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| + +============+===========+============+=================+===+=========+ + | 'poly' | -- | 2 | 0.8 |...| 2 | + +------------+-----------+------------+-----------------+---+---------+ + | 'poly' | -- | 3 | 0.7 |...| 4 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.1 | -- | 0.8 |...| 3 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.2 | -- | 0.9 |...| 1 | + +------------+-----------+------------+-----------------+---+---------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], + mask = [False False False False]...) + 'param_gamma': masked_array(data = [-- -- 0.1 0.2], + mask = [ True True False False]...), + 'param_degree': masked_array(data = [2.0 3.0 -- --], + mask = [False False True True]...), + 'split0_test_score' : [0.8, 0.7, 0.8, 0.9], + 'split1_test_score' : [0.82, 0.5, 0.7, 0.78], + 'mean_test_score' : [0.81, 0.60, 0.75, 0.82], + 'std_test_score' : [0.02, 0.01, 0.03, 0.03], + 'rank_test_score' : [2, 4, 3, 1], + 'split0_train_score' : [0.8, 0.9, 0.7], + 'split1_train_score' : [0.82, 0.5, 0.7], + 'mean_train_score' : [0.81, 0.7, 0.7], + 'std_train_score' : [0.03, 0.03, 0.04], + 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], + 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], + 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], + 'std_score_time' : [0.001, 0.002, 0.003, 0.005], + 'params' : [{'kernel': 'poly', 'degree': 2}, ...], + } + + NOTE: + + The key ``'params'`` is used to store a list of parameter settings + dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` + dict at the keys ending with that scorer's name (``'_<scorer_name>'``) + instead of ``'_score'`` shown above. ('split0_test_precision', + 'mean_train_precision' etc.) + + best_estimator_ : estimator or dict + Estimator that was chosen by the search, i.e. estimator which gave + highest score (or smallest loss if specified) on the left out data. Not + available if ``refit=False``. + + See ``refit`` parameter for more information on allowed values. + + best_score_ : float + Mean cross-validated score of the best_estimator + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + best_params_ : dict + Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + best_index_ : int + The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest mean + score (``search.best_score_``). + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + scorer_ : function or a dict + Scorer function used on the held out data to choose the best parameters + for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + n_splits_ : int + The number of cross-validation splits (folds/iterations). + + Notes + ----- + The parameters selected are those that maximize the score of the left out + data, unless an explicit score is passed in which case it is used instead. + + See Also + -------- + `ParameterGrid`_: + Generates all the combinations of a hyperparameter grid. + + :class:`.GenSVM`: + The GenSVM classifier + + .. _GridSearchCV: + http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html + .. _accuracy_score: + http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html + .. _scikit-learn User Guide on cross validation: + http://scikit-learn.org/stable/modules/cross_validation.html + + .. _ParameterGrid: + http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html + .. _DataFrame: + https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html + """ + + def __init__(self, param_grid, scoring=None, iid=True, cv=None, refit=True, + verbose=0, return_train_score=True): + + self.param_grid = param_grid + _check_param_grid(self.param_grid) + _validate_param_grid(self.param_grid) + + self.scoring = scoring + self.cv = cv + self.refit = refit + self.verbose = verbose + self.return_train_score = return_train_score + self.iid = iid + + def _get_param_iterator(self): + return ParameterGrid(self.param_grid) + + + def fit(self, X, y, groups=None): + """Run GenSVM grid search with all sets of parameters + + Parameters + ---------- + + X : array-like, shape = (n_samples, n_features) + Training data, where n_samples is the number of observations and + n_features is the number of features. + + y : array-like, shape = (n_samples, ) + Target vector for the training data. + + groups : array-like, with shape (n_samples, ), optional + Group labels for the samples used while splitting the dataset into + train/test sets. + + Returns + ------- + self : object + Return self. + + """ + + X, y_orig = check_X_y(X, y, accept_sparse=False, dtype=np.float64, + order="C") + + y_type = type_of_target(y_orig) + if y_type not in ["binary", "multiclass"]: + raise ValueError("Label type not allowed for GenSVM: %r" % y_type) + + # This is necessary because GenSVM expects classes to go from 1 to + # n_class + self.encoder = LabelEncoder() + y = self.encoder.fit_transform(y_orig) + y += 1 + + candidate_params = list(self._get_param_iterator()) + + scorers, self.multimetric_, refit_metric = _skl_check_scorers( + self.scoring, self.refit) + + X, y, groups = indexable(X, y, groups) + + results, n_splits = _fit_grid_gensvm(X, y, groups, candidate_params, + scorers, self.cv, self.refit, self.verbose, + self.return_train_score, self.iid) + + self.cv_results_ = results + + # For multi-metric evaluation, store the best_index_, best_params_ and + # best_score_ iff refit is one of the scorer names + # In single metric evaluation, refit_metric is "score" + if self.refit or not self.multimetric_: + self.best_index_ = results["rank_test_%s" % refit_metric].argmin() + self.best_params_ = candidate_params[self.best_index_] + self.best_score_ = results["mean_test_%s" % refit_metric][ + self.best_index_] + + if self.refit: + self.best_estimator_ = GenSVM(**self.best_params_) + # y_orig because GenSVM fit must know the conversion for predict to + # work correctly + self.best_estimator_.fit(X, y_orig) + + ## Store the only scorer not as a dict for single metric evaluation + self.scorer_ = scorers if self.multimetric_ else scorers['score'] + + self.cv_results_ = results + self.n_splits_ = n_splits + + return self + + + def score(self, X, y): + """Compute the score on the test data given the true labels + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test data, where n_samples is the number of observations and + n_features is the number of features. + + y : array-like, shape = (n_samples, ) + True labels for the test data. + + + Returns + ------- + score : float + + """ + _skl_check_is_fitted(self, 'score', self.refit) + return _skl_grid_score(X, y, self.scorer_, self.best_estimator_, + self.refit, self.multimetric_) + + def predict(self, X): + """Predict the class labels on the test data + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test data, where n_samples is the number of observations and + n_features is the number of features. + + Returns + ------- + y_pred : array-like, shape = (n_samples, ) + Predicted class labels of the data in X. + + """ + _skl_check_is_fitted(self, 'predict', self.refit) + return self.best_estimator_.predict(X) diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py new file mode 100644 index 0000000..9c46e0e --- /dev/null +++ b/gensvm/sklearn_util.py @@ -0,0 +1,212 @@ +"""Functions in GenSVM that are taken from Scikit-Learn + +The GenSVM Python package is designed to work in the same way as Scikit-Learn +classifiers, as this makes it easier for people familiar with Scikit-Learn to +use GenSVM. As such, some of the functionality of the GenSVM Python package is +similar to code in the Scikit-Learn package (such as formatting the grid search +results). To keep a clean separation between code from Scikit-Learn (which is +licensed under the BSD license) and code written by the author(s) of the GenSVM +package, the code from scikit-learn is placed here in explicit self-contained +functions. To comply with clause a of the BSD license, it is repeated below as +required. + +""" + +import numpy as np + +from collections import defaultdict +from functools import partial + +from .core import GenSVM +from .util import get_ranks + + +# BEGIN SCIKIT LEARN CODE + +""" + +New BSD License + +Copyright (c) 2007–2017 The scikit-learn developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the Scikit-learn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + +""" + +from sklearn.exceptions import NotFittedError +from sklearn.externals import six +from sklearn.metrics.scorer import _check_multimetric_scoring +from sklearn.model_selection._validation import _aggregate_score_dicts +from sklearn.utils.fixes import MaskedArray +from sklearn.utils.validation import check_is_fitted + +def _skl_format_cv_results(out, return_train_score, candidate_params, + n_candidates, n_splits, scorers, iid): + + # if one choose to see train score, "out" will contain train score info + if return_train_score: + (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, + score_time) = zip(*out) + else: + (test_score_dicts, test_sample_counts, fit_time, + score_time) = zip(*out) + + # test_score_dicts and train_score dicts are lists of dictionaries and + # we make them into dict of lists + test_scores = _aggregate_score_dicts(test_score_dicts) + if return_train_score: + train_scores = _aggregate_score_dicts(train_score_dicts) + + results = dict() + + def _store(key_name, array, weights=None, splits=False, rank=False): + """A small helper to store the scores/times to the cv_results_""" + # When iterated first by splits, then by parameters + # We want `array` to have `n_candidates` rows and `n_splits` cols. + array = np.array(array, dtype=np.float64).reshape(n_candidates, + n_splits) + if splits: + for split_i in range(n_splits): + # Uses closure to alter the results + results["split%d_%s" + % (split_i, key_name)] = array[:, split_i] + + array_means = np.average(array, axis=1, weights=weights) + results['mean_%s' % key_name] = array_means + # Weighted std is not directly available in numpy + array_stds = np.sqrt(np.average((array - + array_means[:, np.newaxis]) ** 2, + axis=1, weights=weights)) + results['std_%s' % key_name] = array_stds + + if rank: + results["rank_%s" % key_name] = np.asarray( + get_ranks(-array_means), dtype=np.int32) + + _store('fit_time', fit_time) + _store('score_time', score_time) + # Use one MaskedArray and mask all the places where the param is not + # applicable for that candidate. Use defaultdict as each candidate may + # not contain all the params + param_results = defaultdict(partial(MaskedArray, + np.empty(n_candidates,), + mask=True, + dtype=object)) + for cand_i, params in enumerate(candidate_params): + for name, value in params.items(): + # An all masked empty array gets created for the key + # `"param_%s" % name` at the first occurence of `name`. + # Setting the value at an index also unmasks that index + param_results["param_%s" % name][cand_i] = value + + results.update(param_results) + # Store a list of param dicts at the key 'params' + results['params'] = candidate_params + + # NOTE test_sample counts (weights) remain the same for all candidates + test_sample_counts = np.array(test_sample_counts[:n_splits], + dtype=np.int) + for scorer_name in scorers.keys(): + # Computed the (weighted) mean and std for test scores alone + _store('test_%s' % scorer_name, test_scores[scorer_name], + splits=True, rank=True, + weights=test_sample_counts if iid else None) + if return_train_score: + _store('train_%s' % scorer_name, train_scores[scorer_name], + splits=True) + + return results + + +def _skl_check_scorers(scoring, refit): + + scorers, multimetric_ = _check_multimetric_scoring( + GenSVM(), scoring=scoring) + if multimetric_: + if refit is not False and ( + not isinstance(refit, six.string_types) or + # This will work for both dict / list (tuple) + refit not in scorers): + raise ValueError("For multi-metric scoring, the parameter " + "refit must be set to a scorer key " + "to refit an estimator with the best " + "parameter setting on the whole data and " + "make the best_* attributes " + "available for that metric. If this is not " + "needed, refit should be set to False " + "explicitly. %r was passed." % refit) + else: + refit_metric = refit + else: + refit_metric = 'score' + + return scorers, multimetric_, refit_metric + + +def _skl_check_is_fitted(estimator, method_name, refit): + if not refit: + raise NotFittedError('This %s instance was initialized ' + 'with refit=False. %s is ' + 'available only after refitting on the best ' + 'parameters. You can refit an estimator ' + 'manually using the ``best_parameters_`` ' + 'attribute' + % (type(estimator).__name__, method_name)) + else: + check_is_fitted(estimator, 'best_estimator_') + + + +def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): + """Returns the score on the given data, if the estimator has been + refit. + + This uses the score defined by ``scoring`` where provided, and the + ``best_estimator_.score`` method otherwise. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Input data, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + Returns + ------- + score : float + """ + if scorer_ is None: + raise ValueError("No score function explicitly defined, " + "and the estimator doesn't provide one %s" + % best_estimator_) + score = scorer_[refit] if multimetric_ else scorer_ + return score(best_estimator_, X, y) diff --git a/gensvm/util.py b/gensvm/util.py new file mode 100644 index 0000000..1e79d75 --- /dev/null +++ b/gensvm/util.py @@ -0,0 +1,32 @@ +""" +Utility functions for GenSVM + +""" + + +import numpy as np + + +def get_ranks(x): + """ + Rank data in an array. Low values get a small rank number. Ties are broken + by assigning the lowest value. + + Examples + -------- + >>> x = [7, 0.1, 0.5, 0.1, 10, 100, 200] + >>> get_ranks(x) + [4, 1, 3, 1, 5, 6, 7] + + """ + x = np.ravel(np.asarray(x)) + l = len(x) + r = 1 + ranks = np.zeros((l, )) + while not all([k is None for k in x]): + m = min([k for k in x if not k is None]) + idx = [1 if k == m else 0 for k in x] + ranks = [r if idx[k] else ranks[k] for k in range(l)] + r += sum(idx) + x = [None if idx[k] else x[k] for k in range(l)] + return ranks |
