diff options
Diffstat (limited to 'gensvm/sklearn_util.py')
| -rw-r--r-- | gensvm/sklearn_util.py | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py new file mode 100644 index 0000000..9c46e0e --- /dev/null +++ b/gensvm/sklearn_util.py @@ -0,0 +1,212 @@ +"""Functions in GenSVM that are taken from Scikit-Learn + +The GenSVM Python package is designed to work in the same way as Scikit-Learn +classifiers, as this makes it easier for people familiar with Scikit-Learn to +use GenSVM. As such, some of the functionality of the GenSVM Python package is +similar to code in the Scikit-Learn package (such as formatting the grid search +results). To keep a clean separation between code from Scikit-Learn (which is +licensed under the BSD license) and code written by the author(s) of the GenSVM +package, the code from scikit-learn is placed here in explicit self-contained +functions. To comply with clause a of the BSD license, it is repeated below as +required. + +""" + +import numpy as np + +from collections import defaultdict +from functools import partial + +from .core import GenSVM +from .util import get_ranks + + +# BEGIN SCIKIT LEARN CODE + +""" + +New BSD License + +Copyright (c) 2007–2017 The scikit-learn developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the Scikit-learn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + +""" + +from sklearn.exceptions import NotFittedError +from sklearn.externals import six +from sklearn.metrics.scorer import _check_multimetric_scoring +from sklearn.model_selection._validation import _aggregate_score_dicts +from sklearn.utils.fixes import MaskedArray +from sklearn.utils.validation import check_is_fitted + +def _skl_format_cv_results(out, return_train_score, candidate_params, + n_candidates, n_splits, scorers, iid): + + # if one choose to see train score, "out" will contain train score info + if return_train_score: + (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, + score_time) = zip(*out) + else: + (test_score_dicts, test_sample_counts, fit_time, + score_time) = zip(*out) + + # test_score_dicts and train_score dicts are lists of dictionaries and + # we make them into dict of lists + test_scores = _aggregate_score_dicts(test_score_dicts) + if return_train_score: + train_scores = _aggregate_score_dicts(train_score_dicts) + + results = dict() + + def _store(key_name, array, weights=None, splits=False, rank=False): + """A small helper to store the scores/times to the cv_results_""" + # When iterated first by splits, then by parameters + # We want `array` to have `n_candidates` rows and `n_splits` cols. + array = np.array(array, dtype=np.float64).reshape(n_candidates, + n_splits) + if splits: + for split_i in range(n_splits): + # Uses closure to alter the results + results["split%d_%s" + % (split_i, key_name)] = array[:, split_i] + + array_means = np.average(array, axis=1, weights=weights) + results['mean_%s' % key_name] = array_means + # Weighted std is not directly available in numpy + array_stds = np.sqrt(np.average((array - + array_means[:, np.newaxis]) ** 2, + axis=1, weights=weights)) + results['std_%s' % key_name] = array_stds + + if rank: + results["rank_%s" % key_name] = np.asarray( + get_ranks(-array_means), dtype=np.int32) + + _store('fit_time', fit_time) + _store('score_time', score_time) + # Use one MaskedArray and mask all the places where the param is not + # applicable for that candidate. Use defaultdict as each candidate may + # not contain all the params + param_results = defaultdict(partial(MaskedArray, + np.empty(n_candidates,), + mask=True, + dtype=object)) + for cand_i, params in enumerate(candidate_params): + for name, value in params.items(): + # An all masked empty array gets created for the key + # `"param_%s" % name` at the first occurence of `name`. + # Setting the value at an index also unmasks that index + param_results["param_%s" % name][cand_i] = value + + results.update(param_results) + # Store a list of param dicts at the key 'params' + results['params'] = candidate_params + + # NOTE test_sample counts (weights) remain the same for all candidates + test_sample_counts = np.array(test_sample_counts[:n_splits], + dtype=np.int) + for scorer_name in scorers.keys(): + # Computed the (weighted) mean and std for test scores alone + _store('test_%s' % scorer_name, test_scores[scorer_name], + splits=True, rank=True, + weights=test_sample_counts if iid else None) + if return_train_score: + _store('train_%s' % scorer_name, train_scores[scorer_name], + splits=True) + + return results + + +def _skl_check_scorers(scoring, refit): + + scorers, multimetric_ = _check_multimetric_scoring( + GenSVM(), scoring=scoring) + if multimetric_: + if refit is not False and ( + not isinstance(refit, six.string_types) or + # This will work for both dict / list (tuple) + refit not in scorers): + raise ValueError("For multi-metric scoring, the parameter " + "refit must be set to a scorer key " + "to refit an estimator with the best " + "parameter setting on the whole data and " + "make the best_* attributes " + "available for that metric. If this is not " + "needed, refit should be set to False " + "explicitly. %r was passed." % refit) + else: + refit_metric = refit + else: + refit_metric = 'score' + + return scorers, multimetric_, refit_metric + + +def _skl_check_is_fitted(estimator, method_name, refit): + if not refit: + raise NotFittedError('This %s instance was initialized ' + 'with refit=False. %s is ' + 'available only after refitting on the best ' + 'parameters. You can refit an estimator ' + 'manually using the ``best_parameters_`` ' + 'attribute' + % (type(estimator).__name__, method_name)) + else: + check_is_fitted(estimator, 'best_estimator_') + + + +def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): + """Returns the score on the given data, if the estimator has been + refit. + + This uses the score defined by ``scoring`` where provided, and the + ``best_estimator_.score`` method otherwise. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Input data, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + Returns + ------- + score : float + """ + if scorer_ is None: + raise ValueError("No score function explicitly defined, " + "and the estimator doesn't provide one %s" + % best_estimator_) + score = scorer_[refit] if multimetric_ else scorer_ + return score(best_estimator_, X, y) |
