aboutsummaryrefslogtreecommitdiff
path: root/gensvm/sklearn_util.py
diff options
context:
space:
mode:
Diffstat (limited to 'gensvm/sklearn_util.py')
-rw-r--r--gensvm/sklearn_util.py212
1 files changed, 212 insertions, 0 deletions
diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py
new file mode 100644
index 0000000..9c46e0e
--- /dev/null
+++ b/gensvm/sklearn_util.py
@@ -0,0 +1,212 @@
+"""Functions in GenSVM that are taken from Scikit-Learn
+
+The GenSVM Python package is designed to work in the same way as Scikit-Learn
+classifiers, as this makes it easier for people familiar with Scikit-Learn to
+use GenSVM. As such, some of the functionality of the GenSVM Python package is
+similar to code in the Scikit-Learn package (such as formatting the grid search
+results). To keep a clean separation between code from Scikit-Learn (which is
+licensed under the BSD license) and code written by the author(s) of the GenSVM
+package, the code from scikit-learn is placed here in explicit self-contained
+functions. To comply with clause a of the BSD license, it is repeated below as
+required.
+
+"""
+
+import numpy as np
+
+from collections import defaultdict
+from functools import partial
+
+from .core import GenSVM
+from .util import get_ranks
+
+
+# BEGIN SCIKIT LEARN CODE
+
+"""
+
+New BSD License
+
+Copyright (c) 2007–2017 The scikit-learn developers.
+All rights reserved.
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ a. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ b. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ c. Neither the name of the Scikit-learn Developers nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+
+"""
+
+from sklearn.exceptions import NotFittedError
+from sklearn.externals import six
+from sklearn.metrics.scorer import _check_multimetric_scoring
+from sklearn.model_selection._validation import _aggregate_score_dicts
+from sklearn.utils.fixes import MaskedArray
+from sklearn.utils.validation import check_is_fitted
+
+def _skl_format_cv_results(out, return_train_score, candidate_params,
+ n_candidates, n_splits, scorers, iid):
+
+ # if one choose to see train score, "out" will contain train score info
+ if return_train_score:
+ (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
+ score_time) = zip(*out)
+ else:
+ (test_score_dicts, test_sample_counts, fit_time,
+ score_time) = zip(*out)
+
+ # test_score_dicts and train_score dicts are lists of dictionaries and
+ # we make them into dict of lists
+ test_scores = _aggregate_score_dicts(test_score_dicts)
+ if return_train_score:
+ train_scores = _aggregate_score_dicts(train_score_dicts)
+
+ results = dict()
+
+ def _store(key_name, array, weights=None, splits=False, rank=False):
+ """A small helper to store the scores/times to the cv_results_"""
+ # When iterated first by splits, then by parameters
+ # We want `array` to have `n_candidates` rows and `n_splits` cols.
+ array = np.array(array, dtype=np.float64).reshape(n_candidates,
+ n_splits)
+ if splits:
+ for split_i in range(n_splits):
+ # Uses closure to alter the results
+ results["split%d_%s"
+ % (split_i, key_name)] = array[:, split_i]
+
+ array_means = np.average(array, axis=1, weights=weights)
+ results['mean_%s' % key_name] = array_means
+ # Weighted std is not directly available in numpy
+ array_stds = np.sqrt(np.average((array -
+ array_means[:, np.newaxis]) ** 2,
+ axis=1, weights=weights))
+ results['std_%s' % key_name] = array_stds
+
+ if rank:
+ results["rank_%s" % key_name] = np.asarray(
+ get_ranks(-array_means), dtype=np.int32)
+
+ _store('fit_time', fit_time)
+ _store('score_time', score_time)
+ # Use one MaskedArray and mask all the places where the param is not
+ # applicable for that candidate. Use defaultdict as each candidate may
+ # not contain all the params
+ param_results = defaultdict(partial(MaskedArray,
+ np.empty(n_candidates,),
+ mask=True,
+ dtype=object))
+ for cand_i, params in enumerate(candidate_params):
+ for name, value in params.items():
+ # An all masked empty array gets created for the key
+ # `"param_%s" % name` at the first occurence of `name`.
+ # Setting the value at an index also unmasks that index
+ param_results["param_%s" % name][cand_i] = value
+
+ results.update(param_results)
+ # Store a list of param dicts at the key 'params'
+ results['params'] = candidate_params
+
+ # NOTE test_sample counts (weights) remain the same for all candidates
+ test_sample_counts = np.array(test_sample_counts[:n_splits],
+ dtype=np.int)
+ for scorer_name in scorers.keys():
+ # Computed the (weighted) mean and std for test scores alone
+ _store('test_%s' % scorer_name, test_scores[scorer_name],
+ splits=True, rank=True,
+ weights=test_sample_counts if iid else None)
+ if return_train_score:
+ _store('train_%s' % scorer_name, train_scores[scorer_name],
+ splits=True)
+
+ return results
+
+
+def _skl_check_scorers(scoring, refit):
+
+ scorers, multimetric_ = _check_multimetric_scoring(
+ GenSVM(), scoring=scoring)
+ if multimetric_:
+ if refit is not False and (
+ not isinstance(refit, six.string_types) or
+ # This will work for both dict / list (tuple)
+ refit not in scorers):
+ raise ValueError("For multi-metric scoring, the parameter "
+ "refit must be set to a scorer key "
+ "to refit an estimator with the best "
+ "parameter setting on the whole data and "
+ "make the best_* attributes "
+ "available for that metric. If this is not "
+ "needed, refit should be set to False "
+ "explicitly. %r was passed." % refit)
+ else:
+ refit_metric = refit
+ else:
+ refit_metric = 'score'
+
+ return scorers, multimetric_, refit_metric
+
+
+def _skl_check_is_fitted(estimator, method_name, refit):
+ if not refit:
+ raise NotFittedError('This %s instance was initialized '
+ 'with refit=False. %s is '
+ 'available only after refitting on the best '
+ 'parameters. You can refit an estimator '
+ 'manually using the ``best_parameters_`` '
+ 'attribute'
+ % (type(estimator).__name__, method_name))
+ else:
+ check_is_fitted(estimator, 'best_estimator_')
+
+
+
+def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_):
+ """Returns the score on the given data, if the estimator has been
+ refit.
+
+ This uses the score defined by ``scoring`` where provided, and the
+ ``best_estimator_.score`` method otherwise.
+
+ Parameters
+ ----------
+ X : array-like, shape = [n_samples, n_features]
+ Input data, where n_samples is the number of samples and
+ n_features is the number of features.
+
+ y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+ Target relative to X for classification or regression;
+ None for unsupervised learning.
+
+ Returns
+ -------
+ score : float
+ """
+ if scorer_ is None:
+ raise ValueError("No score function explicitly defined, "
+ "and the estimator doesn't provide one %s"
+ % best_estimator_)
+ score = scorer_[refit] if multimetric_ else scorer_
+ return score(best_estimator_, X, y)