diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-01-15 12:21:24 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-01-15 12:21:24 +0000 |
| commit | d1ddd504802072d930170b802d2cf98fb309cd46 (patch) | |
| tree | 2421ba88a37d686eca467cf85960ab7da4ae991a /gensvm/sklearn_util.py | |
| parent | Move wrapper to better folder structure (diff) | |
| download | pygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.tar.gz pygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.zip | |
Code formatting with Black
Diffstat (limited to 'gensvm/sklearn_util.py')
| -rw-r--r-- | gensvm/sklearn_util.py | 148 |
1 files changed, 89 insertions, 59 deletions
diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py index 0829bf6..05d9618 100644 --- a/gensvm/sklearn_util.py +++ b/gensvm/sklearn_util.py @@ -68,16 +68,30 @@ from sklearn.model_selection._validation import _aggregate_score_dicts from sklearn.utils.fixes import MaskedArray from sklearn.utils.validation import check_is_fitted -def _skl_format_cv_results(out, return_train_score, candidate_params, - n_candidates, n_splits, scorers, iid): + +def _skl_format_cv_results( + out, + return_train_score, + candidate_params, + n_candidates, + n_splits, + scorers, + iid, +): # if one choose to see train score, "out" will contain train score info if return_train_score: - (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) + ( + train_score_dicts, + test_score_dicts, + test_sample_counts, + fit_time, + score_time, + ) = zip(*out) else: - (test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) + (test_score_dicts, test_sample_counts, fit_time, score_time) = zip( + *out + ) # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists @@ -91,35 +105,39 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters # We want `array` to have `n_candidates` rows and `n_splits` cols. - array = np.array(array, dtype=np.float64).reshape(n_candidates, - n_splits) + array = np.array(array, dtype=np.float64).reshape( + n_candidates, n_splits + ) if splits: for split_i in range(n_splits): # Uses closure to alter the results - results["split%d_%s" - % (split_i, key_name)] = array[:, split_i] + results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) - results['mean_%s' % key_name] = array_means + results["mean_%s" % key_name] = array_means # Weighted std is not directly available in numpy - array_stds = np.sqrt(np.average((array - - array_means[:, np.newaxis]) ** 2, - axis=1, weights=weights)) - results['std_%s' % key_name] = array_stds + array_stds = np.sqrt( + np.average( + (array - array_means[:, np.newaxis]) ** 2, + axis=1, + weights=weights, + ) + ) + results["std_%s" % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( - get_ranks(-array_means), dtype=np.int32) + get_ranks(-array_means), dtype=np.int32 + ) - _store('fit_time', fit_time) - _store('score_time', score_time) + _store("fit_time", fit_time) + _store("score_time", score_time) # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params - param_results = defaultdict(partial(MaskedArray, - np.empty(n_candidates,), - mask=True, - dtype=object)) + param_results = defaultdict( + partial(MaskedArray, np.empty(n_candidates), mask=True, dtype=object) + ) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key @@ -129,19 +147,25 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, results.update(param_results) # Store a list of param dicts at the key 'params' - results['params'] = candidate_params + results["params"] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates - test_sample_counts = np.array(test_sample_counts[:n_splits], - dtype=np.int) + test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone - _store('test_%s' % scorer_name, test_scores[scorer_name], - splits=True, rank=True, - weights=test_sample_counts if iid else None) + _store( + "test_%s" % scorer_name, + test_scores[scorer_name], + splits=True, + rank=True, + weights=test_sample_counts if iid else None, + ) if return_train_score: - _store('train_%s' % scorer_name, train_scores[scorer_name], - splits=True) + _store( + "train_%s" % scorer_name, + train_scores[scorer_name], + splits=True, + ) return results @@ -149,44 +173,49 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, def _skl_check_scorers(scoring, refit): scorers, multimetric_ = _check_multimetric_scoring( - GenSVM(), scoring=scoring) + GenSVM(), scoring=scoring + ) if multimetric_: if refit is not False and ( - not isinstance(refit, six.string_types) or - # This will work for both dict / list (tuple) - refit not in scorers): - raise ValueError("For multi-metric scoring, the parameter " - "refit must be set to a scorer key " - "to refit an estimator with the best " - "parameter setting on the whole data and " - "make the best_* attributes " - "available for that metric. If this is not " - "needed, refit should be set to False " - "explicitly. %r was passed." % refit) + not isinstance(refit, six.string_types) + or + # This will work for both dict / list (tuple) + refit not in scorers + ): + raise ValueError( + "For multi-metric scoring, the parameter " + "refit must be set to a scorer key " + "to refit an estimator with the best " + "parameter setting on the whole data and " + "make the best_* attributes " + "available for that metric. If this is not " + "needed, refit should be set to False " + "explicitly. %r was passed." % refit + ) else: refit_metric = refit else: - refit_metric = 'score' + refit_metric = "score" return scorers, multimetric_, refit_metric def _skl_check_is_fitted(estimator, method_name, refit): if not refit: - raise NotFittedError('This %s instance was initialized ' - 'with refit=False. %s is ' - 'available only after refitting on the best ' - 'parameters. You can refit an estimator ' - 'manually using the ``best_parameters_`` ' - 'attribute' - % (type(estimator).__name__, method_name)) + raise NotFittedError( + "This %s instance was initialized " + "with refit=False. %s is " + "available only after refitting on the best " + "parameters. You can refit an estimator " + "manually using the ``best_parameters_`` " + "attribute" % (type(estimator).__name__, method_name) + ) else: - check_is_fitted(estimator, 'best_estimator_') - + check_is_fitted(estimator, "best_estimator_") def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): - """Returns the score on the given data, if the estimator has been + """Returns the score on the given data, if the estimator has been refit. This uses the score defined by ``scoring`` where provided, and the @@ -206,9 +235,10 @@ def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): ------- score : float """ - if scorer_ is None: - raise ValueError("No score function explicitly defined, " - "and the estimator doesn't provide one %s" - % best_estimator_) - score = scorer_[refit] if multimetric_ else scorer_ - return score(best_estimator_, X, y) + if scorer_ is None: + raise ValueError( + "No score function explicitly defined, " + "and the estimator doesn't provide one %s" % best_estimator_ + ) + score = scorer_[refit] if multimetric_ else scorer_ + return score(best_estimator_, X, y) |
