diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-01-15 12:21:24 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-01-15 12:21:24 +0000 |
| commit | d1ddd504802072d930170b802d2cf98fb309cd46 (patch) | |
| tree | 2421ba88a37d686eca467cf85960ab7da4ae991a /gensvm | |
| parent | Move wrapper to better folder structure (diff) | |
| download | pygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.tar.gz pygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.zip | |
Code formatting with Black
Diffstat (limited to 'gensvm')
| -rw-r--r-- | gensvm/__init__.py | 2 | ||||
| -rw-r--r-- | gensvm/core.py | 179 | ||||
| -rw-r--r-- | gensvm/gridsearch.py | 251 | ||||
| -rw-r--r-- | gensvm/sklearn_util.py | 148 | ||||
| -rw-r--r-- | gensvm/util.py | 2 |
5 files changed, 370 insertions, 212 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py index 712da42..430b929 100644 --- a/gensvm/__init__.py +++ b/gensvm/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -__version__ = '0.1.7' +__version__ = "0.1.7" from .core import GenSVM from .gridsearch import GenSVMGridSearchCV diff --git a/gensvm/core.py b/gensvm/core.py index 77a3a7f..edd5236 100644 --- a/gensvm/core.py +++ b/gensvm/core.py @@ -21,9 +21,25 @@ from sklearn.utils.validation import check_is_fitted from .cython_wrapper import wrapper -def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, - coef, degree, kernel_eigen_cutoff, verbose, max_iter, - random_state=None, seed_V=None): +def _fit_gensvm( + X, + y, + n_class, + p, + lmd, + kappa, + epsilon, + weights, + kernel, + gamma, + coef, + degree, + kernel_eigen_cutoff, + verbose, + max_iter, + random_state=None, + seed_V=None, +): # process the random state rnd = check_random_state(random_state) @@ -32,23 +48,41 @@ def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma, wrapper.set_verbosity_wrap(verbose) # convert the weight index - weight_idx = 1 if weights == 'unit' else 2 + weight_idx = 1 if weights == "unit" else 2 # run the actual training raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap( - X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, - coef, degree, kernel_eigen_cutoff, max_iter, - rnd.randint(np.iinfo('i').max), seed_V) + X, + y, + n_class, + p, + lmd, + kappa, + epsilon, + weight_idx, + kernel, + gamma, + coef, + degree, + kernel_eigen_cutoff, + max_iter, + rnd.randint(np.iinfo("i").max), + seed_V, + ) # process output if status_ == 1 and verbose > 0: - warnings.warn("GenSVM optimization prematurely ended due to a " - "incorrect step in the optimization algorithm.", - FitFailedWarning) + warnings.warn( + "GenSVM optimization prematurely ended due to a " + "incorrect step in the optimization algorithm.", + FitFailedWarning, + ) if status_ == 2 and verbose > 0: - warnings.warn("GenSVM failed to converge, increase " - "the number of iterations.", ConvergenceWarning) + warnings.warn( + "GenSVM failed to converge, increase " "the number of iterations.", + ConvergenceWarning, + ) coef_ = raw_coef_[1:, :] intercept_ = raw_coef_[0, :] @@ -141,32 +175,53 @@ class GenSVM(BaseEstimator, ClassifierMixin): """ - def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, - weights='unit', kernel='linear', gamma='auto', coef=1.0, - degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, - max_iter=1e8): + def __init__( + self, + p=1.0, + lmd=1e-5, + kappa=0.0, + epsilon=1e-6, + weights="unit", + kernel="linear", + gamma="auto", + coef=1.0, + degree=2.0, + kernel_eigen_cutoff=1e-8, + verbose=0, + random_state=None, + max_iter=1e8, + ): if not 1.0 <= p <= 2.0: - raise ValueError("Value for p should be within [1, 2]; got p = %r" - % p) + raise ValueError( + "Value for p should be within [1, 2]; got p = %r" % p + ) if not kappa > -1.0: - raise ValueError("Value for kappa should be larger than -1; got " - "kappa = %r" % kappa) + raise ValueError( + "Value for kappa should be larger than -1; got " + "kappa = %r" % kappa + ) if not lmd > 0: - raise ValueError("Value for lmd should be larger than 0; got " - "lmd = %r" % lmd) + raise ValueError( + "Value for lmd should be larger than 0; got " "lmd = %r" % lmd + ) if not epsilon > 0: - raise ValueError("Value for epsilon should be larger than 0; got " - "epsilon = %r" % epsilon) + raise ValueError( + "Value for epsilon should be larger than 0; got " + "epsilon = %r" % epsilon + ) if gamma == 0.0: raise ValueError("A gamma value of 0.0 is invalid") - if not weights in ('unit', 'group'): - raise ValueError("Unknown weight parameter specified. Should be " - "'unit' or 'group'; got %r" % weights) - if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'): - raise ValueError("Unknown kernel specified. Should be " - "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel) - + if not weights in ("unit", "group"): + raise ValueError( + "Unknown weight parameter specified. Should be " + "'unit' or 'group'; got %r" % weights + ) + if not kernel in ("linear", "rbf", "poly", "sigmoid"): + raise ValueError( + "Unknown kernel specified. Should be " + "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel + ) self.p = p self.lmd = lmd @@ -182,7 +237,6 @@ class GenSVM(BaseEstimator, ClassifierMixin): self.random_state = random_state self.max_iter = max_iter - def fit(self, X, y, seed_V=None): """Fit the GenSVM model on the given data @@ -219,44 +273,69 @@ class GenSVM(BaseEstimator, ClassifierMixin): Returns self. """ - X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, - order="C") + X, y_org = check_X_y( + X, y, accept_sparse=False, dtype=np.float64, order="C" + ) y_type = type_of_target(y_org) if y_type not in ["binary", "multiclass"]: raise ValueError("Label type not allowed for GenSVM: %r" % y_type) - if self.gamma == 'auto': + if self.gamma == "auto": gamma = 1 / X.shape[1] else: gamma = self.gamma - # This is necessary because GenSVM expects classes to go from 1 to + # This is necessary because GenSVM expects classes to go from 1 to # n_class self.encoder = LabelEncoder() y = self.encoder.fit_transform(y_org) y += 1 n_class = len(np.unique(y)) - if not seed_V is None and self.kernel != 'linear': - warnings.warn("Warm starts are only supported for the " - "linear kernel. The seed_V parameter will be ignored.") + if not seed_V is None and self.kernel != "linear": + warnings.warn( + "Warm starts are only supported for the " + "linear kernel. The seed_V parameter will be ignored." + ) seed_V = None if not seed_V is None: n_samples, n_features = X.shape if seed_V.shape[1] + 1 > n_class: n_class = seed_V.shape[1] - if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 < - n_class): - raise ValueError("Seed V must have shape [%i, %i], " - "but has shape [%i, %i]" % (n_features+1, n_class-1, - seed_V.shape[0], seed_V.shape[1])) - - self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \ - _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa, - self.epsilon, self.weights, self.kernel, gamma, - self.coef, self.degree, self.kernel_eigen_cutoff, - self.verbose, self.max_iter, self.random_state, seed_V) + if seed_V.shape[0] - 1 != n_features or ( + seed_V.shape[1] + 1 < n_class + ): + raise ValueError( + "Seed V must have shape [%i, %i], " + "but has shape [%i, %i]" + % ( + n_features + 1, + n_class - 1, + seed_V.shape[0], + seed_V.shape[1], + ) + ) + + self.coef_, self.intercept_, self.n_iter_, self.n_support_ = _fit_gensvm( + X, + y, + n_class, + self.p, + self.lmd, + self.kappa, + self.epsilon, + self.weights, + self.kernel, + gamma, + self.coef, + self.degree, + self.kernel_eigen_cutoff, + self.verbose, + self.max_iter, + self.random_state, + seed_V, + ) return self def predict(self, X): diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py index d5ea31e..dc835f9 100644 --- a/gensvm/gridsearch.py +++ b/gensvm/gridsearch.py @@ -27,29 +27,33 @@ from sklearn.utils.validation import indexable from .cython_wrapper import wrapper from .core import GenSVM -from .sklearn_util import (_skl_format_cv_results, _skl_check_scorers, - _skl_check_is_fitted, _skl_grid_score) +from .sklearn_util import ( + _skl_format_cv_results, + _skl_check_scorers, + _skl_check_is_fitted, + _skl_grid_score, +) def _sort_candidate_params(candidate_params): - if any(('epsilon' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('epsilon'), reverse=True) - if any(('p' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('p')) - if any(('lmd' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('lmd')) - if any(('kappa' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('kappa')) - if any(('weights' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('weights')) - if any(('gamma' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('gamma')) - if any(('degree' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('degree')) - if any(('coef' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('coef')) - if any(('kernel' in p for p in candidate_params)): - candidate_params.sort(key=itemgetter('kernel')) + if any(("epsilon" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("epsilon"), reverse=True) + if any(("p" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("p")) + if any(("lmd" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("lmd")) + if any(("kappa" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("kappa")) + if any(("weights" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("weights")) + if any(("gamma" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("gamma")) + if any(("degree" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("degree")) + if any(("coef" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("coef")) + if any(("kernel" in p for p in candidate_params)): + candidate_params.sort(key=itemgetter("kernel")) def _validate_param_grid(param_grid): @@ -61,32 +65,32 @@ def _validate_param_grid(param_grid): """ # the conditions that the parameters must satisfy conditions = { - 'p': lambda x : 1.0 <= x <= 2.0, - 'kappa': lambda x : x > -1.0, - 'lmd': lambda x : x > 0, - 'epsilon': lambda x : x > 0, - 'gamma' : lambda x : x != 0, - 'weights' : lambda x : x in ['unit', 'group'], - } + "p": lambda x: 1.0 <= x <= 2.0, + "kappa": lambda x: x > -1.0, + "lmd": lambda x: x > 0, + "epsilon": lambda x: x > 0, + "gamma": lambda x: x != 0, + "weights": lambda x: x in ["unit", "group"], + } for param in conditions: if param in param_grid: if not all(map(conditions[param], param_grid[param])): raise ValueError( - "Invalid value in grid for parameter: %s." % (param) - ) + "Invalid value in grid for parameter: %s." % (param) + ) class _MockEstimator(ClassifierMixin): - #This mock estimator facilitates the use of the Scorer class of - #Scikit-Learn. Basically, we want to use the _score function of - #sklearn.model_selection._validation, but we don't keep track of the - #individual estimators in the GenSVM C grid search code. With this wrapper - #we can mock an estimator for the _score function. + # This mock estimator facilitates the use of the Scorer class of + # Scikit-Learn. Basically, we want to use the _score function of + # sklearn.model_selection._validation, but we don't keep track of the + # individual estimators in the GenSVM C grid search code. With this wrapper + # we can mock an estimator for the _score function. - #The ClassifierMixin adds the score method to the estimator. This allows us - #to leave scoring=None as the default to the GenSVMGridSearchCV class and - #ends up using the accuracy_score metric. + # The ClassifierMixin adds the score method to the estimator. This allows us + # to leave scoring=None as the default to the GenSVMGridSearchCV class and + # ends up using the accuracy_score metric. def __init__(self, predictions): self.predictions = predictions @@ -95,11 +99,17 @@ class _MockEstimator(ClassifierMixin): return self.predictions -def _format_results(results, cv_idx, true_y, scorers, iid, - return_train_score=True, - return_n_test_samples=True, - return_times=True, - return_parameters=False): +def _format_results( + results, + cv_idx, + true_y, + scorers, + iid, + return_train_score=True, + return_n_test_samples=True, + return_times=True, + return_parameters=False, +): """Format the results from the grid search Parameters @@ -118,49 +128,52 @@ def _format_results(results, cv_idx, true_y, scorers, iid, """ out = [] - candidate_params = results['params'] + candidate_params = results["params"] n_candidates = len(candidate_params) n_splits = len(np.unique(cv_idx)) is_multimetric = not callable(scorers) - # Out must be a list of dicts of size n_params x n_splits that iterates + # Out must be a list of dicts of size n_params x n_splits that iterates # over the params in the list and for each param iterates over the splits. - for param, duration, predictions in zip(results['params'], - results['duration'], results['predictions']): + for param, duration, predictions in zip( + results["params"], results["duration"], results["predictions"] + ): for test_idx in np.unique(cv_idx): ret = [] score_time = 0 if return_train_score: - train_pred = predictions[cv_idx != test_idx, ] - y_train = true_y[cv_idx != test_idx, ] + train_pred = predictions[cv_idx != test_idx,] + y_train = true_y[cv_idx != test_idx,] train_mock = _MockEstimator(train_pred) start_time = time.time() - train_scores = _score(train_mock, None, y_train, scorers, - is_multimetric) + train_scores = _score( + train_mock, None, y_train, scorers, is_multimetric + ) score_time += time.time() - start_time ret.append(train_scores) - test_pred = predictions[cv_idx == test_idx, ] - y_test = true_y[cv_idx == test_idx, ] + test_pred = predictions[cv_idx == test_idx,] + y_test = true_y[cv_idx == test_idx,] test_mock = _MockEstimator(test_pred) start_time = time.time() - test_scores = _score(test_mock, None, y_test, scorers, - is_multimetric) + test_scores = _score( + test_mock, None, y_test, scorers, is_multimetric + ) score_time += time.time() - start_time ret.append(test_scores) if return_n_test_samples: ret.append(len(y_test)) if return_times: - # Note, the C library returns the duration for a task (i.e. all - # splits). The _skkl_format_cv_results() computes the mean of - # the values, which should represent the average time per - # split. To compute this correctly, we here divide by the - # number of splits. Since we calculate the mean later, the mean - # is still correct, but this is not the exact fit_time for this + # Note, the C library returns the duration for a task (i.e. all + # splits). The _skkl_format_cv_results() computes the mean of + # the values, which should represent the average time per + # split. To compute this correctly, we here divide by the + # number of splits. Since we calculate the mean later, the mean + # is still correct, but this is not the exact fit_time for this # fold. fit_time = duration / n_splits ret.extend([fit_time, score_time]) @@ -169,14 +182,31 @@ def _format_results(results, cv_idx, true_y, scorers, iid, out.append(ret) - cv_results_ = _skl_format_cv_results(out, return_train_score, - candidate_params, n_candidates, n_splits, scorers, iid) + cv_results_ = _skl_format_cv_results( + out, + return_train_score, + candidate_params, + n_candidates, + n_splits, + scorers, + iid, + ) return cv_results_ -def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit, - verbose, return_train_score, iid): +def _fit_grid_gensvm( + X, + y, + groups, + candidate_params, + scorers, + cv, + refit, + verbose, + return_train_score, + iid, +): """Utility function for fitting the grid search for GenSVM This function sorts the parameter grid for optimal computation speed, sets @@ -193,35 +223,30 @@ def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit, """ # sort the candidate params - # the optimal order of the parameters from inner to outer loop is: epsilon, + # the optimal order of the parameters from inner to outer loop is: epsilon, # p, lambda, kappa, weights, kernel, ??? _sort_candidate_params(candidate_params) # set the verbosity in GenSVM wrapper.set_verbosity_wrap(verbose) - # NOTE: The C library can compute the accuracy score and destroy the exact - # predictions, but this doesn't allow us to compute the score per fold. So + # NOTE: The C library can compute the accuracy score and destroy the exact + # predictions, but this doesn't allow us to compute the score per fold. So # we always want to get the raw predictions for each grid point. store_predictions = True # Convert the cv variable to a cv_idx array cv = check_cv(cv, y, classifier=True) n_folds = cv.get_n_splits(X, y, groups) - cv_idx = np.zeros((X.shape[0], ), dtype=np.int_) - 1 + cv_idx = np.zeros((X.shape[0],), dtype=np.int_) - 1 fold_idx = 0 for train, test in cv.split(X, y, groups): - cv_idx[test, ] = fold_idx + cv_idx[test,] = fold_idx fold_idx += 1 results_ = wrapper.grid_wrap( - X, - y, - candidate_params, - int(store_predictions), - cv_idx, - int(n_folds), - ) + X, y, candidate_params, int(store_predictions), cv_idx, int(n_folds) + ) cv_results_ = _format_results(results_, cv_idx, y, scorers, iid) return cv_results_, n_folds @@ -449,8 +474,16 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html """ - def __init__(self, param_grid, scoring=None, iid=True, cv=None, refit=True, - verbose=0, return_train_score=True): + def __init__( + self, + param_grid, + scoring=None, + iid=True, + cv=None, + refit=True, + verbose=0, + return_train_score=True, + ): self.param_grid = param_grid _check_param_grid(self.param_grid) @@ -466,7 +499,6 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): def _get_param_iterator(self): return ParameterGrid(self.param_grid) - def fit(self, X, y, groups=None): """Run GenSVM grid search with all sets of parameters @@ -491,14 +523,15 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): """ - X, y_orig = check_X_y(X, y, accept_sparse=False, dtype=np.float64, - order="C") + X, y_orig = check_X_y( + X, y, accept_sparse=False, dtype=np.float64, order="C" + ) y_type = type_of_target(y_orig) if y_type not in ["binary", "multiclass"]: raise ValueError("Label type not allowed for GenSVM: %r" % y_type) - # This is necessary because GenSVM expects classes to go from 1 to + # This is necessary because GenSVM expects classes to go from 1 to # n_class self.encoder = LabelEncoder() y = self.encoder.fit_transform(y_orig) @@ -507,13 +540,23 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): candidate_params = list(self._get_param_iterator()) scorers, self.multimetric_, refit_metric = _skl_check_scorers( - self.scoring, self.refit) + self.scoring, self.refit + ) X, y, groups = indexable(X, y, groups) - results, n_splits = _fit_grid_gensvm(X, y, groups, candidate_params, - scorers, self.cv, self.refit, self.verbose, - self.return_train_score, self.iid) + results, n_splits = _fit_grid_gensvm( + X, + y, + groups, + candidate_params, + scorers, + self.cv, + self.refit, + self.verbose, + self.return_train_score, + self.iid, + ) self.cv_results_ = results @@ -524,23 +567,23 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_params_ = candidate_params[self.best_index_] self.best_score_ = results["mean_test_%s" % refit_metric][ - self.best_index_] + self.best_index_ + ] if self.refit: self.best_estimator_ = GenSVM(**self.best_params_) - # y_orig because GenSVM fit must know the conversion for predict to + # y_orig because GenSVM fit must know the conversion for predict to # work correctly self.best_estimator_.fit(X, y_orig) ## Store the only scorer not as a dict for single metric evaluation - self.scorer_ = scorers if self.multimetric_ else scorers['score'] + self.scorer_ = scorers if self.multimetric_ else scorers["score"] self.cv_results_ = results self.n_splits_ = n_splits return self - def score(self, X, y): """Compute the score on the test data given the true labels @@ -559,9 +602,15 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): score : float """ - _skl_check_is_fitted(self, 'score', self.refit) - return _skl_grid_score(X, y, self.scorer_, self.best_estimator_, - self.refit, self.multimetric_) + _skl_check_is_fitted(self, "score", self.refit) + return _skl_grid_score( + X, + y, + self.scorer_, + self.best_estimator_, + self.refit, + self.multimetric_, + ) def predict(self, X): """Predict the class labels on the test data @@ -578,7 +627,7 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin): Predicted class labels of the data in X. """ - _skl_check_is_fitted(self, 'predict', self.refit) + _skl_check_is_fitted(self, "predict", self.refit) return self.best_estimator_.predict(X) @@ -609,11 +658,11 @@ def load_default_grid(): used as input for the :class:`.GenSVMGridSearchCV` class. """ pg = { - 'lmd': [pow(2, x) for x in range(-18, 19, 2)], - 'kappa': [-0.9, 0.5, 5.0], - 'p': [1.0, 1.5, 2.0], - 'weights': ['unit', 'group'], - 'epsilon': [1e-8], - 'kernel': ['linear'] - } + "lmd": [pow(2, x) for x in range(-18, 19, 2)], + "kappa": [-0.9, 0.5, 5.0], + "p": [1.0, 1.5, 2.0], + "weights": ["unit", "group"], + "epsilon": [1e-8], + "kernel": ["linear"], + } return pg diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py index 0829bf6..05d9618 100644 --- a/gensvm/sklearn_util.py +++ b/gensvm/sklearn_util.py @@ -68,16 +68,30 @@ from sklearn.model_selection._validation import _aggregate_score_dicts from sklearn.utils.fixes import MaskedArray from sklearn.utils.validation import check_is_fitted -def _skl_format_cv_results(out, return_train_score, candidate_params, - n_candidates, n_splits, scorers, iid): + +def _skl_format_cv_results( + out, + return_train_score, + candidate_params, + n_candidates, + n_splits, + scorers, + iid, +): # if one choose to see train score, "out" will contain train score info if return_train_score: - (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) + ( + train_score_dicts, + test_score_dicts, + test_sample_counts, + fit_time, + score_time, + ) = zip(*out) else: - (test_score_dicts, test_sample_counts, fit_time, - score_time) = zip(*out) + (test_score_dicts, test_sample_counts, fit_time, score_time) = zip( + *out + ) # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists @@ -91,35 +105,39 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters # We want `array` to have `n_candidates` rows and `n_splits` cols. - array = np.array(array, dtype=np.float64).reshape(n_candidates, - n_splits) + array = np.array(array, dtype=np.float64).reshape( + n_candidates, n_splits + ) if splits: for split_i in range(n_splits): # Uses closure to alter the results - results["split%d_%s" - % (split_i, key_name)] = array[:, split_i] + results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) - results['mean_%s' % key_name] = array_means + results["mean_%s" % key_name] = array_means # Weighted std is not directly available in numpy - array_stds = np.sqrt(np.average((array - - array_means[:, np.newaxis]) ** 2, - axis=1, weights=weights)) - results['std_%s' % key_name] = array_stds + array_stds = np.sqrt( + np.average( + (array - array_means[:, np.newaxis]) ** 2, + axis=1, + weights=weights, + ) + ) + results["std_%s" % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( - get_ranks(-array_means), dtype=np.int32) + get_ranks(-array_means), dtype=np.int32 + ) - _store('fit_time', fit_time) - _store('score_time', score_time) + _store("fit_time", fit_time) + _store("score_time", score_time) # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params - param_results = defaultdict(partial(MaskedArray, - np.empty(n_candidates,), - mask=True, - dtype=object)) + param_results = defaultdict( + partial(MaskedArray, np.empty(n_candidates), mask=True, dtype=object) + ) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key @@ -129,19 +147,25 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, results.update(param_results) # Store a list of param dicts at the key 'params' - results['params'] = candidate_params + results["params"] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates - test_sample_counts = np.array(test_sample_counts[:n_splits], - dtype=np.int) + test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone - _store('test_%s' % scorer_name, test_scores[scorer_name], - splits=True, rank=True, - weights=test_sample_counts if iid else None) + _store( + "test_%s" % scorer_name, + test_scores[scorer_name], + splits=True, + rank=True, + weights=test_sample_counts if iid else None, + ) if return_train_score: - _store('train_%s' % scorer_name, train_scores[scorer_name], - splits=True) + _store( + "train_%s" % scorer_name, + train_scores[scorer_name], + splits=True, + ) return results @@ -149,44 +173,49 @@ def _skl_format_cv_results(out, return_train_score, candidate_params, def _skl_check_scorers(scoring, refit): scorers, multimetric_ = _check_multimetric_scoring( - GenSVM(), scoring=scoring) + GenSVM(), scoring=scoring + ) if multimetric_: if refit is not False and ( - not isinstance(refit, six.string_types) or - # This will work for both dict / list (tuple) - refit not in scorers): - raise ValueError("For multi-metric scoring, the parameter " - "refit must be set to a scorer key " - "to refit an estimator with the best " - "parameter setting on the whole data and " - "make the best_* attributes " - "available for that metric. If this is not " - "needed, refit should be set to False " - "explicitly. %r was passed." % refit) + not isinstance(refit, six.string_types) + or + # This will work for both dict / list (tuple) + refit not in scorers + ): + raise ValueError( + "For multi-metric scoring, the parameter " + "refit must be set to a scorer key " + "to refit an estimator with the best " + "parameter setting on the whole data and " + "make the best_* attributes " + "available for that metric. If this is not " + "needed, refit should be set to False " + "explicitly. %r was passed." % refit + ) else: refit_metric = refit else: - refit_metric = 'score' + refit_metric = "score" return scorers, multimetric_, refit_metric def _skl_check_is_fitted(estimator, method_name, refit): if not refit: - raise NotFittedError('This %s instance was initialized ' - 'with refit=False. %s is ' - 'available only after refitting on the best ' - 'parameters. You can refit an estimator ' - 'manually using the ``best_parameters_`` ' - 'attribute' - % (type(estimator).__name__, method_name)) + raise NotFittedError( + "This %s instance was initialized " + "with refit=False. %s is " + "available only after refitting on the best " + "parameters. You can refit an estimator " + "manually using the ``best_parameters_`` " + "attribute" % (type(estimator).__name__, method_name) + ) else: - check_is_fitted(estimator, 'best_estimator_') - + check_is_fitted(estimator, "best_estimator_") def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): - """Returns the score on the given data, if the estimator has been + """Returns the score on the given data, if the estimator has been refit. This uses the score defined by ``scoring`` where provided, and the @@ -206,9 +235,10 @@ def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_): ------- score : float """ - if scorer_ is None: - raise ValueError("No score function explicitly defined, " - "and the estimator doesn't provide one %s" - % best_estimator_) - score = scorer_[refit] if multimetric_ else scorer_ - return score(best_estimator_, X, y) + if scorer_ is None: + raise ValueError( + "No score function explicitly defined, " + "and the estimator doesn't provide one %s" % best_estimator_ + ) + score = scorer_[refit] if multimetric_ else scorer_ + return score(best_estimator_, X, y) diff --git a/gensvm/util.py b/gensvm/util.py index 8d2a3e4..0b7cd1d 100644 --- a/gensvm/util.py +++ b/gensvm/util.py @@ -24,7 +24,7 @@ def get_ranks(x): x = np.ravel(np.asarray(x)) l = len(x) r = 1 - ranks = np.zeros((l, )) + ranks = np.zeros((l,)) while not all([k is None for k in x]): m = min([k for k in x if not k is None]) idx = [1 if k == m else 0 for k in x] |
