aboutsummaryrefslogtreecommitdiff
path: root/gensvm
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-01-15 12:21:24 +0000
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-01-15 12:21:24 +0000
commitd1ddd504802072d930170b802d2cf98fb309cd46 (patch)
tree2421ba88a37d686eca467cf85960ab7da4ae991a /gensvm
parentMove wrapper to better folder structure (diff)
downloadpygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.tar.gz
pygensvm-d1ddd504802072d930170b802d2cf98fb309cd46.zip
Code formatting with Black
Diffstat (limited to 'gensvm')
-rw-r--r--gensvm/__init__.py2
-rw-r--r--gensvm/core.py179
-rw-r--r--gensvm/gridsearch.py251
-rw-r--r--gensvm/sklearn_util.py148
-rw-r--r--gensvm/util.py2
5 files changed, 370 insertions, 212 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py
index 712da42..430b929 100644
--- a/gensvm/__init__.py
+++ b/gensvm/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-__version__ = '0.1.7'
+__version__ = "0.1.7"
from .core import GenSVM
from .gridsearch import GenSVMGridSearchCV
diff --git a/gensvm/core.py b/gensvm/core.py
index 77a3a7f..edd5236 100644
--- a/gensvm/core.py
+++ b/gensvm/core.py
@@ -21,9 +21,25 @@ from sklearn.utils.validation import check_is_fitted
from .cython_wrapper import wrapper
-def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma,
- coef, degree, kernel_eigen_cutoff, verbose, max_iter,
- random_state=None, seed_V=None):
+def _fit_gensvm(
+ X,
+ y,
+ n_class,
+ p,
+ lmd,
+ kappa,
+ epsilon,
+ weights,
+ kernel,
+ gamma,
+ coef,
+ degree,
+ kernel_eigen_cutoff,
+ verbose,
+ max_iter,
+ random_state=None,
+ seed_V=None,
+):
# process the random state
rnd = check_random_state(random_state)
@@ -32,23 +48,41 @@ def _fit_gensvm(X, y, n_class, p, lmd, kappa, epsilon, weights, kernel, gamma,
wrapper.set_verbosity_wrap(verbose)
# convert the weight index
- weight_idx = 1 if weights == 'unit' else 2
+ weight_idx = 1 if weights == "unit" else 2
# run the actual training
raw_coef_, n_SV_, n_iter_, training_error_, status_ = wrapper.train_wrap(
- X, y, n_class, p, lmd, kappa, epsilon, weight_idx, kernel, gamma,
- coef, degree, kernel_eigen_cutoff, max_iter,
- rnd.randint(np.iinfo('i').max), seed_V)
+ X,
+ y,
+ n_class,
+ p,
+ lmd,
+ kappa,
+ epsilon,
+ weight_idx,
+ kernel,
+ gamma,
+ coef,
+ degree,
+ kernel_eigen_cutoff,
+ max_iter,
+ rnd.randint(np.iinfo("i").max),
+ seed_V,
+ )
# process output
if status_ == 1 and verbose > 0:
- warnings.warn("GenSVM optimization prematurely ended due to a "
- "incorrect step in the optimization algorithm.",
- FitFailedWarning)
+ warnings.warn(
+ "GenSVM optimization prematurely ended due to a "
+ "incorrect step in the optimization algorithm.",
+ FitFailedWarning,
+ )
if status_ == 2 and verbose > 0:
- warnings.warn("GenSVM failed to converge, increase "
- "the number of iterations.", ConvergenceWarning)
+ warnings.warn(
+ "GenSVM failed to converge, increase " "the number of iterations.",
+ ConvergenceWarning,
+ )
coef_ = raw_coef_[1:, :]
intercept_ = raw_coef_[0, :]
@@ -141,32 +175,53 @@ class GenSVM(BaseEstimator, ClassifierMixin):
"""
- def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6,
- weights='unit', kernel='linear', gamma='auto', coef=1.0,
- degree=2.0, kernel_eigen_cutoff=1e-8, verbose=0, random_state=None,
- max_iter=1e8):
+ def __init__(
+ self,
+ p=1.0,
+ lmd=1e-5,
+ kappa=0.0,
+ epsilon=1e-6,
+ weights="unit",
+ kernel="linear",
+ gamma="auto",
+ coef=1.0,
+ degree=2.0,
+ kernel_eigen_cutoff=1e-8,
+ verbose=0,
+ random_state=None,
+ max_iter=1e8,
+ ):
if not 1.0 <= p <= 2.0:
- raise ValueError("Value for p should be within [1, 2]; got p = %r"
- % p)
+ raise ValueError(
+ "Value for p should be within [1, 2]; got p = %r" % p
+ )
if not kappa > -1.0:
- raise ValueError("Value for kappa should be larger than -1; got "
- "kappa = %r" % kappa)
+ raise ValueError(
+ "Value for kappa should be larger than -1; got "
+ "kappa = %r" % kappa
+ )
if not lmd > 0:
- raise ValueError("Value for lmd should be larger than 0; got "
- "lmd = %r" % lmd)
+ raise ValueError(
+ "Value for lmd should be larger than 0; got " "lmd = %r" % lmd
+ )
if not epsilon > 0:
- raise ValueError("Value for epsilon should be larger than 0; got "
- "epsilon = %r" % epsilon)
+ raise ValueError(
+ "Value for epsilon should be larger than 0; got "
+ "epsilon = %r" % epsilon
+ )
if gamma == 0.0:
raise ValueError("A gamma value of 0.0 is invalid")
- if not weights in ('unit', 'group'):
- raise ValueError("Unknown weight parameter specified. Should be "
- "'unit' or 'group'; got %r" % weights)
- if not kernel in ('linear', 'rbf', 'poly', 'sigmoid'):
- raise ValueError("Unknown kernel specified. Should be "
- "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel)
-
+ if not weights in ("unit", "group"):
+ raise ValueError(
+ "Unknown weight parameter specified. Should be "
+ "'unit' or 'group'; got %r" % weights
+ )
+ if not kernel in ("linear", "rbf", "poly", "sigmoid"):
+ raise ValueError(
+ "Unknown kernel specified. Should be "
+ "'linear', 'rbf', 'poly', or 'sigmoid'; got %r" % kernel
+ )
self.p = p
self.lmd = lmd
@@ -182,7 +237,6 @@ class GenSVM(BaseEstimator, ClassifierMixin):
self.random_state = random_state
self.max_iter = max_iter
-
def fit(self, X, y, seed_V=None):
"""Fit the GenSVM model on the given data
@@ -219,44 +273,69 @@ class GenSVM(BaseEstimator, ClassifierMixin):
Returns self.
"""
- X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64,
- order="C")
+ X, y_org = check_X_y(
+ X, y, accept_sparse=False, dtype=np.float64, order="C"
+ )
y_type = type_of_target(y_org)
if y_type not in ["binary", "multiclass"]:
raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
- if self.gamma == 'auto':
+ if self.gamma == "auto":
gamma = 1 / X.shape[1]
else:
gamma = self.gamma
- # This is necessary because GenSVM expects classes to go from 1 to
+ # This is necessary because GenSVM expects classes to go from 1 to
# n_class
self.encoder = LabelEncoder()
y = self.encoder.fit_transform(y_org)
y += 1
n_class = len(np.unique(y))
- if not seed_V is None and self.kernel != 'linear':
- warnings.warn("Warm starts are only supported for the "
- "linear kernel. The seed_V parameter will be ignored.")
+ if not seed_V is None and self.kernel != "linear":
+ warnings.warn(
+ "Warm starts are only supported for the "
+ "linear kernel. The seed_V parameter will be ignored."
+ )
seed_V = None
if not seed_V is None:
n_samples, n_features = X.shape
if seed_V.shape[1] + 1 > n_class:
n_class = seed_V.shape[1]
- if seed_V.shape[0] - 1 != n_features or (seed_V.shape[1] + 1 <
- n_class):
- raise ValueError("Seed V must have shape [%i, %i], "
- "but has shape [%i, %i]" % (n_features+1, n_class-1,
- seed_V.shape[0], seed_V.shape[1]))
-
- self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
- _fit_gensvm(X, y, n_class, self.p, self.lmd, self.kappa,
- self.epsilon, self.weights, self.kernel, gamma,
- self.coef, self.degree, self.kernel_eigen_cutoff,
- self.verbose, self.max_iter, self.random_state, seed_V)
+ if seed_V.shape[0] - 1 != n_features or (
+ seed_V.shape[1] + 1 < n_class
+ ):
+ raise ValueError(
+ "Seed V must have shape [%i, %i], "
+ "but has shape [%i, %i]"
+ % (
+ n_features + 1,
+ n_class - 1,
+ seed_V.shape[0],
+ seed_V.shape[1],
+ )
+ )
+
+ self.coef_, self.intercept_, self.n_iter_, self.n_support_ = _fit_gensvm(
+ X,
+ y,
+ n_class,
+ self.p,
+ self.lmd,
+ self.kappa,
+ self.epsilon,
+ self.weights,
+ self.kernel,
+ gamma,
+ self.coef,
+ self.degree,
+ self.kernel_eigen_cutoff,
+ self.verbose,
+ self.max_iter,
+ self.random_state,
+ seed_V,
+ )
return self
def predict(self, X):
diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py
index d5ea31e..dc835f9 100644
--- a/gensvm/gridsearch.py
+++ b/gensvm/gridsearch.py
@@ -27,29 +27,33 @@ from sklearn.utils.validation import indexable
from .cython_wrapper import wrapper
from .core import GenSVM
-from .sklearn_util import (_skl_format_cv_results, _skl_check_scorers,
- _skl_check_is_fitted, _skl_grid_score)
+from .sklearn_util import (
+ _skl_format_cv_results,
+ _skl_check_scorers,
+ _skl_check_is_fitted,
+ _skl_grid_score,
+)
def _sort_candidate_params(candidate_params):
- if any(('epsilon' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('epsilon'), reverse=True)
- if any(('p' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('p'))
- if any(('lmd' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('lmd'))
- if any(('kappa' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('kappa'))
- if any(('weights' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('weights'))
- if any(('gamma' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('gamma'))
- if any(('degree' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('degree'))
- if any(('coef' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('coef'))
- if any(('kernel' in p for p in candidate_params)):
- candidate_params.sort(key=itemgetter('kernel'))
+ if any(("epsilon" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("epsilon"), reverse=True)
+ if any(("p" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("p"))
+ if any(("lmd" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("lmd"))
+ if any(("kappa" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("kappa"))
+ if any(("weights" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("weights"))
+ if any(("gamma" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("gamma"))
+ if any(("degree" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("degree"))
+ if any(("coef" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("coef"))
+ if any(("kernel" in p for p in candidate_params)):
+ candidate_params.sort(key=itemgetter("kernel"))
def _validate_param_grid(param_grid):
@@ -61,32 +65,32 @@ def _validate_param_grid(param_grid):
"""
# the conditions that the parameters must satisfy
conditions = {
- 'p': lambda x : 1.0 <= x <= 2.0,
- 'kappa': lambda x : x > -1.0,
- 'lmd': lambda x : x > 0,
- 'epsilon': lambda x : x > 0,
- 'gamma' : lambda x : x != 0,
- 'weights' : lambda x : x in ['unit', 'group'],
- }
+ "p": lambda x: 1.0 <= x <= 2.0,
+ "kappa": lambda x: x > -1.0,
+ "lmd": lambda x: x > 0,
+ "epsilon": lambda x: x > 0,
+ "gamma": lambda x: x != 0,
+ "weights": lambda x: x in ["unit", "group"],
+ }
for param in conditions:
if param in param_grid:
if not all(map(conditions[param], param_grid[param])):
raise ValueError(
- "Invalid value in grid for parameter: %s." % (param)
- )
+ "Invalid value in grid for parameter: %s." % (param)
+ )
class _MockEstimator(ClassifierMixin):
- #This mock estimator facilitates the use of the Scorer class of
- #Scikit-Learn. Basically, we want to use the _score function of
- #sklearn.model_selection._validation, but we don't keep track of the
- #individual estimators in the GenSVM C grid search code. With this wrapper
- #we can mock an estimator for the _score function.
+ # This mock estimator facilitates the use of the Scorer class of
+ # Scikit-Learn. Basically, we want to use the _score function of
+ # sklearn.model_selection._validation, but we don't keep track of the
+ # individual estimators in the GenSVM C grid search code. With this wrapper
+ # we can mock an estimator for the _score function.
- #The ClassifierMixin adds the score method to the estimator. This allows us
- #to leave scoring=None as the default to the GenSVMGridSearchCV class and
- #ends up using the accuracy_score metric.
+ # The ClassifierMixin adds the score method to the estimator. This allows us
+ # to leave scoring=None as the default to the GenSVMGridSearchCV class and
+ # ends up using the accuracy_score metric.
def __init__(self, predictions):
self.predictions = predictions
@@ -95,11 +99,17 @@ class _MockEstimator(ClassifierMixin):
return self.predictions
-def _format_results(results, cv_idx, true_y, scorers, iid,
- return_train_score=True,
- return_n_test_samples=True,
- return_times=True,
- return_parameters=False):
+def _format_results(
+ results,
+ cv_idx,
+ true_y,
+ scorers,
+ iid,
+ return_train_score=True,
+ return_n_test_samples=True,
+ return_times=True,
+ return_parameters=False,
+):
"""Format the results from the grid search
Parameters
@@ -118,49 +128,52 @@ def _format_results(results, cv_idx, true_y, scorers, iid,
"""
out = []
- candidate_params = results['params']
+ candidate_params = results["params"]
n_candidates = len(candidate_params)
n_splits = len(np.unique(cv_idx))
is_multimetric = not callable(scorers)
- # Out must be a list of dicts of size n_params x n_splits that iterates
+ # Out must be a list of dicts of size n_params x n_splits that iterates
# over the params in the list and for each param iterates over the splits.
- for param, duration, predictions in zip(results['params'],
- results['duration'], results['predictions']):
+ for param, duration, predictions in zip(
+ results["params"], results["duration"], results["predictions"]
+ ):
for test_idx in np.unique(cv_idx):
ret = []
score_time = 0
if return_train_score:
- train_pred = predictions[cv_idx != test_idx, ]
- y_train = true_y[cv_idx != test_idx, ]
+ train_pred = predictions[cv_idx != test_idx,]
+ y_train = true_y[cv_idx != test_idx,]
train_mock = _MockEstimator(train_pred)
start_time = time.time()
- train_scores = _score(train_mock, None, y_train, scorers,
- is_multimetric)
+ train_scores = _score(
+ train_mock, None, y_train, scorers, is_multimetric
+ )
score_time += time.time() - start_time
ret.append(train_scores)
- test_pred = predictions[cv_idx == test_idx, ]
- y_test = true_y[cv_idx == test_idx, ]
+ test_pred = predictions[cv_idx == test_idx,]
+ y_test = true_y[cv_idx == test_idx,]
test_mock = _MockEstimator(test_pred)
start_time = time.time()
- test_scores = _score(test_mock, None, y_test, scorers,
- is_multimetric)
+ test_scores = _score(
+ test_mock, None, y_test, scorers, is_multimetric
+ )
score_time += time.time() - start_time
ret.append(test_scores)
if return_n_test_samples:
ret.append(len(y_test))
if return_times:
- # Note, the C library returns the duration for a task (i.e. all
- # splits). The _skkl_format_cv_results() computes the mean of
- # the values, which should represent the average time per
- # split. To compute this correctly, we here divide by the
- # number of splits. Since we calculate the mean later, the mean
- # is still correct, but this is not the exact fit_time for this
+ # Note, the C library returns the duration for a task (i.e. all
+ # splits). The _skkl_format_cv_results() computes the mean of
+ # the values, which should represent the average time per
+ # split. To compute this correctly, we here divide by the
+ # number of splits. Since we calculate the mean later, the mean
+ # is still correct, but this is not the exact fit_time for this
# fold.
fit_time = duration / n_splits
ret.extend([fit_time, score_time])
@@ -169,14 +182,31 @@ def _format_results(results, cv_idx, true_y, scorers, iid,
out.append(ret)
- cv_results_ = _skl_format_cv_results(out, return_train_score,
- candidate_params, n_candidates, n_splits, scorers, iid)
+ cv_results_ = _skl_format_cv_results(
+ out,
+ return_train_score,
+ candidate_params,
+ n_candidates,
+ n_splits,
+ scorers,
+ iid,
+ )
return cv_results_
-def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit,
- verbose, return_train_score, iid):
+def _fit_grid_gensvm(
+ X,
+ y,
+ groups,
+ candidate_params,
+ scorers,
+ cv,
+ refit,
+ verbose,
+ return_train_score,
+ iid,
+):
"""Utility function for fitting the grid search for GenSVM
This function sorts the parameter grid for optimal computation speed, sets
@@ -193,35 +223,30 @@ def _fit_grid_gensvm(X, y, groups, candidate_params, scorers, cv, refit,
"""
# sort the candidate params
- # the optimal order of the parameters from inner to outer loop is: epsilon,
+ # the optimal order of the parameters from inner to outer loop is: epsilon,
# p, lambda, kappa, weights, kernel, ???
_sort_candidate_params(candidate_params)
# set the verbosity in GenSVM
wrapper.set_verbosity_wrap(verbose)
- # NOTE: The C library can compute the accuracy score and destroy the exact
- # predictions, but this doesn't allow us to compute the score per fold. So
+ # NOTE: The C library can compute the accuracy score and destroy the exact
+ # predictions, but this doesn't allow us to compute the score per fold. So
# we always want to get the raw predictions for each grid point.
store_predictions = True
# Convert the cv variable to a cv_idx array
cv = check_cv(cv, y, classifier=True)
n_folds = cv.get_n_splits(X, y, groups)
- cv_idx = np.zeros((X.shape[0], ), dtype=np.int_) - 1
+ cv_idx = np.zeros((X.shape[0],), dtype=np.int_) - 1
fold_idx = 0
for train, test in cv.split(X, y, groups):
- cv_idx[test, ] = fold_idx
+ cv_idx[test,] = fold_idx
fold_idx += 1
results_ = wrapper.grid_wrap(
- X,
- y,
- candidate_params,
- int(store_predictions),
- cv_idx,
- int(n_folds),
- )
+ X, y, candidate_params, int(store_predictions), cv_idx, int(n_folds)
+ )
cv_results_ = _format_results(results_, cv_idx, y, scorers, iid)
return cv_results_, n_folds
@@ -449,8 +474,16 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
"""
- def __init__(self, param_grid, scoring=None, iid=True, cv=None, refit=True,
- verbose=0, return_train_score=True):
+ def __init__(
+ self,
+ param_grid,
+ scoring=None,
+ iid=True,
+ cv=None,
+ refit=True,
+ verbose=0,
+ return_train_score=True,
+ ):
self.param_grid = param_grid
_check_param_grid(self.param_grid)
@@ -466,7 +499,6 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
def _get_param_iterator(self):
return ParameterGrid(self.param_grid)
-
def fit(self, X, y, groups=None):
"""Run GenSVM grid search with all sets of parameters
@@ -491,14 +523,15 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
"""
- X, y_orig = check_X_y(X, y, accept_sparse=False, dtype=np.float64,
- order="C")
+ X, y_orig = check_X_y(
+ X, y, accept_sparse=False, dtype=np.float64, order="C"
+ )
y_type = type_of_target(y_orig)
if y_type not in ["binary", "multiclass"]:
raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
- # This is necessary because GenSVM expects classes to go from 1 to
+ # This is necessary because GenSVM expects classes to go from 1 to
# n_class
self.encoder = LabelEncoder()
y = self.encoder.fit_transform(y_orig)
@@ -507,13 +540,23 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
candidate_params = list(self._get_param_iterator())
scorers, self.multimetric_, refit_metric = _skl_check_scorers(
- self.scoring, self.refit)
+ self.scoring, self.refit
+ )
X, y, groups = indexable(X, y, groups)
- results, n_splits = _fit_grid_gensvm(X, y, groups, candidate_params,
- scorers, self.cv, self.refit, self.verbose,
- self.return_train_score, self.iid)
+ results, n_splits = _fit_grid_gensvm(
+ X,
+ y,
+ groups,
+ candidate_params,
+ scorers,
+ self.cv,
+ self.refit,
+ self.verbose,
+ self.return_train_score,
+ self.iid,
+ )
self.cv_results_ = results
@@ -524,23 +567,23 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
self.best_params_ = candidate_params[self.best_index_]
self.best_score_ = results["mean_test_%s" % refit_metric][
- self.best_index_]
+ self.best_index_
+ ]
if self.refit:
self.best_estimator_ = GenSVM(**self.best_params_)
- # y_orig because GenSVM fit must know the conversion for predict to
+ # y_orig because GenSVM fit must know the conversion for predict to
# work correctly
self.best_estimator_.fit(X, y_orig)
## Store the only scorer not as a dict for single metric evaluation
- self.scorer_ = scorers if self.multimetric_ else scorers['score']
+ self.scorer_ = scorers if self.multimetric_ else scorers["score"]
self.cv_results_ = results
self.n_splits_ = n_splits
return self
-
def score(self, X, y):
"""Compute the score on the test data given the true labels
@@ -559,9 +602,15 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
score : float
"""
- _skl_check_is_fitted(self, 'score', self.refit)
- return _skl_grid_score(X, y, self.scorer_, self.best_estimator_,
- self.refit, self.multimetric_)
+ _skl_check_is_fitted(self, "score", self.refit)
+ return _skl_grid_score(
+ X,
+ y,
+ self.scorer_,
+ self.best_estimator_,
+ self.refit,
+ self.multimetric_,
+ )
def predict(self, X):
"""Predict the class labels on the test data
@@ -578,7 +627,7 @@ class GenSVMGridSearchCV(BaseEstimator, MetaEstimatorMixin):
Predicted class labels of the data in X.
"""
- _skl_check_is_fitted(self, 'predict', self.refit)
+ _skl_check_is_fitted(self, "predict", self.refit)
return self.best_estimator_.predict(X)
@@ -609,11 +658,11 @@ def load_default_grid():
used as input for the :class:`.GenSVMGridSearchCV` class.
"""
pg = {
- 'lmd': [pow(2, x) for x in range(-18, 19, 2)],
- 'kappa': [-0.9, 0.5, 5.0],
- 'p': [1.0, 1.5, 2.0],
- 'weights': ['unit', 'group'],
- 'epsilon': [1e-8],
- 'kernel': ['linear']
- }
+ "lmd": [pow(2, x) for x in range(-18, 19, 2)],
+ "kappa": [-0.9, 0.5, 5.0],
+ "p": [1.0, 1.5, 2.0],
+ "weights": ["unit", "group"],
+ "epsilon": [1e-8],
+ "kernel": ["linear"],
+ }
return pg
diff --git a/gensvm/sklearn_util.py b/gensvm/sklearn_util.py
index 0829bf6..05d9618 100644
--- a/gensvm/sklearn_util.py
+++ b/gensvm/sklearn_util.py
@@ -68,16 +68,30 @@ from sklearn.model_selection._validation import _aggregate_score_dicts
from sklearn.utils.fixes import MaskedArray
from sklearn.utils.validation import check_is_fitted
-def _skl_format_cv_results(out, return_train_score, candidate_params,
- n_candidates, n_splits, scorers, iid):
+
+def _skl_format_cv_results(
+ out,
+ return_train_score,
+ candidate_params,
+ n_candidates,
+ n_splits,
+ scorers,
+ iid,
+):
# if one choose to see train score, "out" will contain train score info
if return_train_score:
- (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
- score_time) = zip(*out)
+ (
+ train_score_dicts,
+ test_score_dicts,
+ test_sample_counts,
+ fit_time,
+ score_time,
+ ) = zip(*out)
else:
- (test_score_dicts, test_sample_counts, fit_time,
- score_time) = zip(*out)
+ (test_score_dicts, test_sample_counts, fit_time, score_time) = zip(
+ *out
+ )
# test_score_dicts and train_score dicts are lists of dictionaries and
# we make them into dict of lists
@@ -91,35 +105,39 @@ def _skl_format_cv_results(out, return_train_score, candidate_params,
"""A small helper to store the scores/times to the cv_results_"""
# When iterated first by splits, then by parameters
# We want `array` to have `n_candidates` rows and `n_splits` cols.
- array = np.array(array, dtype=np.float64).reshape(n_candidates,
- n_splits)
+ array = np.array(array, dtype=np.float64).reshape(
+ n_candidates, n_splits
+ )
if splits:
for split_i in range(n_splits):
# Uses closure to alter the results
- results["split%d_%s"
- % (split_i, key_name)] = array[:, split_i]
+ results["split%d_%s" % (split_i, key_name)] = array[:, split_i]
array_means = np.average(array, axis=1, weights=weights)
- results['mean_%s' % key_name] = array_means
+ results["mean_%s" % key_name] = array_means
# Weighted std is not directly available in numpy
- array_stds = np.sqrt(np.average((array -
- array_means[:, np.newaxis]) ** 2,
- axis=1, weights=weights))
- results['std_%s' % key_name] = array_stds
+ array_stds = np.sqrt(
+ np.average(
+ (array - array_means[:, np.newaxis]) ** 2,
+ axis=1,
+ weights=weights,
+ )
+ )
+ results["std_%s" % key_name] = array_stds
if rank:
results["rank_%s" % key_name] = np.asarray(
- get_ranks(-array_means), dtype=np.int32)
+ get_ranks(-array_means), dtype=np.int32
+ )
- _store('fit_time', fit_time)
- _store('score_time', score_time)
+ _store("fit_time", fit_time)
+ _store("score_time", score_time)
# Use one MaskedArray and mask all the places where the param is not
# applicable for that candidate. Use defaultdict as each candidate may
# not contain all the params
- param_results = defaultdict(partial(MaskedArray,
- np.empty(n_candidates,),
- mask=True,
- dtype=object))
+ param_results = defaultdict(
+ partial(MaskedArray, np.empty(n_candidates), mask=True, dtype=object)
+ )
for cand_i, params in enumerate(candidate_params):
for name, value in params.items():
# An all masked empty array gets created for the key
@@ -129,19 +147,25 @@ def _skl_format_cv_results(out, return_train_score, candidate_params,
results.update(param_results)
# Store a list of param dicts at the key 'params'
- results['params'] = candidate_params
+ results["params"] = candidate_params
# NOTE test_sample counts (weights) remain the same for all candidates
- test_sample_counts = np.array(test_sample_counts[:n_splits],
- dtype=np.int)
+ test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int)
for scorer_name in scorers.keys():
# Computed the (weighted) mean and std for test scores alone
- _store('test_%s' % scorer_name, test_scores[scorer_name],
- splits=True, rank=True,
- weights=test_sample_counts if iid else None)
+ _store(
+ "test_%s" % scorer_name,
+ test_scores[scorer_name],
+ splits=True,
+ rank=True,
+ weights=test_sample_counts if iid else None,
+ )
if return_train_score:
- _store('train_%s' % scorer_name, train_scores[scorer_name],
- splits=True)
+ _store(
+ "train_%s" % scorer_name,
+ train_scores[scorer_name],
+ splits=True,
+ )
return results
@@ -149,44 +173,49 @@ def _skl_format_cv_results(out, return_train_score, candidate_params,
def _skl_check_scorers(scoring, refit):
scorers, multimetric_ = _check_multimetric_scoring(
- GenSVM(), scoring=scoring)
+ GenSVM(), scoring=scoring
+ )
if multimetric_:
if refit is not False and (
- not isinstance(refit, six.string_types) or
- # This will work for both dict / list (tuple)
- refit not in scorers):
- raise ValueError("For multi-metric scoring, the parameter "
- "refit must be set to a scorer key "
- "to refit an estimator with the best "
- "parameter setting on the whole data and "
- "make the best_* attributes "
- "available for that metric. If this is not "
- "needed, refit should be set to False "
- "explicitly. %r was passed." % refit)
+ not isinstance(refit, six.string_types)
+ or
+ # This will work for both dict / list (tuple)
+ refit not in scorers
+ ):
+ raise ValueError(
+ "For multi-metric scoring, the parameter "
+ "refit must be set to a scorer key "
+ "to refit an estimator with the best "
+ "parameter setting on the whole data and "
+ "make the best_* attributes "
+ "available for that metric. If this is not "
+ "needed, refit should be set to False "
+ "explicitly. %r was passed." % refit
+ )
else:
refit_metric = refit
else:
- refit_metric = 'score'
+ refit_metric = "score"
return scorers, multimetric_, refit_metric
def _skl_check_is_fitted(estimator, method_name, refit):
if not refit:
- raise NotFittedError('This %s instance was initialized '
- 'with refit=False. %s is '
- 'available only after refitting on the best '
- 'parameters. You can refit an estimator '
- 'manually using the ``best_parameters_`` '
- 'attribute'
- % (type(estimator).__name__, method_name))
+ raise NotFittedError(
+ "This %s instance was initialized "
+ "with refit=False. %s is "
+ "available only after refitting on the best "
+ "parameters. You can refit an estimator "
+ "manually using the ``best_parameters_`` "
+ "attribute" % (type(estimator).__name__, method_name)
+ )
else:
- check_is_fitted(estimator, 'best_estimator_')
-
+ check_is_fitted(estimator, "best_estimator_")
def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_):
- """Returns the score on the given data, if the estimator has been
+ """Returns the score on the given data, if the estimator has been
refit.
This uses the score defined by ``scoring`` where provided, and the
@@ -206,9 +235,10 @@ def _skl_grid_score(X, y, scorer_, best_estimator_, refit, multimetric_):
-------
score : float
"""
- if scorer_ is None:
- raise ValueError("No score function explicitly defined, "
- "and the estimator doesn't provide one %s"
- % best_estimator_)
- score = scorer_[refit] if multimetric_ else scorer_
- return score(best_estimator_, X, y)
+ if scorer_ is None:
+ raise ValueError(
+ "No score function explicitly defined, "
+ "and the estimator doesn't provide one %s" % best_estimator_
+ )
+ score = scorer_[refit] if multimetric_ else scorer_
+ return score(best_estimator_, X, y)
diff --git a/gensvm/util.py b/gensvm/util.py
index 8d2a3e4..0b7cd1d 100644
--- a/gensvm/util.py
+++ b/gensvm/util.py
@@ -24,7 +24,7 @@ def get_ranks(x):
x = np.ravel(np.asarray(x))
l = len(x)
r = 1
- ranks = np.zeros((l, ))
+ ranks = np.zeros((l,))
while not all([k is None for k in x]):
m = min([k for k in x if not k is None])
idx = [1 if k == m else 0 for k in x]