diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-03-06 12:24:33 -0500 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2019-03-06 12:24:33 -0500 |
| commit | 61423e7d4b98eeb8bb73ebb5786bd1477d99ee23 (patch) | |
| tree | 0ab7f1fa1c05b7451d7b7cd3cd867b93f0b988f2 /gensvm | |
| parent | Extract durations array (diff) | |
| download | pygensvm-61423e7d4b98eeb8bb73ebb5786bd1477d99ee23.tar.gz pygensvm-61423e7d4b98eeb8bb73ebb5786bd1477d99ee23.zip | |
Add support for interrupted grid search
Diffstat (limited to 'gensvm')
| -rw-r--r-- | gensvm/gridsearch.py | 55 | ||||
| -rw-r--r-- | gensvm/util.py | 31 |
2 files changed, 50 insertions, 36 deletions
diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py index fbb9168..d62ab1d 100644 --- a/gensvm/gridsearch.py +++ b/gensvm/gridsearch.py @@ -99,6 +99,23 @@ class _MockEstimator(ClassifierMixin): return self.predictions +def _wrap_score(y_pred, y_true, scorers, is_multimetric): + start_time = time.time() + results = {} + # we use -1 to signify missing predictions because numpy has no integer NaN + if np.any(y_pred < 0): + if is_multimetric: + for name in scorers: + results[name] = np.nan + else: + results["score"] = np.nan + else: + estimator = _MockEstimator(y_pred) + results = _score(estimator, None, y_true, scorers, is_multimetric) + score_time = time.time() - start_time + return results, score_time + + def _format_results( results, cv_idx, @@ -136,46 +153,38 @@ def _format_results( # Out must be a list of dicts of size n_params x n_splits that iterates # over the params in the list and for each param iterates over the splits. - for param, duration, predictions in zip( - results["params"], results["duration"], results["predictions"] + for param, durations, predictions in zip( + results["params"], results["durations"], results["predictions"] ): - for test_idx in np.unique(cv_idx): + fit_times = durations + is_missing = np.any(np.isnan(durations)) + for test_idx in sorted(np.unique(cv_idx)): ret = [] score_time = 0 if return_train_score: train_pred = predictions[cv_idx != test_idx,] y_train = true_y[cv_idx != test_idx,] - train_mock = _MockEstimator(train_pred) - start_time = time.time() - train_scores = _score( - train_mock, None, y_train, scorers, is_multimetric + train_score, score_t = _wrap_score( + train_pred, y_train, scorers, is_multimetric ) - score_time += time.time() - start_time - ret.append(train_scores) + score_time += score_t + ret.append(train_score) test_pred = predictions[cv_idx == test_idx,] y_test = true_y[cv_idx == test_idx,] - test_mock = _MockEstimator(test_pred) - start_time = time.time() - test_scores = _score( - test_mock, None, y_test, scorers, is_multimetric + test_score, score_t = _wrap_score( + test_pred, y_test, scorers, is_multimetric ) - score_time += time.time() - start_time - ret.append(test_scores) + score_time += score_t + ret.append(test_score) if return_n_test_samples: ret.append(len(y_test)) if return_times: - # Note, the C library returns the duration for a task (i.e. all - # splits). The _skkl_format_cv_results() computes the mean of - # the values, which should represent the average time per - # split. To compute this correctly, we here divide by the - # number of splits. Since we calculate the mean later, the mean - # is still correct, but this is not the exact fit_time for this - # fold. - fit_time = duration / n_splits + fit_time = fit_times[test_idx] + score_time = np.nan if is_missing else score_time ret.extend([fit_time, score_time]) if return_parameters: ret.append(param) diff --git a/gensvm/util.py b/gensvm/util.py index 0b7cd1d..046f3be 100644 --- a/gensvm/util.py +++ b/gensvm/util.py @@ -9,10 +9,11 @@ Utility functions for GenSVM import numpy as np -def get_ranks(x): +def get_ranks(a): """ Rank data in an array. Low values get a small rank number. Ties are broken - by assigning the lowest value. + by assigning the lowest value (this corresponds to ``rankdata(a, + method='min')`` in SciPy. Examples -------- @@ -21,14 +22,18 @@ def get_ranks(x): [4, 1, 3, 1, 5, 6, 7] """ - x = np.ravel(np.asarray(x)) - l = len(x) - r = 1 - ranks = np.zeros((l,)) - while not all([k is None for k in x]): - m = min([k for k in x if not k is None]) - idx = [1 if k == m else 0 for k in x] - ranks = [r if idx[k] else ranks[k] for k in range(l)] - r += sum(idx) - x = [None if idx[k] else x[k] for k in range(l)] - return ranks + orig = np.ravel(np.asarray(a)) + arr = orig[~np.isnan(orig)] + sorter = np.argsort(arr, kind="quicksort") + inv = np.empty(sorter.size, dtype=np.intp) + inv[sorter] = np.arange(sorter.size, dtype=np.intp) + + arr = arr[sorter] + obs = np.r_[True, arr[1:] != arr[:-1]] + dense = obs.cumsum()[inv] + + count = np.r_[np.nonzero(obs)[0], len(obs)] + ranks = np.zeros_like(orig) + ranks[~np.isnan(orig)] = count[dense - 1] + 1 + ranks[np.isnan(orig)] = np.max(ranks) + 1 + return list(ranks) |
