aboutsummaryrefslogtreecommitdiff
path: root/gensvm
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2019-03-06 12:24:33 -0500
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2019-03-06 12:24:33 -0500
commit61423e7d4b98eeb8bb73ebb5786bd1477d99ee23 (patch)
tree0ab7f1fa1c05b7451d7b7cd3cd867b93f0b988f2 /gensvm
parentExtract durations array (diff)
downloadpygensvm-61423e7d4b98eeb8bb73ebb5786bd1477d99ee23.tar.gz
pygensvm-61423e7d4b98eeb8bb73ebb5786bd1477d99ee23.zip
Add support for interrupted grid search
Diffstat (limited to 'gensvm')
-rw-r--r--gensvm/gridsearch.py55
-rw-r--r--gensvm/util.py31
2 files changed, 50 insertions, 36 deletions
diff --git a/gensvm/gridsearch.py b/gensvm/gridsearch.py
index fbb9168..d62ab1d 100644
--- a/gensvm/gridsearch.py
+++ b/gensvm/gridsearch.py
@@ -99,6 +99,23 @@ class _MockEstimator(ClassifierMixin):
return self.predictions
+def _wrap_score(y_pred, y_true, scorers, is_multimetric):
+ start_time = time.time()
+ results = {}
+ # we use -1 to signify missing predictions because numpy has no integer NaN
+ if np.any(y_pred < 0):
+ if is_multimetric:
+ for name in scorers:
+ results[name] = np.nan
+ else:
+ results["score"] = np.nan
+ else:
+ estimator = _MockEstimator(y_pred)
+ results = _score(estimator, None, y_true, scorers, is_multimetric)
+ score_time = time.time() - start_time
+ return results, score_time
+
+
def _format_results(
results,
cv_idx,
@@ -136,46 +153,38 @@ def _format_results(
# Out must be a list of dicts of size n_params x n_splits that iterates
# over the params in the list and for each param iterates over the splits.
- for param, duration, predictions in zip(
- results["params"], results["duration"], results["predictions"]
+ for param, durations, predictions in zip(
+ results["params"], results["durations"], results["predictions"]
):
- for test_idx in np.unique(cv_idx):
+ fit_times = durations
+ is_missing = np.any(np.isnan(durations))
+ for test_idx in sorted(np.unique(cv_idx)):
ret = []
score_time = 0
if return_train_score:
train_pred = predictions[cv_idx != test_idx,]
y_train = true_y[cv_idx != test_idx,]
- train_mock = _MockEstimator(train_pred)
- start_time = time.time()
- train_scores = _score(
- train_mock, None, y_train, scorers, is_multimetric
+ train_score, score_t = _wrap_score(
+ train_pred, y_train, scorers, is_multimetric
)
- score_time += time.time() - start_time
- ret.append(train_scores)
+ score_time += score_t
+ ret.append(train_score)
test_pred = predictions[cv_idx == test_idx,]
y_test = true_y[cv_idx == test_idx,]
- test_mock = _MockEstimator(test_pred)
- start_time = time.time()
- test_scores = _score(
- test_mock, None, y_test, scorers, is_multimetric
+ test_score, score_t = _wrap_score(
+ test_pred, y_test, scorers, is_multimetric
)
- score_time += time.time() - start_time
- ret.append(test_scores)
+ score_time += score_t
+ ret.append(test_score)
if return_n_test_samples:
ret.append(len(y_test))
if return_times:
- # Note, the C library returns the duration for a task (i.e. all
- # splits). The _skkl_format_cv_results() computes the mean of
- # the values, which should represent the average time per
- # split. To compute this correctly, we here divide by the
- # number of splits. Since we calculate the mean later, the mean
- # is still correct, but this is not the exact fit_time for this
- # fold.
- fit_time = duration / n_splits
+ fit_time = fit_times[test_idx]
+ score_time = np.nan if is_missing else score_time
ret.extend([fit_time, score_time])
if return_parameters:
ret.append(param)
diff --git a/gensvm/util.py b/gensvm/util.py
index 0b7cd1d..046f3be 100644
--- a/gensvm/util.py
+++ b/gensvm/util.py
@@ -9,10 +9,11 @@ Utility functions for GenSVM
import numpy as np
-def get_ranks(x):
+def get_ranks(a):
"""
Rank data in an array. Low values get a small rank number. Ties are broken
- by assigning the lowest value.
+ by assigning the lowest value (this corresponds to ``rankdata(a,
+ method='min')`` in SciPy.
Examples
--------
@@ -21,14 +22,18 @@ def get_ranks(x):
[4, 1, 3, 1, 5, 6, 7]
"""
- x = np.ravel(np.asarray(x))
- l = len(x)
- r = 1
- ranks = np.zeros((l,))
- while not all([k is None for k in x]):
- m = min([k for k in x if not k is None])
- idx = [1 if k == m else 0 for k in x]
- ranks = [r if idx[k] else ranks[k] for k in range(l)]
- r += sum(idx)
- x = [None if idx[k] else x[k] for k in range(l)]
- return ranks
+ orig = np.ravel(np.asarray(a))
+ arr = orig[~np.isnan(orig)]
+ sorter = np.argsort(arr, kind="quicksort")
+ inv = np.empty(sorter.size, dtype=np.intp)
+ inv[sorter] = np.arange(sorter.size, dtype=np.intp)
+
+ arr = arr[sorter]
+ obs = np.r_[True, arr[1:] != arr[:-1]]
+ dense = obs.cumsum()[inv]
+
+ count = np.r_[np.nonzero(obs)[0], len(obs)]
+ ranks = np.zeros_like(orig)
+ ranks[~np.isnan(orig)] = count[dense - 1] + 1
+ ranks[np.isnan(orig)] = np.max(ranks) + 1
+ return list(ranks)