diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-10-06 16:44:11 +0200 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-10-06 16:44:11 +0200 |
| commit | ba0225b7ab0556b2019935d5f5786863c0a01e6a (patch) | |
| tree | 42c89e4d5d863f8498c1efa9cd5d25da0e240480 /gensvm | |
| download | pygensvm-ba0225b7ab0556b2019935d5f5786863c0a01e6a.tar.gz pygensvm-ba0225b7ab0556b2019935d5f5786863c0a01e6a.zip | |
initial commit
Diffstat (limited to 'gensvm')
| -rw-r--r-- | gensvm/__init__.py | 3 | ||||
| -rw-r--r-- | gensvm/models.py | 190 | ||||
| -rw-r--r-- | gensvm/pyx_gensvm.pxd | 91 | ||||
| -rw-r--r-- | gensvm/pyx_gensvm.pyx | 123 |
4 files changed, 407 insertions, 0 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py new file mode 100644 index 0000000..5e1a743 --- /dev/null +++ b/gensvm/__init__.py @@ -0,0 +1,3 @@ +__version__ = '0.1.0' + +from .models import GenSVM diff --git a/gensvm/models.py b/gensvm/models.py new file mode 100644 index 0000000..f06374b --- /dev/null +++ b/gensvm/models.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +""" +""" + +from __future__ import print_function, division + +import numpy as np +import warnings + +from sklearn.base import BaseEstimator +from sklearn.exceptions import ConvergenceWarning, FitFailedWarning +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_X_y, check_random_state +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.validation import check_is_fitted + +from . import pyx_gensvm + + +def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, + degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None): + + # process the random state + rnd = check_random_state(random_state) + + # set the verbosity in GenSVM + pyx_gensvm.set_verbosity_wrap(verbose) + + # run the actual training + raw_coef_, n_SV_, n_iter_, training_error_, status_ = pyx_gensvm.train_wrap( + X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, + degree, kernel_eigen_cutoff, max_iter, + rnd.randint(np.iinfo('i').max)) + + # process output + if status_ == 1 and verbose > 0: + warnings.warn("GenSVM optimization prematurely ended due to a " + "incorrect step in the optimization algorithm.", + FitFailedWarning) + + if status_ == 2 and verbose > 0: + warnings.warn("GenSVM failed to converge, increase " + "the number of iterations.", ConvergenceWarning) + + coef_ = raw_coef_[1:, :] + intercept_ = raw_coef_[0, :] + + return coef_, intercept_, n_iter_, n_SV_ + + +class GenSVM(BaseEstimator): + """Generalized Multiclass Support Vector Machine Classification. + + This class implements the basic GenSVM classifier. GenSVM is a generalized + multiclass SVM which is flexible in the weighting of misclassification + errors. It is this flexibility that makes it perform well on diverse + datasets. + + This methods of this class use the GenSVM C library for the actual + computations. + + Parameters + ---------- + p : float, optional (default=1.0) + Parameter for the L_p norm of the loss function (1.0 <= p <= 2.0) + + lmd : float, optional (default=1e-5) + Parameter for the regularization term of the loss function (lmd > 0) + + kappa : float, optional (default=0.0) + Parameter for the hinge function in the loss function (kappa > -1.0) + + weight_idx : int, optional (default=1) + Type of sample weights to use (1 = unit weights, 2 = size correction + weights) + + kernel : string, optional (default='linear') + Specify the kernel type to use in the classifier. It must be one of + 'linear', 'poly', 'rbf', or 'sigmoid'. + + gamma : float, optional (default=1.0) + Kernel parameter for the rbf, poly, and sigmoid kernel + + coef : float, optional (default=0.0) + Kernel parameter for the poly and sigmoid kernel + + degree : float, optional (default=2.0) + Kernel parameter for the poly kernel + + kernel_eigen_cutoff : float, optional (default=1e-8) + Cutoff point for the reduced eigendecomposition used with + kernel-GenSVM. Eigenvectors for which the ratio between their + corresponding eigenvalue and the largest eigenvalue is smaller than the + cutoff will be dropped. + + verbose : int, (default=0) + Enable verbose output + + max_iter : int, (default=1e8) + The maximum number of iterations to be run. + + + Attributes + ---------- + coef_ : array, shape = [n_features, n_classes-1] + Weights assigned to the features (coefficients in the primal problem) + + intercept_ : array, shape = [n_classes] + Constants in the decision function + + n_iter_ : int + The number of iterations that were run during training. + + n_support_ : int + The number of support vectors that were found + + + References + ---------- + * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized + Multiclass Support Vector Machine. Journal of Machine Learning Research, + 17(225):1--42, 2016. + + """ + + def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, + kernel='linear', gamma=1.0, coef=0.0, degree=2.0, + kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, + max_iter=1e8): + self.p = p + self.lmd = lmd + self.kappa = kappa + self.epsilon = epsilon + self.weight_idx = weight_idx + self.kernel = kernel + self.gamma = gamma + self.coef = coef + self.degree = degree + self.kernel_eigen_cutoff = kernel_eigen_cutoff + self.verbose = verbose + self.random_state = random_state + self.max_iter = max_iter + + + def fit(self, X, y): + if not 1.0 <= self.p <= 2.0: + raise ValueError("Value for p should be within [1, 2]; got p = %r)" + % self.p) + if not self.kappa > -1.0: + raise ValueError("Value for kappa should be larger than -1; got " + "kappa = %r" % self.kappa) + if not self.lmd > 0: + raise ValueError("Value for lmd should be larger than 0; got " + "lmd = %r" % self.lmd) + if not self.epsilon > 0: + raise ValueError("Value for epsilon should be larger than 0; got " + "epsilon = %r" % self.epsilon) + X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, + order="C") + + y_type = type_of_target(y_org) + if y_type not in ["binary", "multiclass"]: + raise ValueError("Label type not allowed for GenSVM: %r" % y_type) + + # This is necessary because GenSVM expects classes to go from 1 to + # n_class + self.encoder = LabelEncoder() + y = self.encoder.fit_transform(y_org) + y += 1 + + self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \ + _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, + self.weight_idx, self.kernel, self.gamma, self.coef, + self.degree, self.kernel_eigen_cutoff, self.verbose, + self.max_iter, self.random_state) + return self + + + def predict(self, X): + check_is_fitted(self, "coef_") + + V = np.vstack((self.intercept_, self.coef_)) + predictions = pyx_gensvm.predict_wrap(X, V) + + # Transform the classes back to the original form + predictions -= 1 + outcome = self.encoder.inverse_transform(predictions) + + return outcome diff --git a/gensvm/pyx_gensvm.pxd b/gensvm/pyx_gensvm.pxd new file mode 100644 index 0000000..be4d5f5 --- /dev/null +++ b/gensvm/pyx_gensvm.pxd @@ -0,0 +1,91 @@ +cimport numpy as np + +# Includes + +cdef extern from "gensvm_globals.h": + # Stuff for kerneltype + ctypedef enum KernelType: + pass + +cdef extern from "gensvm_sparse.h": + # stuff for GenSparse + + cdef struct GenSparse: + long nnz + long n_row + long n_col + double *values + long *ia + long *ja + + GenSparse *gensvm_init_sparse() + void gensvm_free_sparse(GenSparse *) + + +cdef extern from "gensvm_base.h": + + cdef struct GenData: + long K + long n + long m + long r + long *y + double *Z + GenSparse *spZ + double *RAW + double *Sigma + KernelType kerneltype + double *kernelparam + + cdef struct GenModel: + int weight_idx + long K + long n + long m + double epsilon + double p + double kappa + double lmd + double *V + double *Vbar + double *U + double *UU + double *Q + double *H + double *rho + double training_error + KernelType kerneltype + double *kernelparam + double kernel_eigen_cutoff + + GenModel *gensvm_init_model() + void gensvm_free_model(GenModel *) + + GenData *gensvm_init_data() + void gensvm_free_data(GenData *) + +cdef extern from "gensvm_train.h": + + void gensvm_train(GenModel *, GenData *, GenModel *) nogil + +cdef extern from "gensvm_sv.h": + + long gensvm_num_sv(GenModel *) + +cdef extern from "gensvm_helper.c": + + ctypedef char* char_const_ptr "char const *" + void set_model(GenModel *, double, double, double, double, int, int, + double, double, double, double, long, long) + void set_data(GenData *, char *, char *, np.npy_intp *, long) + char_const_ptr check_model(GenModel *) + void copy_V(void *, GenModel *) + long get_iter_count(GenModel *) + double get_training_error(GenModel *) + int get_status(GenModel *) + long get_n(GenModel *) + long get_m(GenModel *) + long get_K(GenModel *) + void free_data(GenData *) + void set_verbosity(int) + void gensvm_predict(char *, char *, long, long, long, char *) nogil diff --git a/gensvm/pyx_gensvm.pyx b/gensvm/pyx_gensvm.pyx new file mode 100644 index 0000000..394d4ca --- /dev/null +++ b/gensvm/pyx_gensvm.pyx @@ -0,0 +1,123 @@ +""" +Wrapper for GenSVM + +Not implemented yet: + - vector of instance weights + - class weights + - seed model + - max_iter = -1 for unlimited + +""" + +from __future__ import print_function + +import numpy as np +cimport numpy as np + +cimport pyx_gensvm + +np.import_array() + +GENSVM_KERNEL_TYPES = ["linear", "poly", "rbf", "sigmoid"] + +def train_wrap( + np.ndarray[np.float64_t, ndim=2, mode='c'] X, + np.ndarray[np.int_t, ndim=1, mode='c'] y, + double p=1.0, + double lmd=pow(2, -8), + double kappa=0.0, + double epsilon=1e-6, + int weight_idx=1, + str kernel='linear', + double gamma=1.0, + double coef=0.0, + double degree=2.0, + double kernel_eigen_cutoff=1e-8, + int max_iter=100000000, + int random_seed=-1): + """ + """ + + # Initialize model and data + cdef GenModel *model = gensvm_init_model() + cdef GenData *data = gensvm_init_data() + cdef long n_obs + cdef long n_var + cdef long n_class + + # get the kernel index + kernel_index = GENSVM_KERNEL_TYPES.index(kernel) + + # get the number of classes + classes = np.unique(y) + n_obs = X.shape[0] + n_var = X.shape[1] + n_class = classes.shape[0] + + # Set the data + set_data(data, X.data, y.data, X.shape, n_class) + + # Set the model + set_model(model, p, lmd, kappa, epsilon, weight_idx, kernel_index, degree, + gamma, coef, kernel_eigen_cutoff, max_iter, random_seed) + + # Check the parameters + error_msg = check_model(model) + if error_msg: + gensvm_free_model(model) + free_data(data) + error_repl = error_msg.decode('utf-8') + raise ValueError(error_repl) + + # Do the actual training + with nogil: + gensvm_train(model, data, NULL) + + # copy the results + cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V + V = np.empty((n_var+1, n_class-1)) + copy_V(V.data, model) + + # get other results from model + iter_count = get_iter_count(model) + training_error = get_training_error(model) + fit_status = get_status(model) + n_SV = gensvm_num_sv(model) + + # free model and data + gensvm_free_model(model); + free_data(data); + + return (V, n_SV, iter_count, training_error, fit_status) + +def predict_wrap( + np.ndarray[np.float64_t, ndim=2, mode='c'] X, + np.ndarray[np.float64_t, ndim=2, mode='c'] V + ): + """ + """ + + cdef long n_test_obs + cdef long n_var + cdef long n_class + + n_test_obs = X.shape[0] + n_var = X.shape[1] + n_class = V.shape[1] + 1 + + # output vector + cdef np.ndarray[np.int_t, ndim=1, mode='c'] predictions + predictions = np.empty((n_test_obs, ), dtype=np.int) + + # do the prediction + with nogil: + gensvm_predict(X.data, V.data, n_test_obs, n_var, n_class, + predictions.data) + + return predictions + +def set_verbosity_wrap(int verbosity): + """ + Control verbosity of gensvm library + """ + set_verbosity(verbosity) |
