aboutsummaryrefslogtreecommitdiff
path: root/gensvm
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2017-10-06 16:44:11 +0200
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2017-10-06 16:44:11 +0200
commitba0225b7ab0556b2019935d5f5786863c0a01e6a (patch)
tree42c89e4d5d863f8498c1efa9cd5d25da0e240480 /gensvm
downloadpygensvm-ba0225b7ab0556b2019935d5f5786863c0a01e6a.tar.gz
pygensvm-ba0225b7ab0556b2019935d5f5786863c0a01e6a.zip
initial commit
Diffstat (limited to 'gensvm')
-rw-r--r--gensvm/__init__.py3
-rw-r--r--gensvm/models.py190
-rw-r--r--gensvm/pyx_gensvm.pxd91
-rw-r--r--gensvm/pyx_gensvm.pyx123
4 files changed, 407 insertions, 0 deletions
diff --git a/gensvm/__init__.py b/gensvm/__init__.py
new file mode 100644
index 0000000..5e1a743
--- /dev/null
+++ b/gensvm/__init__.py
@@ -0,0 +1,3 @@
+__version__ = '0.1.0'
+
+from .models import GenSVM
diff --git a/gensvm/models.py b/gensvm/models.py
new file mode 100644
index 0000000..f06374b
--- /dev/null
+++ b/gensvm/models.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+
+"""
+"""
+
+from __future__ import print_function, division
+
+import numpy as np
+import warnings
+
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_X_y, check_random_state
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import check_is_fitted
+
+from . import pyx_gensvm
+
+
+def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
+ degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None):
+
+ # process the random state
+ rnd = check_random_state(random_state)
+
+ # set the verbosity in GenSVM
+ pyx_gensvm.set_verbosity_wrap(verbose)
+
+ # run the actual training
+ raw_coef_, n_SV_, n_iter_, training_error_, status_ = pyx_gensvm.train_wrap(
+ X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef,
+ degree, kernel_eigen_cutoff, max_iter,
+ rnd.randint(np.iinfo('i').max))
+
+ # process output
+ if status_ == 1 and verbose > 0:
+ warnings.warn("GenSVM optimization prematurely ended due to a "
+ "incorrect step in the optimization algorithm.",
+ FitFailedWarning)
+
+ if status_ == 2 and verbose > 0:
+ warnings.warn("GenSVM failed to converge, increase "
+ "the number of iterations.", ConvergenceWarning)
+
+ coef_ = raw_coef_[1:, :]
+ intercept_ = raw_coef_[0, :]
+
+ return coef_, intercept_, n_iter_, n_SV_
+
+
+class GenSVM(BaseEstimator):
+ """Generalized Multiclass Support Vector Machine Classification.
+
+ This class implements the basic GenSVM classifier. GenSVM is a generalized
+ multiclass SVM which is flexible in the weighting of misclassification
+ errors. It is this flexibility that makes it perform well on diverse
+ datasets.
+
+ This methods of this class use the GenSVM C library for the actual
+ computations.
+
+ Parameters
+ ----------
+ p : float, optional (default=1.0)
+ Parameter for the L_p norm of the loss function (1.0 <= p <= 2.0)
+
+ lmd : float, optional (default=1e-5)
+ Parameter for the regularization term of the loss function (lmd > 0)
+
+ kappa : float, optional (default=0.0)
+ Parameter for the hinge function in the loss function (kappa > -1.0)
+
+ weight_idx : int, optional (default=1)
+ Type of sample weights to use (1 = unit weights, 2 = size correction
+ weights)
+
+ kernel : string, optional (default='linear')
+ Specify the kernel type to use in the classifier. It must be one of
+ 'linear', 'poly', 'rbf', or 'sigmoid'.
+
+ gamma : float, optional (default=1.0)
+ Kernel parameter for the rbf, poly, and sigmoid kernel
+
+ coef : float, optional (default=0.0)
+ Kernel parameter for the poly and sigmoid kernel
+
+ degree : float, optional (default=2.0)
+ Kernel parameter for the poly kernel
+
+ kernel_eigen_cutoff : float, optional (default=1e-8)
+ Cutoff point for the reduced eigendecomposition used with
+ kernel-GenSVM. Eigenvectors for which the ratio between their
+ corresponding eigenvalue and the largest eigenvalue is smaller than the
+ cutoff will be dropped.
+
+ verbose : int, (default=0)
+ Enable verbose output
+
+ max_iter : int, (default=1e8)
+ The maximum number of iterations to be run.
+
+
+ Attributes
+ ----------
+ coef_ : array, shape = [n_features, n_classes-1]
+ Weights assigned to the features (coefficients in the primal problem)
+
+ intercept_ : array, shape = [n_classes]
+ Constants in the decision function
+
+ n_iter_ : int
+ The number of iterations that were run during training.
+
+ n_support_ : int
+ The number of support vectors that were found
+
+
+ References
+ ----------
+ * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized
+ Multiclass Support Vector Machine. Journal of Machine Learning Research,
+ 17(225):1--42, 2016.
+
+ """
+
+ def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1,
+ kernel='linear', gamma=1.0, coef=0.0, degree=2.0,
+ kernel_eigen_cutoff=1e-8, verbose=0, random_state=None,
+ max_iter=1e8):
+ self.p = p
+ self.lmd = lmd
+ self.kappa = kappa
+ self.epsilon = epsilon
+ self.weight_idx = weight_idx
+ self.kernel = kernel
+ self.gamma = gamma
+ self.coef = coef
+ self.degree = degree
+ self.kernel_eigen_cutoff = kernel_eigen_cutoff
+ self.verbose = verbose
+ self.random_state = random_state
+ self.max_iter = max_iter
+
+
+ def fit(self, X, y):
+ if not 1.0 <= self.p <= 2.0:
+ raise ValueError("Value for p should be within [1, 2]; got p = %r)"
+ % self.p)
+ if not self.kappa > -1.0:
+ raise ValueError("Value for kappa should be larger than -1; got "
+ "kappa = %r" % self.kappa)
+ if not self.lmd > 0:
+ raise ValueError("Value for lmd should be larger than 0; got "
+ "lmd = %r" % self.lmd)
+ if not self.epsilon > 0:
+ raise ValueError("Value for epsilon should be larger than 0; got "
+ "epsilon = %r" % self.epsilon)
+ X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64,
+ order="C")
+
+ y_type = type_of_target(y_org)
+ if y_type not in ["binary", "multiclass"]:
+ raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
+
+ # This is necessary because GenSVM expects classes to go from 1 to
+ # n_class
+ self.encoder = LabelEncoder()
+ y = self.encoder.fit_transform(y_org)
+ y += 1
+
+ self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
+ _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon,
+ self.weight_idx, self.kernel, self.gamma, self.coef,
+ self.degree, self.kernel_eigen_cutoff, self.verbose,
+ self.max_iter, self.random_state)
+ return self
+
+
+ def predict(self, X):
+ check_is_fitted(self, "coef_")
+
+ V = np.vstack((self.intercept_, self.coef_))
+ predictions = pyx_gensvm.predict_wrap(X, V)
+
+ # Transform the classes back to the original form
+ predictions -= 1
+ outcome = self.encoder.inverse_transform(predictions)
+
+ return outcome
diff --git a/gensvm/pyx_gensvm.pxd b/gensvm/pyx_gensvm.pxd
new file mode 100644
index 0000000..be4d5f5
--- /dev/null
+++ b/gensvm/pyx_gensvm.pxd
@@ -0,0 +1,91 @@
+cimport numpy as np
+
+# Includes
+
+cdef extern from "gensvm_globals.h":
+ # Stuff for kerneltype
+ ctypedef enum KernelType:
+ pass
+
+cdef extern from "gensvm_sparse.h":
+ # stuff for GenSparse
+
+ cdef struct GenSparse:
+ long nnz
+ long n_row
+ long n_col
+ double *values
+ long *ia
+ long *ja
+
+ GenSparse *gensvm_init_sparse()
+ void gensvm_free_sparse(GenSparse *)
+
+
+cdef extern from "gensvm_base.h":
+
+ cdef struct GenData:
+ long K
+ long n
+ long m
+ long r
+ long *y
+ double *Z
+ GenSparse *spZ
+ double *RAW
+ double *Sigma
+ KernelType kerneltype
+ double *kernelparam
+
+ cdef struct GenModel:
+ int weight_idx
+ long K
+ long n
+ long m
+ double epsilon
+ double p
+ double kappa
+ double lmd
+ double *V
+ double *Vbar
+ double *U
+ double *UU
+ double *Q
+ double *H
+ double *rho
+ double training_error
+ KernelType kerneltype
+ double *kernelparam
+ double kernel_eigen_cutoff
+
+ GenModel *gensvm_init_model()
+ void gensvm_free_model(GenModel *)
+
+ GenData *gensvm_init_data()
+ void gensvm_free_data(GenData *)
+
+cdef extern from "gensvm_train.h":
+
+ void gensvm_train(GenModel *, GenData *, GenModel *) nogil
+
+cdef extern from "gensvm_sv.h":
+
+ long gensvm_num_sv(GenModel *)
+
+cdef extern from "gensvm_helper.c":
+
+ ctypedef char* char_const_ptr "char const *"
+ void set_model(GenModel *, double, double, double, double, int, int,
+ double, double, double, double, long, long)
+ void set_data(GenData *, char *, char *, np.npy_intp *, long)
+ char_const_ptr check_model(GenModel *)
+ void copy_V(void *, GenModel *)
+ long get_iter_count(GenModel *)
+ double get_training_error(GenModel *)
+ int get_status(GenModel *)
+ long get_n(GenModel *)
+ long get_m(GenModel *)
+ long get_K(GenModel *)
+ void free_data(GenData *)
+ void set_verbosity(int)
+ void gensvm_predict(char *, char *, long, long, long, char *) nogil
diff --git a/gensvm/pyx_gensvm.pyx b/gensvm/pyx_gensvm.pyx
new file mode 100644
index 0000000..394d4ca
--- /dev/null
+++ b/gensvm/pyx_gensvm.pyx
@@ -0,0 +1,123 @@
+"""
+Wrapper for GenSVM
+
+Not implemented yet:
+ - vector of instance weights
+ - class weights
+ - seed model
+ - max_iter = -1 for unlimited
+
+"""
+
+from __future__ import print_function
+
+import numpy as np
+cimport numpy as np
+
+cimport pyx_gensvm
+
+np.import_array()
+
+GENSVM_KERNEL_TYPES = ["linear", "poly", "rbf", "sigmoid"]
+
+def train_wrap(
+ np.ndarray[np.float64_t, ndim=2, mode='c'] X,
+ np.ndarray[np.int_t, ndim=1, mode='c'] y,
+ double p=1.0,
+ double lmd=pow(2, -8),
+ double kappa=0.0,
+ double epsilon=1e-6,
+ int weight_idx=1,
+ str kernel='linear',
+ double gamma=1.0,
+ double coef=0.0,
+ double degree=2.0,
+ double kernel_eigen_cutoff=1e-8,
+ int max_iter=100000000,
+ int random_seed=-1):
+ """
+ """
+
+ # Initialize model and data
+ cdef GenModel *model = gensvm_init_model()
+ cdef GenData *data = gensvm_init_data()
+ cdef long n_obs
+ cdef long n_var
+ cdef long n_class
+
+ # get the kernel index
+ kernel_index = GENSVM_KERNEL_TYPES.index(kernel)
+
+ # get the number of classes
+ classes = np.unique(y)
+ n_obs = X.shape[0]
+ n_var = X.shape[1]
+ n_class = classes.shape[0]
+
+ # Set the data
+ set_data(data, X.data, y.data, X.shape, n_class)
+
+ # Set the model
+ set_model(model, p, lmd, kappa, epsilon, weight_idx, kernel_index, degree,
+ gamma, coef, kernel_eigen_cutoff, max_iter, random_seed)
+
+ # Check the parameters
+ error_msg = check_model(model)
+ if error_msg:
+ gensvm_free_model(model)
+ free_data(data)
+ error_repl = error_msg.decode('utf-8')
+ raise ValueError(error_repl)
+
+ # Do the actual training
+ with nogil:
+ gensvm_train(model, data, NULL)
+
+ # copy the results
+ cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V
+ V = np.empty((n_var+1, n_class-1))
+ copy_V(V.data, model)
+
+ # get other results from model
+ iter_count = get_iter_count(model)
+ training_error = get_training_error(model)
+ fit_status = get_status(model)
+ n_SV = gensvm_num_sv(model)
+
+ # free model and data
+ gensvm_free_model(model);
+ free_data(data);
+
+ return (V, n_SV, iter_count, training_error, fit_status)
+
+def predict_wrap(
+ np.ndarray[np.float64_t, ndim=2, mode='c'] X,
+ np.ndarray[np.float64_t, ndim=2, mode='c'] V
+ ):
+ """
+ """
+
+ cdef long n_test_obs
+ cdef long n_var
+ cdef long n_class
+
+ n_test_obs = X.shape[0]
+ n_var = X.shape[1]
+ n_class = V.shape[1] + 1
+
+ # output vector
+ cdef np.ndarray[np.int_t, ndim=1, mode='c'] predictions
+ predictions = np.empty((n_test_obs, ), dtype=np.int)
+
+ # do the prediction
+ with nogil:
+ gensvm_predict(X.data, V.data, n_test_obs, n_var, n_class,
+ predictions.data)
+
+ return predictions
+
+def set_verbosity_wrap(int verbosity):
+ """
+ Control verbosity of gensvm library
+ """
+ set_verbosity(verbosity)