7 files changed, 533 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cb8022d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,57 @@
+#
+# Makefile for easier installation and cleanup.
+#
+# Uses self-documenting macros from here:
+# http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
+
+PACKAGE=gensvm
+DOC_DIR='./docs/'
+
+.PHONY: help cover
+
+.DEFAULT_GOAL := help
+
+help:
+	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
+		 awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
+		 %s\n", $$1, $$2}'
+
+in: inplace
+inplace:
+	python setup.py build_ext -i
+
+install: ## Install for the current user using the default python command
+	python setup.py build_ext --inplace
+	python setup.py install --user
+
+install2: ## Install for the current user using the python2 command
+	python2 setup.py build_ext --inplace
+	python2 setup.py install --user
+
+test: develop ## Run nosetests using the default nosetests command
+	nosetests -v
+
+test2: develop2 ## Run nosetests using the nosetests2 command
+	nosetests2 -v
+
+cover: test ## Test unit test coverage using default nosetests
+	nosetests --with-coverage --cover-package=$(PACKAGE) \
+		--cover-erase --cover-inclusive --cover-branches \
+		--cover-html --cover-html-dir=cover
+
+clean: ## Clean build dist and egg directories left after install
+	rm -rf ./dist ./build ./$(PACKAGE).egg-info
+	rm -rf gensvm/pyx_gensvm*.so
+	rm -f MANIFEST
+
+develop: ## Install a development version of the package needed for testing
+	python setup.py develop --user
+
+develop2: ## Install a development version of the package needed for testing (python2)
+	python2 setup.py develop --user
+
+dist: ## Make Python source distribution
+	python setup.py sdist
+
+dist2: ## Make Python 2 source distribution
+	python2 setup.py sdist
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/README.rst
diff --git a/gensvm/__init__.py b/gensvm/__init__.py
new file mode 100644
index 0000000..5e1a743
--- /dev/null
+++ b/gensvm/__init__.py
@@ -0,0 +1,3 @@
+__version__ = '0.1.0'
+
+from .models import GenSVM
diff --git a/gensvm/models.py b/gensvm/models.py
new file mode 100644
index 0000000..f06374b
--- /dev/null
+++ b/gensvm/models.py
@@ -0,0 +1,190 @@
+# -*- coding: utf-8 -*-
+
+"""
+"""
+
+from __future__ import print_function, division
+
+import numpy as np
+import warnings
+
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_X_y, check_random_state
+from sklearn.utils.multiclass import type_of_target
+from sklearn.utils.validation import check_is_fitted
+
+from . import pyx_gensvm
+
+
+def _fit_gensvm(X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
+        degree, kernel_eigen_cutoff, verbose, max_iter, random_state=None):
+
+    # process the random state
+    rnd = check_random_state(random_state)
+
+    # set the verbosity in GenSVM
+    pyx_gensvm.set_verbosity_wrap(verbose)
+
+    # run the actual training
+    raw_coef_, n_SV_, n_iter_, training_error_, status_ = pyx_gensvm.train_wrap(
+            X, y, p, lmd, kappa, epsilon, weight_idx, kernel, gamma, coef, 
+            degree, kernel_eigen_cutoff, max_iter, 
+            rnd.randint(np.iinfo('i').max))
+
+    # process output
+    if status_ == 1 and verbose > 0:
+        warnings.warn("GenSVM optimization prematurely ended due to a "
+                "incorrect step in the optimization algorithm.", 
+                FitFailedWarning)
+
+    if status_ == 2 and verbose > 0:
+        warnings.warn("GenSVM failed to converge, increase "
+                "the number of iterations.", ConvergenceWarning)
+
+    coef_ = raw_coef_[1:, :]
+    intercept_ = raw_coef_[0, :]
+
+    return coef_, intercept_, n_iter_, n_SV_
+
+
+class GenSVM(BaseEstimator):
+    """Generalized Multiclass Support Vector Machine Classification.
+
+    This class implements the basic GenSVM classifier. GenSVM is a generalized 
+    multiclass SVM which is flexible in the weighting of misclassification 
+    errors. It is this flexibility that makes it perform well on diverse 
+    datasets.
+
+    This methods of this class use the GenSVM C library for the actual 
+    computations.
+
+    Parameters
+    ----------
+    p : float, optional (default=1.0)
+        Parameter for the L_p norm of the loss function (1.0 <= p <= 2.0)
+
+    lmd : float, optional (default=1e-5)
+        Parameter for the regularization term of the loss function (lmd > 0)
+
+    kappa : float, optional (default=0.0)
+        Parameter for the hinge function in the loss function (kappa > -1.0)
+
+    weight_idx : int, optional (default=1)
+        Type of sample weights to use (1 = unit weights, 2 = size correction 
+        weights)
+
+    kernel : string, optional (default='linear')
+        Specify the kernel type to use in the classifier. It must be one of 
+        'linear', 'poly', 'rbf', or 'sigmoid'.
+
+    gamma : float, optional (default=1.0)
+        Kernel parameter for the rbf, poly, and sigmoid kernel
+
+    coef : float, optional (default=0.0)
+        Kernel parameter for the poly and sigmoid kernel
+
+    degree : float, optional (default=2.0)
+        Kernel parameter for the poly kernel
+
+    kernel_eigen_cutoff : float, optional (default=1e-8)
+        Cutoff point for the reduced eigendecomposition used with 
+        kernel-GenSVM. Eigenvectors for which the ratio between their 
+        corresponding eigenvalue and the largest eigenvalue is smaller than the 
+        cutoff will be dropped.
+
+    verbose : int, (default=0)
+        Enable verbose output
+
+    max_iter : int, (default=1e8)
+        The maximum number of iterations to be run.
+
+
+    Attributes
+    ----------
+    coef_ : array, shape = [n_features, n_classes-1]
+        Weights assigned to the features (coefficients in the primal problem)
+
+    intercept_ : array, shape = [n_classes]
+        Constants in the decision function
+
+    n_iter_ : int
+        The number of iterations that were run during training.
+
+    n_support_ : int
+        The number of support vectors that were found
+
+
+    References
+    ----------
+    * Van den Burg, G.J.J. and Groenen, P.J.F.. GenSVM: A Generalized 
+    Multiclass Support Vector Machine. Journal of Machine Learning Research, 
+    17(225):1--42, 2016.
+
+    """
+
+    def __init__(self, p=1.0, lmd=1e-5, kappa=0.0, epsilon=1e-6, weight_idx=1, 
+            kernel='linear', gamma=1.0, coef=0.0, degree=2.0, 
+            kernel_eigen_cutoff=1e-8, verbose=0, random_state=None, 
+            max_iter=1e8):
+        self.p = p
+        self.lmd = lmd
+        self.kappa = kappa
+        self.epsilon = epsilon
+        self.weight_idx = weight_idx
+        self.kernel = kernel
+        self.gamma = gamma
+        self.coef = coef
+        self.degree = degree
+        self.kernel_eigen_cutoff = kernel_eigen_cutoff
+        self.verbose = verbose
+        self.random_state = random_state
+        self.max_iter = max_iter
+
+
+    def fit(self, X, y):
+        if not 1.0 <= self.p <= 2.0:
+            raise ValueError("Value for p should be within [1, 2]; got p = %r)" 
+                    % self.p)
+        if not self.kappa > -1.0:
+            raise ValueError("Value for kappa should be larger than -1; got "
+                    "kappa = %r" % self.kappa)
+        if not self.lmd > 0:
+            raise ValueError("Value for lmd should be larger than 0; got "
+                    "lmd = %r" % self.lmd)
+        if not self.epsilon > 0:
+            raise ValueError("Value for epsilon should be larger than 0; got "
+                    "epsilon = %r" % self.epsilon)
+        X, y_org = check_X_y(X, y, accept_sparse=False, dtype=np.float64, 
+                order="C")
+
+        y_type = type_of_target(y_org)
+        if y_type not in ["binary", "multiclass"]:
+            raise ValueError("Label type not allowed for GenSVM: %r" % y_type)
+
+        # This is necessary because GenSVM expects classes to go from 1 to 
+        # n_class
+        self.encoder = LabelEncoder()
+        y = self.encoder.fit_transform(y_org)
+        y += 1
+
+        self.coef_, self.intercept_, self.n_iter_, self.n_support_ = \
+                _fit_gensvm(X, y, self.p, self.lmd, self.kappa, self.epsilon, 
+                        self.weight_idx, self.kernel, self.gamma, self.coef, 
+                        self.degree, self.kernel_eigen_cutoff, self.verbose, 
+                        self.max_iter, self.random_state)
+        return self
+
+
+    def predict(self, X):
+        check_is_fitted(self, "coef_")
+
+        V = np.vstack((self.intercept_, self.coef_))
+        predictions = pyx_gensvm.predict_wrap(X, V)
+
+        # Transform the classes back to the original form
+        predictions -= 1
+        outcome = self.encoder.inverse_transform(predictions)
+
+        return outcome
diff --git a/gensvm/pyx_gensvm.pxd b/gensvm/pyx_gensvm.pxd
new file mode 100644
index 0000000..be4d5f5
--- /dev/null
+++ b/gensvm/pyx_gensvm.pxd
@@ -0,0 +1,91 @@
+cimport numpy as np
+
+# Includes
+
+cdef extern from "gensvm_globals.h":
+    # Stuff for kerneltype
+    ctypedef enum KernelType:
+        pass
+
+cdef extern from "gensvm_sparse.h":
+    # stuff for GenSparse
+
+    cdef struct GenSparse:
+        long nnz
+        long n_row
+        long n_col
+        double *values
+        long *ia
+        long *ja
+
+    GenSparse *gensvm_init_sparse()
+    void gensvm_free_sparse(GenSparse *)
+
+
+cdef extern from "gensvm_base.h":
+
+    cdef struct GenData:
+        long K
+        long n
+        long m
+        long r
+        long *y
+        double *Z
+        GenSparse *spZ
+        double *RAW
+        double *Sigma
+        KernelType kerneltype
+        double *kernelparam
+
+    cdef struct GenModel:
+        int weight_idx
+        long K
+        long n
+        long m
+        double epsilon
+        double p
+        double kappa
+        double lmd
+        double *V
+        double *Vbar
+        double *U
+        double *UU
+        double *Q
+        double *H
+        double *rho
+        double training_error
+        KernelType kerneltype
+        double *kernelparam
+        double kernel_eigen_cutoff
+
+    GenModel *gensvm_init_model()
+    void gensvm_free_model(GenModel *)
+
+    GenData *gensvm_init_data()
+    void gensvm_free_data(GenData *)
+
+cdef extern from "gensvm_train.h":
+
+    void gensvm_train(GenModel *, GenData *, GenModel *) nogil
+
+cdef extern from "gensvm_sv.h":
+
+    long gensvm_num_sv(GenModel *)
+
+cdef extern from "gensvm_helper.c":
+
+    ctypedef char* char_const_ptr "char const *"
+    void set_model(GenModel *, double, double, double, double, int, int, 
+            double, double, double, double, long, long)
+    void set_data(GenData *, char *, char *, np.npy_intp *, long)
+    char_const_ptr check_model(GenModel *)
+    void copy_V(void *, GenModel *)
+    long get_iter_count(GenModel *)
+    double get_training_error(GenModel *)
+    int get_status(GenModel *)
+    long get_n(GenModel *)
+    long get_m(GenModel *)
+    long get_K(GenModel *)
+    void free_data(GenData *)
+    void set_verbosity(int)
+    void gensvm_predict(char *, char *, long, long, long, char *) nogil
diff --git a/gensvm/pyx_gensvm.pyx b/gensvm/pyx_gensvm.pyx
new file mode 100644
index 0000000..394d4ca
--- /dev/null
+++ b/gensvm/pyx_gensvm.pyx
@@ -0,0 +1,123 @@
+"""
+Wrapper for GenSVM
+
+Not implemented yet:
+    - vector of instance weights
+    - class weights
+    - seed model
+    - max_iter = -1 for unlimited
+
+"""
+
+from __future__ import print_function
+
+import numpy as np
+cimport numpy as np
+
+cimport pyx_gensvm
+
+np.import_array()
+
+GENSVM_KERNEL_TYPES = ["linear", "poly", "rbf", "sigmoid"]
+
+def train_wrap(
+        np.ndarray[np.float64_t, ndim=2, mode='c'] X,
+        np.ndarray[np.int_t, ndim=1, mode='c'] y,
+        double p=1.0,
+        double lmd=pow(2, -8),
+        double kappa=0.0,
+        double epsilon=1e-6,
+        int weight_idx=1,
+        str kernel='linear',
+        double gamma=1.0,
+        double coef=0.0,
+        double degree=2.0,
+        double kernel_eigen_cutoff=1e-8,
+        int max_iter=100000000,
+        int random_seed=-1):
+    """
+    """
+
+    # Initialize model and data
+    cdef GenModel *model = gensvm_init_model()
+    cdef GenData *data = gensvm_init_data()
+    cdef long n_obs
+    cdef long n_var
+    cdef long n_class
+
+    # get the kernel index
+    kernel_index = GENSVM_KERNEL_TYPES.index(kernel)
+
+    # get the number of classes
+    classes = np.unique(y)
+    n_obs = X.shape[0]
+    n_var = X.shape[1]
+    n_class = classes.shape[0]
+
+    # Set the data
+    set_data(data, X.data, y.data, X.shape, n_class)
+
+    # Set the model
+    set_model(model, p, lmd, kappa, epsilon, weight_idx, kernel_index, degree, 
+            gamma, coef, kernel_eigen_cutoff, max_iter, random_seed)
+
+    # Check the parameters
+    error_msg = check_model(model)
+    if error_msg:
+        gensvm_free_model(model)
+        free_data(data)
+        error_repl = error_msg.decode('utf-8')
+        raise ValueError(error_repl)
+
+    # Do the actual training
+    with nogil:
+        gensvm_train(model, data, NULL)
+
+    # copy the results
+    cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V
+    V = np.empty((n_var+1, n_class-1))
+    copy_V(V.data, model)
+
+    # get other results from model
+    iter_count = get_iter_count(model)
+    training_error = get_training_error(model)
+    fit_status = get_status(model)
+    n_SV = gensvm_num_sv(model)
+
+    # free model and data
+    gensvm_free_model(model);
+    free_data(data);
+
+    return (V, n_SV, iter_count, training_error, fit_status)
+
+def predict_wrap(
+        np.ndarray[np.float64_t, ndim=2, mode='c'] X,
+        np.ndarray[np.float64_t, ndim=2, mode='c'] V
+        ):
+    """
+    """
+
+    cdef long n_test_obs
+    cdef long n_var
+    cdef long n_class
+
+    n_test_obs = X.shape[0]
+    n_var = X.shape[1]
+    n_class = V.shape[1] + 1
+
+    # output vector
+    cdef np.ndarray[np.int_t, ndim=1, mode='c'] predictions
+    predictions = np.empty((n_test_obs, ), dtype=np.int)
+
+    # do the prediction
+    with nogil:
+        gensvm_predict(X.data, V.data, n_test_obs, n_var, n_class, 
+                predictions.data)
+
+    return predictions
+
+def set_verbosity_wrap(int verbosity):
+    """
+    Control verbosity of gensvm library
+    """
+    set_verbosity(verbosity)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..6d369ef
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import re
+import numpy
+
+from numpy.distutils.core import setup
+from numpy.distutils.misc_util import Configuration
+from sklearn._build_utils import get_blas_info, maybe_cythonize_extensions
+
+
+def configuration(parent_package='', top_path=None):
+    config = Configuration('gensvm', parent_package, top_path)
+
+    # gensvm module
+    cblas_libs, blas_info = get_blas_info()
+    if os.name == 'posix':
+        cblas_libs.append('m')
+
+    gensvm_sources = [
+            os.path.join('gensvm', 'pyx_gensvm.pyx'),
+            os.path.join('gensvm', 'src', 'gensvm', 'src', '*.c'),
+            ]
+
+    gensvm_depends = [
+            os.path.join('gensvm', 'src', 'gensvm', 'include', '*.h'),
+            os.path.join('gensvm', 'src', 'gensvm', 'gensvm_helper.c')
+            ]
+
+    config.add_extension('pyx_gensvm',
+            sources=gensvm_sources,
+            libraries=cblas_libs,
+            include_dirs=[
+                os.path.join('gensvm', 'src', 'gensvm'),
+                os.path.join('gensvm', 'src', 'gensvm', 'include'),
+                numpy.get_include(),
+                blas_info.pop('include_dirs', [])],
+            extra_compile_args=blas_info.pop('extra_compile_args', []),
+            depends=gensvm_depends,
+            **blas_info)
+    # end gensvm module
+
+    maybe_cythonize_extensions(top_path, config)
+
+    return config
+
+
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+
+if __name__ == '__main__':
+
+    version = re.search("__version__ = '([^']+)'", 
+            open('gensvm/__init__.py').read()).group(1)
+
+    attr = configuration(top_path='').todict()
+
+    attr['description'] = 'Python package for the GenSVM classifier'
+    attr['long_description'] = read('README.rst')
+    attr['packages'] = ['gensvm']
+    attr['url'] = "https://github.com/GjjvdBurg/PyGenSVM"
+    attr['author'] = "G.J.J. van den Burg"
+    attr['author_email'] = "gertjanvandenburg@gmail.com"
+    attr['license'] = 'GPL v2'
+    attr['install_requires'] = ['scikit-learn', 'numpy']
+
+    setup(**attr)