diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-13 14:08:00 -0500 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2017-12-13 14:08:00 -0500 |
| commit | 804df540d007b13fe8e0d1bb3df535e84618ef9f (patch) | |
| tree | 992d8c17f655a05d4caf034b2a09ac549a353a13 /docs | |
| parent | Manually generate restructured text for class documentation (diff) | |
| download | pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.tar.gz pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.zip | |
amend last commit
Diffstat (limited to 'docs')
| -rw-r--r-- | docs/cls_gensvm.rst | 113 | ||||
| -rw-r--r-- | docs/cls_gridsearch.rst | 271 | ||||
| -rw-r--r-- | docs/generate_autodocs.py | 76 |
3 files changed, 460 insertions, 0 deletions
diff --git a/docs/cls_gensvm.rst b/docs/cls_gensvm.rst new file mode 100644 index 0000000..082df8f --- /dev/null +++ b/docs/cls_gensvm.rst @@ -0,0 +1,113 @@ + +.. py:class:: GenSVM(p=1.0, lmd=1e-05, kappa=0.0, epsilon=1e-06, weights='unit', kernel='linear', gamma='auto', coef=0.0, degree=2.0, kernel_eigen_cutoff=1e-08, verbose=0, random_state=None, max_iter=100000000.0) + :noindex: + :module: gensvm.core + + Generalized Multiclass Support Vector Machine Classification. + + This class implements the basic GenSVM classifier. GenSVM is a generalized + multiclass SVM which is flexible in the weighting of misclassification + errors. It is this flexibility that makes it perform well on diverse + datasets. + + The :func:`.~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class + use the GenSVM C library for the actual computations. + + :param p: Parameter for the L_p norm of the loss function (1.0 <= p <= 2.0) + :type p: float, optional (default=1.0) + :param lmd: Parameter for the regularization term of the loss function (lmd > 0) + :type lmd: float, optional (default=1e-5) + :param kappa: Parameter for the hinge function in the loss function (kappa > -1.0) + :type kappa: float, optional (default=0.0) + :param weights: Type of sample weights to use. Options are 'unit' for unit weights and + 'group' for group size correction weights (equation 4 in the paper). + :type weights: string, optional (default='unit') + :param kernel: Specify the kernel type to use in the classifier. It must be one of + 'linear', 'poly', 'rbf', or 'sigmoid'. + :type kernel: string, optional (default='linear') + :param gamma: Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is + 'auto' then 1/n_features will be used. + :type gamma: float, optional (default='auto') + :param coef: Kernel parameter for the poly and sigmoid kernel + :type coef: float, optional (default=0.0) + :param degree: Kernel parameter for the poly kernel + :type degree: float, optional (default=2.0) + :param kernel_eigen_cutoff: Cutoff point for the reduced eigendecomposition used with + kernel-GenSVM. Eigenvectors for which the ratio between their + corresponding eigenvalue and the largest eigenvalue is smaller than the + cutoff will be dropped. + :type kernel_eigen_cutoff: float, optional (default=1e-8) + :param verbose: Enable verbose output + :type verbose: int, (default=0) + :param max_iter: The maximum number of iterations to be run. + :type max_iter: int, (default=1e8) + + .. attribute:: coef_ + + *array, shape = [n_features, n_classes-1]* -- Weights assigned to the features (coefficients in the primal problem) + + .. attribute:: intercept_ + + *array, shape = [n_classes-1]* -- Constants in the decision function + + .. attribute:: combined_coef_ + + *array, shape = [n_features+1, n_classes-1]* -- Combined weights matrix for the seed_V parameter to the fit method + + .. attribute:: n_iter_ + + *int* -- The number of iterations that were run during training. + + .. attribute:: n_support_ + + *int* -- The number of support vectors that were found + + .. seealso:: + + :class:`.GenSVMGridSearchCV` + Helper class to run an efficient grid search for GenSVM. + + + .. py:method:: GenSVM.fit(X, y, seed_V=None) + :noindex: + :module: gensvm.core + + Fit the GenSVM model on the given data + + The model can be fit with or without a seed matrix (``seed_V``). This + can be used to provide warm starts for the algorithm. + + :param X: The input data. It is expected that only numeric data is given. + :type X: array, shape = (n_observations, n_features) + :param y: The label vector, labels can be numbers or strings. + :type y: array, shape = (n_observations, ) + :param seed_V: Seed coefficient array to use as a warm start for the optimization. + It can for instance be the :attr:`combined_coef_ + <.GenSVM.combined_coef_>` attribute of a different GenSVM model. + This is only supported for the linear kernel. + + NOTE: the size of the seed_V matrix is ``n_features+1`` by + ``n_classes - 1``. The number of columns of ``seed_V`` is leading + for the number of classes in the model. For example, if ``y`` + contains 3 different classes and ``seed_V`` has 3 columns, we + assume that there are actually 4 classes in the problem but one + class is just represented in this training data. This can be useful + for problems were a certain class has only a few samples. + :type seed_V: array, shape = (n_features+1, n_classes-1), optional + + :returns: **self** -- Returns self. + :rtype: object + + + .. py:method:: GenSVM.predict(X) + :noindex: + :module: gensvm.core + + Predict the class labels on the given data + + :param X: + :type X: array, shape = [n_samples, n_features] + + :returns: **y_pred** + :rtype: array, shape = (n_samples, ) + diff --git a/docs/cls_gridsearch.rst b/docs/cls_gridsearch.rst new file mode 100644 index 0000000..8708123 --- /dev/null +++ b/docs/cls_gridsearch.rst @@ -0,0 +1,271 @@ + +.. py:class:: GenSVMGridSearchCV(param_grid, scoring=None, iid=True, cv=None, refit=True, verbose=0, return_train_score=True) + :noindex: + :module: gensvm.gridsearch + + GenSVM cross validated grid search + + This class implements efficient GenSVM grid search with cross validation. + One of the strong features of GenSVM is that seeding the classifier + properly can greatly reduce total training time. This class ensures that + the grid search is done in the most efficient way possible. + + The implementation of this class is based on the `GridSearchCV`_ class in + scikit-learn. The documentation of the various parameters is therefore + mostly the same. This is done to provide the user with a familiar and + easy-to-use interface to doing a grid search with GenSVM. A separate class + was needed to benefit from the fast low-level C implementation of grid + search in the GenSVM library. + + :param param_grid: Dictionary of parameter names (strings) as keys and lists of parameter + settings to evaluate as values, or a list of such dicts. The GenSVM + model will be evaluated at all combinations of the parameters. + :type param_grid: dict or list of dicts + :param scoring: A single string (see :ref:`scoring_parameter`) or a callable (see + :ref:`scoring`) to evaluate the predictions on the test set. + + For evaluating multiple metrics, either give a list of (unique) strings + or a dict with names as keys and callables as values. + + NOTE that when using custom scorers, each scorer should return a single + value. Metric functions returning a list/array of values can be wrapped + into multiple scorers that return one value each. + + If None, the `accuracy_score`_ is used. + :type scoring: string, callable, list/tuple, dict or None + :param iid: If True, the data is assumed to be identically distributed across the + folds, and the loss minimized is the total loss per sample and not the + mean loss across the folds. + :type iid: boolean, default=True + :param cv: Determines the cross-validation splitting strategy. Possible inputs for + cv are: + + - None, to use the default 3-fold cross validation, + - integer, to specify the number of folds in a `(Stratified)KFold`, + - An object to be used as a cross-validation generator. + - An iterable yielding train, test splits. + + For integer/None inputs, :class:`StratifiedKFold + <sklearn.model_selection.StratifiedKFold>` is used. In all other + cases, :class:`KFold <sklearn.model_selection.KFold>` is used. + + Refer to the `scikit-learn User Guide on cross validation`_ for the + various strategies that can be used here. + :type cv: int, cross-validation generator or an iterable, optional + :param refit: Refit the GenSVM estimator with the best found parameters on the whole + dataset. + + For multiple metric evaluation, this needs to be a string denoting the + scorer to be used to find the best parameters for refitting the + estimator at the end. + + The refitted estimator is made available at the `:attr:best_estimator_ + <.GenSVMGridSearchCV.best_estimator_>` attribute and allows the user to + use the :func:`~GenSVMGridSearchCV.predict` method directly on this + :class:`.GenSVMGridSearchCV` instance. + + Also for multiple metric evaluation, the attributes :attr:`best_index_ + <.GenSVMGridSearchCV.best_index_>`, :attr:`best_score_ + <.GenSVMGridSearchCV.best_score_>` and :attr:`best_params_ + <.GenSVMGridSearchCV:best_params_>` will only be available if ``refit`` + is set and all of them will be determined w.r.t this specific scorer. + + See ``scoring`` parameter to know more about multiple metric + evaluation. + :type refit: boolean, or string, default=True + :param verbose: Controls the verbosity: the higher, the more messages. + :type verbose: integer + :param return_train_score: If ``False``, the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` + attribute will not include training scores. + :type return_train_score: boolean, default=True + + .. rubric:: Examples + + >>> from gensvm import GenSVMGridSearchCV + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> param_grid = {'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]} + >>> clf = GenSVMGridSearchCV(param_grid) + >>> clf.fit(iris.data, iris.target) + GenSVMGridSearchCV(cv=None, iid=True, + param_grid={'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]}, + refit=True, return_train_score=True, scoring=None, verbose=0) + + .. attribute:: cv_results_ + + *dict of numpy (masked) ndarrays* -- A dict with keys as column headers and values as columns, that can be + imported into a pandas `DataFrame`_. + + For instance the below given table + + +------------+-----------+------------+-----------------+---+---------+ + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| + +============+===========+============+=================+===+=========+ + | 'poly' | -- | 2 | 0.8 |...| 2 | + +------------+-----------+------------+-----------------+---+---------+ + | 'poly' | -- | 3 | 0.7 |...| 4 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.1 | -- | 0.8 |...| 3 | + +------------+-----------+------------+-----------------+---+---------+ + | 'rbf' | 0.2 | -- | 0.9 |...| 1 | + +------------+-----------+------------+-----------------+---+---------+ + + will be represented by a ``cv_results_`` dict of:: + + { + 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], + mask = [False False False False]...) + 'param_gamma': masked_array(data = [-- -- 0.1 0.2], + mask = [ True True False False]...), + 'param_degree': masked_array(data = [2.0 3.0 -- --], + mask = [False False True True]...), + 'split0_test_score' : [0.8, 0.7, 0.8, 0.9], + 'split1_test_score' : [0.82, 0.5, 0.7, 0.78], + 'mean_test_score' : [0.81, 0.60, 0.75, 0.82], + 'std_test_score' : [0.02, 0.01, 0.03, 0.03], + 'rank_test_score' : [2, 4, 3, 1], + 'split0_train_score' : [0.8, 0.9, 0.7], + 'split1_train_score' : [0.82, 0.5, 0.7], + 'mean_train_score' : [0.81, 0.7, 0.7], + 'std_train_score' : [0.03, 0.03, 0.04], + 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], + 'std_fit_time' : [0.01, 0.02, 0.01, 0.01], + 'mean_score_time' : [0.007, 0.06, 0.04, 0.04], + 'std_score_time' : [0.001, 0.002, 0.003, 0.005], + 'params' : [{'kernel': 'poly', 'degree': 2}, ...], + } + + NOTE: + + The key ``'params'`` is used to store a list of parameter settings + dicts for all the parameter candidates. + + The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and + ``std_score_time`` are all in seconds. + + For multi-metric evaluation, the scores for all the scorers are + available in the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>` + dict at the keys ending with that scorer's name (``'_<scorer_name>'``) + instead of ``'_score'`` shown above. ('split0_test_precision', + 'mean_train_precision' etc.) + + .. attribute:: best_estimator_ + + *estimator or dict* -- Estimator that was chosen by the search, i.e. estimator which gave + highest score (or smallest loss if specified) on the left out data. Not + available if ``refit=False``. + + See ``refit`` parameter for more information on allowed values. + + .. attribute:: best_score_ + + *float* -- Mean cross-validated score of the best_estimator + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + .. attribute:: best_params_ + + *dict* -- Parameter setting that gave the best results on the hold out data. + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + .. attribute:: best_index_ + + *int* -- The index (of the ``cv_results_`` arrays) which corresponds to the best + candidate parameter setting. + + The dict at ``search.cv_results_['params'][search.best_index_]`` gives + the parameter setting for the best model, that gives the highest mean + score (``search.best_score_``). + + For multi-metric evaluation, this is present only if ``refit`` is + specified. + + .. attribute:: scorer_ + + *function or a dict* -- Scorer function used on the held out data to choose the best parameters + for the model. + + For multi-metric evaluation, this attribute holds the validated + ``scoring`` dict which maps the scorer key to the scorer callable. + + .. attribute:: n_splits_ + + *int* -- The number of cross-validation splits (folds/iterations). + + .. rubric:: Notes + + The parameters selected are those that maximize the score of the left out + data, unless an explicit score is passed in which case it is used instead. + + .. seealso:: + + `ParameterGrid`_: + Generates all the combinations of a hyperparameter grid. + + :class:`.GenSVM`: + The GenSVM classifier + + .. _GridSearchCV: + http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html + .. _accuracy_score: + http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html + .. _scikit-learn User Guide on cross validation: + http://scikit-learn.org/stable/modules/cross_validation.html + + .. _ParameterGrid: + http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html + .. _DataFrame: + https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html + + + .. py:method:: GenSVMGridSearchCV.fit(X, y, groups=None) + :noindex: + :module: gensvm.gridsearch + + Run GenSVM grid search with all sets of parameters + + :param X: Training data, where n_samples is the number of observations and + n_features is the number of features. + :type X: array-like, shape = (n_samples, n_features) + :param y: Target vector for the training data. + :type y: array-like, shape = (n_samples, ) + :param groups: Group labels for the samples used while splitting the dataset into + train/test sets. + :type groups: array-like, with shape (n_samples, ), optional + + :returns: **self** -- Return self. + :rtype: object + + + .. py:method:: GenSVMGridSearchCV.predict(X) + :noindex: + :module: gensvm.gridsearch + + Predict the class labels on the test data + + :param X: Test data, where n_samples is the number of observations and + n_features is the number of features. + :type X: array-like, shape = (n_samples, n_features) + + :returns: **y_pred** -- Predicted class labels of the data in X. + :rtype: array-like, shape = (n_samples, ) + + + .. py:method:: GenSVMGridSearchCV.score(X, y) + :noindex: + :module: gensvm.gridsearch + + Compute the score on the test data given the true labels + + :param X: Test data, where n_samples is the number of observations and + n_features is the number of features. + :type X: array-like, shape = (n_samples, n_features) + :param y: True labels for the test data. + :type y: array-like, shape = (n_samples, ) + + :returns: **score** + :rtype: float + diff --git a/docs/generate_autodocs.py b/docs/generate_autodocs.py new file mode 100644 index 0000000..6138287 --- /dev/null +++ b/docs/generate_autodocs.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +""" +This script manually generates the autodoc RST files for the classes we want to +document. By doing this, we can generate the documentation on Read The Docs +(RTD). If we try to use vanilla autodoc, we run into the problem that a +working Blas installation is necessary to install the GenSVM python package and +this is not available in the RTD VM. + +Author: Gertjan van den Burg + +""" + +import os + +from docutils.statemachine import StringList, ViewList + +from sphinx.ext.autodoc import AutoDirective, ClassDocumenter, Options +from sphinx.application import Sphinx +from sphinx.environment import BuildEnvironment + +BASE_DIR = '/home/gertjan/Dropbox/phd/research/msvm/python/start_here/' +DOCDIR = os.path.join(BASE_DIR, 'gensvm', 'docs') + +CLASSES = [ + 'GenSVMGridSearchCV', + 'GenSVM' + ] + +FULL_NAMES = { + 'GenSVM': 'gensvm.core.GenSVM', + 'GenSVMGridSearchCV': 'gensvm.gridsearch.GenSVMGridSearchCV' + } + +OUTPUT_FILES = { + 'GenSVMGridSearchCV': os.path.join(DOCDIR, 'cls_gridsearch.rst'), + 'GenSVM': os.path.join(DOCDIR, 'cls_gensvm.rst') + } + + +def load_app(): + srcdir = DOCDIR[:] + confdir = DOCDIR[:] + outdir = os.path.join(BASE_DIR, 'gensvm_docs', 'html') + doctreedir = os.path.join(BASE_DIR, 'gensvm_docs', 'doctrees') + buildername = 'html' + + app = Sphinx(srcdir, confdir, outdir, doctreedir, buildername) + return app + + +def generate_autodoc(app, cls): + ad = AutoDirective(name='autoclass', arguments=[FULL_NAMES[cls]], + options={'noindex': True}, content=StringList([], items=[]), + lineno=0, content_offset=1, block_text='', state=None, + state_machine=None) + + ad.env = BuildEnvironment(app) + ad.genopt = Options(noindex=True) + ad.filename_set = set() + ad.result = ViewList() + + documenter = ClassDocumenter(ad, ad.arguments[0]) + documenter.generate(all_members=True) + + with open(OUTPUT_FILES[cls], 'w') as fid: + for line in ad.result: + fid.write(line + '\n') + +def main(): + app = load_app() + for cls in CLASSES: + generate_autodoc(app, cls) + +if __name__ == '__main__': + main() |
