amend last commit

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-13 14:08:00 -0500
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2017-12-13 14:08:00 -0500
commit: 804df540d007b13fe8e0d1bb3df535e84618ef9f (patch)
tree: 992d8c17f655a05d4caf034b2a09ac549a353a13 /docs
parent: Manually generate restructured text for class documentation (diff)
download: pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.tar.gz
pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.zip
3 files changed, 460 insertions, 0 deletions
diff --git a/docs/cls_gensvm.rst b/docs/cls_gensvm.rst
new file mode 100644
index 0000000..082df8f
--- /dev/null
+++ b/docs/cls_gensvm.rst
@@ -0,0 +1,113 @@
+
+.. py:class:: GenSVM(p=1.0, lmd=1e-05, kappa=0.0, epsilon=1e-06, weights='unit', kernel='linear', gamma='auto', coef=0.0, degree=2.0, kernel_eigen_cutoff=1e-08, verbose=0, random_state=None, max_iter=100000000.0)
+   :noindex:
+   :module: gensvm.core
+
+   Generalized Multiclass Support Vector Machine Classification.
+   
+   This class implements the basic GenSVM classifier. GenSVM is a generalized
+   multiclass SVM which is flexible in the weighting of misclassification
+   errors. It is this flexibility that makes it perform well on diverse
+   datasets.
+   
+   The :func:`.~GenSVM.fit` and :func:`~GenSVM.predict` methods of this class
+   use the GenSVM C library for the actual computations.
+   
+   :param p: Parameter for the L_p norm of the loss function (1.0 <= p <= 2.0)
+   :type p: float, optional (default=1.0)
+   :param lmd: Parameter for the regularization term of the loss function (lmd > 0)
+   :type lmd: float, optional (default=1e-5)
+   :param kappa: Parameter for the hinge function in the loss function (kappa > -1.0)
+   :type kappa: float, optional (default=0.0)
+   :param weights: Type of sample weights to use. Options are 'unit' for unit weights and
+                   'group' for group size correction weights (equation 4 in the paper).
+   :type weights: string, optional (default='unit')
+   :param kernel: Specify the kernel type to use in the classifier. It must be one of
+                  'linear', 'poly', 'rbf', or 'sigmoid'.
+   :type kernel: string, optional (default='linear')
+   :param gamma: Kernel parameter for the rbf, poly, and sigmoid kernel. If gamma is
+                 'auto' then 1/n_features will be used.
+   :type gamma: float, optional (default='auto')
+   :param coef: Kernel parameter for the poly and sigmoid kernel
+   :type coef: float, optional (default=0.0)
+   :param degree: Kernel parameter for the poly kernel
+   :type degree: float, optional (default=2.0)
+   :param kernel_eigen_cutoff: Cutoff point for the reduced eigendecomposition used with
+                               kernel-GenSVM. Eigenvectors for which the ratio between their
+                               corresponding eigenvalue and the largest eigenvalue is smaller than the
+                               cutoff will be dropped.
+   :type kernel_eigen_cutoff: float, optional (default=1e-8)
+   :param verbose: Enable verbose output
+   :type verbose: int, (default=0)
+   :param max_iter: The maximum number of iterations to be run.
+   :type max_iter: int, (default=1e8)
+   
+   .. attribute:: coef_
+   
+      *array, shape = [n_features, n_classes-1]* -- Weights assigned to the features (coefficients in the primal problem)
+   
+   .. attribute:: intercept_
+   
+      *array, shape = [n_classes-1]* -- Constants in the decision function
+   
+   .. attribute:: combined_coef_
+   
+      *array, shape = [n_features+1, n_classes-1]* -- Combined weights matrix for the seed_V parameter to the fit method
+   
+   .. attribute:: n_iter_
+   
+      *int* -- The number of iterations that were run during training.
+   
+   .. attribute:: n_support_
+   
+      *int* -- The number of support vectors that were found
+   
+   .. seealso::
+   
+      :class:`.GenSVMGridSearchCV`
+          Helper class to run an efficient grid search for GenSVM.
+   
+   
+   .. py:method:: GenSVM.fit(X, y, seed_V=None)
+      :noindex:
+      :module: gensvm.core
+   
+      Fit the GenSVM model on the given data
+      
+      The model can be fit with or without a seed matrix (``seed_V``). This
+      can be used to provide warm starts for the algorithm.
+      
+      :param X: The input data. It is expected that only numeric data is given.
+      :type X: array, shape = (n_observations, n_features)
+      :param y: The label vector, labels can be numbers or strings.
+      :type y: array, shape = (n_observations, )
+      :param seed_V: Seed coefficient array to use as a warm start for the optimization.
+                     It can for instance be the :attr:`combined_coef_
+                     <.GenSVM.combined_coef_>` attribute of a different GenSVM model.
+                     This is only supported for the linear kernel.
+      
+                     NOTE: the size of the seed_V matrix is ``n_features+1`` by
+                     ``n_classes - 1``.  The number of columns of ``seed_V`` is leading
+                     for the number of classes in the model. For example, if ``y``
+                     contains 3 different classes and ``seed_V`` has 3 columns, we
+                     assume that there are actually 4 classes in the problem but one
+                     class is just represented in this training data. This can be useful
+                     for problems were a certain class has only a few samples.
+      :type seed_V: array, shape = (n_features+1, n_classes-1), optional
+      
+      :returns: **self** -- Returns self.
+      :rtype: object
+      
+   
+   .. py:method:: GenSVM.predict(X)
+      :noindex:
+      :module: gensvm.core
+   
+      Predict the class labels on the given data
+      
+      :param X:
+      :type X: array, shape = [n_samples, n_features]
+      
+      :returns: **y_pred**
+      :rtype: array, shape = (n_samples, )
+      
diff --git a/docs/cls_gridsearch.rst b/docs/cls_gridsearch.rst
new file mode 100644
index 0000000..8708123
--- /dev/null
+++ b/docs/cls_gridsearch.rst
@@ -0,0 +1,271 @@
+
+.. py:class:: GenSVMGridSearchCV(param_grid, scoring=None, iid=True, cv=None, refit=True, verbose=0, return_train_score=True)
+   :noindex:
+   :module: gensvm.gridsearch
+
+   GenSVM cross validated grid search
+   
+   This class implements efficient GenSVM grid search with cross validation.
+   One of the strong features of GenSVM is that seeding the classifier
+   properly can greatly reduce total training time. This class ensures that
+   the grid search is done in the most efficient way possible.
+   
+   The implementation of this class is based on the `GridSearchCV`_ class in
+   scikit-learn. The documentation of the various parameters is therefore
+   mostly the same. This is done to provide the user with a familiar and
+   easy-to-use interface to doing a grid search with GenSVM. A separate class
+   was needed to benefit from the fast low-level C implementation of grid
+   search in the GenSVM library.
+   
+   :param param_grid: Dictionary of parameter names (strings) as keys and lists of parameter
+                      settings to evaluate as values, or a list of such dicts. The GenSVM
+                      model will be evaluated at all combinations of the parameters.
+   :type param_grid: dict or list of dicts
+   :param scoring: A single string (see :ref:`scoring_parameter`) or a callable (see
+                   :ref:`scoring`) to evaluate the predictions on the test set.
+   
+                   For evaluating multiple metrics, either give a list of (unique) strings
+                   or a dict with names as keys and callables as values.
+   
+                   NOTE that when using custom scorers, each scorer should return a single
+                   value. Metric functions returning a list/array of values can be wrapped
+                   into multiple scorers that return one value each.
+   
+                   If None, the `accuracy_score`_ is used.
+   :type scoring: string, callable, list/tuple, dict or None
+   :param iid: If True, the data is assumed to be identically distributed across the
+               folds, and the loss minimized is the total loss per sample and not the
+               mean loss across the folds.
+   :type iid: boolean, default=True
+   :param cv: Determines the cross-validation splitting strategy. Possible inputs for
+              cv are:
+   
+                - None, to use the default 3-fold cross validation,
+                - integer, to specify the number of folds in a `(Stratified)KFold`,
+                - An object to be used as a cross-validation generator.
+                - An iterable yielding train, test splits.
+   
+              For integer/None inputs, :class:`StratifiedKFold
+              <sklearn.model_selection.StratifiedKFold>` is used.  In all other
+              cases, :class:`KFold <sklearn.model_selection.KFold>` is used.
+   
+              Refer to the `scikit-learn User Guide on cross validation`_ for the
+              various strategies that can be used here.
+   :type cv: int, cross-validation generator or an iterable, optional
+   :param refit: Refit the GenSVM estimator with the best found parameters on the whole
+                 dataset.
+   
+                 For multiple metric evaluation, this needs to be a string denoting the
+                 scorer to be used to find the best parameters for refitting the
+                 estimator at the end.
+   
+                 The refitted estimator is made available at the `:attr:best_estimator_
+                 <.GenSVMGridSearchCV.best_estimator_>` attribute and allows the user to
+                 use the :func:`~GenSVMGridSearchCV.predict` method directly on this
+                 :class:`.GenSVMGridSearchCV` instance.
+   
+                 Also for multiple metric evaluation, the attributes :attr:`best_index_
+                 <.GenSVMGridSearchCV.best_index_>`, :attr:`best_score_
+                 <.GenSVMGridSearchCV.best_score_>` and :attr:`best_params_
+                 <.GenSVMGridSearchCV:best_params_>` will only be available if ``refit``
+                 is set and all of them will be determined w.r.t this specific scorer.
+   
+                 See ``scoring`` parameter to know more about multiple metric
+                 evaluation.
+   :type refit: boolean, or string, default=True
+   :param verbose: Controls the verbosity: the higher, the more messages.
+   :type verbose: integer
+   :param return_train_score: If ``False``, the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>`
+                              attribute will not include training scores.
+   :type return_train_score: boolean, default=True
+   
+   .. rubric:: Examples
+   
+   >>> from gensvm import GenSVMGridSearchCV
+   >>> from sklearn.datasets import load_iris
+   >>> iris = load_iris()
+   >>> param_grid = {'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]}
+   >>> clf = GenSVMGridSearchCV(param_grid)
+   >>> clf.fit(iris.data, iris.target)
+   GenSVMGridSearchCV(cv=None, iid=True,
+         param_grid={'p': [1.0, 2.0], 'kappa': [-0.9, 0.0, 1.0]},
+         refit=True, return_train_score=True, scoring=None, verbose=0)
+   
+   .. attribute:: cv_results_
+   
+      *dict of numpy (masked) ndarrays* -- A dict with keys as column headers and values as columns, that can be
+      imported into a pandas `DataFrame`_.
+   
+      For instance the below given table
+   
+      +------------+-----------+------------+-----------------+---+---------+
+      |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
+      +============+===========+============+=================+===+=========+
+      |  'poly'    |     --    |      2     |        0.8      |...|    2    |
+      +------------+-----------+------------+-----------------+---+---------+
+      |  'poly'    |     --    |      3     |        0.7      |...|    4    |
+      +------------+-----------+------------+-----------------+---+---------+
+      |  'rbf'     |     0.1   |     --     |        0.8      |...|    3    |
+      +------------+-----------+------------+-----------------+---+---------+
+      |  'rbf'     |     0.2   |     --     |        0.9      |...|    1    |
+      +------------+-----------+------------+-----------------+---+---------+
+   
+      will be represented by a ``cv_results_`` dict of::
+   
+          {
+          'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
+                                       mask = [False False False False]...)
+          'param_gamma': masked_array(data = [-- -- 0.1 0.2],
+                                      mask = [ True  True False False]...),
+          'param_degree': masked_array(data = [2.0 3.0 -- --],
+                                       mask = [False False  True  True]...),
+          'split0_test_score'  : [0.8, 0.7, 0.8, 0.9],
+          'split1_test_score'  : [0.82, 0.5, 0.7, 0.78],
+          'mean_test_score'    : [0.81, 0.60, 0.75, 0.82],
+          'std_test_score'     : [0.02, 0.01, 0.03, 0.03],
+          'rank_test_score'    : [2, 4, 3, 1],
+          'split0_train_score' : [0.8, 0.9, 0.7],
+          'split1_train_score' : [0.82, 0.5, 0.7],
+          'mean_train_score'   : [0.81, 0.7, 0.7],
+          'std_train_score'    : [0.03, 0.03, 0.04],
+          'mean_fit_time'      : [0.73, 0.63, 0.43, 0.49],
+          'std_fit_time'       : [0.01, 0.02, 0.01, 0.01],
+          'mean_score_time'    : [0.007, 0.06, 0.04, 0.04],
+          'std_score_time'     : [0.001, 0.002, 0.003, 0.005],
+          'params'             : [{'kernel': 'poly', 'degree': 2}, ...],
+          }
+   
+      NOTE:
+   
+      The key ``'params'`` is used to store a list of parameter settings
+      dicts for all the parameter candidates.
+   
+      The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+      ``std_score_time`` are all in seconds.
+   
+      For multi-metric evaluation, the scores for all the scorers are
+      available in the :attr:`cv_results_ <.GenSVMGridSearchCV.cv_results_>`
+      dict at the keys ending with that scorer's name (``'_<scorer_name>'``)
+      instead of ``'_score'`` shown above. ('split0_test_precision',
+      'mean_train_precision' etc.)
+   
+   .. attribute:: best_estimator_
+   
+      *estimator or dict* -- Estimator that was chosen by the search, i.e. estimator which gave
+      highest score (or smallest loss if specified) on the left out data. Not
+      available if ``refit=False``.
+   
+      See ``refit`` parameter for more information on allowed values.
+   
+   .. attribute:: best_score_
+   
+      *float* -- Mean cross-validated score of the best_estimator
+   
+      For multi-metric evaluation, this is present only if ``refit`` is
+      specified.
+   
+   .. attribute:: best_params_
+   
+      *dict* -- Parameter setting that gave the best results on the hold out data.
+   
+      For multi-metric evaluation, this is present only if ``refit`` is
+      specified.
+   
+   .. attribute:: best_index_
+   
+      *int* -- The index (of the ``cv_results_`` arrays) which corresponds to the best
+      candidate parameter setting.
+   
+      The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+      the parameter setting for the best model, that gives the highest mean
+      score (``search.best_score_``).
+   
+      For multi-metric evaluation, this is present only if ``refit`` is
+      specified.
+   
+   .. attribute:: scorer_
+   
+      *function or a dict* -- Scorer function used on the held out data to choose the best parameters
+      for the model.
+   
+      For multi-metric evaluation, this attribute holds the validated
+      ``scoring`` dict which maps the scorer key to the scorer callable.
+   
+   .. attribute:: n_splits_
+   
+      *int* -- The number of cross-validation splits (folds/iterations).
+   
+   .. rubric:: Notes
+   
+   The parameters selected are those that maximize the score of the left out
+   data, unless an explicit score is passed in which case it is used instead.
+   
+   .. seealso::
+   
+      `ParameterGrid`_:
+          Generates all the combinations of a hyperparameter grid.
+   
+      :class:`.GenSVM`:
+          The GenSVM classifier
+   
+      .. _GridSearchCV:
+          http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
+      .. _accuracy_score:
+          http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
+      .. _scikit-learn User Guide on cross validation:
+          http://scikit-learn.org/stable/modules/cross_validation.html
+   
+      .. _ParameterGrid:
+          http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterGrid.html
+      .. _DataFrame:
+          https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
+   
+   
+   .. py:method:: GenSVMGridSearchCV.fit(X, y, groups=None)
+      :noindex:
+      :module: gensvm.gridsearch
+   
+      Run GenSVM grid search with all sets of parameters
+      
+      :param X: Training data, where n_samples is the number of observations and
+                n_features is the number of features.
+      :type X: array-like, shape = (n_samples, n_features)
+      :param y: Target vector for the training data.
+      :type y: array-like, shape = (n_samples, )
+      :param groups: Group labels for the samples used while splitting the dataset into
+                     train/test sets.
+      :type groups: array-like, with shape (n_samples, ), optional
+      
+      :returns: **self** -- Return self.
+      :rtype: object
+      
+   
+   .. py:method:: GenSVMGridSearchCV.predict(X)
+      :noindex:
+      :module: gensvm.gridsearch
+   
+      Predict the class labels on the test data
+      
+      :param X: Test data, where n_samples is the number of observations and
+                n_features is the number of features.
+      :type X: array-like, shape = (n_samples, n_features)
+      
+      :returns: **y_pred** -- Predicted class labels of the data in X.
+      :rtype: array-like, shape = (n_samples, )
+      
+   
+   .. py:method:: GenSVMGridSearchCV.score(X, y)
+      :noindex:
+      :module: gensvm.gridsearch
+   
+      Compute the score on the test data given the true labels
+      
+      :param X: Test data, where n_samples is the number of observations and
+                n_features is the number of features.
+      :type X: array-like, shape = (n_samples, n_features)
+      :param y: True labels for the test data.
+      :type y: array-like, shape = (n_samples, )
+      
+      :returns: **score**
+      :rtype: float
+      
diff --git a/docs/generate_autodocs.py b/docs/generate_autodocs.py
new file mode 100644
index 0000000..6138287
--- /dev/null
+++ b/docs/generate_autodocs.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+"""
+This script manually generates the autodoc RST files for the classes we want to 
+document. By doing this, we can generate the documentation on Read The Docs 
+(RTD).  If we try to use vanilla autodoc, we run into the problem that a 
+working Blas installation is necessary to install the GenSVM python package and 
+this is not available in the RTD VM.
+
+Author: Gertjan van den Burg
+
+"""
+
+import os
+
+from docutils.statemachine import StringList, ViewList
+
+from sphinx.ext.autodoc import AutoDirective, ClassDocumenter, Options
+from sphinx.application import Sphinx
+from sphinx.environment import BuildEnvironment
+
+BASE_DIR = '/home/gertjan/Dropbox/phd/research/msvm/python/start_here/'
+DOCDIR = os.path.join(BASE_DIR, 'gensvm', 'docs')
+
+CLASSES = [
+        'GenSVMGridSearchCV',
+        'GenSVM'
+        ]
+
+FULL_NAMES = {
+        'GenSVM': 'gensvm.core.GenSVM',
+        'GenSVMGridSearchCV': 'gensvm.gridsearch.GenSVMGridSearchCV'
+        }
+
+OUTPUT_FILES = {
+        'GenSVMGridSearchCV': os.path.join(DOCDIR, 'cls_gridsearch.rst'),
+        'GenSVM': os.path.join(DOCDIR, 'cls_gensvm.rst')
+        }
+
+
+def load_app():
+    srcdir = DOCDIR[:]
+    confdir = DOCDIR[:]
+    outdir = os.path.join(BASE_DIR, 'gensvm_docs', 'html')
+    doctreedir = os.path.join(BASE_DIR, 'gensvm_docs', 'doctrees')
+    buildername = 'html'
+
+    app = Sphinx(srcdir, confdir, outdir, doctreedir, buildername)
+    return app
+
+
+def generate_autodoc(app, cls):
+    ad = AutoDirective(name='autoclass', arguments=[FULL_NAMES[cls]], 
+            options={'noindex': True}, content=StringList([], items=[]), 
+            lineno=0, content_offset=1, block_text='', state=None, 
+            state_machine=None)
+
+    ad.env = BuildEnvironment(app)
+    ad.genopt = Options(noindex=True)
+    ad.filename_set = set()
+    ad.result = ViewList()
+
+    documenter = ClassDocumenter(ad, ad.arguments[0])
+    documenter.generate(all_members=True)
+
+    with open(OUTPUT_FILES[cls], 'w') as fid:
+        for line in ad.result:
+            fid.write(line + '\n')
+
+def main():
+    app = load_app()
+    for cls in CLASSES:
+        generate_autodoc(app, cls)
+
+if __name__ == '__main__':
+    main()
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-13 14:08:00 -0500
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2017-12-13 14:08:00 -0500
commit	804df540d007b13fe8e0d1bb3df535e84618ef9f (patch)
tree	992d8c17f655a05d4caf034b2a09ac549a353a13 /docs
parent	Manually generate restructured text for class documentation (diff)
download	pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.tar.gz pygensvm-804df540d007b13fe8e0d1bb3df535e84618ef9f.zip