aboutsummaryrefslogtreecommitdiff
path: root/include/msvmmaj_train_dataset.h
blob: 0889626fc0317cb660caba4e03895cdf205e6ff3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/**
 * @file msvmmaj_train_dataset.h
 * @author Gertjan van den Burg
 * @date August, 2013
 * @brief Structs and functions necessary for the grid search
 *
 * @details
 * The grid search for the optimal parameters is done through a queue.
 * This file contains struct definitions for this queue and a single
 * task in a queue, as well as a structure for the complete training
 * scheme. Function declarations are also included.
 *
 */

#ifndef MSVMMAJ_TRAIN_DATASET_H
#define MSVMMAJ_TRAIN_DATASET_H

#include "globals.h"
#include "types.h"

/**
 * @brief A structure for a single task in the queue.
 *
 * @param folds 	number of folds in cross validation
 * @param ID 		numeric id of the task in the queue
 * @param weight_idx 	parameter for the MajModel
 * @param p 		parameter for the MajModel
 * @param kappa 	parameter for the MajModel
 * @param lambda 	parameter for the MajModel
 * @param epsilon 	parameter for the MajModel
 * @param kerneltype 	parameter for the MajModel
 * @param *kernelparam parameters for the MajModel
 * @param *train_data 	pointer to the training data
 * @param *test_data 	pointer to the test data (if any)
 * @param performance 	performance after cross validation
 */
struct Task {
	KernelType kerneltype;
	int weight_idx;
	long folds;
	long ID;
	double p;
	double kappa;
	double lambda;
	double epsilon;
	double *kernelparam;
	struct MajData *train_data;
	struct MajData *test_data;
	double performance;
};

/**
 * @brief Simple task queue.
 *
 * This struct is basically just an array of pointers to Task instances,
 * with a length and an index of the current task.
 *
 * @param **tasks 	array of pointers to Task structs
 * @param N 		size of task array
 * @param i 		index used for keeping track of the queue
 */
struct Queue {
	struct Task **tasks;
	long N;
	long i;
};

/**
 * @brief Structure for describing the entire grid search
 *
 * @param traintype 		type of training to use
 * @param kerneltype 		type of kernel to use throughout training
 * @param repeats 		number of repeats to be done after the grid 
 * 				search to find the parameter set with the 
 * 				most consistent high performance
 * @param folds 		number of folds in cross validation
 * @param Np 			size of the array of p values
 * @param Nl 			size of the array of lambda values
 * @param Nk 			size of the array of kappa values
 * @param Ne 			size of the array of epsilon values
 * @param Nw 			size of the array of weight_idx values
 * @param Ng 			size of the array of gamma values
 * @param Nc 			size of the array of coef values
 * @param Nd 			size of the array of degree values
 * @param *weight_idxs 		array of weight_idxs
 * @param *ps 			array of p values 
 * @param *lambdas 		array of lambda values
 * @param *kappas 		array of kappa values
 * @param *epsilons 		array of epsilon values
 * @param *gammas 		array of gamma values
 * @param *coefs 		array of coef values
 * @param *degrees 		array of degree values
 * @param *train_data_file 	filename of train data file
 * @param *test_data_file 	filename of test data file
 *
 */
struct Training {
	TrainType traintype;
	KernelType kerneltype;
	long repeats;
	long folds;
	long Np;
	long Nl;
	long Nk;
	long Ne;
	long Nw;
	long Ng;
	long Nc;
	long Nd;
	int *weight_idxs;
	double *ps;
	double *lambdas;
	double *kappas;
	double *epsilons;
	double *gammas;
	double *coefs;
	double *degrees;
	char *train_data_file;
	char *test_data_file;
};

void make_queue(struct Training *training, struct Queue *queue,
		struct MajData *train_data, struct MajData *test_data);

struct Task *get_next_task(struct Queue *q);
void start_training_tt(struct Queue *q);
void start_training_cv(struct Queue *q);
void free_queue(struct Queue *q);

void consistency_repeats(struct Queue *q, long repeats, TrainType traintype);

double cross_validation(struct MajModel *model, struct MajData *data,
	       	long folds);

void make_model_from_task(struct Task *task, struct MajModel *model);
void copy_model(struct MajModel *from, struct MajModel *to);

void msvmmaj_reallocate_model(struct MajModel *model, long n, long m);

void print_progress_string(struct Task *task, long N);
#endif