/** * @file gensvm_io.c * @author G.J.J. van den Burg * @date 2014-01-07 * @brief Functions for input and output of data and model files * * @details * This file contains functions for reading and writing model files, and data * files. It also contains a function for generating a string of the current * time, used in writing output files. * * @copyright Copyright 2016, G.J.J. van den Burg. This file is part of GenSVM. GenSVM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. GenSVM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GenSVM. If not, see . */ #include "gensvm_io.h" /** * @brief Read data from file * * @details * Read the data from the data_file. The data matrix X is augmented * with a column of ones, to get the matrix Z. The data is expected * to follow a specific format, which is specified in the @ref spec_data_file. * The class labels are assumed to be in the interval [1 .. K], which can be * checked using the function gensvm_check_outcome_contiguous(). * * @param[in,out] dataset initialized GenData struct * @param[in] data_file filename of the data file. */ void gensvm_read_data(struct GenData *dataset, char *data_file) { FILE *fid = NULL; long i, j, n, m, nr = 0, K = 0; double value; char buf[GENSVM_MAX_LINE_LENGTH]; if ((fid = fopen(data_file, "r")) == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Datafile %s could not be opened.\n", data_file); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } // Read data dimensions nr += fscanf(fid, "%ld", &n); nr += fscanf(fid, "%ld", &m); // Allocate memory dataset->RAW = Malloc(double, n*(m+1)); // Read first line of data for (j=1; jRAW, m+1, 0, j, value); } if (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: No label found on first line.\n"); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } // Check if there is a label at the end of the line if (sscanf(buf, "%lf", &value) > 0) { dataset->y = Malloc(long, n); dataset->y[0] = value; K = 1; } else { free(dataset->y); dataset->y = NULL; } // Read the rest of the file for (i=1; iRAW, m+1, i, j, value); } if (dataset->y != NULL) { nr += fscanf(fid, "%lf", &value); dataset->y[i] = (long) value; K = maximum(K, dataset->y[i]); } } fclose(fid); if (nr < n * m) { // LCOV_EXCL_START err("[GenSVM Error]: not enough data found in %s\n", data_file); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } // Set the column of ones for (i=0; iRAW, m+1, i, 0, 1.0); dataset->n = n; dataset->m = m; dataset->r = m; dataset->K = K; dataset->Z = dataset->RAW; if (gensvm_could_sparse(dataset->Z, n, m+1)) { note("Converting to sparse ... "); dataset->spZ = gensvm_dense_to_sparse(dataset->Z, n, m+1); note("done.\n"); free(dataset->RAW); dataset->RAW = NULL; dataset->Z = NULL; } } /** * @brief Print an error to the screen and exit (copied from LibSVM) * * @param[in] line_num line number where the error occured * */ void exit_input_error(int line_num) { err("[GenSVM Error]: Wrong input format on line: %i\n", line_num); exit(EXIT_FAILURE); } /** * @brief Read data from a file in LibSVM/SVMlight format * * @details * This function reads data from a file where the data is stored in * LibSVM/SVMlight format. The file format is described in @ref * spec_libsvm_data_file. This is a sparse data format, which can be * beneficial for certain applications. The advantage of having this function * here is twofold: 1) existing datasets where data is stored in * LibSVM/SVMlight format can be easily used in GenSVM, and 2) sparse datasets * which are too large for memory when kept in dense format can be loaded * efficiently into GenSVM. * * @note * This code is based on the read_problem() function in the svm-train.c * file of LibSVM. It has however been expanded to be able to handle data * files without labels. * * @note * This file tries to detect whether 1-based or 0-based indexing is used in * the data file. By default 1-based indexing is used, but if an index is * found with value 0, 0-based indexing is assumed. * * @sa * gensvm_read_problem() * * @param[in] data GenData structure * @param[in] data_file filename of the datafile * */ void gensvm_read_data_libsvm(struct GenData *data, char *data_file) { bool do_sparse, zero_based = false; long i, j, n, m, K, nnz, cnt, tmp, index, row_cnt, num_labels, min_index = 1; int n_big, n_small, big_start; double value; FILE *fid = NULL; char *label = NULL, *endptr = NULL, **big_parts = NULL, **small_parts = NULL; char buf[GENSVM_MAX_LINE_LENGTH]; fid = fopen(data_file, "r"); if (fid == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Datafile %s could not be opened.\n", data_file); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } // first count the number of elements n = 0; m = -1; num_labels = 0; nnz = 0; while (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) != NULL) { // split the string in labels and/or index:value pairs big_parts = str_split(buf, " \t", &n_big); // record if this line has a label (first part has no colon) num_labels += (!str_contains_char(big_parts[0], ':')); // check for each part if it is a index:value pair for (i=0; i 0 && num_labels != n) { err("[GenSVM Error]: There are some lines with missing " "labels. Please fix this before " "continuing.\n"); exit(EXIT_FAILURE); } // don't forget the column of ones nnz += n; // deal with 0-based or 1-based indexing in the LibSVM file if (min_index == 0) { m++; zero_based = true; } // check if sparsity is worth it do_sparse = gensvm_nnz_comparison(nnz, n, m+1); if (do_sparse) { data->spZ = gensvm_init_sparse(); data->spZ->nnz = nnz; data->spZ->n_row = n; data->spZ->n_col = m+1; data->spZ->values = Calloc(double, nnz); data->spZ->ia = Calloc(long, n+1); data->spZ->ja = Calloc(long, nnz); data->spZ->ia[0] = 0; } else { data->RAW = Calloc(double, n*(m+1)); data->Z = data->RAW; } if (num_labels > 0) data->y = Calloc(long, n); K = 0; cnt = 0; for (i=0; iy[i] = tmp; // keep track of maximum K K = maximum(K, data->y[i]); // increment big part index big_start++; } row_cnt = 0; // set the first element in the row to 1 if (do_sparse) { data->spZ->values[cnt] = 1.0; data->spZ->ja[cnt] = 0; cnt++; row_cnt++; } else { matrix_set(data->RAW, m+1, i, 0, 1.0); } // read the rest of the line for (j=big_start; jspZ->values[cnt] = value; data->spZ->ja[cnt] = index + zero_based; cnt++; row_cnt++; } else { matrix_set(data->RAW, m+1, i, index + zero_based, value); } // free the small parts free(small_parts[0]); free(small_parts[1]); free(small_parts); } if (do_sparse) { data->spZ->ia[i+1] = data->spZ->ia[i] + row_cnt; } // free the big parts for (j=0; jn = n; data->m = m; data->r = m; data->K = K; } /** * @brief Read model from file * * @details * Read a GenModel from a model file. The GenModel struct must have been * initalized elswhere. The model file is expected to follow the @ref * spec_model_file. The easiest way to generate a model file is through * gensvm_write_model(), which can for instance be used in trainGenSVM.c. * * @param[in,out] model initialized GenModel * @param[in] model_filename filename of the model file * */ void gensvm_read_model(struct GenModel *model, char *model_filename) { long i, j, nr = 0; FILE *fid = NULL; char buffer[GENSVM_MAX_LINE_LENGTH]; char data_filename[GENSVM_MAX_LINE_LENGTH]; double value = 0; fid = fopen(model_filename, "r"); if (fid == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Couldn't open model file %s\n", model_filename); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } // skip the first four lines for (i=0; i<4; i++) next_line(fid, model_filename); // read all model variables model->p = get_fmt_double(fid, model_filename, "p = %lf"); model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf"); model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf"); model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf"); model->weight_idx = (int) get_fmt_long(fid, model_filename, "weight_idx = %li"); // skip to data section for (i=0; i<2; i++) next_line(fid, model_filename); // read filename of data file if (fgets(buffer, GENSVM_MAX_LINE_LENGTH, fid) == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Error reading from model file %s\n", model_filename); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } sscanf(buffer, "filename = %s\n", data_filename); model->data_file = Calloc(char, GENSVM_MAX_LINE_LENGTH); strcpy(model->data_file, data_filename); // read all data variables model->n = get_fmt_long(fid, model_filename, "n = %li\n"); model->m = get_fmt_long(fid, model_filename, "m = %li\n"); model->K = get_fmt_long(fid, model_filename, "K = %li\n"); // skip to output for (i=0; i<2; i++) next_line(fid, model_filename); // read the matrix V and check for consistency model->V = Malloc(double, (model->m+1)*(model->K-1)); for (i=0; im+1; i++) { for (j=0; jK-1; j++) { nr += fscanf(fid, "%lf ", &value); matrix_set(model->V, model->K-1, i, j, value); } } if (nr != (model->m+1)*(model->K-1)) { // LCOV_EXCL_START err("[GenSVM Error] Error reading from model file %s. " "Not enough elements of V found.\n", model_filename); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } } /** * @brief Write model to file * * @details * Write a GenModel to a file. The current time is specified in the file in * UTC + offset. The model file further corresponds to the @ref * spec_model_file. * * @param[in] model GenModel which contains an estimate for * GenModel::V * @param[in] output_filename the output file to write the model to * */ void gensvm_write_model(struct GenModel *model, char *output_filename) { FILE *fid = NULL; long i, j; char timestr[GENSVM_MAX_LINE_LENGTH]; // open output file fid = fopen(output_filename, "w"); if (fid == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Error opening output file %s\n", output_filename); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } gensvm_time_string(timestr); // Write output to file fprintf(fid, "Output file for GenSVM (version %s)\n", VERSION_STRING); fprintf(fid, "Generated on: %s\n\n", timestr); fprintf(fid, "Model:\n"); fprintf(fid, "p = %15.16f\n", model->p); fprintf(fid, "lambda = %15.16f\n", model->lambda); fprintf(fid, "kappa = %15.16f\n", model->kappa); fprintf(fid, "epsilon = %g\n", model->epsilon); fprintf(fid, "weight_idx = %i\n", model->weight_idx); fprintf(fid, "\n"); fprintf(fid, "Data:\n"); fprintf(fid, "filename = %s\n", model->data_file); fprintf(fid, "n = %li\n", model->n); fprintf(fid, "m = %li\n", model->m); fprintf(fid, "K = %li\n", model->K); fprintf(fid, "\n"); fprintf(fid, "Output:\n"); for (i=0; im+1; i++) { for (j=0; jK-1; j++) { if (j > 0) fprintf(fid, " "); fprintf(fid, "%+15.16f", matrix_get(model->V, model->K-1, i, j)); } fprintf(fid, "\n"); } fclose(fid); } /** * @brief Write predictions to file * * @details * Write the given predictions to an output file, such that the resulting file * corresponds to the @ref spec_data_file. * * @param[in] data GenData with the original instances * @param[in] predy predictions of the class labels of the * instances in the given GenData. Note that the * order of the instances is assumed to be the * same. * @param[in] output_filename the file to which the predictions are written * */ void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename) { long i, j; FILE *fid = NULL; fid = fopen(output_filename, "w"); if (fid == NULL) { // LCOV_EXCL_START err("[GenSVM Error]: Error opening output file %s\n", output_filename); exit(EXIT_FAILURE); // LCOV_EXCL_STOP } fprintf(fid, "%li\n", data->n); fprintf(fid, "%li\n", data->m); for (i=0; in; i++) { for (j=0; jm; j++) fprintf(fid, "%.16f ", matrix_get(data->Z, data->m+1, i, j+1)); fprintf(fid, "%li\n", predy[i]); } fclose(fid); } /** * @brief Get time string with UTC offset * * @details * Create a string for the current system time. Include an offset of UTC for * consistency. The format of the generated string is "DDD MMM D HH:MM:SS * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)". * * @param[in,out] buffer allocated string buffer, on exit contains * formatted string * */ void gensvm_time_string(char *buffer) { int diff, hours, minutes; char timestr[GENSVM_MAX_LINE_LENGTH]; time_t current_time, lt, gt; struct tm *lclt = NULL; // get current time (in epoch) current_time = time(NULL); if (current_time == ((time_t)-1)) { // LCOV_EXCL_START err("[GenSVM Error]: Failed to compute the current time.\n"); return; // LCOV_EXCL_STOP } // convert time to local time and create a string lclt = localtime(¤t_time); strftime(timestr, GENSVM_MAX_LINE_LENGTH, "%c", lclt); if (timestr == NULL) { err("[GenSVM Error]: Failed to convert time to string.\n"); return; } // calculate the UTC offset including DST lt = mktime(localtime(¤t_time)); gt = mktime(gmtime(¤t_time)); diff = -difftime(gt, lt); hours = (diff/3600); minutes = (diff%3600)/60; if (lclt->tm_isdst == 1) hours++; sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes); }