diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
| commit | 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch) | |
| tree | 10aa6710599230c889ec44407a065ee303a79348 /examples | |
| download | TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip | |
Initial commit
Diffstat (limited to 'examples')
| -rw-r--r-- | examples/R/README.md | 34 | ||||
| -rw-r--r-- | examples/R/load_dataset.R | 41 | ||||
| -rw-r--r-- | examples/python/README.md | 59 | ||||
| -rw-r--r-- | examples/python/load_dataset.py | 113 |
4 files changed, 247 insertions, 0 deletions
diff --git a/examples/R/README.md b/examples/R/README.md new file mode 100644 index 0000000..14ce4bf --- /dev/null +++ b/examples/R/README.md @@ -0,0 +1,34 @@ +# Loading a TCPD dataset into R + +The file ``load_dataset.R`` contains the function ``load.dataset`` that reads +the JSON file into an R dataframe. The +[RJSONIO](https://cran.r-project.org/web/packages/RJSONIO/index.html) package +is required: + +```R +> install.packages('RJSONIO') +``` + +Simply run: + +```R +> source('./load_dataset.R') +> df <- load.dataset('../../datasets/ozone/ozone.json') +> df + t Total Emissions +1 0 380000 +2 1 400000 +3 2 440000 +4 3 480000 +5 4 510000 +6 5 540000 +7 6 580000 +8 7 630000 +``` + +Notice that the time axis in TCPD is always 0-based. This needs to be taken +into account when comparing detection results to the human annotations. (This +is an unfortunate consequence of the differences between indexing in R and +Python.) + +Missing observations in time series are represented with a ``NA`` value. diff --git a/examples/R/load_dataset.R b/examples/R/load_dataset.R new file mode 100644 index 0000000..8ef0e22 --- /dev/null +++ b/examples/R/load_dataset.R @@ -0,0 +1,41 @@ +#' --- +#' title: Example code to load a TCPD time series +#' author: G.J.J. van den Burg +#' date: 2020-01-06 +#' license: See the LICENSE file. +#' copyright: 2019, The Alan Turing Institute +#' --- + +library(RJSONIO) + +load.dataset <- function(filename) +{ + data <- fromJSON(filename) + + # reformat the data into a data frame with a time index and the data values + tidx <- data$time$index + + cols <- c() + + mat <- NULL + for (j in 1:data$n_dim) { + s <- data$series[[j]] + v <- NULL + for (i in 1:data$n_obs) { + val <- s$raw[[i]] + if (is.null(val)) { + v <- c(v, NA) + } else { + v <- c(v, val) + } + } + cols <- c(cols, s$label) + mat <- cbind(mat, v) + } + + mat <- cbind(tidx, mat) + colnames(mat) <- c('t', cols) + + df <- as.data.frame(mat) + return(df) +} diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 0000000..157ba9a --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1,59 @@ +# Loading a TCPD time series in Python + +The ``load_dataset.py`` file contains example code to load a time series as a +``TimeSeries`` object. + +```python +>>> from load_dataset import TimeSeries +>>> ts = TimeSeries.from_json('../../datasets/ozone/ozone.json') +``` + +To export the time series as a [pandas +DataFrame](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dataframe), +simply use: + +```python +>>> ts.df + t Total Emissions +0 0 380000.0 +1 1 400000.0 +2 2 440000.0 +3 3 480000.0 +4 4 510000.0 +5 5 540000.0 +... +``` + +The ``TimeSeries`` instance ``ts`` has an integer time axis at ``ts.t`` and +the observations at ``ts.y``. The time axis is zero-based by default. If you +prefer to use a one-based indexing, simply run: + +```python +>>> ts.make_one_based() +>>> ts.df + t Total Emissions +0 1 380000.0 +1 2 400000.0 +2 3 440000.0 +3 4 480000.0 +4 5 510000.0 +5 6 540000.0 +... +``` + +Many of the time series in TCPD have date or datetime labels for the time +axis. This axis can be retrieved using: + +```python +>>> ts.datestr +array(['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', + ... + '2009', '2010', '2011', '2012', '2013', '2014'], dtype='<U4') +``` + +which uses the date format stored in ``ts.datefmt``. + +```python +>>> ts.datefmt +'%Y' +``` diff --git a/examples/python/load_dataset.py b/examples/python/load_dataset.py new file mode 100644 index 0000000..59cbb1a --- /dev/null +++ b/examples/python/load_dataset.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Example code for loading a dataset to a TimeSeries object. + +Note that this code requires Pandas to be available. + +Author: Gertjan van den Burg +Copyright: The Alan Turing Institute, 2019 +License: See LICENSE file. + +""" + +import json +import numpy as np +import pandas as pd + + +class TimeSeries: + def __init__( + self, + t, + y, + name=None, + longname=None, + datestr=None, + datefmt=None, + columns=None, + ): + self.t = t + self.y = y + + self.name = name + self.longname = longname + self.datestr = datestr + self.datefmt = datefmt + self.columns = columns + + # whether the series is stored as zero-based or one-based + self.zero_based = True + + @property + def n_obs(self): + return len(self.t) + + @property + def n_dim(self): + return self.y.shape[1] + + @property + def shape(self): + return (self.n_obs, self.n_dim) + + @classmethod + def from_json(cls, filename): + with open(filename, "rb") as fp: + data = json.load(fp) + + tidx = np.array(data["time"]["index"]) + tidx = np.squeeze(tidx) + + if "format" in data["time"]: + datefmt = data["time"]["format"] + datestr = np.array(data["time"]["raw"]) + else: + datefmt = None + datestr = None + + y = np.zeros((data["n_obs"], data["n_dim"])) + columns = [] + + for idx, series in enumerate(data["series"]): + columns.append(series.get("label", "V%i" % (idx + 1))) + thetype = np.int if series["type"] == "integer" else np.float64 + vec = np.array(series["raw"], dtype=thetype) + y[:, idx] = vec + + ts = cls( + tidx, + y, + name=data["name"], + longname=data["longname"], + datefmt=datefmt, + datestr=datestr, + columns=columns, + ) + return ts + + @property + def df(self): + d = {"t": self.t} + for i in range(len(self.columns)): + col = self.columns[i] + val = self.y[:, i] + d[col] = val + return pd.DataFrame(d) + + def make_one_based(self): + """ Convert the time index to a one-based time index. """ + if self.zero_based: + self.t = [t + 1 for t in self.t] + self.zero_based = False + + def __repr__(self): + return "TimeSeries(name=%s, n_obs=%s, n_dim=%s)" % ( + self.name, + self.n_obs, + self.n_dim, + ) + + def __str__(self): + return repr(self) |
