diff options
Diffstat (limited to 'examples/python')
| -rw-r--r-- | examples/python/README.md | 59 | ||||
| -rw-r--r-- | examples/python/load_dataset.py | 113 |
2 files changed, 172 insertions, 0 deletions
diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 0000000..157ba9a --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1,59 @@ +# Loading a TCPD time series in Python + +The ``load_dataset.py`` file contains example code to load a time series as a +``TimeSeries`` object. + +```python +>>> from load_dataset import TimeSeries +>>> ts = TimeSeries.from_json('../../datasets/ozone/ozone.json') +``` + +To export the time series as a [pandas +DataFrame](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dataframe), +simply use: + +```python +>>> ts.df + t Total Emissions +0 0 380000.0 +1 1 400000.0 +2 2 440000.0 +3 3 480000.0 +4 4 510000.0 +5 5 540000.0 +... +``` + +The ``TimeSeries`` instance ``ts`` has an integer time axis at ``ts.t`` and +the observations at ``ts.y``. The time axis is zero-based by default. If you +prefer to use a one-based indexing, simply run: + +```python +>>> ts.make_one_based() +>>> ts.df + t Total Emissions +0 1 380000.0 +1 2 400000.0 +2 3 440000.0 +3 4 480000.0 +4 5 510000.0 +5 6 540000.0 +... +``` + +Many of the time series in TCPD have date or datetime labels for the time +axis. This axis can be retrieved using: + +```python +>>> ts.datestr +array(['1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', + ... + '2009', '2010', '2011', '2012', '2013', '2014'], dtype='<U4') +``` + +which uses the date format stored in ``ts.datefmt``. + +```python +>>> ts.datefmt +'%Y' +``` diff --git a/examples/python/load_dataset.py b/examples/python/load_dataset.py new file mode 100644 index 0000000..59cbb1a --- /dev/null +++ b/examples/python/load_dataset.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Example code for loading a dataset to a TimeSeries object. + +Note that this code requires Pandas to be available. + +Author: Gertjan van den Burg +Copyright: The Alan Turing Institute, 2019 +License: See LICENSE file. + +""" + +import json +import numpy as np +import pandas as pd + + +class TimeSeries: + def __init__( + self, + t, + y, + name=None, + longname=None, + datestr=None, + datefmt=None, + columns=None, + ): + self.t = t + self.y = y + + self.name = name + self.longname = longname + self.datestr = datestr + self.datefmt = datefmt + self.columns = columns + + # whether the series is stored as zero-based or one-based + self.zero_based = True + + @property + def n_obs(self): + return len(self.t) + + @property + def n_dim(self): + return self.y.shape[1] + + @property + def shape(self): + return (self.n_obs, self.n_dim) + + @classmethod + def from_json(cls, filename): + with open(filename, "rb") as fp: + data = json.load(fp) + + tidx = np.array(data["time"]["index"]) + tidx = np.squeeze(tidx) + + if "format" in data["time"]: + datefmt = data["time"]["format"] + datestr = np.array(data["time"]["raw"]) + else: + datefmt = None + datestr = None + + y = np.zeros((data["n_obs"], data["n_dim"])) + columns = [] + + for idx, series in enumerate(data["series"]): + columns.append(series.get("label", "V%i" % (idx + 1))) + thetype = np.int if series["type"] == "integer" else np.float64 + vec = np.array(series["raw"], dtype=thetype) + y[:, idx] = vec + + ts = cls( + tidx, + y, + name=data["name"], + longname=data["longname"], + datefmt=datefmt, + datestr=datestr, + columns=columns, + ) + return ts + + @property + def df(self): + d = {"t": self.t} + for i in range(len(self.columns)): + col = self.columns[i] + val = self.y[:, i] + d[col] = val + return pd.DataFrame(d) + + def make_one_based(self): + """ Convert the time index to a one-based time index. """ + if self.zero_based: + self.t = [t + 1 for t in self.t] + self.zero_based = False + + def __repr__(self): + return "TimeSeries(name=%s, n_obs=%s, n_dim=%s)" % ( + self.name, + self.n_obs, + self.n_dim, + ) + + def __str__(self): + return repr(self) |
