diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-03-10 12:27:53 +0000 |
| commit | 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch) | |
| tree | 10aa6710599230c889ec44407a065ee303a79348 /datasets/global_co2 | |
| download | TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip | |
Initial commit
Diffstat (limited to 'datasets/global_co2')
| -rw-r--r-- | datasets/global_co2/.gitignore | 3 | ||||
| -rw-r--r-- | datasets/global_co2/README.md | 26 | ||||
| -rw-r--r-- | datasets/global_co2/get_global_co2.py | 192 | ||||
| -rw-r--r-- | datasets/global_co2/global_co2.png | bin | 0 -> 13341 bytes |
4 files changed, 221 insertions, 0 deletions
diff --git a/datasets/global_co2/.gitignore b/datasets/global_co2/.gitignore new file mode 100644 index 0000000..21a3adb --- /dev/null +++ b/datasets/global_co2/.gitignore @@ -0,0 +1,3 @@ +global_co2.json +mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv +old/ diff --git a/datasets/global_co2/README.md b/datasets/global_co2/README.md new file mode 100644 index 0000000..d0bfa61 --- /dev/null +++ b/datasets/global_co2/README.md @@ -0,0 +1,26 @@ +# Global Monthly CO2 levels + +This dataset concerns monthly global hemispheric means of carbon dioxide in +air. The data is part of the CMIP6 dataset, developed by Meinshausen et al. + +When using this data, please cite: + +```bib +@article{meinshausen2017historical, + title={Historical greenhouse gas concentrations for climate modelling ({CMIP6})}, + author={Meinshausen, M. and Vogel, E. and Nauels, A. and Lorbacher, K. and Meinshausen, N. and Etheridge, D. M. and Fraser, P. J. and Montzka, S. A. and Rayner, P. J. and Trudinger, C. M. and Krummel, P. B. and Beyerle, U. and Canadell, J. G. and Daniel, J. S. and Enting, I. G. and Law, R. M. and Lunder, C. R. and O'Doherty, S. and Prinn, R. G. and Reimann, S. and Rubino, M. and Velders, G. J. M. and Vollmer, M. K. and Wang, R. H. J. and Weiss, R.}, + journal={Geoscientific Model Development}, + volume={10}, + pages={2057--2116}, + year={2017}, + publisher={Copernicus} +} +``` + +While it appears that the data is in the public domain, it is not clear what +license it is under. We therefore download it from the original source (hosted +at ETH Zurich) and convert it to our dataset format. The original data is +sampled every 4 years and cropped to recent history to reduce the length of +the series. + + diff --git a/datasets/global_co2/get_global_co2.py b/datasets/global_co2/get_global_co2.py new file mode 100644 index 0000000..526f8a8 --- /dev/null +++ b/datasets/global_co2/get_global_co2.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Collect the global_co2 dataset + +See the README file for more information. + +Author: G.J.J. van den Burg +License: This file is part of TCPD, see the top-level LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + + +import argparse +import clevercsv +import hashlib +import json +import os + +from functools import wraps +from urllib.request import urlretrieve + + +CSV_URL = "ftp://data.iac.ethz.ch/CMIP6/input4MIPs/UoM/GHGConc/CMIP/mon/atmos/UoM-CMIP-1-1-0/GHGConc/gr3-GMNHSH/v20160701/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv" + +MD5_CSV = "a3d42f5e339f4c652b8ae80e830b6941" +MD5_JSON = "7c8edd8887f51a6f841cc9d806ab4e56" + +NAME_CSV = "mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv" +NAME_JSON = "global_co2.json" + +SAMPLE = 48 + + +class ValidationError(Exception): + def __init__(self, filename): + message = ( + "Validating the file '%s' failed. \n" + "Please raise an issue on the GitHub page for this project " + "if the error persists." % filename + ) + super().__init__(message) + + +def check_md5sum(filename, checksum): + with open(filename, "rb") as fp: + data = fp.read() + h = hashlib.md5(data).hexdigest() + return h == checksum + + +def validate(checksum): + """Decorator that validates the target file.""" + + def validate_decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + target = kwargs.get("target_path", None) + if os.path.exists(target) and check_md5sum(target, checksum): + return + out = func(*args, **kwargs) + if not os.path.exists(target): + raise FileNotFoundError("Target file expected at: %s" % target) + if not check_md5sum(target, checksum): + raise ValidationError(target) + return out + + return wrapper + + return validate_decorator + + +@validate(MD5_CSV) +def get_csv(target_path=None): + urlretrieve(CSV_URL, target_path) + + +def reformat_time(datestr): + """ From MMM-YY to %Y-%m """ + MONTHS = { + "Jan": 1, + "Feb": 2, + "Mar": 3, + "Apr": 4, + "May": 5, + "Jun": 6, + "Jul": 7, + "Aug": 8, + "Sep": 9, + "Oct": 10, + "Nov": 11, + "Dec": 12, + } + dd, mmm, rest = datestr.split("-") + yyyy = rest.split(" ")[0] + m = MONTHS.get(mmm) + return "%s-%02d-%s" % (yyyy, m, dd) + + +@validate(MD5_JSON) +def write_json(csv_path, target_path=None): + with open(csv_path, "r", newline="", encoding="ascii") as fp: + reader = clevercsv.reader( + fp, delimiter=",", quotechar="", escapechar="" + ) + rows = list(reader) + + header = rows.pop(0) + rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0] + + as_dicts = [{h: v for h, v in zip(header, row)} for row in rows] + by_date = { + reformat_time(d["datetime"]): float(d["data_mean_global"]) + for d in as_dicts + } + + # trim off anything before 1600 + by_date = {k: v for k, v in by_date.items() if k.split("-")[0] >= "1600"} + + time = sorted(by_date.keys()) + values = [by_date[t] for t in time] + + name = "global_co2" + longname = "Global CO2" + time_fmt = "%Y-%m-%d" + series = [{"label": "Mean", "type": "float", "raw": values}] + + data = { + "name": name, + "longname": longname, + "n_obs": len(values), + "n_dim": len(series), + "time": { + "type": "string", + "format": time_fmt, + "index": list(range(len(time))), + "raw": time, + }, + "series": series, + } + if time is None: + del data["time"] + + with open(target_path, "w") as fp: + json.dump(data, fp, indent="\t") + + +def collect(output_dir="."): + csv_path = os.path.join(output_dir, NAME_CSV,) + json_path = os.path.join(output_dir, NAME_JSON) + + get_csv(target_path=csv_path) + write_json(csv_path, target_path=json_path) + + +def clean(output_dir="."): + csv_path = os.path.join(output_dir, NAME_CSV,) + json_path = os.path.join(output_dir, NAME_JSON) + + if os.path.exists(csv_path): + os.unlink(csv_path) + if os.path.exists(json_path): + os.unlink(json_path) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", "--output-dir", help="output directory to use", default="." + ) + parser.add_argument( + "action", + choices=["collect", "clean"], + help="Action to perform", + default="collect", + nargs="?", + ) + return parser.parse_args() + + +def main(output_dir="."): + args = parse_args() + if args.action == "collect": + collect(output_dir=args.output_dir) + elif args.action == "clean": + clean(output_dir=args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/datasets/global_co2/global_co2.png b/datasets/global_co2/global_co2.png Binary files differnew file mode 100644 index 0000000..a225398 --- /dev/null +++ b/datasets/global_co2/global_co2.png |
