Initial commit

author: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-10 12:27:53 +0000
committer: Gertjan van den Burg <gertjanvandenburg@gmail.com> 2020-03-10 12:27:53 +0000
commit: 7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree: 10aa6710599230c889ec44407a065ee303a79348 /datasets/global_co2
download: TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz
TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip
4 files changed, 221 insertions, 0 deletions
diff --git a/datasets/global_co2/.gitignore b/datasets/global_co2/.gitignore
new file mode 100644
index 0000000..21a3adb
--- /dev/null
+++ b/datasets/global_co2/.gitignore
@@ -0,0 +1,3 @@
+global_co2.json
+mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
+old/
diff --git a/datasets/global_co2/README.md b/datasets/global_co2/README.md
new file mode 100644
index 0000000..d0bfa61
--- /dev/null
+++ b/datasets/global_co2/README.md
@@ -0,0 +1,26 @@
+# Global Monthly CO2 levels
+
+This dataset concerns monthly global hemispheric means of carbon dioxide in 
+air. The data is part of the CMIP6 dataset, developed by Meinshausen et al. 
+
+When using this data, please cite:
+
+```bib
+@article{meinshausen2017historical,
+	title={Historical greenhouse gas concentrations for climate modelling ({CMIP6})},
+	author={Meinshausen, M. and Vogel, E. and Nauels, A. and Lorbacher, K. and Meinshausen, N. and Etheridge, D. M. and Fraser, P. J. and Montzka, S. A. and Rayner, P. J. and Trudinger, C. M. and Krummel, P. B. and Beyerle, U. and Canadell, J. G. and Daniel, J. S. and Enting, I. G. and Law, R. M. and Lunder, C. R. and O'Doherty, S. and Prinn, R. G. and Reimann, S. and Rubino, M. and Velders, G. J. M. and Vollmer, M. K. and Wang, R. H. J. and Weiss, R.},
+	journal={Geoscientific Model Development},
+	volume={10},
+	pages={2057--2116},
+	year={2017},
+	publisher={Copernicus}
+}
+```
+
+While it appears that the data is in the public domain, it is not clear what 
+license it is under. We therefore download it from the original source (hosted 
+at ETH Zurich) and convert it to our dataset format. The original data is 
+sampled every 4 years and cropped to recent history to reduce the length of 
+the series.
+
+![Plot of global_co2 dataset](./global_co2.png)
diff --git a/datasets/global_co2/get_global_co2.py b/datasets/global_co2/get_global_co2.py
new file mode 100644
index 0000000..526f8a8
--- /dev/null
+++ b/datasets/global_co2/get_global_co2.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Collect the global_co2 dataset
+
+See the README file for more information.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+
+import argparse
+import clevercsv
+import hashlib
+import json
+import os
+
+from functools import wraps
+from urllib.request import urlretrieve
+
+
+CSV_URL = "ftp://data.iac.ethz.ch/CMIP6/input4MIPs/UoM/GHGConc/CMIP/mon/atmos/UoM-CMIP-1-1-0/GHGConc/gr3-GMNHSH/v20160701/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
+
+MD5_CSV = "a3d42f5e339f4c652b8ae80e830b6941"
+MD5_JSON = "7c8edd8887f51a6f841cc9d806ab4e56"
+
+NAME_CSV = "mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
+NAME_JSON = "global_co2.json"
+
+SAMPLE = 48
+
+
+class ValidationError(Exception):
+    def __init__(self, filename):
+        message = (
+            "Validating the file '%s' failed. \n"
+            "Please raise an issue on the GitHub page for this project "
+            "if the error persists." % filename
+        )
+        super().__init__(message)
+
+
+def check_md5sum(filename, checksum):
+    with open(filename, "rb") as fp:
+        data = fp.read()
+    h = hashlib.md5(data).hexdigest()
+    return h == checksum
+
+
+def validate(checksum):
+    """Decorator that validates the target file."""
+
+    def validate_decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            target = kwargs.get("target_path", None)
+            if os.path.exists(target) and check_md5sum(target, checksum):
+                return
+            out = func(*args, **kwargs)
+            if not os.path.exists(target):
+                raise FileNotFoundError("Target file expected at: %s" % target)
+            if not check_md5sum(target, checksum):
+                raise ValidationError(target)
+            return out
+
+        return wrapper
+
+    return validate_decorator
+
+
+@validate(MD5_CSV)
+def get_csv(target_path=None):
+    urlretrieve(CSV_URL, target_path)
+
+
+def reformat_time(datestr):
+    """ From MMM-YY to %Y-%m """
+    MONTHS = {
+        "Jan": 1,
+        "Feb": 2,
+        "Mar": 3,
+        "Apr": 4,
+        "May": 5,
+        "Jun": 6,
+        "Jul": 7,
+        "Aug": 8,
+        "Sep": 9,
+        "Oct": 10,
+        "Nov": 11,
+        "Dec": 12,
+    }
+    dd, mmm, rest = datestr.split("-")
+    yyyy = rest.split(" ")[0]
+    m = MONTHS.get(mmm)
+    return "%s-%02d-%s" % (yyyy, m, dd)
+
+
+@validate(MD5_JSON)
+def write_json(csv_path, target_path=None):
+    with open(csv_path, "r", newline="", encoding="ascii") as fp:
+        reader = clevercsv.reader(
+            fp, delimiter=",", quotechar="", escapechar=""
+        )
+        rows = list(reader)
+
+    header = rows.pop(0)
+    rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0]
+
+    as_dicts = [{h: v for h, v in zip(header, row)} for row in rows]
+    by_date = {
+        reformat_time(d["datetime"]): float(d["data_mean_global"])
+        for d in as_dicts
+    }
+
+    # trim off anything before 1600
+    by_date = {k: v for k, v in by_date.items() if k.split("-")[0] >= "1600"}
+
+    time = sorted(by_date.keys())
+    values = [by_date[t] for t in time]
+
+    name = "global_co2"
+    longname = "Global CO2"
+    time_fmt = "%Y-%m-%d"
+    series = [{"label": "Mean", "type": "float", "raw": values}]
+
+    data = {
+        "name": name,
+        "longname": longname,
+        "n_obs": len(values),
+        "n_dim": len(series),
+        "time": {
+            "type": "string",
+            "format": time_fmt,
+            "index": list(range(len(time))),
+            "raw": time,
+        },
+        "series": series,
+    }
+    if time is None:
+        del data["time"]
+
+    with open(target_path, "w") as fp:
+        json.dump(data, fp, indent="\t")
+
+
+def collect(output_dir="."):
+    csv_path = os.path.join(output_dir, NAME_CSV,)
+    json_path = os.path.join(output_dir, NAME_JSON)
+
+    get_csv(target_path=csv_path)
+    write_json(csv_path, target_path=json_path)
+
+
+def clean(output_dir="."):
+    csv_path = os.path.join(output_dir, NAME_CSV,)
+    json_path = os.path.join(output_dir, NAME_JSON)
+
+    if os.path.exists(csv_path):
+        os.unlink(csv_path)
+    if os.path.exists(json_path):
+        os.unlink(json_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o", "--output-dir", help="output directory to use", default="."
+    )
+    parser.add_argument(
+        "action",
+        choices=["collect", "clean"],
+        help="Action to perform",
+        default="collect",
+        nargs="?",
+    )
+    return parser.parse_args()
+
+
+def main(output_dir="."):
+    args = parse_args()
+    if args.action == "collect":
+        collect(output_dir=args.output_dir)
+    elif args.action == "clean":
+        clean(output_dir=args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/global_co2/global_co2.png b/datasets/global_co2/global_co2.png
new file mode 100644
index 0000000..a225398
--- /dev/null
+++ b/datasets/global_co2/global_co2.png
author	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-10 12:27:53 +0000
committer	Gertjan van den Burg <gertjanvandenburg@gmail.com>	2020-03-10 12:27:53 +0000
commit	7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e (patch)
tree	10aa6710599230c889ec44407a065ee303a79348 /datasets/global_co2
download	TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.tar.gz TCPD-7c6c2e09e3ad1d41f26869cb7b9f9882175c8a6e.zip