aboutsummaryrefslogtreecommitdiff
path: root/datasets/global_co2/get_global_co2.py
diff options
context:
space:
mode:
Diffstat (limited to 'datasets/global_co2/get_global_co2.py')
-rw-r--r--datasets/global_co2/get_global_co2.py192
1 files changed, 192 insertions, 0 deletions
diff --git a/datasets/global_co2/get_global_co2.py b/datasets/global_co2/get_global_co2.py
new file mode 100644
index 0000000..526f8a8
--- /dev/null
+++ b/datasets/global_co2/get_global_co2.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Collect the global_co2 dataset
+
+See the README file for more information.
+
+Author: G.J.J. van den Burg
+License: This file is part of TCPD, see the top-level LICENSE file.
+Copyright: 2019, The Alan Turing Institute
+
+"""
+
+
+import argparse
+import clevercsv
+import hashlib
+import json
+import os
+
+from functools import wraps
+from urllib.request import urlretrieve
+
+
+CSV_URL = "ftp://data.iac.ethz.ch/CMIP6/input4MIPs/UoM/GHGConc/CMIP/mon/atmos/UoM-CMIP-1-1-0/GHGConc/gr3-GMNHSH/v20160701/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
+
+MD5_CSV = "a3d42f5e339f4c652b8ae80e830b6941"
+MD5_JSON = "7c8edd8887f51a6f841cc9d806ab4e56"
+
+NAME_CSV = "mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv"
+NAME_JSON = "global_co2.json"
+
+SAMPLE = 48
+
+
+class ValidationError(Exception):
+ def __init__(self, filename):
+ message = (
+ "Validating the file '%s' failed. \n"
+ "Please raise an issue on the GitHub page for this project "
+ "if the error persists." % filename
+ )
+ super().__init__(message)
+
+
+def check_md5sum(filename, checksum):
+ with open(filename, "rb") as fp:
+ data = fp.read()
+ h = hashlib.md5(data).hexdigest()
+ return h == checksum
+
+
+def validate(checksum):
+ """Decorator that validates the target file."""
+
+ def validate_decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ target = kwargs.get("target_path", None)
+ if os.path.exists(target) and check_md5sum(target, checksum):
+ return
+ out = func(*args, **kwargs)
+ if not os.path.exists(target):
+ raise FileNotFoundError("Target file expected at: %s" % target)
+ if not check_md5sum(target, checksum):
+ raise ValidationError(target)
+ return out
+
+ return wrapper
+
+ return validate_decorator
+
+
+@validate(MD5_CSV)
+def get_csv(target_path=None):
+ urlretrieve(CSV_URL, target_path)
+
+
+def reformat_time(datestr):
+ """ From MMM-YY to %Y-%m """
+ MONTHS = {
+ "Jan": 1,
+ "Feb": 2,
+ "Mar": 3,
+ "Apr": 4,
+ "May": 5,
+ "Jun": 6,
+ "Jul": 7,
+ "Aug": 8,
+ "Sep": 9,
+ "Oct": 10,
+ "Nov": 11,
+ "Dec": 12,
+ }
+ dd, mmm, rest = datestr.split("-")
+ yyyy = rest.split(" ")[0]
+ m = MONTHS.get(mmm)
+ return "%s-%02d-%s" % (yyyy, m, dd)
+
+
+@validate(MD5_JSON)
+def write_json(csv_path, target_path=None):
+ with open(csv_path, "r", newline="", encoding="ascii") as fp:
+ reader = clevercsv.reader(
+ fp, delimiter=",", quotechar="", escapechar=""
+ )
+ rows = list(reader)
+
+ header = rows.pop(0)
+ rows = [r for i, r in enumerate(rows) if i % SAMPLE == 0]
+
+ as_dicts = [{h: v for h, v in zip(header, row)} for row in rows]
+ by_date = {
+ reformat_time(d["datetime"]): float(d["data_mean_global"])
+ for d in as_dicts
+ }
+
+ # trim off anything before 1600
+ by_date = {k: v for k, v in by_date.items() if k.split("-")[0] >= "1600"}
+
+ time = sorted(by_date.keys())
+ values = [by_date[t] for t in time]
+
+ name = "global_co2"
+ longname = "Global CO2"
+ time_fmt = "%Y-%m-%d"
+ series = [{"label": "Mean", "type": "float", "raw": values}]
+
+ data = {
+ "name": name,
+ "longname": longname,
+ "n_obs": len(values),
+ "n_dim": len(series),
+ "time": {
+ "type": "string",
+ "format": time_fmt,
+ "index": list(range(len(time))),
+ "raw": time,
+ },
+ "series": series,
+ }
+ if time is None:
+ del data["time"]
+
+ with open(target_path, "w") as fp:
+ json.dump(data, fp, indent="\t")
+
+
+def collect(output_dir="."):
+ csv_path = os.path.join(output_dir, NAME_CSV,)
+ json_path = os.path.join(output_dir, NAME_JSON)
+
+ get_csv(target_path=csv_path)
+ write_json(csv_path, target_path=json_path)
+
+
+def clean(output_dir="."):
+ csv_path = os.path.join(output_dir, NAME_CSV,)
+ json_path = os.path.join(output_dir, NAME_JSON)
+
+ if os.path.exists(csv_path):
+ os.unlink(csv_path)
+ if os.path.exists(json_path):
+ os.unlink(json_path)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-o", "--output-dir", help="output directory to use", default="."
+ )
+ parser.add_argument(
+ "action",
+ choices=["collect", "clean"],
+ help="Action to perform",
+ default="collect",
+ nargs="?",
+ )
+ return parser.parse_args()
+
+
+def main(output_dir="."):
+ args = parse_args()
+ if args.action == "collect":
+ collect(output_dir=args.output_dir)
+ elif args.action == "clean":
+ clean(output_dir=args.output_dir)
+
+
+if __name__ == "__main__":
+ main()