From 4654fe77c06b6f6396b349eee07d9eb3a374d18d Mon Sep 17 00:00:00 2001 From: Gertjan van den Burg Date: Mon, 4 May 2020 22:56:12 +0100 Subject: Address rounding differences in bee_waggle_6 To construct this timeseries from the original data we compute the sine and cosine of the head angle of the bee. On different systems this can result in slight differences in the data due to rounding. This commit adds a known version of the dataset with a different rounding than the original one and adds a comment to the user when the checksum is not matched exactly. --- datasets/bee_waggle_6/get_bee_waggle_6.py | 45 +++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 11 deletions(-) (limited to 'datasets') diff --git a/datasets/bee_waggle_6/get_bee_waggle_6.py b/datasets/bee_waggle_6/get_bee_waggle_6.py index 6f80042..1a2033b 100644 --- a/datasets/bee_waggle_6/get_bee_waggle_6.py +++ b/datasets/bee_waggle_6/get_bee_waggle_6.py @@ -18,6 +18,7 @@ import json import math import os import zipfile +import sys from functools import wraps from urllib.request import urlretrieve @@ -26,8 +27,11 @@ ZIP_URL = "https://web.archive.org/web/20191114130815if_/https://www.cc.gatech.e MD5_ZIP = "039843dc15c72fd5450eeb11c6e5599c" MD5_JSON = "4f03feafecb3be0b069b3cb0d6b17d4f" -# alternative checksum for small rounding errors -MD5_JSON_2 = "71311783488ee5f1122545d24c15429b" +# known alternative checksums for small rounding errors +MD5_JSON_X = [ + "71311783488ee5f1122545d24c15429b", + "3632e004b540de5c3eb049fb5591d044", +] NAME_ZIP = "psslds.zip" NAME_JSON = "bee_waggle_6.json" @@ -50,7 +54,7 @@ def check_md5sum(filename, checksum): return h == checksum -def validate(checksum, alternative_checksum=None): +def validate(checksum, alt_checksums=None): """Decorator that validates the target file.""" def validate_decorator(func): @@ -61,18 +65,37 @@ def validate(checksum, alternative_checksum=None): return if ( os.path.exists(target) - and alternative_checksum - and check_md5sum(target, alternative_checksum) + and alt_checksums + and any(check_md5sum(target, c) for c in alt_checksums) ): + print( + "Note: Matched alternative checksum for %s. " + "This indicates that small differences exist compared to " + "the original version of this time series, likely due to " + "rounding differences. Usually this is nothing to " + "worry about." % target, + file=sys.stderr, + ) return out = func(*args, **kwargs) if not os.path.exists(target): raise FileNotFoundError("Target file expected at: %s" % target) - if not (check_md5sum(target, checksum) or ( - alternative_checksum - and check_md5sum(target, alternative_checksum) - )): - raise ValidationError(target) + if not ( + check_md5sum(target, checksum) + or ( + alt_checksums + and any(check_md5sum(target, c) for c in alt_checksums) + ) + ): + print( + "Warning: Generated dataset %s didn't match a " + "known checksum. This is likely due to " + "rounding differences caused by " + "different system architectures. Minor differences in " + "algorithm performance can occur for this dataset. " + % target, + file=sys.stderr, + ) return out return wrapper @@ -85,7 +108,7 @@ def download_zip(target_path=None): urlretrieve(ZIP_URL, target_path) -@validate(MD5_JSON, MD5_JSON_2) +@validate(MD5_JSON, MD5_JSON_X) def write_json(zip_path, target_path=None): with zipfile.ZipFile(zip_path) as thezip: with thezip.open("psslds/zips/data/sequence6/btf/ximage.btf") as fp: -- cgit v1.2.3