diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-08-17 20:56:12 +0100 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-08-17 20:56:12 +0100 |
| commit | 38800a8e4ce2328548a10ea31062089900385075 (patch) | |
| tree | 53259b8ed39a6e9d9b39cb894a2e45780c666f42 | |
| parent | Use read_table instead of read_csv (diff) | |
| parent | Add retries to all download scripts (diff) | |
| download | TCPD-38800a8e4ce2328548a10ea31062089900385075.tar.gz TCPD-38800a8e4ce2328548a10ea31062089900385075.zip | |
Merge branch 'bugfix/measles_download' into master
| -rw-r--r-- | datasets/apple/get_apple.py | 33 | ||||
| -rw-r--r-- | datasets/bee_waggle_6/get_bee_waggle_6.py | 16 | ||||
| -rw-r--r-- | datasets/bitcoin/get_bitcoin.py | 19 | ||||
| -rw-r--r-- | datasets/homeruns/get_homeruns.py | 17 | ||||
| -rw-r--r-- | datasets/iceland_tourism/get_iceland_tourism.py | 18 | ||||
| -rw-r--r-- | datasets/measles/get_measles.py | 18 | ||||
| -rw-r--r-- | datasets/occupancy/get_occupancy.py | 17 | ||||
| -rw-r--r-- | datasets/ratner_stock/get_ratner_stock.py | 31 | ||||
| -rw-r--r-- | datasets/scanline_126007/get_scanline_126007.py | 18 | ||||
| -rw-r--r-- | datasets/scanline_42049/get_scanline_42049.py | 17 |
10 files changed, 178 insertions, 26 deletions
diff --git a/datasets/apple/get_apple.py b/datasets/apple/get_apple.py index 89f8483..76d8e27 100644 --- a/datasets/apple/get_apple.py +++ b/datasets/apple/get_apple.py @@ -21,8 +21,11 @@ import hashlib import json import os import yfinance +import sys +import time from functools import wraps +from urllib.error import URLError MD5_CSV = "9021c03bb9fea3f16ecc812d77926168" MD5_JSON = "22edb48471bd3711f7a6e15de6413643" @@ -71,15 +74,27 @@ def validate(checksum): def write_csv(target_path=None): - aapl = yfinance.download( - "AAPL", - start="1996-12-12", - end="2004-05-15", - progress=False, - rounding=False, - threads=False - ) - aapl.round(6).to_csv(target_path, float_format="%.6f") + count = 0 + while count < 5: + count += 1 + try: + aapl = yfinance.download( + "AAPL", + start="1996-12-12", + end="2004-05-15", + progress=False, + rounding=False, + threads=False, + ) + aapl.round(6).to_csv(target_path, float_format="%.6f") + return + except URLError as err: + print( + "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON) diff --git a/datasets/bee_waggle_6/get_bee_waggle_6.py b/datasets/bee_waggle_6/get_bee_waggle_6.py index 1a2033b..4bdd702 100644 --- a/datasets/bee_waggle_6/get_bee_waggle_6.py +++ b/datasets/bee_waggle_6/get_bee_waggle_6.py @@ -19,9 +19,11 @@ import math import os import zipfile import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError ZIP_URL = "https://web.archive.org/web/20191114130815if_/https://www.cc.gatech.edu/%7Eborg/ijcv_psslds/psslds.zip" @@ -105,7 +107,19 @@ def validate(checksum, alt_checksums=None): @validate(MD5_ZIP) def download_zip(target_path=None): - urlretrieve(ZIP_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(ZIP_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download zip. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON, MD5_JSON_X) diff --git a/datasets/bitcoin/get_bitcoin.py b/datasets/bitcoin/get_bitcoin.py index e0b2917..281b093 100644 --- a/datasets/bitcoin/get_bitcoin.py +++ b/datasets/bitcoin/get_bitcoin.py @@ -13,13 +13,16 @@ Copyright: 2019, The Alan Turing Institute """ import argparse +import clevercsv import hashlib import json import os -import clevercsv +import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError CSV_URL = "https://web.archive.org/web/20191114131838if_/https://api.blockchain.info/charts/market-price?timespan=all&format=csv" @@ -70,7 +73,19 @@ def validate(checksum): @validate(MD5_CSV) def get_market_price(target_path=None): - urlretrieve(CSV_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(CSV_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON) diff --git a/datasets/homeruns/get_homeruns.py b/datasets/homeruns/get_homeruns.py index 6093484..dab616c 100644 --- a/datasets/homeruns/get_homeruns.py +++ b/datasets/homeruns/get_homeruns.py @@ -17,9 +17,12 @@ import clevercsv import hashlib import json import os +import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError # Original source of the batting csv file CSV_URL = "https://web.archive.org/web/20191128150525if_/https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/242285f8f5e8981327cf50c07355fb034833ce4a/core/Batting.csv" @@ -70,7 +73,19 @@ def validate(checksum): @validate(MD5_CSV) def download_csv(target_path=None): - urlretrieve(CSV_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(CSV_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) def read_csv(csv_file): diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py index 752f07d..b9c8347 100644 --- a/datasets/iceland_tourism/get_iceland_tourism.py +++ b/datasets/iceland_tourism/get_iceland_tourism.py @@ -17,9 +17,12 @@ import hashlib import json import os import xlrd +import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx" @@ -84,7 +87,20 @@ def validate(checksum): @validate(MD5_XLSX) def download_xlsx(target_path=None): - urlretrieve(XLSX_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(XLSX_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download xlsx. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) + def format_ym(year, month): diff --git a/datasets/measles/get_measles.py b/datasets/measles/get_measles.py index eb58824..13f6dc9 100644 --- a/datasets/measles/get_measles.py +++ b/datasets/measles/get_measles.py @@ -17,9 +17,12 @@ import clevercsv import hashlib import json import os +import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError DAT_URL = "https://web.archive.org/web/20191128124615if_/https://ms.mcmaster.ca/~bolker/measdata/ewmeas.dat" @@ -70,7 +73,20 @@ def validate(checksum): @validate(MD5_DAT) def download_zip(target_path=None): - urlretrieve(DAT_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(DAT_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download zip. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) + @validate(MD5_JSON) diff --git a/datasets/occupancy/get_occupancy.py b/datasets/occupancy/get_occupancy.py index 0b590fa..aa12514 100644 --- a/datasets/occupancy/get_occupancy.py +++ b/datasets/occupancy/get_occupancy.py @@ -17,9 +17,12 @@ import clevercsv import hashlib import json import os +import sys +import time from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError SAMPLE = 16 @@ -72,7 +75,19 @@ def validate(checksum): @validate(MD5_TXT) def download_txt(target_path=None): - urlretrieve(TXT_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(TXT_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download txt. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON) diff --git a/datasets/ratner_stock/get_ratner_stock.py b/datasets/ratner_stock/get_ratner_stock.py index 4559608..61fed1f 100644 --- a/datasets/ratner_stock/get_ratner_stock.py +++ b/datasets/ratner_stock/get_ratner_stock.py @@ -18,8 +18,11 @@ import hashlib import json import os import yfinance +import sys +import time from functools import wraps +from urllib.error import URLError MD5_CSV = "db7406dc7d4eb480d73b4fe6c4bb00be" MD5_JSON = "f7086ff916f35b88463bf8fd1857815e" @@ -68,14 +71,26 @@ def validate(checksum): def write_csv(target_path=None): - sig = yfinance.download( - "SIG", - start="1988-07-14", - end="1995-08-23", - progress=False, - rounding=False, - ) - sig.round(6).to_csv(target_path, float_format="%.6f") + count = 0 + while count < 5: + count += 1 + try: + sig = yfinance.download( + "SIG", + start="1988-07-14", + end="1995-08-23", + progress=False, + rounding=False, + ) + sig.round(6).to_csv(target_path, float_format="%.6f") + return + except URLError as err: + print( + "Error occurred (%r) when trying to download csv. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON) diff --git a/datasets/scanline_126007/get_scanline_126007.py b/datasets/scanline_126007/get_scanline_126007.py index ba41774..7845bb1 100644 --- a/datasets/scanline_126007/get_scanline_126007.py +++ b/datasets/scanline_126007/get_scanline_126007.py @@ -17,10 +17,13 @@ import hashlib import os import numpy as np import json +import sys +import time from PIL import Image from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError IMG_URL = "https://web.archive.org/web/20070611200633im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/126007.jpg" @@ -70,7 +73,20 @@ def validate(checksum): @validate(MD5_IMG) def download_img(target_path=None): - urlretrieve(IMG_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(IMG_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download img. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) + @validate(MD5_JSON) diff --git a/datasets/scanline_42049/get_scanline_42049.py b/datasets/scanline_42049/get_scanline_42049.py index 45f81ce..4c6e56b 100644 --- a/datasets/scanline_42049/get_scanline_42049.py +++ b/datasets/scanline_42049/get_scanline_42049.py @@ -17,10 +17,13 @@ import hashlib import os import numpy as np import json +import sys +import time from PIL import Image from functools import wraps from urllib.request import urlretrieve +from urllib.error import URLError IMG_URL = "https://web.archive.org/web/20070611230044im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/42049.jpg" @@ -70,7 +73,19 @@ def validate(checksum): @validate(MD5_IMG) def download_img(target_path=None): - urlretrieve(IMG_URL, target_path) + count = 0 + while count < 5: + count += 1 + try: + urlretrieve(IMG_URL, target_path) + return + except URLError as err: + print( + "Error occurred (%r) when trying to download img. Retrying in 5 seconds" + % err, + sys.stderr, + ) + time.sleep(5) @validate(MD5_JSON) |
