aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-08-17 20:44:42 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-08-17 20:44:42 +0100
commit9480e00023b1315609000cf256c59425e9efdccd (patch)
tree53259b8ed39a6e9d9b39cb894a2e45780c666f42
parentAdd retries to the get_measles script (diff)
downloadTCPD-9480e00023b1315609000cf256c59425e9efdccd.tar.gz
TCPD-9480e00023b1315609000cf256c59425e9efdccd.zip
Add retries to all download scripts
-rw-r--r--datasets/apple/get_apple.py33
-rw-r--r--datasets/bee_waggle_6/get_bee_waggle_6.py16
-rw-r--r--datasets/bitcoin/get_bitcoin.py19
-rw-r--r--datasets/homeruns/get_homeruns.py17
-rw-r--r--datasets/iceland_tourism/get_iceland_tourism.py18
-rw-r--r--datasets/occupancy/get_occupancy.py17
-rw-r--r--datasets/ratner_stock/get_ratner_stock.py31
-rw-r--r--datasets/scanline_126007/get_scanline_126007.py18
-rw-r--r--datasets/scanline_42049/get_scanline_42049.py17
9 files changed, 161 insertions, 25 deletions
diff --git a/datasets/apple/get_apple.py b/datasets/apple/get_apple.py
index 89f8483..76d8e27 100644
--- a/datasets/apple/get_apple.py
+++ b/datasets/apple/get_apple.py
@@ -21,8 +21,11 @@ import hashlib
import json
import os
import yfinance
+import sys
+import time
from functools import wraps
+from urllib.error import URLError
MD5_CSV = "9021c03bb9fea3f16ecc812d77926168"
MD5_JSON = "22edb48471bd3711f7a6e15de6413643"
@@ -71,15 +74,27 @@ def validate(checksum):
def write_csv(target_path=None):
- aapl = yfinance.download(
- "AAPL",
- start="1996-12-12",
- end="2004-05-15",
- progress=False,
- rounding=False,
- threads=False
- )
- aapl.round(6).to_csv(target_path, float_format="%.6f")
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ aapl = yfinance.download(
+ "AAPL",
+ start="1996-12-12",
+ end="2004-05-15",
+ progress=False,
+ rounding=False,
+ threads=False,
+ )
+ aapl.round(6).to_csv(target_path, float_format="%.6f")
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON)
diff --git a/datasets/bee_waggle_6/get_bee_waggle_6.py b/datasets/bee_waggle_6/get_bee_waggle_6.py
index 1a2033b..4bdd702 100644
--- a/datasets/bee_waggle_6/get_bee_waggle_6.py
+++ b/datasets/bee_waggle_6/get_bee_waggle_6.py
@@ -19,9 +19,11 @@ import math
import os
import zipfile
import sys
+import time
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
ZIP_URL = "https://web.archive.org/web/20191114130815if_/https://www.cc.gatech.edu/%7Eborg/ijcv_psslds/psslds.zip"
@@ -105,7 +107,19 @@ def validate(checksum, alt_checksums=None):
@validate(MD5_ZIP)
def download_zip(target_path=None):
- urlretrieve(ZIP_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(ZIP_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download zip. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON, MD5_JSON_X)
diff --git a/datasets/bitcoin/get_bitcoin.py b/datasets/bitcoin/get_bitcoin.py
index e0b2917..281b093 100644
--- a/datasets/bitcoin/get_bitcoin.py
+++ b/datasets/bitcoin/get_bitcoin.py
@@ -13,13 +13,16 @@ Copyright: 2019, The Alan Turing Institute
"""
import argparse
+import clevercsv
import hashlib
import json
import os
-import clevercsv
+import sys
+import time
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
CSV_URL = "https://web.archive.org/web/20191114131838if_/https://api.blockchain.info/charts/market-price?timespan=all&format=csv"
@@ -70,7 +73,19 @@ def validate(checksum):
@validate(MD5_CSV)
def get_market_price(target_path=None):
- urlretrieve(CSV_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(CSV_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON)
diff --git a/datasets/homeruns/get_homeruns.py b/datasets/homeruns/get_homeruns.py
index 6093484..dab616c 100644
--- a/datasets/homeruns/get_homeruns.py
+++ b/datasets/homeruns/get_homeruns.py
@@ -17,9 +17,12 @@ import clevercsv
import hashlib
import json
import os
+import sys
+import time
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
# Original source of the batting csv file
CSV_URL = "https://web.archive.org/web/20191128150525if_/https://raw.githubusercontent.com/chadwickbureau/baseballdatabank/242285f8f5e8981327cf50c07355fb034833ce4a/core/Batting.csv"
@@ -70,7 +73,19 @@ def validate(checksum):
@validate(MD5_CSV)
def download_csv(target_path=None):
- urlretrieve(CSV_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(CSV_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
def read_csv(csv_file):
diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py
index 752f07d..b9c8347 100644
--- a/datasets/iceland_tourism/get_iceland_tourism.py
+++ b/datasets/iceland_tourism/get_iceland_tourism.py
@@ -17,9 +17,12 @@ import hashlib
import json
import os
import xlrd
+import sys
+import time
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
XLSX_URL = "https://web.archive.org/web/20191121170223if_/https://www.ferdamalastofa.is/static/files/ferdamalastofa/Frettamyndir/2019/nov/visitors-to-iceland-2002-2019-oct.xlsx"
@@ -84,7 +87,20 @@ def validate(checksum):
@validate(MD5_XLSX)
def download_xlsx(target_path=None):
- urlretrieve(XLSX_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(XLSX_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download xlsx. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
+
def format_ym(year, month):
diff --git a/datasets/occupancy/get_occupancy.py b/datasets/occupancy/get_occupancy.py
index 0b590fa..aa12514 100644
--- a/datasets/occupancy/get_occupancy.py
+++ b/datasets/occupancy/get_occupancy.py
@@ -17,9 +17,12 @@ import clevercsv
import hashlib
import json
import os
+import sys
+import time
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
SAMPLE = 16
@@ -72,7 +75,19 @@ def validate(checksum):
@validate(MD5_TXT)
def download_txt(target_path=None):
- urlretrieve(TXT_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(TXT_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download txt. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON)
diff --git a/datasets/ratner_stock/get_ratner_stock.py b/datasets/ratner_stock/get_ratner_stock.py
index 4559608..61fed1f 100644
--- a/datasets/ratner_stock/get_ratner_stock.py
+++ b/datasets/ratner_stock/get_ratner_stock.py
@@ -18,8 +18,11 @@ import hashlib
import json
import os
import yfinance
+import sys
+import time
from functools import wraps
+from urllib.error import URLError
MD5_CSV = "db7406dc7d4eb480d73b4fe6c4bb00be"
MD5_JSON = "f7086ff916f35b88463bf8fd1857815e"
@@ -68,14 +71,26 @@ def validate(checksum):
def write_csv(target_path=None):
- sig = yfinance.download(
- "SIG",
- start="1988-07-14",
- end="1995-08-23",
- progress=False,
- rounding=False,
- )
- sig.round(6).to_csv(target_path, float_format="%.6f")
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ sig = yfinance.download(
+ "SIG",
+ start="1988-07-14",
+ end="1995-08-23",
+ progress=False,
+ rounding=False,
+ )
+ sig.round(6).to_csv(target_path, float_format="%.6f")
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download csv. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON)
diff --git a/datasets/scanline_126007/get_scanline_126007.py b/datasets/scanline_126007/get_scanline_126007.py
index ba41774..7845bb1 100644
--- a/datasets/scanline_126007/get_scanline_126007.py
+++ b/datasets/scanline_126007/get_scanline_126007.py
@@ -17,10 +17,13 @@ import hashlib
import os
import numpy as np
import json
+import sys
+import time
from PIL import Image
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
IMG_URL = "https://web.archive.org/web/20070611200633im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/126007.jpg"
@@ -70,7 +73,20 @@ def validate(checksum):
@validate(MD5_IMG)
def download_img(target_path=None):
- urlretrieve(IMG_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(IMG_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download img. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
+
@validate(MD5_JSON)
diff --git a/datasets/scanline_42049/get_scanline_42049.py b/datasets/scanline_42049/get_scanline_42049.py
index 45f81ce..4c6e56b 100644
--- a/datasets/scanline_42049/get_scanline_42049.py
+++ b/datasets/scanline_42049/get_scanline_42049.py
@@ -17,10 +17,13 @@ import hashlib
import os
import numpy as np
import json
+import sys
+import time
from PIL import Image
from functools import wraps
from urllib.request import urlretrieve
+from urllib.error import URLError
IMG_URL = "https://web.archive.org/web/20070611230044im_/http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/segbench/BSDS300/html/images/plain/normal/gray/42049.jpg"
@@ -70,7 +73,19 @@ def validate(checksum):
@validate(MD5_IMG)
def download_img(target_path=None):
- urlretrieve(IMG_URL, target_path)
+ count = 0
+ while count < 5:
+ count += 1
+ try:
+ urlretrieve(IMG_URL, target_path)
+ return
+ except URLError as err:
+ print(
+ "Error occurred (%r) when trying to download img. Retrying in 5 seconds"
+ % err,
+ sys.stderr,
+ )
+ time.sleep(5)
@validate(MD5_JSON)