aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-05 17:26:40 +0100
committerGertjan van den Burg <gertjanvandenburg@gmail.com>2020-04-05 17:26:40 +0100
commitfee27cb0c492b40fea0a61929c1f132d513f03ac (patch)
tree882e67660ebfc800e9d1971435b02ee2bd2537b9
parentAdd zenodo doi badge (diff)
downloadTCPD-fee27cb0c492b40fea0a61929c1f132d513f03ac.tar.gz
TCPD-fee27cb0c492b40fea0a61929c1f132d513f03ac.zip
Make robocall script more robust
-rw-r--r--datasets/robocalls/get_robocalls.py31
1 files changed, 29 insertions, 2 deletions
diff --git a/datasets/robocalls/get_robocalls.py b/datasets/robocalls/get_robocalls.py
index 8d76e87..9a1cccb 100644
--- a/datasets/robocalls/get_robocalls.py
+++ b/datasets/robocalls/get_robocalls.py
@@ -19,11 +19,19 @@ import hashlib
import json
import os
import requests
+import sys
+import time
from functools import wraps
URL = "https://web.archive.org/web/20191027130452/https://robocallindex.com/history/time"
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 "
+ "Safari/537.36"
+}
+
MD5_JSON = "f67ec0ccb50f2a835912e5c51932c083"
MONTHS = {
@@ -86,9 +94,28 @@ def validate(checksum):
# We can't validate the HTML as the wayback machine inserts the retrieval time
# in the HTML, so the checksum is not constant.
def write_html(target_path=None):
- req = requests.get(URL)
+ count = 0
+ jar = {}
+ tries = 10
+ while count < tries:
+ count += 1
+ error = False
+ try:
+ res = requests.get(URL, headers=HEADERS, cookies=jar)
+ except requests.exceptions.ConnectionError:
+ error = True
+ if error or not res.ok:
+ print(
+ "(%i/%i) Error getting URL %s. Retrying in 5 seconds."
+ % (count, tries, URL),
+ file=sys.stderr,
+ )
+ time.sleep(5)
+ continue
+ if error:
+ raise ValueError("Couldn't retrieve URL %s" % URL)
with open(target_path, "wb") as fp:
- fp.write(req.content)
+ fp.write(res.content)
@validate(MD5_JSON)