diff options
Diffstat (limited to 'datasets/robocalls/get_robocalls.py')
| -rw-r--r-- | datasets/robocalls/get_robocalls.py | 196 |
1 files changed, 196 insertions, 0 deletions
diff --git a/datasets/robocalls/get_robocalls.py b/datasets/robocalls/get_robocalls.py new file mode 100644 index 0000000..8d76e87 --- /dev/null +++ b/datasets/robocalls/get_robocalls.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Collect the robocalls dataset + +See the README file for more information. + +Author: G.J.J. van den Burg +License: This file is part of TCPD, see the top-level LICENSE file. +Copyright: 2019, The Alan Turing Institute + +""" + + +import argparse +import bs4 +import hashlib +import json +import os +import requests + +from functools import wraps + +URL = "https://web.archive.org/web/20191027130452/https://robocallindex.com/history/time" + +MD5_JSON = "f67ec0ccb50f2a835912e5c51932c083" + +MONTHS = { + "January": 1, + "February": 2, + "March": 3, + "April": 4, + "May": 5, + "June": 6, + "July": 7, + "August": 8, + "September": 9, + "October": 10, + "November": 11, + "December": 12, +} + + +NAME_HTML = "robocalls.html" +NAME_JSON = "robocalls.json" + + +class ValidationError(Exception): + def __init__(self, filename): + self.message = ( + "Validating the file '%s' failed. \n" + "Please raise an issue on the GitHub page for this project \n" + "if the error persists." % filename + ) + + +def check_md5sum(filename, checksum): + with open(filename, "rb") as fp: + data = fp.read() + h = hashlib.md5(data).hexdigest() + return h == checksum + + +def validate(checksum): + """Decorator that validates the target file.""" + + def validate_decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + target = kwargs.get("target_path", None) + if os.path.exists(target) and check_md5sum(target, checksum): + return + out = func(*args, **kwargs) + if not os.path.exists(target): + raise FileNotFoundError("Target file expected at: %s" % target) + if not check_md5sum(target, checksum): + raise ValidationError(target) + return out + + return wrapper + + return validate_decorator + + +# We can't validate the HTML as the wayback machine inserts the retrieval time +# in the HTML, so the checksum is not constant. +def write_html(target_path=None): + req = requests.get(URL) + with open(target_path, "wb") as fp: + fp.write(req.content) + + +@validate(MD5_JSON) +def write_json(html_path, target_path=None): + with open(html_path, "rb") as fp: + soup = bs4.BeautifulSoup(fp, "html.parser") + + items = [] + + table = soup.find(id="robocallers-detail-table-1") + for row in table.find_all(attrs={"class": "month-row"}): + tds = row.find_all("td") + month_year = tds[0].a.text + amount = tds[1].text + + month, year = month_year.split(" ") + value = int(amount.replace(",", "")) + + month_idx = MONTHS[month] + + items.append({"time": "%s-%02d" % (year, month_idx), "value": value}) + + # During initial (manual) data collection it wasn't noticed that the first + # observation is at April 2015, not May 2015. Technically, this means that + # this series has a missing value at May 2015. However, because the + # annotators have considered the series as a consecutive series without the + # missing value, we do not add it in here. This way, the file that this + # script creates corresponds to what the annotators and algorithms have + # seen during the study. + + apr2015 = next((it for it in items if it["time"] == "2015-04"), None) + apr2015["time"] = "2015-05" + + by_date = {it["time"]: it["value"] for it in items} + + # remove the observations that were not part of the original dataset + del by_date["2019-09"] + + time = sorted(by_date.keys()) + values = [by_date[t] for t in time] + + series = [{"label": "V1", "type": "int", "raw": values}] + + data = { + "name": "robocalls", + "longname": "Robocalls", + "n_obs": len(time), + "n_dim": len(series), + "time": { + "type": "string", + "format": "%Y-%m", + "index": list(range(0, len(time))), + "raw": time, + }, + "series": series, + } + + with open(target_path, "w") as fp: + json.dump(data, fp, indent="\t") + + +def collect(output_dir="."): + html_path = os.path.join(output_dir, NAME_HTML) + json_path = os.path.join(output_dir, NAME_JSON) + + write_html(target_path=html_path) + write_json(html_path, target_path=json_path) + + +def clean(output_dir="."): + html_path = os.path.join(output_dir, NAME_HTML) + json_path = os.path.join(output_dir, NAME_JSON) + + if os.path.exists(html_path): + os.unlink(html_path) + + if os.path.exists(json_path): + os.unlink(json_path) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-o", "--output-dir", help="output directory to use", default="." + ) + parser.add_argument( + "action", + choices=["collect", "clean"], + help="Action to perform", + default="collect", + nargs="?", + ) + return parser.parse_args() + + +def main(output_dir="."): + args = parse_args() + if args.action == "collect": + collect(output_dir=args.output_dir) + elif args.action == "clean": + clean(output_dir=args.output_dir) + + +if __name__ == "__main__": + main() |
