diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-12-15 23:27:58 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-12-15 23:27:58 +0000 |
| commit | a2a65bfb5ad1ca7039347ff06a8758f6fb42e479 (patch) | |
| tree | e24a131456442e868d04ed42eb77748064e1951c /datasets | |
| parent | Update comment in apple download script (diff) | |
| download | TCPD-a2a65bfb5ad1ca7039347ff06a8758f6fb42e479.tar.gz TCPD-a2a65bfb5ad1ca7039347ff06a8758f6fb42e479.zip | |
Use openpyxl for xlrd file
Diffstat (limited to 'datasets')
| -rw-r--r-- | datasets/iceland_tourism/get_iceland_tourism.py | 22 |
1 files changed, 12 insertions, 10 deletions
diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py index b9c8347..c177a8c 100644 --- a/datasets/iceland_tourism/get_iceland_tourism.py +++ b/datasets/iceland_tourism/get_iceland_tourism.py @@ -15,8 +15,8 @@ Copyright: 2019, The Alan Turing Institute import argparse import hashlib import json +import openpyxl import os -import xlrd import sys import time @@ -102,7 +102,6 @@ def download_xlsx(target_path=None): time.sleep(5) - def format_ym(year, month): midx = MONTHS[month] return "%i-%02d" % (int(year), midx) @@ -110,27 +109,30 @@ def format_ym(year, month): @validate(MD5_JSON) def write_json(xlsx_path, target_path=None): - wb = xlrd.open_workbook(xlsx_path) - ws = wb.sheet_by_index(2) + wb = openpyxl.load_workbook(xlsx_path) + ws = wb.worksheets[2] + + rows = list(ws.rows) # hardcoding these row indices, not worth doing it nicely - header = ws.row(2) + header = rows[2] + column_idx = [ i for i, c in enumerate(header) - if c.ctype == xlrd.XL_CELL_NUMBER and 2003 <= c.value < 2020 + if c.data_type == "n" and c.value and 2003 <= c.value < 2020 ] visitors = [] - r_offset = 3 + r_offset = 4 for c in column_idx: for r in range(r_offset, r_offset + 12): - cell = ws.cell(r, c) - if cell.ctype == xlrd.XL_CELL_EMPTY: + cell = ws.cell(r, c + 1) + if cell.value is None or str(cell.value) == "": continue year = header[c].value - month = ws.cell(r, 0).value + month = ws.cell(r, 1).value datestr = format_ym(year, month) # eliminate some observations that were not in the original dataset if datestr in ["2019-08", "2019-09", "2019-10"]: |
