diff options
| author | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-12-15 23:27:58 +0000 |
|---|---|---|
| committer | Gertjan van den Burg <gertjanvandenburg@gmail.com> | 2020-12-15 23:27:58 +0000 |
| commit | a2a65bfb5ad1ca7039347ff06a8758f6fb42e479 (patch) | |
| tree | e24a131456442e868d04ed42eb77748064e1951c | |
| parent | Update comment in apple download script (diff) | |
| download | TCPD-a2a65bfb5ad1ca7039347ff06a8758f6fb42e479.tar.gz TCPD-a2a65bfb5ad1ca7039347ff06a8758f6fb42e479.zip | |
Use openpyxl for xlrd file
| -rw-r--r-- | datasets/iceland_tourism/get_iceland_tourism.py | 22 | ||||
| -rw-r--r-- | requirements.txt | 2 |
2 files changed, 13 insertions, 11 deletions
diff --git a/datasets/iceland_tourism/get_iceland_tourism.py b/datasets/iceland_tourism/get_iceland_tourism.py index b9c8347..c177a8c 100644 --- a/datasets/iceland_tourism/get_iceland_tourism.py +++ b/datasets/iceland_tourism/get_iceland_tourism.py @@ -15,8 +15,8 @@ Copyright: 2019, The Alan Turing Institute import argparse import hashlib import json +import openpyxl import os -import xlrd import sys import time @@ -102,7 +102,6 @@ def download_xlsx(target_path=None): time.sleep(5) - def format_ym(year, month): midx = MONTHS[month] return "%i-%02d" % (int(year), midx) @@ -110,27 +109,30 @@ def format_ym(year, month): @validate(MD5_JSON) def write_json(xlsx_path, target_path=None): - wb = xlrd.open_workbook(xlsx_path) - ws = wb.sheet_by_index(2) + wb = openpyxl.load_workbook(xlsx_path) + ws = wb.worksheets[2] + + rows = list(ws.rows) # hardcoding these row indices, not worth doing it nicely - header = ws.row(2) + header = rows[2] + column_idx = [ i for i, c in enumerate(header) - if c.ctype == xlrd.XL_CELL_NUMBER and 2003 <= c.value < 2020 + if c.data_type == "n" and c.value and 2003 <= c.value < 2020 ] visitors = [] - r_offset = 3 + r_offset = 4 for c in column_idx: for r in range(r_offset, r_offset + 12): - cell = ws.cell(r, c) - if cell.ctype == xlrd.XL_CELL_EMPTY: + cell = ws.cell(r, c + 1) + if cell.value is None or str(cell.value) == "": continue year = header[c].value - month = ws.cell(r, 0).value + month = ws.cell(r, 1).value datestr = format_ym(year, month) # eliminate some observations that were not in the original dataset if datestr in ["2019-08", "2019-09", "2019-10"]: diff --git a/requirements.txt b/requirements.txt index 70427d1..508da1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,6 @@ beautifulsoup4>=4.8.1 clevercsv>=0.4.7 numpy>=1.17 requests>=2.22.0 -xlrd>=1.1.0 yfinance>=0.1.45 jsonschema>=3.2.0 +openpyxl |
