Created
September 22, 2021 00:08
-
-
Save jeanpaulrsoucy/6bb99e38c6c83520041516567a84f7e2 to your computer and use it in GitHub Desktop.
Download archived BC COVID-19 hospitalization and ICU data into a single spreadsheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download archived BC COVID-19 hospitalization data from the Archive of Canadian COVID-19 Data | |
# and extract data from JSON files into a single CSV spreadsheet | |
# https://github.com/ccodwg/Covid19CanadaArchive | |
# bug: there may be a few dates with duplicate data from when the archive tool downloaded the same file twice in one day | |
# load modules | |
import os | |
import re | |
import json | |
from datetime import datetime | |
import requests # download files | |
import pandas as pd # manipulate data frames | |
from boto3 import client # Amazon S3 client | |
# (optional) change working directory where files will be downloaded | |
# os.chdir('/path/to/directory') | |
os.chdir('/home/jprs/Desktop/dl') | |
# define path to files of interest (e.g., BC hospitalization and ICU data) | |
file_path = 'archive/bc/cumulative-case-death-recover-hosp-icu-by-rha' | |
# get list of available files | |
cli = client('s3') | |
files = [key['Key'] for key in cli.list_objects(Bucket='data.opencovid.ca', Prefix=file_path)['Contents']] | |
# (optional) filter out supplementary material from list of files (does not apply in this case) | |
# pat = re.compile('^.*/supplementary/') # match files in supplementary folder | |
# files = [s for s in files if not pat.match(s)] | |
# download files into working directory | |
base_url = 'http://data.opencovid.ca/' # base URL of archive | |
for file in files: | |
req = requests.get(base_url + file) | |
with open(os.path.basename(file), 'wb') as f: | |
f.write(req.content) | |
# create list of downloaded files | |
file_list = list(map(lambda f: os.path.basename(f), files)) | |
# combine donwloaded JSON files into a single spreadsheet | |
data = list() # create empty list | |
for file in file_list: | |
## open file and read in as JSON | |
with open(file, 'r') as f: | |
d = f.read() | |
d = json.loads(d) | |
d = d['features'] # access content | |
d = pd.json_normalize(d) # flatten | |
d.columns = d.columns.str.replace('^attributes\.', '', regex = True) # clean column names | |
d.insert(0, 'file_datetime', datetime.strptime(file[-21:-5], '%Y-%m-%d_%H-%M')) # add file datetime column (times are ) | |
## append to list | |
data.append(d) | |
## convert to single data frame | |
data = pd.concat(data) | |
# write final result as CSV | |
data.to_csv('data_out.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment