Skip to content

Instantly share code, notes, and snippets.

@jeanpaulrsoucy
Created September 22, 2021 00:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeanpaulrsoucy/6bb99e38c6c83520041516567a84f7e2 to your computer and use it in GitHub Desktop.
Save jeanpaulrsoucy/6bb99e38c6c83520041516567a84f7e2 to your computer and use it in GitHub Desktop.
Download archived BC COVID-19 hospitalization and ICU data into a single spreadsheet
# Download archived BC COVID-19 hospitalization data from the Archive of Canadian COVID-19 Data
# and extract data from JSON files into a single CSV spreadsheet
# https://github.com/ccodwg/Covid19CanadaArchive
# bug: there may be a few dates with duplicate data from when the archive tool downloaded the same file twice in one day
# load modules
import os
import re
import json
from datetime import datetime
import requests # download files
import pandas as pd # manipulate data frames
from boto3 import client # Amazon S3 client
# (optional) change working directory where files will be downloaded
# os.chdir('/path/to/directory')
os.chdir('/home/jprs/Desktop/dl')
# define path to files of interest (e.g., BC hospitalization and ICU data)
file_path = 'archive/bc/cumulative-case-death-recover-hosp-icu-by-rha'
# get list of available files
cli = client('s3')
files = [key['Key'] for key in cli.list_objects(Bucket='data.opencovid.ca', Prefix=file_path)['Contents']]
# (optional) filter out supplementary material from list of files (does not apply in this case)
# pat = re.compile('^.*/supplementary/') # match files in supplementary folder
# files = [s for s in files if not pat.match(s)]
# download files into working directory
base_url = 'http://data.opencovid.ca/' # base URL of archive
for file in files:
req = requests.get(base_url + file)
with open(os.path.basename(file), 'wb') as f:
f.write(req.content)
# create list of downloaded files
file_list = list(map(lambda f: os.path.basename(f), files))
# combine donwloaded JSON files into a single spreadsheet
data = list() # create empty list
for file in file_list:
## open file and read in as JSON
with open(file, 'r') as f:
d = f.read()
d = json.loads(d)
d = d['features'] # access content
d = pd.json_normalize(d) # flatten
d.columns = d.columns.str.replace('^attributes\.', '', regex = True) # clean column names
d.insert(0, 'file_datetime', datetime.strptime(file[-21:-5], '%Y-%m-%d_%H-%M')) # add file datetime column (times are )
## append to list
data.append(d)
## convert to single data frame
data = pd.concat(data)
# write final result as CSV
data.to_csv('data_out.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment