jeanpaulrsoucy/bc-archive-data.py

## bc-archive-data.py
# Download archived BC COVID-19 hospitalization data from the Archive of Canadian COVID-19 Data
# and extract data from JSON files into a single CSV spreadsheet
# https://github.com/ccodwg/Covid19CanadaArchive

# bug: there may be a few dates with duplicate data from when the archive tool downloaded the same file twice in one day

# load modules
import os
import re
import json
from datetime import datetime
import requests # download files
import pandas as pd # manipulate data frames
from boto3 import client # Amazon S3 client

# (optional) change working directory where files will be downloaded
# os.chdir('/path/to/directory')
os.chdir('/home/jprs/Desktop/dl')

# define path to files of interest (e.g., BC hospitalization and ICU data)
file_path = 'archive/bc/cumulative-case-death-recover-hosp-icu-by-rha'

# get list of available files
cli = client('s3')
files = [key['Key'] for key in cli.list_objects(Bucket='data.opencovid.ca', Prefix=file_path)['Contents']]

# (optional) filter out supplementary material from list of files (does not apply in this case)
# pat = re.compile('^.*/supplementary/') # match files in supplementary folder
# files = [s for s in files if not pat.match(s)]

# download files into working directory
base_url = 'http://data.opencovid.ca/' # base URL of archive
for file in files:
  req = requests.get(base_url + file)
  with open(os.path.basename(file), 'wb') as f:
    f.write(req.content)

# create list of downloaded files
file_list = list(map(lambda f: os.path.basename(f), files))

# combine donwloaded JSON files into a single spreadsheet
data = list() # create empty list
for file in file_list:
  ## open file and read in as JSON
  with open(file, 'r') as f:
    d = f.read()
  d = json.loads(d)
  d = d['features'] # access content
  d = pd.json_normalize(d) # flatten
  d.columns = d.columns.str.replace('^attributes\.', '', regex = True) # clean column names
  d.insert(0, 'file_datetime', datetime.strptime(file[-21:-5], '%Y-%m-%d_%H-%M')) # add file datetime column (times are )
  ## append to list
  data.append(d)
## convert to single data frame
data = pd.concat(data)

# write final result as CSV
data.to_csv('data_out.csv', index=False)
	# Download archived BC COVID-19 hospitalization data from the Archive of Canadian COVID-19 Data
	# and extract data from JSON files into a single CSV spreadsheet
	# https://github.com/ccodwg/Covid19CanadaArchive

	# bug: there may be a few dates with duplicate data from when the archive tool downloaded the same file twice in one day

	# load modules
	import os
	import re
	import json
	from datetime import datetime
	import requests # download files
	import pandas as pd # manipulate data frames
	from boto3 import client # Amazon S3 client

	# (optional) change working directory where files will be downloaded
	# os.chdir('/path/to/directory')
	os.chdir('/home/jprs/Desktop/dl')

	# define path to files of interest (e.g., BC hospitalization and ICU data)
	file_path = 'archive/bc/cumulative-case-death-recover-hosp-icu-by-rha'

	# get list of available files
	cli = client('s3')
	files = [key['Key'] for key in cli.list_objects(Bucket='data.opencovid.ca', Prefix=file_path)['Contents']]

	# (optional) filter out supplementary material from list of files (does not apply in this case)
	# pat = re.compile('^.*/supplementary/') # match files in supplementary folder
	# files = [s for s in files if not pat.match(s)]

	# download files into working directory
	base_url = 'http://data.opencovid.ca/' # base URL of archive
	for file in files:
	req = requests.get(base_url + file)
	with open(os.path.basename(file), 'wb') as f:
	f.write(req.content)

	# create list of downloaded files
	file_list = list(map(lambda f: os.path.basename(f), files))

	# combine donwloaded JSON files into a single spreadsheet
	data = list() # create empty list
	for file in file_list:
	## open file and read in as JSON
	with open(file, 'r') as f:
	d = f.read()
	d = json.loads(d)
	d = d['features'] # access content
	d = pd.json_normalize(d) # flatten
	d.columns = d.columns.str.replace('^attributes\.', '', regex = True) # clean column names
	d.insert(0, 'file_datetime', datetime.strptime(file[-21:-5], '%Y-%m-%d_%H-%M')) # add file datetime column (times are )
	## append to list
	data.append(d)
	## convert to single data frame
	data = pd.concat(data)

	# write final result as CSV
	data.to_csv('data_out.csv', index=False)