Generate CSV of daily deltas from nychealth/coronavirus-data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
##### | |
# This script provides a delta history of all updates to case-hosp-death.csv | |
# based on the AS_OF from summary.csv | |
# https://github.com/nychealth/coronavirus-data/blob/master/case-hosp-death.csv | |
# https://github.com/nychealth/coronavirus-data/blob/master/summary.csv | |
# | |
# By: Jehiah Czebotar | |
# https://jehiah.cz/ | |
# | |
# | |
# Usage: | |
# | |
# run from a checkout of https://github.com/nychealth/coronavirus-data | |
##### | |
# find all the commits of case-hosp-death.csv | |
COMMITS=($(git log --format="%H" -- case-hosp-death.csv)) | |
NEW_IFS=$(echo -e '\n\r\t ,') | |
# header | |
echo "AS_OF,DATE_OF_INTEREST,NEW_COVID_CASE_COUNT,HOSPITALIZED_CASE_COUNT,DEATH_COUNT" | |
# output the values from the first version of case-hosp-death.csv | |
AS_OF="March 25, 5:30 p.m." # from 2d90af0522c65ee77d6287a1f0469264648e3681 | |
ORIGINAL_CSV=$(echo "$(git show ${COMMITS[${#COMMITS[@]}]}:case-hosp-death.csv && echo)" | gsed -e 's/\r$//g') | |
echo "$ORIGINAL_CSV" | while IFS="$NEW_IFS" read DATE_OF_INTEREST NEW_COVID_CASE_COUNT HOSPITALIZED_CASE_COUNT DEATH_COUNT; do | |
# ignore the header; $DATE_OF_INTEREST might have BOM so check second field | |
if [ "$NEW_COVID_CASE_COUNT" = "NEW_COVID_CASE_COUNT" ]; then | |
continue | |
fi | |
echo "\"${AS_OF}\",${DATE_OF_INTEREST},${NEW_COVID_CASE_COUNT},${HOSPITALIZED_CASE_COUNT},${DEATH_COUNT}" | |
done | |
# iterate each following version, and output only the delta | |
declare -A CASE_MAP | |
declare -A HOSPITALIZED_MAP | |
declare -A DEATH_MAP | |
for i in $(seq $(( ${#COMMITS[@]} - 1 )) -1 0); do | |
# compare between $COMMITS[$i] (more recent) and $COMMITS[$i + 1] (previous) | |
CASE_MAP=() | |
HOSPITALIZED_MAP=() | |
DEATH_MAP=() | |
# pre-processing. handle missing trailing newline, and stray carrrage return | |
# add extra newline at the end | |
PREVIOUS_CSV=$(echo "$(git show ${COMMITS[$(( $i + 1 ))]}:case-hosp-death.csv && echo)" | gsed -e 's/\r$//g') | |
CURRENT_CSV=$(echo "$(git show ${COMMITS[$i]}:case-hosp-death.csv && echo)" | gsed -e 's/\r$//g') | |
AS_OF=$(git show ${COMMITS[$i]}:summary.csv | tail -1 | awk -F '"' '{print $2}') | |
echo "$PREVIOUS_CSV" | while IFS="$NEW_IFS" read DATE_OF_INTEREST NEW_COVID_CASE_COUNT HOSPITALIZED_CASE_COUNT DEATH_COUNT; do | |
# ignore the header; $DATE_OF_INTEREST might have BOM so check second field | |
if [ "$NEW_COVID_CASE_COUNT" = "NEW_COVID_CASE_COUNT" ]; then | |
continue | |
fi | |
CASE_MAP[${DATE_OF_INTEREST}]=$NEW_COVID_CASE_COUNT | |
HOSPITALIZED_MAP[${DATE_OF_INTEREST}]=$HOSPITALIZED_CASE_COUNT | |
DEATH_MAP[${DATE_OF_INTEREST}]=${DEATH_COUNT%%[:space:]} | |
done | |
echo "$CURRENT_CSV" | while IFS="$NEW_IFS" read DATE_OF_INTEREST NEW_COVID_CASE_COUNT HOSPITALIZED_CASE_COUNT DEATH_COUNT; do | |
# ignore the header; $DATE_OF_INTEREST might have BOM so check second field | |
if [ "$NEW_COVID_CASE_COUNT" = "NEW_COVID_CASE_COUNT" ]; then | |
continue | |
fi | |
# DEBUG | |
# echo "DATE_OF_INTEREST $DATE_OF_INTEREST CASE $NEW_COVID_CASE_COUNT H $HOSPITALIZED_CASE_COUNT D $DEATH_COUNT" | |
# echo "PREV CASE ${CASE_MAP[$DATE_OF_INTEREST]} H ${HOSPITALIZED_MAP[$DATE_OF_INTEREST]} D ${DEATH_MAP[$DATE_OF_INTEREST]}" | |
if [ -z "$NEW_COVID_CASE_COUNT" ] || [ -z "${CASE_MAP[$DATE_OF_INTEREST]}" ]; then | |
CASE_DELTA="$NEW_COVID_CASE_COUNT" | |
else | |
CASE_DELTA=$(($NEW_COVID_CASE_COUNT - ${CASE_MAP[$DATE_OF_INTEREST]})) | |
fi | |
if [ -z "$HOSPITALIZED_CASE_COUNT" ] || [ -z "${HOSPITALIZED_MAP[$DATE_OF_INTEREST]}" ]; then | |
HOSPITALIZED_DELTA="$HOSPITALIZED_CASE_COUNT" | |
else | |
HOSPITALIZED_DELTA=$(($HOSPITALIZED_CASE_COUNT - ${HOSPITALIZED_MAP[$DATE_OF_INTEREST]})) | |
fi | |
if [ -z "$DEATH_COUNT" ] || [ -z "${DEATH_MAP[$DATE_OF_INTEREST]}" ]; then | |
DEATH_DELTA="$DEATH_COUNT" | |
else | |
DEATH_DELTA=$(($DEATH_COUNT - ${DEATH_MAP[$DATE_OF_INTEREST]})) | |
fi | |
if [ "${DATE_OF_INTEREST}" != 0 ] && [ "${CASE_DELTA}" != 0 ] && [ "${DEATH_DELTA}" != 0 ]; then | |
echo "\"${AS_OF}\",${DATE_OF_INTEREST},${CASE_DELTA},${HOSPITALIZED_DELTA},${DEATH_DELTA}" | |
fi | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment