Kat Li YiLi225

## process_data.R
###### Process data and subset on countries
# sort(unique(series_df$Country.Region))
selected_countries = c('China', 'Italy', 'US', 'Iran')

## sum group by country by status
date_col_idx = which(grepl('X', colnames(series_df)))

country_data = series_df %>%
  filter(Country.Region %in% selected_countries) %>%
  select(c(Country.Region, Status, colnames(.)[date_col_idx])) %>%

## check_cols.R
### check whether the column names of 3 datasets match up
columns = sapply(series_data_, colnames)

### !!! The code below certainly works for datasets with small numbers of columns,
### However, what if we have 1000 columns to do pair-wise checking,
### or additional columns being added to the datasource?
all(columns[, 1] == columns[, 2])
all(columns[, 2] == columns[, 3])
all(columns[, 1] == columns[, 3])

## combine.R
#### Append a list of datasets into one single data frame
series_df = do.call(rbind, series_data_)

##### (optional) Recode the values
series_df[which(series_df$Country.Region == 'United Kingdom'), 'Country.Region'] = 'UK'
series_df[which(series_df$Country.Region == 'Korea, South'), 'Country.Region'] = 'Korea S'

## helper.R
#### helper function for trajectory line chart:
#### note: there is global variable in this function!!!
helper_vis_continuous <- function(dat = country_data, country = 'US') {
  ## vertical structure for ggplot
  current_dat = dat %>%
    filter(Country.Region == country) %>%
    reshape2::melt(.) %>%
    set_colnames(c('Country', 'Status', 'Date', 'Total'))

  ## starts with the date when 1st case was confirmed for this country

## wide_to_long.R
#### Select the countries for plotting, and convert wide format to long
current_dat = country_data %>%
  filter(Country.Region %in% selected_countries) %>%
  reshape2::melt(.) %>%
  set_colnames(c('Country', 'Status', 'Date', 'Total'))

## starts with the date when 1st case confirmed
find_case1_onwards <- function(country_name) {
  case1_idx = which(current_dat[current_dat$Country == country_name, 'Total'] > 0)[1]


## ploty_facet_wrap.R
#### Plotly with ggplot facet_wrap
country_plot = ggplot(dat = current_dat,
                      aes_string(x = 'Date', y = 'Total', color = 'Status', group = 'Status', linetype = 'Status')) +
  geom_line(lwd = 1.2) +

  facet_wrap(~ Country, scales = "free") +

  labs(title = sprintf('Trajectories of the Status of Coronavirus \n')) +
  xlab('Date') + ylab('Total Numbers') +
  theme_bw() +

## heatmap.R
####### Heatmap from 2/22 to 3/28
library(gplots)
my_palette <- colorRampPalette(c("light blue", "black", "red"))(n = 1000)

heatmap_dat = country_data %>%
  filter(Status == 'confirmed')

## subset: after Feb 22nd
col_idx = which(colnames(heatmap_dat) == '2_22_20')
heatmap_dat = heatmap_dat[, c(1, 2, col_idx:ncol(heatmap_dat))]

## animate_1.R
#####======= static and animated plots
total_text_y = 0.87*(max(confirmed_formatted$Total))
panel_size_y = max(confirmed_formatted$Total) * 1.15
vline_original_y = seq(floor(max(confirmed_formatted$Total)/8),
                       max(confirmed_formatted$Total), by = floor(max(confirmed_formatted$Total)/8))

country_font_size = 10
bar_end_num_size = 11

staticplot = ggplot(confirmed_formatted,

## animate_2.R
#### Specify the transition length and ease_aes to give it a smoother transition
current_state_len = 0
current_transition_len = 3

anim = staticplot +
  transition_states(Date, transition_length = current_transition_len, state_length = current_state_len) +
  ease_aes('cubic-in-out') +
  view_follow(fixed_x = TRUE, fixed_y = c(-10, NA))  +
  labs(title = 'Spead of Confirmed Cases per day: {closest_state}',
       subtitle = 'Top 10 Countries/Regions',

## read_in_data.R
######## Point to your data directory
series_all_files = list.files(series_data_dir)
series_data_files = series_all_files[grepl('.csv', series_all_files)]

print(sprintf('Total data files = %s', length(series_data_files)))

series_data_ = lapply(series_data_files,
                      function(i) {
                        dat = read.csv(paste0(series_data_dir, '/', i), stringsAsFactors = FALSE)
                        file_ = gsub('.csv', '', i)
	###### Process data and subset on countries
	# sort(unique(series_df$Country.Region))
	selected_countries = c('China', 'Italy', 'US', 'Iran')

	## sum group by country by status
	date_col_idx = which(grepl('X', colnames(series_df)))

	country_data = series_df %>%
	filter(Country.Region %in% selected_countries) %>%
	select(c(Country.Region, Status, colnames(.)[date_col_idx])) %>%
	### check whether the column names of 3 datasets match up
	columns = sapply(series_data_, colnames)

	### !!! The code below certainly works for datasets with small numbers of columns,
	### However, what if we have 1000 columns to do pair-wise checking,
	### or additional columns being added to the datasource?
	all(columns[, 1] == columns[, 2])
	all(columns[, 2] == columns[, 3])
	all(columns[, 1] == columns[, 3])
	#### Append a list of datasets into one single data frame
	series_df = do.call(rbind, series_data_)

	##### (optional) Recode the values
	series_df[which(series_df$Country.Region == 'United Kingdom'), 'Country.Region'] = 'UK'
	series_df[which(series_df$Country.Region == 'Korea, South'), 'Country.Region'] = 'Korea S'
	#### helper function for trajectory line chart:
	#### note: there is global variable in this function!!!
	helper_vis_continuous <- function(dat = country_data, country = 'US') {
	## vertical structure for ggplot
	current_dat = dat %>%
	filter(Country.Region == country) %>%
	reshape2::melt(.) %>%
	set_colnames(c('Country', 'Status', 'Date', 'Total'))

	## starts with the date when 1st case was confirmed for this country
	#### Select the countries for plotting, and convert wide format to long
	current_dat = country_data %>%
	filter(Country.Region %in% selected_countries) %>%
	reshape2::melt(.) %>%
	set_colnames(c('Country', 'Status', 'Date', 'Total'))

	## starts with the date when 1st case confirmed
	find_case1_onwards <- function(country_name) {
	case1_idx = which(current_dat[current_dat$Country == country_name, 'Total'] > 0)[1]
	#### Plotly with ggplot facet_wrap
	country_plot = ggplot(dat = current_dat,
	aes_string(x = 'Date', y = 'Total', color = 'Status', group = 'Status', linetype = 'Status')) +
	geom_line(lwd = 1.2) +

	facet_wrap(~ Country, scales = "free") +

	labs(title = sprintf('Trajectories of the Status of Coronavirus \n')) +
	xlab('Date') + ylab('Total Numbers') +
	theme_bw() +
	####### Heatmap from 2/22 to 3/28
	library(gplots)
	my_palette <- colorRampPalette(c("light blue", "black", "red"))(n = 1000)

	heatmap_dat = country_data %>%
	filter(Status == 'confirmed')

	## subset: after Feb 22nd
	col_idx = which(colnames(heatmap_dat) == '2_22_20')
	heatmap_dat = heatmap_dat[, c(1, 2, col_idx:ncol(heatmap_dat))]
	#####======= static and animated plots
	total_text_y = 0.87*(max(confirmed_formatted$Total))
	panel_size_y = max(confirmed_formatted$Total) * 1.15
	vline_original_y = seq(floor(max(confirmed_formatted$Total)/8),
	max(confirmed_formatted$Total), by = floor(max(confirmed_formatted$Total)/8))

	country_font_size = 10
	bar_end_num_size = 11

	staticplot = ggplot(confirmed_formatted,
	#### Specify the transition length and ease_aes to give it a smoother transition
	current_state_len = 0
	current_transition_len = 3

	anim = staticplot +
	transition_states(Date, transition_length = current_transition_len, state_length = current_state_len) +
	ease_aes('cubic-in-out') +
	view_follow(fixed_x = TRUE, fixed_y = c(-10, NA)) +
	labs(title = 'Spead of Confirmed Cases per day: {closest_state}',
	subtitle = 'Top 10 Countries/Regions',
	######## Point to your data directory
	series_all_files = list.files(series_data_dir)
	series_data_files = series_all_files[grepl('.csv', series_all_files)]

	print(sprintf('Total data files = %s', length(series_data_files)))

	series_data_ = lapply(series_data_files,
	function(i) {
	dat = read.csv(paste0(series_data_dir, '/', i), stringsAsFactors = FALSE)
	file_ = gsub('.csv', '', i)