TonyLadson/cut_and_stack.R

## cut_and_stack.R
######################################################################################
#
# Cut and Stack Plot
#
#######################################################################################

library(tidyverse)
library(lubridate)
library(gridExtra)
library(scales)
library(padr)


# Read in sample flow data
# Available on Google Drive at
# https://drive.google.com/open?id=1-g1rOfeOxquPow4YM1aifdEkdBboUzu4

# For advice on reading in files stored on Google Drive
# https://stackoverflow.com/questions/33135060/read-csv-file-hosted-on-google-drive

id <- "1-g1rOfeOxquPow4YM1aifdEkdBboUzu4" # google file ID

flow_daily <- read_csv(file = sprintf("https://docs.google.com/uc?id=%s&export=download", id),
         col_types = cols(
           X1 = col_character(),
           X2 = col_double(),
           X3 = col_integer(),
           X4 = col_double(),
           X5 = col_integer(),
           X6 = col_double(),
           X7 = col_integer()
         ),
         col_names = FALSE,
         skip = 17)

# On my computer
# project_path <- "/Users/anthonyladson/Dropbox/Grad Cert/2017/Unit 1/Hydrology/Casey's Weir/"
# flow_daily <- read_csv(file =str_c(project_path, '404216_20170628/404216.csv' ),
#                        col_types = cols(
#                          X1 = col_character(),
#                          X2 = col_double(),
#                          X3 = col_integer(),
#                          X4 = col_double(),
#                          X5 = col_integer(),
#                          X6 = col_double(),
#                          X7 = col_integer()
#                        ),
#                        col_names = FALSE,
#                        skip = 17
# )

names(flow_daily) <- c('date',
                       'flow_mean',
                       'qcode_mean',
                       'flow_min',
                       'qcode_min',
                       'flow_max',
                       'qcode_max')

# Convert date

flow_daily <- flow_daily %>%
   mutate(date = as.Date(str_sub(date, 1, 10), format = '%d/%m/%Y'))  # Convert time, just use days


#______________________________________________________________________________________
# Checks


# start and end of the file

head(flow_daily)
tail(flow_daily)


# Missing data

colSums(is.na(flow_daily))

# Remove first line (contains missing data and add columns we'll use later)

flow_daily <- flow_daily %>%
  slice(2:nrow(flow_daily)) # There is only one missing value, which is the first, so lets remove it.


# Check time step

x <- flow_daily %>%
  mutate(date_diff = date - lag(date)) %>%
  count(date_diff) # ok
res <- assertthat::assert_that(x[1,2] == nrow(flow_daily)-1)


# # Look for incomplete years
# # This isn't relevant for cut and stack plots
#
# flow_daily %>%
#   mutate(flow_year = year(date)) %>%
#   group_by(flow_year) %>%
#   summarise(days_measured = n()) %>%
#   filter(!(days_measured %in% c(365,366))) # Years without 365 or 366 days
#
# # Delete 1972 and 2017 (first and last year)
#
# flow_daily <- flow_daily %>%
#   filter(!(lubridate::year(date) %in% c(1972, 2017)))

# Zero values

colSums(dplyr::near(flow_daily[,-1], 0))
# looks fine
# date  flow_mean qcode_mean   flow_min  qcode_min   flow_max  qcode_max  flow_year flow_month
# 0        560          0        783          0        557          0          0          0
# jday
# 0


# plot all data
flow_daily %>%
  ggplot(aes(date, flow_mean)) +
  geom_line() +
  scale_x_date(name = 'Date',
               date_breaks = '2 years',
               date_labels = '%Y',
               limits = c(as.Date('1972-06-01'), as.Date('2017-7-31'))) + # These limits are a work around to so x-axis goes from 1972-2018
  scale_y_continuous(name = 'Mean daily flow (ML/d)',
                     labels = comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


## Cut and stack

# ggplot version
# Uses ideas from 'Stacked Time Series in R'
# https://bscheng.com/2016/10/17/stacked-time-series-in-r/


# start and end dates
flow_daily %>%
  dplyr::summarise(min(date), max(date))


# # A tibble: 1 x 2
# `min(flow_year)` `max(flow_year)`
# <dbl>            <dbl>
#   1             1973             2016


# We will cut and stack by decades so
# Pad out to complete decades

flow_daily <- flow_daily %>% pad('day', start_val = as.Date('1970-01-01'))
flow_daily <- flow_daily %>% pad('day', end_val = as.Date('2019-12-31'))

# Add additional date information
flow_daily <- flow_daily %>%
  mutate(flow_year = year(date)) %>%
  mutate(flow_month = month(date)) %>%
  mutate(jday = yday(date))

# Add decade variable

flow_daily <- flow_daily %>%
  mutate(decade = case_when(
    flow_year >= 1970 & flow_year <= 1979 ~ '1970-1979',
    flow_year >= 1980 & flow_year <= 1989 ~ '1980-1989',
    flow_year >= 1990 & flow_year <= 1999 ~ '1990-1999',
    flow_year >= 2000 & flow_year <= 2009 ~ '2000-2009',
    flow_year >= 2010 & flow_year <= 2019 ~ '2010-2019',
    TRUE ~ NA_character_))

# We need to make the plots individually and stack them
# We need to fix the y-axis scale
# Use mean daily flow

#glimpse(flow_daily)


# check the max and min flow to set the scales
flow_daily %>%
  dplyr::summarise(min(flow_mean, na.rm = TRUE), max(flow_mean, na.rm = TRUE))

# # A tibble: 1 x 2
# `min(flow_mean, na.rm = TRUE)` `max(flow_mean, na.rm = TRUE)`
# <dbl>                          <dbl>
#   1                              0                         71366.

upper_limit <- 72000

p1979_1979 <-
  flow_daily %>%
  filter(decade == '1970-1979') %>%
  ggplot(aes(date, flow_mean)) + geom_line() +
  scale_x_date(name = '',
                date_breaks = "1 years",
                date_labels = '%Y') +
  scale_y_continuous(name = '',
                     labels = comma,
                     limits = c(0, upper_limit))

p1979_1979 # check graph


p1980_1989 <-
  flow_daily %>%
  filter(decade == '1980-1989') %>%
  ggplot(aes(date, flow_mean)) + geom_line() +
  scale_x_date(name = '',
               date_breaks = "1 years",
               date_labels = '%Y') +
  scale_y_continuous(name = '',
                     labels = comma,
                     limits = c(0, upper_limit))

p1980_1989

p1990_1999 <-
  flow_daily %>%
  filter(decade == '1990-1999') %>%
  ggplot(aes(date, flow_mean)) + geom_line() +
  scale_x_date(name = '',
               date_breaks = "1 years",
               date_labels = '%Y') +
  scale_y_continuous(name = 'Mean daily flow (ML/d)',
                     labels = comma,
                     limits = c(0, upper_limit))


p1990_1999

p2000_2009 <-
  flow_daily %>%
  filter(decade == '2000-2009') %>%
  ggplot(aes(date, flow_mean)) + geom_line() +
  scale_x_date(name = '',
               date_breaks = "1 years",
               date_labels = '%Y') +
  scale_y_continuous(name = '',
                     labels = comma,
                     limits = c(0, upper_limit))

p2000_2009

p2010_2019 <-
  flow_daily %>%
  filter(decade == '2010-2019') %>%
  ggplot(aes(date, flow_mean)) + geom_line() +
  scale_x_date(name = '',
               date_breaks = "1 years",
               date_labels = '%Y') +
  scale_y_continuous(name = '',
                     labels = comma,
                     limits = c(0, upper_limit))


p2010_2019

# Stick together

gridExtra::grid.arrange(p1979_1979, p1980_1989, p1990_1999, p2000_2009, p2010_2019, ncol = 1)
	######################################################################################
	#
	# Cut and Stack Plot
	#
	#######################################################################################

	library(tidyverse)
	library(lubridate)
	library(gridExtra)
	library(scales)
	library(padr)



	# Read in sample flow data
	# Available on Google Drive at
	# https://drive.google.com/open?id=1-g1rOfeOxquPow4YM1aifdEkdBboUzu4

	# For advice on reading in files stored on Google Drive
	# https://stackoverflow.com/questions/33135060/read-csv-file-hosted-on-google-drive

	id <- "1-g1rOfeOxquPow4YM1aifdEkdBboUzu4" # google file ID

	flow_daily <- read_csv(file = sprintf("https://docs.google.com/uc?id=%s&export=download", id),
	col_types = cols(
	X1 = col_character(),
	X2 = col_double(),
	X3 = col_integer(),
	X4 = col_double(),
	X5 = col_integer(),
	X6 = col_double(),
	X7 = col_integer()
	),
	col_names = FALSE,
	skip = 17)

	# On my computer
	# project_path <- "/Users/anthonyladson/Dropbox/Grad Cert/2017/Unit 1/Hydrology/Casey's Weir/"
	# flow_daily <- read_csv(file =str_c(project_path, '404216_20170628/404216.csv' ),
	# col_types = cols(
	# X1 = col_character(),
	# X2 = col_double(),
	# X3 = col_integer(),
	# X4 = col_double(),
	# X5 = col_integer(),
	# X6 = col_double(),
	# X7 = col_integer()
	# ),
	# col_names = FALSE,
	# skip = 17
	# )

	names(flow_daily) <- c('date',
	'flow_mean',
	'qcode_mean',
	'flow_min',
	'qcode_min',
	'flow_max',
	'qcode_max')

	# Convert date

	flow_daily <- flow_daily %>%
	mutate(date = as.Date(str_sub(date, 1, 10), format = '%d/%m/%Y')) # Convert time, just use days


	#______________________________________________________________________________________
	# Checks


	# start and end of the file

	head(flow_daily)
	tail(flow_daily)


	# Missing data

	colSums(is.na(flow_daily))

	# Remove first line (contains missing data and add columns we'll use later)

	flow_daily <- flow_daily %>%
	slice(2:nrow(flow_daily)) # There is only one missing value, which is the first, so lets remove it.


	# Check time step

	x <- flow_daily %>%
	mutate(date_diff = date - lag(date)) %>%
	count(date_diff) # ok
	res <- assertthat::assert_that(x[1,2] == nrow(flow_daily)-1)



	# # Look for incomplete years
	# # This isn't relevant for cut and stack plots
	#
	# flow_daily %>%
	# mutate(flow_year = year(date)) %>%
	# group_by(flow_year) %>%
	# summarise(days_measured = n()) %>%
	# filter(!(days_measured %in% c(365,366))) # Years without 365 or 366 days
	#
	# # Delete 1972 and 2017 (first and last year)
	#
	# flow_daily <- flow_daily %>%
	# filter(!(lubridate::year(date) %in% c(1972, 2017)))

	# Zero values

	colSums(dplyr::near(flow_daily[,-1], 0))
	# looks fine
	# date flow_mean qcode_mean flow_min qcode_min flow_max qcode_max flow_year flow_month
	# 0 560 0 783 0 557 0 0 0
	# jday
	# 0


	# plot all data
	flow_daily %>%
	ggplot(aes(date, flow_mean)) +
	geom_line() +
	scale_x_date(name = 'Date',
	date_breaks = '2 years',
	date_labels = '%Y',
	limits = c(as.Date('1972-06-01'), as.Date('2017-7-31'))) + # These limits are a work around to so x-axis goes from 1972-2018
	scale_y_continuous(name = 'Mean daily flow (ML/d)',
	labels = comma) +
	theme(axis.text.x = element_text(angle = 45, hjust = 1))



	## Cut and stack

	# ggplot version
	# Uses ideas from 'Stacked Time Series in R'
	# https://bscheng.com/2016/10/17/stacked-time-series-in-r/


	# start and end dates
	flow_daily %>%
	dplyr::summarise(min(date), max(date))


	# # A tibble: 1 x 2
	# `min(flow_year)` `max(flow_year)`
	# <dbl> <dbl>
	# 1 1973 2016


	# We will cut and stack by decades so
	# Pad out to complete decades

	flow_daily <- flow_daily %>% pad('day', start_val = as.Date('1970-01-01'))
	flow_daily <- flow_daily %>% pad('day', end_val = as.Date('2019-12-31'))

	# Add additional date information
	flow_daily <- flow_daily %>%
	mutate(flow_year = year(date)) %>%
	mutate(flow_month = month(date)) %>%
	mutate(jday = yday(date))

	# Add decade variable

	flow_daily <- flow_daily %>%
	mutate(decade = case_when(
	flow_year >= 1970 & flow_year <= 1979 ~ '1970-1979',
	flow_year >= 1980 & flow_year <= 1989 ~ '1980-1989',
	flow_year >= 1990 & flow_year <= 1999 ~ '1990-1999',
	flow_year >= 2000 & flow_year <= 2009 ~ '2000-2009',
	flow_year >= 2010 & flow_year <= 2019 ~ '2010-2019',
	TRUE ~ NA_character_))

	# We need to make the plots individually and stack them
	# We need to fix the y-axis scale
	# Use mean daily flow

	#glimpse(flow_daily)


	# check the max and min flow to set the scales
	flow_daily %>%
	dplyr::summarise(min(flow_mean, na.rm = TRUE), max(flow_mean, na.rm = TRUE))

	# # A tibble: 1 x 2
	# `min(flow_mean, na.rm = TRUE)` `max(flow_mean, na.rm = TRUE)`
	# <dbl> <dbl>
	# 1 0 71366.

	upper_limit <- 72000

	p1979_1979 <-
	flow_daily %>%
	filter(decade == '1970-1979') %>%
	ggplot(aes(date, flow_mean)) + geom_line() +
	scale_x_date(name = '',
	date_breaks = "1 years",
	date_labels = '%Y') +
	scale_y_continuous(name = '',
	labels = comma,
	limits = c(0, upper_limit))

	p1979_1979 # check graph



	p1980_1989 <-
	flow_daily %>%
	filter(decade == '1980-1989') %>%
	ggplot(aes(date, flow_mean)) + geom_line() +
	scale_x_date(name = '',
	date_breaks = "1 years",
	date_labels = '%Y') +
	scale_y_continuous(name = '',
	labels = comma,
	limits = c(0, upper_limit))

	p1980_1989

	p1990_1999 <-
	flow_daily %>%
	filter(decade == '1990-1999') %>%
	ggplot(aes(date, flow_mean)) + geom_line() +
	scale_x_date(name = '',
	date_breaks = "1 years",
	date_labels = '%Y') +
	scale_y_continuous(name = 'Mean daily flow (ML/d)',
	labels = comma,
	limits = c(0, upper_limit))



	p1990_1999

	p2000_2009 <-
	flow_daily %>%
	filter(decade == '2000-2009') %>%
	ggplot(aes(date, flow_mean)) + geom_line() +
	scale_x_date(name = '',
	date_breaks = "1 years",
	date_labels = '%Y') +
	scale_y_continuous(name = '',
	labels = comma,
	limits = c(0, upper_limit))

	p2000_2009

	p2010_2019 <-
	flow_daily %>%
	filter(decade == '2010-2019') %>%
	ggplot(aes(date, flow_mean)) + geom_line() +
	scale_x_date(name = '',
	date_breaks = "1 years",
	date_labels = '%Y') +
	scale_y_continuous(name = '',
	labels = comma,
	limits = c(0, upper_limit))


	p2010_2019

	# Stick together

	gridExtra::grid.arrange(p1979_1979, p1980_1989, p1990_1999, p2000_2009, p2010_2019, ncol = 1)