Last active
June 15, 2019 04:41
-
-
Save TonyLadson/c67fa8efe355ba97375fd40a411618d7 to your computer and use it in GitHub Desktop.
Analysis of point rainfall temporal patterns from Australian Rainfall and Runoff https://tonyladson.wordpress.com/2019/04/16/arr-point-temporal-patterns/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I previously looked at areal patterns https://tonyladson.wordpress.com/2019/04/02/check-arr-temporal-patterns-before-you-use-them/ | |
# This blog extends the analysis to point patterns | |
# All data files are available at https://drive.google.com/open?id=1-RSPFSuAbOFZRIivO8YHfNlBJijwcMRT | |
# Packages ---------------------------------------------------------------- | |
devtools::install_github("thomasp85/patchwork") | |
library(tidyverse) | |
library(stringr) | |
library(rlang) | |
library(ggmap) | |
library(patchwork) | |
#help(package = 'stringr') | |
# Load data ------------------------------------------ | |
# Download point pattern increment files and all states files ----------------------------------------------------- | |
# There are 12 regions | |
# 1. Rangelands West | |
# 2. Wet Tropics | |
# 3. Rangelands | |
# 4. Central Slopes | |
# 5. Monsoonal North | |
# 6. Murray Basin | |
# 7. East Coast North | |
# 8. East Coast South | |
# 9. Southern Slopes Mainland | |
# 10. Southern Slopes Tasmania | |
# 11. East flatlands | |
# 12. West flatlands | |
# Two files for each region | |
# An increments file - which has the temporal patterns | |
# An Allstats file which as information on the storms that were used to generate the temporal patterns. | |
# Get all the point Increment files | |
# All data files are available at https://drive.google.com/open?id=1-RSPFSuAbOFZRIivO8YHfNlBJijwcMRT | |
list.files('../data') | |
all_increment_files <- str_subset(list.files('../data'), '(Increments)') # Extract files that contain 'Increments' | |
point_increment_files <- all_increment_files[!str_detect(all_increment_files, 'Areal')] # Remove those that contain 'Areal' | |
length(point_increment_files) # 12, as expected | |
point_increment_files | |
# [1] "CS_Increments.csv" "ECnorth_Increments.csv" "ECsouth_Increments.csv" "FLTeast_Increments .csv" "FLTwest_Increments.csv" | |
# [6] "MB_Increments.csv" "MN_Increments.csv" "R_Increments.csv" "Rwest_Increments.csv" "SSmainland_Increments.csv" | |
# [11] "SStas_Increments.csv" "WT_Increments.csv" | |
# list the All Stats files | |
all_stats_files <- str_subset(list.files('../data'), 'AllStats') # Extract files that contain 'AllStats' | |
point_stats_files <- all_stats_files[!str_detect(all_stats_files, 'Areal')] | |
point_stats_files | |
# [1] "CS_AllStats.csv" "ECnorth_AllStats.csv" "ECsouth_AllStats.csv" "FLTeast_AllStats .csv" "FLTwest_AllStats.csv" "MB_AllStats.csv" | |
# [7] "MN_AllStats.csv" "R_AllStats.csv" "Rwest_AllStats .csv" "SSmainland_AllStats.csv" "SStas_AllStats.csv" "WT_AllStats.csv" | |
# Function to download an all stats file, convert to a dataframe and store the whole lot in a data frame | |
Get_burst_stats <- function(fname) { | |
# Read in all the patterns | |
# One pattern per line | |
#fname <- "CS_AllStats.csv" | |
burst_stats <- read_lines(file = file.path('../data', fname), skip = 1, n_max = -1L) | |
# Split the strings at commas | |
burst_stats <- str_split(burst_stats, ',') | |
# Convert to data frame | |
burst_stats <- as_data_frame(rlang::exec('rbind', !!!burst_stats)) | |
# name the columns | |
names(burst_stats) <- c('eventID','region', 'region_source', | |
'burst_duration','burst_loading', 'original_burst_depth', | |
'aep_window', 'aep_source', 'burst_start', 'burst_end', | |
'event_ref_number', 'pluvio_ref', 'offical_gauge', | |
'lat', 'lon') | |
burst_stats | |
} | |
# Try on one file | |
Get_burst_stats("MN_AllStats.csv") | |
# Get stats for all regions | |
all_regions_stats <- map_df(.x = point_stats_files, .f = Get_burst_stats) | |
all_regions_stats | |
# Function to calcuate the NUI - non-uniformity index for a rainfall pattern | |
Chisquare_p <- function(p) { | |
p <- p[-(1:5)] # Remove the first 5 elements | |
p <- as.numeric(p) # Convert to numeric | |
p_e <- rep(100/length(p), times = length(p)) # Expected | |
chi <- sum(((p - p_e)^2)/p_e) # o - e | |
-log(-pchisq(chi, df = length(p) - 1, log.p = TRUE)) | |
# chisquare p value setting log = TRUE because some values are very small | |
# Then take the log of the this value to make it a reasonable size | |
} | |
# Function to read in patterns, calculate NUI and return a data frame with information: | |
# eventID, duration_min, duration_hr, interval_min, region, NUI | |
# fname is the file name of the increments file | |
Calc_NUI_for_patterns <- function(fname) { | |
#fname <- "CS_Increments.csv" | |
# Read in all the patterns | |
# One pattern per line | |
pat <- read_lines(file = file.path('../data', fname), skip = 1, n_max = -1L) | |
#pat | |
# remove trialing commas | |
pat <- str_replace(pat, '[,]*$', '') | |
#pat | |
# Split the strings at commas | |
pat <- str_split(pat, ',') | |
#pat | |
# Calculate the Chisquare statistic for each pattern | |
chisq_stat <- unlist(map(.x = pat, .f = Chisquare_p)) | |
#barplot(sort(chisq_stat)) | |
# Function to grab the first 5 fields from a temporal pattern | |
# The first 5 entres are: | |
# 1. EventID | |
# 2. Duration (minutes) | |
# 3. Time step | |
# 4. Region | |
# 5. AEP | |
First_five <- function(p) { | |
p <- p[1:5] | |
} | |
names(pat) <- 1:length(pat) # pat needs to have names for map_df to work | |
pat_db <- map_df(.x = pat, .f= First_five) # get first 5 fields | |
pat_db <- as_data_frame(t(as.matrix(pat_db))) # need to transpose to convert to data frame | |
names(pat_db) <- c('eventID', 'duration_min', 'time_step', 'region', 'aep') | |
# Add the non_uniformity index value NUI | |
pat_db <- pat_db %>% | |
mutate(NUI = chisq_stat) %>% | |
mutate(duration_min = as.integer(duration_min)) %>% | |
mutate(row_n = 1:length(pat)) %>% | |
mutate(duration_hr = as.integer(duration_min/60)) %>% | |
select(row_n, eventID, duration_min, duration_hr, time_step, region, aep, NUI) | |
# return the data for each region | |
pat_db %>% | |
arrange(desc(NUI)) | |
} | |
# Try on one file | |
Calc_NUI_for_patterns("CS_Increments.csv") | |
# Process all increment files to creat a summary for all patterns | |
all_point_patterns_sum <- map_df(.x = point_increment_files, .f = Calc_NUI_for_patterns) | |
all_point_patterns_sum <- all_point_patterns_sum %>% arrange(desc(NUI)) | |
all_point_patterns_sum | |
#Attach storm data to pattern summary | |
all_point_patterns_sum <- all_point_patterns_sum %>% | |
left_join(all_regions_stats, by = c("region" = "region", "eventID" = "eventID")) %>% | |
arrange(desc(NUI)) | |
all_point_patterns_sum %>% View | |
all_point_patterns_sum %>% | |
select(2:9) %>% View | |
# row_n eventID duration_min duration_hr time_step region aep NUI region_source | |
# <int> <chr> <int> <int> <chr> <chr> <chr> <dbl> <chr> | |
# 1 665 1208 8640 144 180 Rangelands (West) frequent 419. Rangelands (West) | |
# 2 706 1182 10080 168 180 Rangelands (West) intermediate 416. Rangelands (West) | |
# 3 697 1973 10080 168 180 Rangelands frequent 375. Rangelands | |
# 4 676 1183 8640 144 180 Rangelands intermediate 357. Rangelands | |
# 5 676 1181 8640 144 180 Rangelands (West) intermediate 357. Rangelands (West) | |
# 6 695 2870 10080 168 180 Central Slopes frequent 342. Central Slopes | |
# 7 703 1083 10080 168 180 Rangelands intermediate 341. Rangelands | |
# 8 705 1079 10080 168 180 Rangelands (West) intermediate 341. Rangelands (West) | |
# 9 696 1971 10080 168 180 Rangelands frequent 338. Rangelands | |
# 10 673 1082 8640 144 180 Rangelands intermediate 326. Rangelands | |
# 11 673 1078 8640 144 180 Rangelands (West) intermediate 326. Rangelands (West) | |
# 12 691 7838 10080 168 180 S-SW Flatlands (East) frequent 305. S-SW Flatlands (East) | |
# 13 662 7837 8640 144 180 S-SW Flatlands (East) frequent 303. S-SW Flatlands (East) | |
# 14 700 1979 10080 168 180 Rangelands frequent 284. Rangelands | |
# 15 633 7836 7200 120 180 S-SW Flatlands (East) frequent 283. S-SW Flatlands (East) | |
# 16 644 1081 7200 120 180 Rangelands intermediate 283. Rangelands | |
# 17 644 1077 7200 120 180 Rangelands (West) intermediate 283. Rangelands (West) | |
# 18 699 5767 10080 168 180 East Coast (North) frequent 279. East Coast (North) | |
# 19 698 1264 10080 168 180 Rangelands (West) frequent 278. Rangelands (West) | |
# 20 699 2874 10080 168 180 Central Slopes frequent 274. Central Slopes | |
# 21 694 1263 10080 168 180 Rangelands frequent 272. Rangelands | |
#saveRDS(all_point_patterns_sum, file = file.path('../data', 'all_point_patterns_sum.rds')) | |
# Process patterns for graphing ------------------------------------------- | |
# Function to create list of patterns | |
# Patterns need to be stored in a list rather than a data frame because the number of increments vary | |
List_patterns <- function(fname) { | |
pat <- read_lines(file = file.path('../data', fname ), skip = 1, n_max = -1L) | |
# strip off training commas | |
pat <- str_replace(pat, '[,]*$', '') | |
# Split the strings at commas | |
pat <- str_split(pat, ',') | |
# Get the region names and the event IDs so that we can name list | |
# elements and then pull individual patterns out. | |
# The region names are the 4th element and the EventID number is the first | |
patIDs <- map_chr(.x = 1:length(pat), .f = function(i)(str_c (pat[[i]][4], pat[[i]][1], sep = '_'))) | |
patIDs <- str_to_lower(patIDs) # convert to lower case | |
patIDs <- str_replace_all(patIDs, ' ', '_') # convert to snake case | |
names(pat) <- patIDs | |
return(pat) | |
} | |
# Apply to all increments files | |
all_point_increments <- map(.x = point_increment_files, .f = List_patterns) | |
all_point_increments | |
# This produces a list of 12 lists | |
# Use purrr::flatten to make into a single list | |
all_point_increments <- purrr::flatten(all_point_increments) | |
#head(all_increments) | |
#tail(all_increments) | |
# What are the region names | |
all_names <- names(all_point_increments) | |
unique(str_replace(all_names, '[0-9]+', '')) # remove the trailling numbers so we are just left with the names, | |
# [1] "central_slopes_" "east_coast_(north)_" "east_coast_(south)_" "s-sw_flatlands_(east)_" "s-sw_flatlands_(west)_" | |
# [6] "murray_basin_" "monsoonal_north_" "rangelands_" "rangelands_(west)_" "southern_slopes_(mainland)_" | |
# [11] "southern_slopes_(tasmania)_" "wet_tropics_" | |
# Save all_point_increments file for use in other scripts | |
#saveRDS(all_point_increments, file = file.path('../data','all_point_increments.rds')) | |
# Function to graph a pattern given the region name and event id e.g. central_slopes_3565 | |
Graph_pattern <- function(pat_name_id) { | |
pattern_ids = names(all_point_increments) | |
# if only a pattern number is provided | |
# change to region_number format, i.e. find the pattern in the names of the all_increments list | |
if(is.numeric(pat_name_id)) { | |
pat_name_id = pattern_ids[(str_detect(pattern_ids, as.character(pat_name_id)))] | |
} | |
pat_data <- all_point_increments[[pat_name_id]] | |
my_eventID <- pat_data[1] | |
my_duration <- pat_data[2] | |
my_increments <- pat_data[3] | |
my_region <- pat_data[4] | |
my_aep <- pat_data[5] | |
my_breaks <- seq(0, 60, by = 5) # The maximum number of increments is 56 | |
# so setting this to 60 will always work | |
my_title <- sprintf('Areal Temporal Pattern %s',my_eventID) | |
my_subtitle <- sprintf('Region = %s, AEP = %s, \nDuration = %s min (%.0f hour), %i increments of %s min', | |
my_region, my_aep, my_duration, as.numeric(my_duration)/60, as.numeric(my_duration)/as.numeric(my_increments), my_increments) | |
# remove header information just leaving numbers | |
pat_data <- as.numeric(pat_data[-1:-5] ) | |
# Create dataframe for plotting | |
pat_data_df <- data_frame(increments = 1:length(pat_data), percentages = pat_data ) | |
p <- pat_data_df %>% | |
ggplot(aes(x = increments, y = pat_data)) + | |
geom_col(fill = 'blue', colour = 'black') + | |
scale_y_continuous(name = 'Percentage in each increment') + | |
scale_x_continuous(breaks = my_breaks) + | |
labs(title = my_title, | |
subtitle = my_subtitle) + | |
theme_grey(base_size = 5) | |
# five minors per major | |
#https://stackoverflow.com/questions/39691591/ggplot2-integer-multiple-of-minor-breaks-per-major-break | |
majors <- ggplot_build(p)$layout$panel_params[[1]]$x.major_source | |
multiplier <- 5 | |
minors <- seq(from = min(majors), | |
to = max(majors), | |
length.out = ((length(majors) - 1) * multiplier) + 1) | |
p <- p + scale_x_continuous(name = 'Increments', breaks = my_breaks, minor_breaks = minors) | |
suppressMessages(print(p)) | |
} | |
# Create graphs | |
p1 <- Graph_pattern(1208) | |
p2 <-Graph_pattern(1182) | |
p3 <-Graph_pattern(1973) | |
p4 <-Graph_pattern(1183) | |
p <- p1 + p2 + p3 + p4 | |
ggsave(file.path('../figures', 'point_patterns_1-4.png'), p, width = 4, height = 3) | |
p5 <- Graph_pattern(1181) | |
p6 <-Graph_pattern(2870) | |
p7 <-Graph_pattern(1083) | |
p8 <-Graph_pattern(1079) | |
p <- p5 + p6 + p7 + p8 | |
p | |
ggsave(file.path('../figures', 'point_patterns_2-8.png'), p, width = 4, height = 3) | |
# Peaks 24 hours apart ---------------------------------------------------- | |
peak_diff_pattern <- function(pat_name_id) { | |
pat_data <- all_point_increments[[pat_name_id]] | |
my_eventID <- pat_data[1] | |
my_duration <- pat_data[2] | |
my_increments <- pat_data[3] | |
my_region <- pat_data[4] | |
my_area <- pat_data[5] | |
pat_data <- as.numeric(pat_data[-1:-5] ) | |
stopifnot(sum(pat_data) == 100) | |
pat_data_df <- data_frame(inc_num = 1:length(pat_data), | |
tt = as.numeric(my_increments) * inc_num, | |
pc = as.numeric(pat_data)) | |
pat_data_df %>% | |
arrange(desc(pc)) %>% | |
summarise(eventID = my_eventID, | |
peak_1 = pc[1], | |
peak_2 = pc[2], | |
diff_time = abs(tt[1] - tt[2]), | |
diff_time_hr = diff_time/60) | |
} | |
# process all patterns | |
diffs <- map_df(.x = 1:length(all_point_increments), .f = peak_diff_pattern) | |
diffs | |
# Search for patterns were the peaks are large and 24 hours apart | |
diffs %>% | |
filter(diff_time_hr == 24) %>% | |
filter(peak_1 >15 & peak_2 >15) %>% | |
arrange(desc(peak_1), desc(peak_2)) %>% View | |
diffs %>% | |
filter(diff_time_hr == 24) %>% | |
filter(peak_1 >15 & peak_2 >15) %>% | |
arrange(desc(peak_1), desc(peak_2)) | |
# # A tibble: 14 x 5 | |
# eventID peak_1 peak_2 diff_time diff_time_hr | |
# <chr> <dbl> <dbl> <dbl> <dbl> | |
# 1 5636 31.0 30.6 1440 24 | |
# 2 5679 29.6 15.2 1440 24 | |
# 3 5635 27.1 22.7 1440 24 | |
# 4 7695 26.5 15.6 1440 24 | |
# 5 3561 26.1 19.3 1440 24 | |
# 6 9006 25.3 15.2 1440 24 | |
# 7 7696 23.4 15.0 1440 24 | |
# 8 6343 21.6 16.5 1440 24 | |
# 9 8386 19.5 15.4 1440 24 | |
# 10 6376 19 17.0 1440 24 | |
# 11 892 18.2 17.8 1440 24 | |
# 12 4142 17.6 15.9 1440 24 | |
# 13 7729 17.5 16.5 1440 24 | |
# 14 8368 17.0 16.0 1440 24 | |
# Check the statistics for these events | |
ids <- diffs %>% | |
filter(diff_time_hr == 24) %>% | |
filter(peak_1 >15 & peak_2 >15) %>% | |
arrange(desc(peak_1), desc(peak_2)) %>% | |
slice(1:10) %>% | |
select(eventID) %>% pull | |
all_point_patterns_sum %>% | |
filter(eventID %in% (ids)) %>% View | |
p1 <- Graph_pattern(5636) | |
p2 <-Graph_pattern(5679) | |
p3 <-Graph_pattern(5635) | |
p4 <-Graph_pattern(7695) | |
p <- p1 + p2 + p3 + p4 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment