Skip to content

Instantly share code, notes, and snippets.

@johnDorian
Created September 22, 2010 00:45
Show Gist options
  • Save johnDorian/590889 to your computer and use it in GitHub Desktop.
Save johnDorian/590889 to your computer and use it in GitHub Desktop.
#####################################################################################################
### Aim: To test the significance of randomly allocating flow to 'midnight' water quality samples.
### Date Created: Thursday 2nd September 2010
### Author: Jason Lessels
### Packages required: TSAgg_0.2-1,geoR
### Notes: The script can easily be modified for the other water quality parameters, and the amount of simulations. Things to check:
###Both WQ and discharge must have the same initial time stamp (hours) before aggregation.
###WQ variable modify lines 52,60,146,186
###line 99 changes the amount of simulations to run.
###line 156 determines when the bushfire occurred.
###lines 200:220 need to specify the names of the likfit models.
###Aggregation, 77,40, and the switch statements in 110:130 - is explained.
#####################################################################################################
###Load the TSAgg package.
install.packages("geoR")
install.packages("TSAgg")
library(TSAgg)
###Set the working dir.
setwd("E:/4th YEAR PROJECT/data/Burke")
setwd("~/Documents/code/francesca/")
###load the quality data
quality<-read.csv("Burke_worked.csv",header=T)
###What is the format of the dates?
head(quality)
tail(quality)
###Load the flow data - takes awhile
flow<-read.table("2122052.TXT",header=T,sep=",")[,-3]
###Fix the names up
names(flow)<-c("date","discharge")
#get rid of missing values
flow<-flow[!is.na(flow[,2]),]
##No bug But everything is easier to work with when midnight is the first observation.
flow<-flow[-c(1:8),]
##Check out that all is well
head(flow)
###Convert the flow data into a timeSeries formatted data frame
flow<-timeSeries(flow$date,"%d/%m/%Y %H:%M",data=flow$discharge)
###The aggregation can take some time for long datasets like this one.
flow.3hr<-hoursAgg(flow,sum,3)
####Make sure all is well
head(flow.3hr)
tail(flow.3hr)
##Change the names of the aggregated data frame.
names(flow.3hr)<-c("date","discharge")
###format the time stamps
q2<-timeSeries(quality$date,"%d/%m/%Y %H:%M")
###Add the quality values to the new timeSeries object, as the timeSeries function dosen't allow for factors (Sample.Type)
q2<-data.frame(q2,quality[,3:7])
###Get rid of any missing TP values, as this is what we are interested in.
q2<-q2[!is.na(q2$Phosphorus.Total..mg.L.),]
###Subset all samples that are collected between midnight and 12.59.59
temp<-q2[q2[,"hour"]==0,]
###Remove all automatically collected samples, as it is highly possible for an automatic sample to have been collected at this time.
midnight.wq<-temp[temp[,"Sample.Type"]!="A",]
###Create a dataset that is the opposite of the above so the two can be joined after the random samples have been produced. This means that all samples not collected at midnight, or they have been collected using an Automatic sampler.
other.wq<-q2[q2[,"hour"]!=0|q2[,"Sample.Type"]=="A",]
###Grab the date and the TP column
other.wq<-other.wq[,c(1,12)]
head(other.wq)
tail(other.wq)
#########################################################
###Test to see if the above separation worked properly
###Check that all is well, by checking the lengths of the two subsetted data frames and comparing to that of the quality tp object (q2)
length(midnight.wq[,1])
length(other.wq[,1])
##Is the length of all quality samples the same as the length of the problem and non problem samples.
length(q2[,1])==(length(other.wq[,1])+length(midnight.wq[,1]))
#########################################################
###Merge the flow with the water quality that dosent have midnight observations.
###Sort the dates so they are in chronological order.
other.wq<-other.wq[order(other.wq[,1]),]
###Format the dates so we can change the hour for merging with flow
qual<-timeSeries(other.wq$dates,"%Y-%m-%d %H:%M:%S")
###Get the aggregated hourly block that the quality observation falls into in respect to the hourly aggregation of the flow.
temphour<-floor(qual$hour/3)*3
###Create a temp time stamp for merging with the flow - this will be changed back after the merging takes place
temptime<-strptime(paste(qual$day,qual$month,qual$year,temphour,0),"%d %m %Y %H %M",tz="GMT")
#Create a temp qual object for th merging with the new aggregated dates.
tempqual<-data.frame(date=temptime,tp=other.wq[,2])
###Merge the quality with flow
temp<-merge(flow.3hr,tempqual,by="date",all.y=T)
#Now change the dates back to the original time stamp of the quality observation
temp$date<-qual$date
###Now get rid of the missing flow values.
other.wq<-temp[!is.na(temp[,2]),]
#############################################################################################################
##########Below randomly allocates a discharge value from the aggregated flow data, between 8am and 5pm.#####
#############################################################################################################
###This sets how many realisations you want this can be changed to anything you want...
simulations<-100
###This creates a matrix for the randomly sampled flow
new.data<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
###Create an object for the dates that the sampled flow represents.
dates<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
##Cycle through each of the midnight samples
for(i in 1:length(midnight.wq[,1])){
#Check to see if flow is recorded on this day and only continue if it is.
if(length(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),1])==1){
#Get the row number in the larger flow dataset, for midnight.
day.id<-as.numeric(rownames(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),]))
#Random sample which hourly aggregated block to use, do this for the length of simulations.
#|1|2,2,2|3,3,3|4,4| = |8am|9am,10am,11am|12pm,1pm,2pm|3pm,4pm| - therefore each hour has equal probabilty, but the blocks arent of equal probablity.
sampled<-sample(c(1,2,2,2,3,3,3,4,4),simulations,replace=T)
#Now cycle through the cols of the current midnight sample (i) and sample based on the sampled value for this col.
for(j in 1:simulations){
###Use a switch statement. There nice and quick and the one slected is based on the sampled value of this col.
#First switch statment gets the randomly allocated flow value.
switch(sampled[j],
new.data[i,j]<-flow.3hr[(day.id+2),2],
new.data[i,j]<-flow.3hr[(day.id+3),2],
new.data[i,j]<-flow.3hr[(day.id+4),2],
new.data[i,j]<-flow.3hr[(day.id+5),2]
)
#Second switch statement gets the time of the randomly allocated flow.
switch(sampled[j],
dates[i,j]<-paste(flow.3hr[(day.id+2),1]),
dates[i,j]<-paste(flow.3hr[(day.id+3),1]),
dates[i,j]<-paste(flow.3hr[(day.id+4),1]),
dates[i,j]<-paste(flow.3hr[(day.id+5),1])
)
}
}
}
############Now merge the newly randomly created value with the good values.
################################
### Put it all back together ###
################################
##Create a list for to store the simulated data for each realisation
all.output<-list()
for(i in 1:simulations){
#Create the dataset for this realisation
sim.<-data.frame(date=dates[,i],discharge=new.data[,i],tp=midnight.wq$Phosphorus.Total..mg.L.)
#Get rid of the missing values
sim.<-sim.[!is.na(sim.[,2]),]
#join this realisation with the unchanged observations
together<-rbind(other.wq,sim.)
#Now get the dates in order.
together<-together[order(together[,1]),]
together$numericdist<-as.numeric(difftime(together[,1],together[1,1],units="days"))
together$bush.fire<-as.numeric(together[,1]>as.POSIXct("2002-01-01 00:00:00"))
all.output[[i]]<-together
}
##########################################################
##########################################################
################### MODEL FITTING ########################
##########################################################
##########################################################
#####First trend is just flow
ini.<-matrix(NA,ncol=2,nrow=7)
ini.[,1]=c(0.1,0.1,0.1,0.1,0.1,0.1,0.1)
ini.[,2]=c(2,3,4,5,6,7,8)
likfitmods<-list()
likfitmods[["flowspatial"]]<-vector()
likfitmods[["firespatial"]]<-vector()
likfitmods[["flowfirespatial"]]<-vector()
Geo<-list()
library(geoR)
for(i in 1:simulations){#testing
#Get lambda values for the boxcox transformtion
#lambda<-boxcox.fit(all.output[[i]]$tp)$lambda
#flowlambda<-boxcox.fit(all.output[[i]]$discharge)$lambda
##Using a log transform change lambda to 0 for log transform.
lambda=0
flowlambda=0
#Create a data frame for creating geoR object
tempData<-data.frame(x=all.output[[i]]$numericdist,y=1,tp=all.output[[i]]$tp,flw=BCtransform(all.output[[i]]$discharge,flowlambda)$data,bfire=as.factor(all.output[[i]]$bush.fire))
#Get rid of any duplicates using the jitter function. Note that the latest geoR package is required.
tempData[,1:2]<-jitterDupCoords(tempData[,1:2],max=0.01)
#Create a geodata object of the data.
Geo[[i]]<-as.geodata(tempData,coords.col=1:2,data.col=3,covar.col=4:5)
###Plot a variogram of the data, using lambda to transform the data
#plot(variog(tempGeo,lambda=lambda,max.dist=30,trend=~fre))
###Use likfit to optimise the variogram model.
##############################
##############################
####### The models ###########
##############################
##############################
##The flow model
likfitmods[["flow"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw,Geo[[i]]),cov.model="matern")
#Check to see if the model has a temporal trend.
if(summary(likfitmods[["flow"]][[i]])$likelihood$AIC<summary(likfitmods[["flow"]][[i]])$ns.likelihood$AIC)likfitmods[["flowspatial"]][i]=1 else likfitmods[["flowspatial"]][i]<-0
##The fire model
likfitmods[["fire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw+bfire,Geo[[i]]),cov.model="matern")
#Check to see if the model has a temporal trend.
if(summary(likfitmods[["fire"]][[i]])$likelihood$AIC<summary(likfitmods[["fire"]][[i]])$ns.likelihood$AIC)likfitmods[["firespatial"]][i]=1 else likfitmods[["firespatial"]][i]<-0
##The flow and fire model
likfitmods[["flowfire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw*bfire,Geo[[i]]),cov.model="matern")
#Check to see if the model has a temporal trend.
if(summary(likfitmods[["flowfire"]][[i]])$likelihood$AIC<summary(likfitmods[["flowfire"]][[i]])$ns.likelihood$AIC)likfitmods[["flowfirespatial"]][i]=1 else likfitmods[["flowfirespatial"]][i]<-0
cat("We are now up to", i,"\n")
#
print(i)
}
###Save the output
save(likfitmods,file="likfitmods_Burke_TP.Rdata")
save(all.output,file="simulated_data_Burke_TP.Rdata")
save(Geo,file="JitteredGeoObjects_Burke_TP.Rdata")
###Get the P values and determine if the interaction is significant.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment