actuaryactually/data_wrangling.R

## data_wrangling.R
#preliminaries:
  wkg.dir = "C:/OldWorld/Blog/DataWrangling" # define where we'll work, save files, etc. for this post
  setwd(wkg.dir)

#Install a helper piece of code (pacman) and then load required code libraries in one-go:
  install.packages("pacman")
  pacman::p_load(ChainLadder,ggplot2,data.table,tidyr)
  #Step 1: load some data and view
  RAA
  plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs")

#Step 1b: additional output needed for blog only; commented out in production copy on GitHub:
  png("RAA.Dev.png")
  plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs")
  dev.off()

#Step 2: Manuipulation of data

  #zeroes begone...
  print(RAA, na.print="")

  #convert to incremental
  ( x = cum2incr(RAA) )
  print(x, na.print="")
  ( plot.x = plot(x/1e3,lattice=TRUE, main="RAA Incremental Data by Accident Year (USD000s)",xlab="Dev Yrs") )


  #for blog outputs only - dropped in final post to site:
  png("RAA.IncrementalDev.png")
  plot.x
  dev.off()


  #convert incremental to cumulative:
  ( y = incr2cum(x) )

  #convert triangles to flat data format
  (pivot.format = as.data.frame(RAA))

    #check - that our manipulations all tie up - i.e. this should return a triangle of zeroes, which it does:
  ( RAA-y )


# Step 3: Finesse with ggplot

#Let's download some example auto claims data taken from CAS website:

  #the following line is commented out. In principle, it should allow directly download from the website into R.
  #however, I have left it comments, as you may get identified (wrongly) as a bot by CAS website.
  #As a result, it's easier to manually download the file to a local drive and proceed as shown:

  #mydat = fread('http://www.casact.org/research/reserve_data/ppauto_pos.csv')

  mydat=read.csv("ppauto_pos.csv") #assuming you've saved the csv to the wkg.dir location

#Home-in on one company and pluck-out columns of interest:
  StateFarm = subset(mydat,subset = mydat$GRNAME=="State Farm Mut Grp")
  StateFarm.Data = StateFarm[,c(2:3,5:8,11)]
  StateFarm.Data$AccidentYear = as.numeric(StateFarm.Data$AccidentYear)
  StateFarm.Data$DevelopmentLag = as.numeric(StateFarm.Data$DevelopmentLag)
  head(StateFarm.Data) # check results - this has the minimum we need, shows first 6 rows only

  #now we convert this into a format that ChainLadder likes...
  (paid.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="CumPaidLoss_B"))
  (inc.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="IncurLoss_B"))


  #lastly overwrite data values so it appears as if we're working with data as at 1997 year end - i.e. the normal situation
    paid.tri[row(paid.tri)+col(paid.tri)>nrow(paid.tri)+1] = NA
    inc.tri[row(inc.tri)+col(inc.tri)>nrow(inc.tri)+1] = NA

  #convert triangles to frame format and purge N/As
    paid.frame = as.data.frame(paid.tri)
    paid.frame$type = "paid"
    paid.frame = subset(paid.frame,subset = paid.frame$value!="NA")

    inc.frame = as.data.frame(inc.tri)
    inc.frame$type = "incurred"
    inc.frame = subset(inc.frame,subset = inc.frame$value!="NA")

  #in practical situations we are likely to have a view on the ultimate loss cost.  FOr this blogpost, let's suppose that we want to run with a 90% loss ratio for all years
    ULR = 0.9
    net.prem = unique(StateFarm.Data$EarnedPremNet_B)
    net.ult = data.frame(cbind(AccidentYear=1988:1997,Ult=ULR*net.prem))

  #create a frame showing the ultimates is the same format as the paid and incurred information - good tip here on vlookup approach, https://www.rforexcelusers.com/vlookup-in-r/
  ult.frame = merge(net.ult[, c("AccidentYear", "Ult")],
        inc.frame[, c("AccidentYear", "DevelopmentLag","type")])

  #reorder and rename columns, then override the type
  ult.frame = ult.frame[,c(1,3,2,4)]
  colnames(ult.frame) = colnames(inc.frame)
  ult.frame$type = "Ult"
  ult.frame = ult.frame[ order(ult.frame$AccidentYear, ult.frame$DevelopmentLag), ]

  #combine the three frames together:
  data.combined = data.frame(rbind(paid.frame,inc.frame,ult.frame))

  #clean up temp variables
  rm(paid.frame,inc.frame,ult.frame)

  #now plot (finally!)

  p = ggplot(data=data.combined, aes(x=DevelopmentLag, y = value/1e6, color=type)) +
    geom_point()+geom_line() + facet_wrap(~AccidentYear)

  #ok, let's improve the output in a few ways

  #axes limits and ticks
  p = p+ylim(0,20)
  p = p+ scale_x_continuous(breaks=seq(0, 10, 2))  # Ticks from 0-10, every 2


  p = p+labs(title = "State Farm Mutual - Motor Loss Development",
       subtitle = "Incurred losses show downward development, highlighting redundancy of case reserves",
       caption = "Source: http://www.casact.org/research/reserve_data/ppauto_pos.csv",
       x = "Development Yr", y = "USDm")

  p = p+theme(axis.text=element_text(size=8),
              axis.title=element_text(size=10,face="italic"))

  #color scale
  p = p+ scale_colour_brewer(palette = "Dark2") #other options are available here: https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf

  #write to LAN:
  png("Prettier_Output.png")
  p
  dev.off()
	#preliminaries:
	wkg.dir = "C:/OldWorld/Blog/DataWrangling" # define where we'll work, save files, etc. for this post
	setwd(wkg.dir)

	#Install a helper piece of code (pacman) and then load required code libraries in one-go:
	install.packages("pacman")
	pacman::p_load(ChainLadder,ggplot2,data.table,tidyr)
	#Step 1: load some data and view
	RAA
	plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs")

	#Step 1b: additional output needed for blog only; commented out in production copy on GitHub:
	png("RAA.Dev.png")
	plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs")
	dev.off()

	#Step 2: Manuipulation of data

	#zeroes begone...
	print(RAA, na.print="")

	#convert to incremental
	( x = cum2incr(RAA) )
	print(x, na.print="")
	( plot.x = plot(x/1e3,lattice=TRUE, main="RAA Incremental Data by Accident Year (USD000s)",xlab="Dev Yrs") )


	#for blog outputs only - dropped in final post to site:
	png("RAA.IncrementalDev.png")
	plot.x
	dev.off()


	#convert incremental to cumulative:
	( y = incr2cum(x) )

	#convert triangles to flat data format
	(pivot.format = as.data.frame(RAA))

	#check - that our manipulations all tie up - i.e. this should return a triangle of zeroes, which it does:
	( RAA-y )


	# Step 3: Finesse with ggplot

	#Let's download some example auto claims data taken from CAS website:

	#the following line is commented out. In principle, it should allow directly download from the website into R.
	#however, I have left it comments, as you may get identified (wrongly) as a bot by CAS website.
	#As a result, it's easier to manually download the file to a local drive and proceed as shown:

	#mydat = fread('http://www.casact.org/research/reserve_data/ppauto_pos.csv')

	mydat=read.csv("ppauto_pos.csv") #assuming you've saved the csv to the wkg.dir location

	#Home-in on one company and pluck-out columns of interest:
	StateFarm = subset(mydat,subset = mydat$GRNAME=="State Farm Mut Grp")
	StateFarm.Data = StateFarm[,c(2:3,5:8,11)]
	StateFarm.Data$AccidentYear = as.numeric(StateFarm.Data$AccidentYear)
	StateFarm.Data$DevelopmentLag = as.numeric(StateFarm.Data$DevelopmentLag)
	head(StateFarm.Data) # check results - this has the minimum we need, shows first 6 rows only

	#now we convert this into a format that ChainLadder likes...
	(paid.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="CumPaidLoss_B"))
	(inc.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="IncurLoss_B"))


	#lastly overwrite data values so it appears as if we're working with data as at 1997 year end - i.e. the normal situation
	paid.tri[row(paid.tri)+col(paid.tri)>nrow(paid.tri)+1] = NA
	inc.tri[row(inc.tri)+col(inc.tri)>nrow(inc.tri)+1] = NA

	#convert triangles to frame format and purge N/As
	paid.frame = as.data.frame(paid.tri)
	paid.frame$type = "paid"
	paid.frame = subset(paid.frame,subset = paid.frame$value!="NA")

	inc.frame = as.data.frame(inc.tri)
	inc.frame$type = "incurred"
	inc.frame = subset(inc.frame,subset = inc.frame$value!="NA")

	#in practical situations we are likely to have a view on the ultimate loss cost. FOr this blogpost, let's suppose that we want to run with a 90% loss ratio for all years
	ULR = 0.9
	net.prem = unique(StateFarm.Data$EarnedPremNet_B)
	net.ult = data.frame(cbind(AccidentYear=1988:1997,Ult=ULR*net.prem))

	#create a frame showing the ultimates is the same format as the paid and incurred information - good tip here on vlookup approach, https://www.rforexcelusers.com/vlookup-in-r/
	ult.frame = merge(net.ult[, c("AccidentYear", "Ult")],
	inc.frame[, c("AccidentYear", "DevelopmentLag","type")])

	#reorder and rename columns, then override the type
	ult.frame = ult.frame[,c(1,3,2,4)]
	colnames(ult.frame) = colnames(inc.frame)
	ult.frame$type = "Ult"
	ult.frame = ult.frame[ order(ult.frame$AccidentYear, ult.frame$DevelopmentLag), ]

	#combine the three frames together:
	data.combined = data.frame(rbind(paid.frame,inc.frame,ult.frame))

	#clean up temp variables
	rm(paid.frame,inc.frame,ult.frame)

	#now plot (finally!)

	p = ggplot(data=data.combined, aes(x=DevelopmentLag, y = value/1e6, color=type)) +
	geom_point()+geom_line() + facet_wrap(~AccidentYear)

	#ok, let's improve the output in a few ways

	#axes limits and ticks
	p = p+ylim(0,20)
	p = p+ scale_x_continuous(breaks=seq(0, 10, 2)) # Ticks from 0-10, every 2


	p = p+labs(title = "State Farm Mutual - Motor Loss Development",
	subtitle = "Incurred losses show downward development, highlighting redundancy of case reserves",
	caption = "Source: http://www.casact.org/research/reserve_data/ppauto_pos.csv",
	x = "Development Yr", y = "USDm")

	p = p+theme(axis.text=element_text(size=8),
	axis.title=element_text(size=10,face="italic"))

	#color scale
	p = p+ scale_colour_brewer(palette = "Dark2") #other options are available here: https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf

	#write to LAN:
	png("Prettier_Output.png")
	p
	dev.off()