Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# actuaryactually/data_wrangling.R

Created Jan 24, 2017
i'm a cowboy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 #preliminaries: wkg.dir = "C:/OldWorld/Blog/DataWrangling" # define where we'll work, save files, etc. for this post setwd(wkg.dir) #Install a helper piece of code (pacman) and then load required code libraries in one-go: install.packages("pacman") pacman::p_load(ChainLadder,ggplot2,data.table,tidyr) #Step 1: load some data and view RAA plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs") #Step 1b: additional output needed for blog only; commented out in production copy on GitHub: png("RAA.Dev.png") plot(RAA/1e3,lattice=TRUE, main="RAA Data by Accident Year (USD000s)",xlab="Dev Yrs") dev.off() #Step 2: Manuipulation of data #zeroes begone... print(RAA, na.print="") #convert to incremental ( x = cum2incr(RAA) ) print(x, na.print="") ( plot.x = plot(x/1e3,lattice=TRUE, main="RAA Incremental Data by Accident Year (USD000s)",xlab="Dev Yrs") ) #for blog outputs only - dropped in final post to site: png("RAA.IncrementalDev.png") plot.x dev.off() #convert incremental to cumulative: ( y = incr2cum(x) ) #convert triangles to flat data format (pivot.format = as.data.frame(RAA)) #check - that our manipulations all tie up - i.e. this should return a triangle of zeroes, which it does: ( RAA-y ) # Step 3: Finesse with ggplot #Let's download some example auto claims data taken from CAS website: #the following line is commented out. In principle, it should allow directly download from the website into R. #however, I have left it comments, as you may get identified (wrongly) as a bot by CAS website. #As a result, it's easier to manually download the file to a local drive and proceed as shown: #mydat = fread('http://www.casact.org/research/reserve_data/ppauto_pos.csv') mydat=read.csv("ppauto_pos.csv") #assuming you've saved the csv to the wkg.dir location #Home-in on one company and pluck-out columns of interest: StateFarm = subset(mydat,subset = mydat\$GRNAME=="State Farm Mut Grp") StateFarm.Data = StateFarm[,c(2:3,5:8,11)] StateFarm.Data\$AccidentYear = as.numeric(StateFarm.Data\$AccidentYear) StateFarm.Data\$DevelopmentLag = as.numeric(StateFarm.Data\$DevelopmentLag) head(StateFarm.Data) # check results - this has the minimum we need, shows first 6 rows only #now we convert this into a format that ChainLadder likes... (paid.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="CumPaidLoss_B")) (inc.tri = as.triangle(StateFarm.Data,origin="AccidentYear",dev="DevelopmentLag",value="IncurLoss_B")) #lastly overwrite data values so it appears as if we're working with data as at 1997 year end - i.e. the normal situation paid.tri[row(paid.tri)+col(paid.tri)>nrow(paid.tri)+1] = NA inc.tri[row(inc.tri)+col(inc.tri)>nrow(inc.tri)+1] = NA #convert triangles to frame format and purge N/As paid.frame = as.data.frame(paid.tri) paid.frame\$type = "paid" paid.frame = subset(paid.frame,subset = paid.frame\$value!="NA") inc.frame = as.data.frame(inc.tri) inc.frame\$type = "incurred" inc.frame = subset(inc.frame,subset = inc.frame\$value!="NA") #in practical situations we are likely to have a view on the ultimate loss cost. FOr this blogpost, let's suppose that we want to run with a 90% loss ratio for all years ULR = 0.9 net.prem = unique(StateFarm.Data\$EarnedPremNet_B) net.ult = data.frame(cbind(AccidentYear=1988:1997,Ult=ULR*net.prem)) #create a frame showing the ultimates is the same format as the paid and incurred information - good tip here on vlookup approach, https://www.rforexcelusers.com/vlookup-in-r/ ult.frame = merge(net.ult[, c("AccidentYear", "Ult")], inc.frame[, c("AccidentYear", "DevelopmentLag","type")]) #reorder and rename columns, then override the type ult.frame = ult.frame[,c(1,3,2,4)] colnames(ult.frame) = colnames(inc.frame) ult.frame\$type = "Ult" ult.frame = ult.frame[ order(ult.frame\$AccidentYear, ult.frame\$DevelopmentLag), ] #combine the three frames together: data.combined = data.frame(rbind(paid.frame,inc.frame,ult.frame)) #clean up temp variables rm(paid.frame,inc.frame,ult.frame) #now plot (finally!) p = ggplot(data=data.combined, aes(x=DevelopmentLag, y = value/1e6, color=type)) + geom_point()+geom_line() + facet_wrap(~AccidentYear) #ok, let's improve the output in a few ways #axes limits and ticks p = p+ylim(0,20) p = p+ scale_x_continuous(breaks=seq(0, 10, 2)) # Ticks from 0-10, every 2 p = p+labs(title = "State Farm Mutual - Motor Loss Development", subtitle = "Incurred losses show downward development, highlighting redundancy of case reserves", caption = "Source: http://www.casact.org/research/reserve_data/ppauto_pos.csv", x = "Development Yr", y = "USDm") p = p+theme(axis.text=element_text(size=8), axis.title=element_text(size=10,face="italic")) #color scale p = p+ scale_colour_brewer(palette = "Dark2") #other options are available here: https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf #write to LAN: png("Prettier_Output.png") p dev.off()
to join this conversation on GitHub. Already have an account? Sign in to comment