Created
April 30, 2017 20:49
-
-
Save arubino322/6e24ec21b76189715ab8ca8da7885ac3 to your computer and use it in GitHub Desktop.
Claims Investigation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### global.R ### | |
library(dplyr) | |
library(data.table) | |
library(tidyr) | |
library(lubridate) | |
library(ggplot2) | |
library(maps) | |
library(plotly) | |
library(dygraphs) | |
library(xts) # as.Date is masked by zoo!! | |
library(reshape2) | |
library(googleVis) | |
library(leaflet) | |
library(DT) | |
claims <- read.table("tidy_claims.tsv", header = T, sep = " ") # space separated for some reason | |
type_df <- claims %>% group_by(Claim.Type) %>% | |
summarise(n = n(), | |
overall_ratio = n / nrow(claims)) | |
type_dis <- claims %>% group_by(Claim.Type, Disposition) %>% | |
summarise(n = n()) %>% | |
spread(Disposition, n) %>% | |
rename(Approved = `Approve in Full`) | |
joined_typedis <- left_join(type_dis, type_df, by = "Claim.Type") %>% | |
mutate(approved_ratio = Approved / n, | |
denied_ratio = Deny / n, | |
settled_ratio = Settle / n) %>% | |
select(- overall_ratio) | |
# Adding Total.Claims will make calculations easier | |
claims <- claims %>% mutate(Total.Claims = Audio.Video + | |
Automobile.Parts.Accessories + | |
Baggage.Cases.Purses + | |
Books.Magazines.Other + | |
Cameras + | |
Clothing + | |
Computer.Accessories + | |
Cosmetics.Grooming + | |
Crafting.Hobby + | |
Currency + | |
Food.Drink + | |
Home.Decor + | |
Household.Items + | |
Hunting.Fishing.Items + | |
Jewelry.Watches + | |
Medical.Science + | |
Musical.Instruments.Accessories + | |
Office.Equipment.Supplies + | |
Other + | |
Outdoor.Items + | |
Personal.Accessories + | |
Personal.Electronics + | |
Personal.Navigation + | |
Sporting.Equipment.Supplies + | |
Tools.Home.Improvement.Supplies + | |
Toys.Games) | |
# How bout by month. Need to create a month column from the incident date | |
claims$Month_Yr <- format(as.Date(claims$Incident.Date), "%Y-%m") | |
claims$Year <- year(claims$Incident.Date) | |
claims$Month.Dot.Year <- gsub("-", ".", claims$Month_Yr) | |
claims$Month.Dot.Year <- as.numeric(claims$Month.Dot.Year) | |
by_date <- claims %>% filter(Claim.Type != "Compliment") %>% | |
group_by(Month_Yr, Claim.Type) %>% | |
summarise(n = n()) %>% | |
arrange(Month_Yr) | |
by_date$Full.Date <- paste0(by_date$Month_Yr, "-01") | |
by_date$Full.Date <- as.Date(as.character(by_date$Full.Date)) | |
x = by_date | |
x$Month_Yr <- NULL | |
x <- as.data.frame(x) | |
x <- dcast(x, Full.Date ~ Claim.Type, value.var = "n") | |
x_by_date <- xts(x, order.by = x$Full.Date) | |
# sort of hard to interpret, except that claims have been pretty steady for the past | |
# few years. What if we do it by month only. | |
claims$Month <- month(claims$Incident.Date) | |
by_month <- claims %>% filter(Claim.Type != "Compliment") %>% | |
group_by(Month, Claim.Type, Year) %>% | |
filter(Claim.Type != "Compliment") %>% | |
summarise(avg_claim_amount = mean(Total.Claims), | |
median_claim_amount = median(Total.Claims)) | |
# I want to see the counts of each item!! How many of what was lost by Airport | |
items_by_airport <- claims %>% filter(Airport.Code %in% c("ATL", "LAX", "ORD", "DFW", "JFK", | |
"DEN", "SFO", "CLT", "LAS", "PHX")) %>% | |
group_by(Airport.Code, Year, Disposition) %>% | |
summarise(audio_video = sum(Audio.Video), | |
automobile = sum(Automobile.Parts.Accessories), | |
baggage = sum(Baggage.Cases.Purses), | |
books = sum(Books.Magazines.Other), | |
cameras = sum(Cameras), | |
clothing = sum(Clothing), | |
computer = sum(Computer.Accessories), | |
cosmetics = sum(Cosmetics.Grooming), | |
crafting = sum(Crafting.Hobby), | |
currency = sum(Currency), | |
food = sum(Food.Drink), | |
home_decor = sum(Home.Decor), | |
household_items = sum(Household.Items), | |
hunting_items = sum(Hunting.Fishing.Items), | |
jewelry = sum(Jewelry.Watches), | |
medical = sum(Medical.Science), | |
music_instruments = sum(Musical.Instruments.Accessories), | |
office_supplies = sum(Office.Equipment.Supplies), | |
outdoor_items = sum(Outdoor.Items), | |
pers_accessories = sum(Personal.Accessories), | |
pers_electronics = sum(Personal.Electronics), | |
pers_navigation = sum(Personal.Navigation), | |
sport_supplies = sum(Sporting.Equipment.Supplies), | |
home_improve_supplies = sum(Tools.Home.Improvement.Supplies), | |
toys = sum(Toys.Games), | |
travel_accessories = sum(Travel.Accessories)) | |
### by airline | |
items_by_airline <- claims %>% filter(Airline.Name %like% "^American Airlines" | | |
Airline.Name %like% "Southwest Airlines" | | |
Airline.Name %like% "Delta Air Lines" | | |
Airline.Name %like% "UAL" | | |
Airline.Name %like% "Jet Blue" | | |
Airline.Name %like% "Alaska Airlines" | | |
Airline.Name %like% "Spirit Airlines") %>% | |
group_by(Airline.Name, Year, Disposition) %>% | |
summarise(audio_video = sum(Audio.Video), | |
automobile = sum(Automobile.Parts.Accessories), | |
baggage = sum(Baggage.Cases.Purses), | |
books = sum(Books.Magazines.Other), | |
cameras = sum(Cameras), | |
clothing = sum(Clothing), | |
computer = sum(Computer.Accessories), | |
cosmetics = sum(Cosmetics.Grooming), | |
crafting = sum(Crafting.Hobby), | |
currency = sum(Currency), | |
food = sum(Food.Drink), | |
home_decor = sum(Home.Decor), | |
household_items = sum(Household.Items), | |
hunting_items = sum(Hunting.Fishing.Items), | |
jewelry = sum(Jewelry.Watches), | |
medical = sum(Medical.Science), | |
music_instruments = sum(Musical.Instruments.Accessories), | |
office_supplies = sum(Office.Equipment.Supplies), | |
outdoor_items = sum(Outdoor.Items), | |
pers_accessories = sum(Personal.Accessories), | |
pers_electronics = sum(Personal.Electronics), | |
pers_navigation = sum(Personal.Navigation), | |
sport_supplies = sum(Sporting.Equipment.Supplies), | |
home_improve_supplies = sum(Tools.Home.Improvement.Supplies), | |
toys = sum(Toys.Games), | |
travel_accessories = sum(Travel.Accessories)) | |
# create a heatmap with dataframe using plotly. | |
# Top 5 airline claims: Delta, Southwest, American, UAL, USAir. Now by Airport | |
# Let's join the lat lon from the other file with the airport codes in this file | |
latlon <- read.csv("Airport_Codes_mapped_to_Latitude_Longitude_in_the_United_States.csv", header = TRUE) | |
latlon <- latlon %>% rename(Airport.Code = locationID) | |
claims <- left_join(claims, latlon, by = "Airport.Code") | |
lltest <- claims %>% group_by(Latitude, Longitude) %>% | |
summarise(n = n()) %>% | |
arrange(desc(n)) | |
ggplot(lltest, aes(x = Latitude, y = Longitude)) + geom_point() | |
######## compare top10 airports and airlines | |
top10airports <- read.csv("top10airports.csv", header = T) | |
# Let's do top10airlines first. Create a dataframe from claims that has total count of | |
# claims grouped by airline and year for 2015 only. We'll need to gather the data and tidy | |
# it (rename columns also) | |
top10airports <- top10airports %>% | |
rename("2015" = est_2015_flights, "2014" = est_2014_flights) %>% | |
gather(key = "Year", value = "flights", 3:4) | |
top10airports$Year <- as.numeric(top10airports$Year) | |
top10ac <- claims %>% filter(Airport.Code %in% c('ATL','LAX','ORD','DFW', | |
'JFK','DEN','SFO', | |
'CLT','LAS','PHX') | |
& (Year == 2014 | Year == 2015)) %>% | |
group_by(Airport.Code, Year, Claim.Type) %>% | |
summarise(total = sum(Total.Claims)) %>% | |
arrange(Year) | |
top10ac <- left_join(top10ac, top10airports, by = c("Airport.Code", "Year")) | |
top10ac <- top10ac %>% mutate(Claim.Rate = total / flights) | |
top10ac$Year <- as.character(top10ac$Year) | |
## Now do airlines | |
# tidy | |
top10airlines <- read.csv("top10airlines.csv", header = T) | |
top10airlines <- top10airlines %>% rename("2015" = est_2015_flights, "2014" = est_2014_flights) %>% | |
gather(key = "Year", value = "flights", 2:3) | |
# filter claims for top 10 airlines | |
top10al <- claims %>% filter((as.character(Airline.Name) %like% "^American Airlines" | | |
Airline.Name %like% "Southwest Airlines" | | |
Airline.Name %like% "Delta Air Lines" | | |
Airline.Name %like% "UAL" | | |
Airline.Name %like% "Jet Blue" | | |
Airline.Name %like% "Alaska Airlines" | | |
Airline.Name %like% "Spirit Airlines") & | |
(Year == 2014 | Year == 2015)) %>% | |
group_by(Airline.Name, Year, Claim.Type) %>% | |
summarise(total_claims = sum(Total.Claims)) | |
# join them, but first, trim the white space | |
for (i in 1:nrow(top10al)) { | |
trimws(top10al$Airline.Name[i], c("both")) | |
} | |
top10al$Year <- as.character(top10al$Year) | |
jtl <- read.csv("jointop10al.csv", header = T) | |
# i did this because whitespaces were causing problems with joining and i was too lazy | |
# to figure out how to fix. | |
top10al <- jtl | |
top10al$Year <- as.character(top10al$Year) | |
# add claim rate | |
top10al <- top10al %>% mutate(Claim.Rate = total_claims / Flights) | |
top10al <- top10al %>% select(- concat) | |
###### leaflet ######## | |
leaf_data <- claims %>% group_by(Latitude, Longitude, Year, Airport.Code) %>% | |
summarise(all_claims = sum(Total.Claims)) %>% | |
filter(all_claims >= 25) %>% | |
arrange(desc(all_claims)) | |
leaf_data$Longitude <- paste0("-",leaf_data$Longitude) | |
leaf_data$Longitude <- as.numeric(leaf_data$Longitude) | |
##### renderTables ##### | |
airport_table <- read.csv("airport_table.csv", header = TRUE) | |
airline_table <- read.csv("airline_table.csv", header = TRUE) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
library(dygraphs) | |
# Define server logic required to draw a histogram | |
shinyServer(function(input, output, session) { | |
# let's get it. | |
leaf_map <- reactive({ | |
leaf_data %>% | |
filter(Year %in% input$year_range) | |
}) | |
output$map <- renderLeaflet({ | |
leaflet(leaf_map()) %>% setView(lng = -98.5556, lat = 39.8097, zoom = 4) %>% | |
addProviderTiles(providers$Stamen.Terrain) %>% | |
addCircles(lng = ~Longitude, lat = ~Latitude, weight = 1, | |
radius = ~(all_claims) * 1000, popup = ~Airport.Code, | |
label = ~paste0(Airport.Code, ": ", all_claims), | |
labelOptions = labelOptions(noHide = T)) | |
}) | |
output$timeSeries <- renderDygraph({ | |
dygraph(x_by_date, main = "Claims") %>% | |
dyRangeSelector() | |
}) | |
output$facetPlot <- renderPlot({ | |
by_month %>% filter(Year == input$general_year & avg_claim_amount != 0) %>% | |
ggplot(aes(x = Month, y = avg_claim_amount)) + | |
geom_histogram(binwidth = 0.2, stat = "identity") + facet_wrap( ~ Claim.Type) + | |
scale_x_discrete(limits = c("Jan", "Feb", "Mar", "Apr", | |
"May", "June", "July", "Aug", | |
"Sept", "Oct", "Nov", "Dec")) + | |
theme(axis.text.x = element_text(angle = 90)) | |
}) | |
### heatmap 1 | |
output$airportItems <- renderPlotly({ | |
top_air_items <- items_by_airport %>% filter(Year == input$airport_year | |
& Disposition == input$disposition) %>% | |
group_by(Airport.Code) %>% select(-Year, -Disposition) | |
row.names(top_air_items) <- top_air_items$Airport.Code | |
airport_matrix <- data.matrix(top_air_items)[,2:27] | |
plot_ly(data = top_air_items, | |
x = c("ATL", "CLT", "DEN", "DFW", "JFK", | |
"LAS","LAX", "ORD", "SFO", "PHX"), | |
y = c("audio_video", "automobile", "baggage", | |
"books", "cameras", "clothing", "computer", | |
"cosmetics", "crafting", "currency", "food", | |
"home_decor", "household_items", "hunting_items", | |
"jewelry", "medical", "music_instruments", | |
"office_supplies", "outdoor_items", "pers_accessories", | |
"pers_electronics", "pers_navigation", "sport_supplies", | |
"home_improve_supplies", "toys", "travel_accessories"), | |
z = t(airport_matrix), type = "heatmap") %>% | |
layout(yaxis = list(tickangle = 30), margin = list(l=100)) | |
}) | |
### for heatmap 2 | |
output$airlineItems <- renderPlotly({ | |
top_line_items <- items_by_airline %>% filter(Year == input$airline_year | |
& Disposition == input$disposition2) %>% | |
group_by(Airline.Name) %>% select(-Year, -Disposition) | |
row.names(top_line_items) <- top_line_items$Airline.Name | |
airline_matrix <- data.matrix(top_line_items)[,2:27] | |
plot_ly(data = top_line_items, | |
x = c("Alaska", "American", "Delta", "JetBlue", | |
"Southwest", "Spirit","United"), | |
y = c("audio_video", "automobile", "baggage", | |
"books", "cameras", "clothing", "computer", | |
"cosmetics", "crafting", "currency", "food", | |
"home_decor", "household_items", "hunting_items", | |
"jewelry", "medical", "music_instruments", | |
"office_supplies", "outdoor_items", "pers_accessories", | |
"pers_electronics", "pers_navigation", "sport_supplies", | |
"home_improve_supplies", "toys", "travel_accessories"), | |
z = t(airline_matrix), type = "heatmap") %>% | |
layout(yaxis = list(tickangle = 30), margin = list(l=100)) | |
}) | |
output$airportTable <- renderDataTable({ | |
datatable(airport_table) | |
}) | |
output$airlineTable <- renderDataTable({ | |
datatable(airline_table) | |
}) | |
output$airportRate <- renderPlot({ | |
top10ac %>% filter(Airport.Code != 'SFO') %>% | |
group_by(Airport.Code, Year) %>% | |
summarise(total_claims = sum(total), | |
total_flights = mean(flights)) %>% | |
mutate(claim_rate = total_claims / total_flights) %>% | |
select(-total_claims, -total_flights) %>% | |
ggplot(aes(x = Airport.Code, y = claim_rate)) + | |
geom_col(aes(fill = Year), position = "dodge") + | |
scale_y_continuous(labels = scales::percent) + | |
ylab("Scale Rate") + xlab("Airport") | |
}) | |
output$airlineRate <- renderPlot({ | |
top10al %>% filter(Airline.Name != "Republic Airways") %>% | |
group_by(Airline.Name, Year) %>% | |
summarise(Total_Claims = sum(total_claims), | |
Total_Flights = mean(Flights)) %>% | |
mutate(claim_rate = Total_Claims / Total_Flights) %>% | |
select(-Total_Claims, -Total_Flights) %>% | |
ggplot(aes(x = Airline.Name, y = claim_rate)) + | |
geom_col(aes(fill = Year), stat = "identity", position = "dodge") + | |
scale_y_continuous(labels = scales::percent) + | |
ylab("Claim Rate") + xlab("Airline") + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
}) | |
output$airlineAndType <- renderPlot({ | |
top10al %>% filter(Airline.Name != "Republic Airways") %>% | |
group_by(Airline.Name, Claim.Type) %>% | |
summarise(Total_Claims = sum(total_claims), | |
Total_Flights = mean(Flights)) %>% | |
mutate(claim_rate = Total_Claims / Total_Flights) %>% | |
select(-Total_Claims, -Total_Flights) %>% | |
ggplot(aes(Airline.Name, claim_rate)) + | |
geom_col(aes(fill = Claim.Type), position = "dodge") + | |
scale_y_continuous(labels = scales::percent) + | |
ylab("Claim Rate") + xlab("Airline and Claim Type") + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
}) | |
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
library(shinydashboard) | |
# Define UI for application that draws a histogram | |
dashboardPage(skin = "blue", | |
dashboardHeader(title = "Where the Angriest Passengers Are", titleWidth = 400), | |
dashboardSidebar( | |
sidebarMenu( | |
menuItem("Map of Total Claims", tabName = "map", icon = icon("map")), | |
menuItem("Claims Over Time", tabName = "claimTS", icon = icon("line-chart"), | |
menuSubItem("Time Series", tabName = "month_and_day"), | |
menuSubItem("Month Breakdown", tabName = "facet_wrap")), | |
menuItem("Itemized Breakdown", tabName = "prop", icon = icon("thermometer"), | |
menuSubItem("Airport Heatmap", tabName = "airport_heat"), | |
menuSubItem("Airline Heatmap", tabName = "airline_heat")), | |
menuItem("Flight and Passenger Data", tabName = "flights", icon = icon("plane"), | |
menuSubItem("Airport Flight Data", tabName = "airport_table"), | |
menuSubItem("Airline Flight Data", tabName = "airline_table")), | |
menuItem("Claim Rates", tabName = "rates", icon = icon("rocket"), | |
menuSubItem("Claim Rate by Airport per Year", tabName = "airport_rate"), | |
menuSubItem("Claim Rate by Airline per Year", tabName = "airline_rate"), | |
menuSubItem("Claim Rate by Airline and Type", tabName = "airline_and_type") | |
) | |
) | |
), | |
dashboardBody( | |
tabItems( | |
tabItem(tabName = "map", | |
h2("Geographic Claims"), | |
fluidRow( | |
absolutePanel(top = 10, right = 10, | |
sliderInput("year_range", h4("Select Year"), | |
min = min(leaf_data$Year), | |
max = max(leaf_data$Year), | |
value = 2010:2015) | |
), | |
leafletOutput("map", height = 600, width = "100%") | |
) | |
), | |
tabItem(tabName = "month_and_day", | |
h2("Time Series of Claims"), | |
fluidRow( | |
box(width = 12, | |
dygraphOutput("timeSeries")) | |
) | |
), | |
tabItem(tabName = "facet_wrap", | |
h2("Average Claim Type Per Month"), | |
fluidRow(box( | |
selectInput("general_year", h4("Select Year"), | |
choices = sort(unique(by_month$Year))) | |
)), | |
fluidRow( | |
box(width = 12, | |
plotOutput("facetPlot")) | |
) | |
), | |
tabItem(tabName = "airport_table", | |
h2("Airport Flight Data for 2014-2015"), | |
fluidRow(box( | |
DT::dataTableOutput('airportTable'), width = 12) | |
) | |
), | |
tabItem(tabName = "airline_table", | |
h2("Airline Flight Data for 2014-2015"), | |
fluidRow(box( | |
DT::dataTableOutput('airlineTable'), width = 12) | |
) | |
), | |
tabItem(tabName = "airport_heat", | |
h2("Airport Heatmap"), | |
fluidRow( | |
absolutePanel(top = 125, left = 250, | |
selectInput('airport_year', h4("Select Year"), | |
choices = sort(unique(items_by_airport$Year)))) | |
), | |
fluidRow( | |
absolutePanel(top = 125, right = 10, | |
selectInput('disposition', h4("Select Status"), | |
choices = sort(unique(items_by_airport$Disposition)))) | |
), | |
fluidRow( | |
absolutePanel(top = 200, width = 12, | |
box(status = "success", | |
plotlyOutput("airportItems"))) | |
) | |
), | |
tabItem(tabName = "airline_heat", | |
h2("Airline Heatmap"), | |
fluidRow(absolutePanel(top = 125, left = 250, | |
selectInput("airline_year", h4("Select Year"), | |
choices = sort(unique(items_by_airline$Year))) | |
)), | |
fluidRow(absolutePanel(top = 125, right = 10, | |
selectInput("disposition2", h4("Select Status"), | |
choices = sort(unique(items_by_airline$Disposition))) | |
)), | |
fluidRow( | |
absolutePanel(top = 200, width = 12, | |
box(status = "success", | |
plotlyOutput("airlineItems")) | |
) | |
) | |
), | |
tabItem(tabName = "airport_rate", | |
h2("Claim Rates by Airport"), | |
plotOutput("airportRate") | |
), | |
tabItem(tabName = "airline_rate", | |
h2("Claim Rates by Airline"), | |
plotOutput("airlineRate") | |
), | |
tabItem(tabName = "airline_and_type", | |
h2("Claim Rates by Airline and Type"), | |
plotOutput("airlineAndType") | |
) | |
) | |
) | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment