Skip to content

Instantly share code, notes, and snippets.

@arubino322
Created April 30, 2017 20:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arubino322/6e24ec21b76189715ab8ca8da7885ac3 to your computer and use it in GitHub Desktop.
Save arubino322/6e24ec21b76189715ab8ca8da7885ac3 to your computer and use it in GitHub Desktop.
Claims Investigation
### global.R ###
library(dplyr)
library(data.table)
library(tidyr)
library(lubridate)
library(ggplot2)
library(maps)
library(plotly)
library(dygraphs)
library(xts) # as.Date is masked by zoo!!
library(reshape2)
library(googleVis)
library(leaflet)
library(DT)
claims <- read.table("tidy_claims.tsv", header = T, sep = " ") # space separated for some reason
type_df <- claims %>% group_by(Claim.Type) %>%
summarise(n = n(),
overall_ratio = n / nrow(claims))
type_dis <- claims %>% group_by(Claim.Type, Disposition) %>%
summarise(n = n()) %>%
spread(Disposition, n) %>%
rename(Approved = `Approve in Full`)
joined_typedis <- left_join(type_dis, type_df, by = "Claim.Type") %>%
mutate(approved_ratio = Approved / n,
denied_ratio = Deny / n,
settled_ratio = Settle / n) %>%
select(- overall_ratio)
# Adding Total.Claims will make calculations easier
claims <- claims %>% mutate(Total.Claims = Audio.Video +
Automobile.Parts.Accessories +
Baggage.Cases.Purses +
Books.Magazines.Other +
Cameras +
Clothing +
Computer.Accessories +
Cosmetics.Grooming +
Crafting.Hobby +
Currency +
Food.Drink +
Home.Decor +
Household.Items +
Hunting.Fishing.Items +
Jewelry.Watches +
Medical.Science +
Musical.Instruments.Accessories +
Office.Equipment.Supplies +
Other +
Outdoor.Items +
Personal.Accessories +
Personal.Electronics +
Personal.Navigation +
Sporting.Equipment.Supplies +
Tools.Home.Improvement.Supplies +
Toys.Games)
# How bout by month. Need to create a month column from the incident date
claims$Month_Yr <- format(as.Date(claims$Incident.Date), "%Y-%m")
claims$Year <- year(claims$Incident.Date)
claims$Month.Dot.Year <- gsub("-", ".", claims$Month_Yr)
claims$Month.Dot.Year <- as.numeric(claims$Month.Dot.Year)
by_date <- claims %>% filter(Claim.Type != "Compliment") %>%
group_by(Month_Yr, Claim.Type) %>%
summarise(n = n()) %>%
arrange(Month_Yr)
by_date$Full.Date <- paste0(by_date$Month_Yr, "-01")
by_date$Full.Date <- as.Date(as.character(by_date$Full.Date))
x = by_date
x$Month_Yr <- NULL
x <- as.data.frame(x)
x <- dcast(x, Full.Date ~ Claim.Type, value.var = "n")
x_by_date <- xts(x, order.by = x$Full.Date)
# sort of hard to interpret, except that claims have been pretty steady for the past
# few years. What if we do it by month only.
claims$Month <- month(claims$Incident.Date)
by_month <- claims %>% filter(Claim.Type != "Compliment") %>%
group_by(Month, Claim.Type, Year) %>%
filter(Claim.Type != "Compliment") %>%
summarise(avg_claim_amount = mean(Total.Claims),
median_claim_amount = median(Total.Claims))
# I want to see the counts of each item!! How many of what was lost by Airport
items_by_airport <- claims %>% filter(Airport.Code %in% c("ATL", "LAX", "ORD", "DFW", "JFK",
"DEN", "SFO", "CLT", "LAS", "PHX")) %>%
group_by(Airport.Code, Year, Disposition) %>%
summarise(audio_video = sum(Audio.Video),
automobile = sum(Automobile.Parts.Accessories),
baggage = sum(Baggage.Cases.Purses),
books = sum(Books.Magazines.Other),
cameras = sum(Cameras),
clothing = sum(Clothing),
computer = sum(Computer.Accessories),
cosmetics = sum(Cosmetics.Grooming),
crafting = sum(Crafting.Hobby),
currency = sum(Currency),
food = sum(Food.Drink),
home_decor = sum(Home.Decor),
household_items = sum(Household.Items),
hunting_items = sum(Hunting.Fishing.Items),
jewelry = sum(Jewelry.Watches),
medical = sum(Medical.Science),
music_instruments = sum(Musical.Instruments.Accessories),
office_supplies = sum(Office.Equipment.Supplies),
outdoor_items = sum(Outdoor.Items),
pers_accessories = sum(Personal.Accessories),
pers_electronics = sum(Personal.Electronics),
pers_navigation = sum(Personal.Navigation),
sport_supplies = sum(Sporting.Equipment.Supplies),
home_improve_supplies = sum(Tools.Home.Improvement.Supplies),
toys = sum(Toys.Games),
travel_accessories = sum(Travel.Accessories))
### by airline
items_by_airline <- claims %>% filter(Airline.Name %like% "^American Airlines" |
Airline.Name %like% "Southwest Airlines" |
Airline.Name %like% "Delta Air Lines" |
Airline.Name %like% "UAL" |
Airline.Name %like% "Jet Blue" |
Airline.Name %like% "Alaska Airlines" |
Airline.Name %like% "Spirit Airlines") %>%
group_by(Airline.Name, Year, Disposition) %>%
summarise(audio_video = sum(Audio.Video),
automobile = sum(Automobile.Parts.Accessories),
baggage = sum(Baggage.Cases.Purses),
books = sum(Books.Magazines.Other),
cameras = sum(Cameras),
clothing = sum(Clothing),
computer = sum(Computer.Accessories),
cosmetics = sum(Cosmetics.Grooming),
crafting = sum(Crafting.Hobby),
currency = sum(Currency),
food = sum(Food.Drink),
home_decor = sum(Home.Decor),
household_items = sum(Household.Items),
hunting_items = sum(Hunting.Fishing.Items),
jewelry = sum(Jewelry.Watches),
medical = sum(Medical.Science),
music_instruments = sum(Musical.Instruments.Accessories),
office_supplies = sum(Office.Equipment.Supplies),
outdoor_items = sum(Outdoor.Items),
pers_accessories = sum(Personal.Accessories),
pers_electronics = sum(Personal.Electronics),
pers_navigation = sum(Personal.Navigation),
sport_supplies = sum(Sporting.Equipment.Supplies),
home_improve_supplies = sum(Tools.Home.Improvement.Supplies),
toys = sum(Toys.Games),
travel_accessories = sum(Travel.Accessories))
# create a heatmap with dataframe using plotly.
# Top 5 airline claims: Delta, Southwest, American, UAL, USAir. Now by Airport
# Let's join the lat lon from the other file with the airport codes in this file
latlon <- read.csv("Airport_Codes_mapped_to_Latitude_Longitude_in_the_United_States.csv", header = TRUE)
latlon <- latlon %>% rename(Airport.Code = locationID)
claims <- left_join(claims, latlon, by = "Airport.Code")
lltest <- claims %>% group_by(Latitude, Longitude) %>%
summarise(n = n()) %>%
arrange(desc(n))
ggplot(lltest, aes(x = Latitude, y = Longitude)) + geom_point()
######## compare top10 airports and airlines
top10airports <- read.csv("top10airports.csv", header = T)
# Let's do top10airlines first. Create a dataframe from claims that has total count of
# claims grouped by airline and year for 2015 only. We'll need to gather the data and tidy
# it (rename columns also)
top10airports <- top10airports %>%
rename("2015" = est_2015_flights, "2014" = est_2014_flights) %>%
gather(key = "Year", value = "flights", 3:4)
top10airports$Year <- as.numeric(top10airports$Year)
top10ac <- claims %>% filter(Airport.Code %in% c('ATL','LAX','ORD','DFW',
'JFK','DEN','SFO',
'CLT','LAS','PHX')
& (Year == 2014 | Year == 2015)) %>%
group_by(Airport.Code, Year, Claim.Type) %>%
summarise(total = sum(Total.Claims)) %>%
arrange(Year)
top10ac <- left_join(top10ac, top10airports, by = c("Airport.Code", "Year"))
top10ac <- top10ac %>% mutate(Claim.Rate = total / flights)
top10ac$Year <- as.character(top10ac$Year)
## Now do airlines
# tidy
top10airlines <- read.csv("top10airlines.csv", header = T)
top10airlines <- top10airlines %>% rename("2015" = est_2015_flights, "2014" = est_2014_flights) %>%
gather(key = "Year", value = "flights", 2:3)
# filter claims for top 10 airlines
top10al <- claims %>% filter((as.character(Airline.Name) %like% "^American Airlines" |
Airline.Name %like% "Southwest Airlines" |
Airline.Name %like% "Delta Air Lines" |
Airline.Name %like% "UAL" |
Airline.Name %like% "Jet Blue" |
Airline.Name %like% "Alaska Airlines" |
Airline.Name %like% "Spirit Airlines") &
(Year == 2014 | Year == 2015)) %>%
group_by(Airline.Name, Year, Claim.Type) %>%
summarise(total_claims = sum(Total.Claims))
# join them, but first, trim the white space
for (i in 1:nrow(top10al)) {
trimws(top10al$Airline.Name[i], c("both"))
}
top10al$Year <- as.character(top10al$Year)
jtl <- read.csv("jointop10al.csv", header = T)
# i did this because whitespaces were causing problems with joining and i was too lazy
# to figure out how to fix.
top10al <- jtl
top10al$Year <- as.character(top10al$Year)
# add claim rate
top10al <- top10al %>% mutate(Claim.Rate = total_claims / Flights)
top10al <- top10al %>% select(- concat)
###### leaflet ########
leaf_data <- claims %>% group_by(Latitude, Longitude, Year, Airport.Code) %>%
summarise(all_claims = sum(Total.Claims)) %>%
filter(all_claims >= 25) %>%
arrange(desc(all_claims))
leaf_data$Longitude <- paste0("-",leaf_data$Longitude)
leaf_data$Longitude <- as.numeric(leaf_data$Longitude)
##### renderTables #####
airport_table <- read.csv("airport_table.csv", header = TRUE)
airline_table <- read.csv("airline_table.csv", header = TRUE)
library(shiny)
library(dygraphs)
# Define server logic required to draw a histogram
shinyServer(function(input, output, session) {
# let's get it.
leaf_map <- reactive({
leaf_data %>%
filter(Year %in% input$year_range)
})
output$map <- renderLeaflet({
leaflet(leaf_map()) %>% setView(lng = -98.5556, lat = 39.8097, zoom = 4) %>%
addProviderTiles(providers$Stamen.Terrain) %>%
addCircles(lng = ~Longitude, lat = ~Latitude, weight = 1,
radius = ~(all_claims) * 1000, popup = ~Airport.Code,
label = ~paste0(Airport.Code, ": ", all_claims),
labelOptions = labelOptions(noHide = T))
})
output$timeSeries <- renderDygraph({
dygraph(x_by_date, main = "Claims") %>%
dyRangeSelector()
})
output$facetPlot <- renderPlot({
by_month %>% filter(Year == input$general_year & avg_claim_amount != 0) %>%
ggplot(aes(x = Month, y = avg_claim_amount)) +
geom_histogram(binwidth = 0.2, stat = "identity") + facet_wrap( ~ Claim.Type) +
scale_x_discrete(limits = c("Jan", "Feb", "Mar", "Apr",
"May", "June", "July", "Aug",
"Sept", "Oct", "Nov", "Dec")) +
theme(axis.text.x = element_text(angle = 90))
})
### heatmap 1
output$airportItems <- renderPlotly({
top_air_items <- items_by_airport %>% filter(Year == input$airport_year
& Disposition == input$disposition) %>%
group_by(Airport.Code) %>% select(-Year, -Disposition)
row.names(top_air_items) <- top_air_items$Airport.Code
airport_matrix <- data.matrix(top_air_items)[,2:27]
plot_ly(data = top_air_items,
x = c("ATL", "CLT", "DEN", "DFW", "JFK",
"LAS","LAX", "ORD", "SFO", "PHX"),
y = c("audio_video", "automobile", "baggage",
"books", "cameras", "clothing", "computer",
"cosmetics", "crafting", "currency", "food",
"home_decor", "household_items", "hunting_items",
"jewelry", "medical", "music_instruments",
"office_supplies", "outdoor_items", "pers_accessories",
"pers_electronics", "pers_navigation", "sport_supplies",
"home_improve_supplies", "toys", "travel_accessories"),
z = t(airport_matrix), type = "heatmap") %>%
layout(yaxis = list(tickangle = 30), margin = list(l=100))
})
### for heatmap 2
output$airlineItems <- renderPlotly({
top_line_items <- items_by_airline %>% filter(Year == input$airline_year
& Disposition == input$disposition2) %>%
group_by(Airline.Name) %>% select(-Year, -Disposition)
row.names(top_line_items) <- top_line_items$Airline.Name
airline_matrix <- data.matrix(top_line_items)[,2:27]
plot_ly(data = top_line_items,
x = c("Alaska", "American", "Delta", "JetBlue",
"Southwest", "Spirit","United"),
y = c("audio_video", "automobile", "baggage",
"books", "cameras", "clothing", "computer",
"cosmetics", "crafting", "currency", "food",
"home_decor", "household_items", "hunting_items",
"jewelry", "medical", "music_instruments",
"office_supplies", "outdoor_items", "pers_accessories",
"pers_electronics", "pers_navigation", "sport_supplies",
"home_improve_supplies", "toys", "travel_accessories"),
z = t(airline_matrix), type = "heatmap") %>%
layout(yaxis = list(tickangle = 30), margin = list(l=100))
})
output$airportTable <- renderDataTable({
datatable(airport_table)
})
output$airlineTable <- renderDataTable({
datatable(airline_table)
})
output$airportRate <- renderPlot({
top10ac %>% filter(Airport.Code != 'SFO') %>%
group_by(Airport.Code, Year) %>%
summarise(total_claims = sum(total),
total_flights = mean(flights)) %>%
mutate(claim_rate = total_claims / total_flights) %>%
select(-total_claims, -total_flights) %>%
ggplot(aes(x = Airport.Code, y = claim_rate)) +
geom_col(aes(fill = Year), position = "dodge") +
scale_y_continuous(labels = scales::percent) +
ylab("Scale Rate") + xlab("Airport")
})
output$airlineRate <- renderPlot({
top10al %>% filter(Airline.Name != "Republic Airways") %>%
group_by(Airline.Name, Year) %>%
summarise(Total_Claims = sum(total_claims),
Total_Flights = mean(Flights)) %>%
mutate(claim_rate = Total_Claims / Total_Flights) %>%
select(-Total_Claims, -Total_Flights) %>%
ggplot(aes(x = Airline.Name, y = claim_rate)) +
geom_col(aes(fill = Year), stat = "identity", position = "dodge") +
scale_y_continuous(labels = scales::percent) +
ylab("Claim Rate") + xlab("Airline") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
output$airlineAndType <- renderPlot({
top10al %>% filter(Airline.Name != "Republic Airways") %>%
group_by(Airline.Name, Claim.Type) %>%
summarise(Total_Claims = sum(total_claims),
Total_Flights = mean(Flights)) %>%
mutate(claim_rate = Total_Claims / Total_Flights) %>%
select(-Total_Claims, -Total_Flights) %>%
ggplot(aes(Airline.Name, claim_rate)) +
geom_col(aes(fill = Claim.Type), position = "dodge") +
scale_y_continuous(labels = scales::percent) +
ylab("Claim Rate") + xlab("Airline and Claim Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
})
library(shiny)
library(shinydashboard)
# Define UI for application that draws a histogram
dashboardPage(skin = "blue",
dashboardHeader(title = "Where the Angriest Passengers Are", titleWidth = 400),
dashboardSidebar(
sidebarMenu(
menuItem("Map of Total Claims", tabName = "map", icon = icon("map")),
menuItem("Claims Over Time", tabName = "claimTS", icon = icon("line-chart"),
menuSubItem("Time Series", tabName = "month_and_day"),
menuSubItem("Month Breakdown", tabName = "facet_wrap")),
menuItem("Itemized Breakdown", tabName = "prop", icon = icon("thermometer"),
menuSubItem("Airport Heatmap", tabName = "airport_heat"),
menuSubItem("Airline Heatmap", tabName = "airline_heat")),
menuItem("Flight and Passenger Data", tabName = "flights", icon = icon("plane"),
menuSubItem("Airport Flight Data", tabName = "airport_table"),
menuSubItem("Airline Flight Data", tabName = "airline_table")),
menuItem("Claim Rates", tabName = "rates", icon = icon("rocket"),
menuSubItem("Claim Rate by Airport per Year", tabName = "airport_rate"),
menuSubItem("Claim Rate by Airline per Year", tabName = "airline_rate"),
menuSubItem("Claim Rate by Airline and Type", tabName = "airline_and_type")
)
)
),
dashboardBody(
tabItems(
tabItem(tabName = "map",
h2("Geographic Claims"),
fluidRow(
absolutePanel(top = 10, right = 10,
sliderInput("year_range", h4("Select Year"),
min = min(leaf_data$Year),
max = max(leaf_data$Year),
value = 2010:2015)
),
leafletOutput("map", height = 600, width = "100%")
)
),
tabItem(tabName = "month_and_day",
h2("Time Series of Claims"),
fluidRow(
box(width = 12,
dygraphOutput("timeSeries"))
)
),
tabItem(tabName = "facet_wrap",
h2("Average Claim Type Per Month"),
fluidRow(box(
selectInput("general_year", h4("Select Year"),
choices = sort(unique(by_month$Year)))
)),
fluidRow(
box(width = 12,
plotOutput("facetPlot"))
)
),
tabItem(tabName = "airport_table",
h2("Airport Flight Data for 2014-2015"),
fluidRow(box(
DT::dataTableOutput('airportTable'), width = 12)
)
),
tabItem(tabName = "airline_table",
h2("Airline Flight Data for 2014-2015"),
fluidRow(box(
DT::dataTableOutput('airlineTable'), width = 12)
)
),
tabItem(tabName = "airport_heat",
h2("Airport Heatmap"),
fluidRow(
absolutePanel(top = 125, left = 250,
selectInput('airport_year', h4("Select Year"),
choices = sort(unique(items_by_airport$Year))))
),
fluidRow(
absolutePanel(top = 125, right = 10,
selectInput('disposition', h4("Select Status"),
choices = sort(unique(items_by_airport$Disposition))))
),
fluidRow(
absolutePanel(top = 200, width = 12,
box(status = "success",
plotlyOutput("airportItems")))
)
),
tabItem(tabName = "airline_heat",
h2("Airline Heatmap"),
fluidRow(absolutePanel(top = 125, left = 250,
selectInput("airline_year", h4("Select Year"),
choices = sort(unique(items_by_airline$Year)))
)),
fluidRow(absolutePanel(top = 125, right = 10,
selectInput("disposition2", h4("Select Status"),
choices = sort(unique(items_by_airline$Disposition)))
)),
fluidRow(
absolutePanel(top = 200, width = 12,
box(status = "success",
plotlyOutput("airlineItems"))
)
)
),
tabItem(tabName = "airport_rate",
h2("Claim Rates by Airport"),
plotOutput("airportRate")
),
tabItem(tabName = "airline_rate",
h2("Claim Rates by Airline"),
plotOutput("airlineRate")
),
tabItem(tabName = "airline_and_type",
h2("Claim Rates by Airline and Type"),
plotOutput("airlineAndType")
)
)
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment