Skip to content

Instantly share code, notes, and snippets.

@ZipporahPolinskyNagel
Created February 5, 2018 04:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZipporahPolinskyNagel/d6fe5726451a621e3a11f62757e02139 to your computer and use it in GitHub Desktop.
Save ZipporahPolinskyNagel/d6fe5726451a621e3a11f62757e02139 to your computer and use it in GitHub Desktop.
Zipporah's Shiny Project
library(ggplot2)
library(dplyr)
library(data.table)
library(tidyr)
charges <- read.csv("charges.csv",stringsAsFactors = FALSE)
charges.df <- as.data.frame(charges)
names(charges.df)[1:23] <- tolower(names(charges.df)[1:23])
#extract month and year
charges.df$violation.date = as.Date(charges.df$violation.date, "%m/%d/%Y")
charges.df$month = as.numeric(month(charges.df$violation.date))
charges.df$year = as.numeric(year(charges.df$violation.date))
#extract industry no.
charges.df <- separate(data = charges.df, col = industry, into = c("industry.name", "industry.id"), sep = " \\- ")
#TODO remove blank industry id at 70 locations for Other (industry_name)
#extract charge type
charges.df <- separate(data = charges.df, col = charge, into = c("charge.id", "charge.desc"), sep = " \\-")
charges.df$charge.cat = gsub("..+§ ","",charges.df$charge.id)
charges.df$charge.cat = gsub(" *\\(.*?\\) *","",charges.df$charge.cat)
#fix boro casing
capFirst <- function(s) {
paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "")
}
charges.df$borough <- tolower(charges.df$borough)
charges.df$borough <- capFirst(charges.df$borough)
charges.df = filter( charges.df, !grepl('Outside', borough) )
charges.df = filter( charges.df, borough != "")
library(shiny)
#server code
shinyServer(function(input, output) {
output$bar <- renderPlot({
resetChargeData(input$charge_number, input$industry_number)
if (input$drilldown == "Borough")
{
drilldown.data = charge_by_boro
xlabel = 'Borough'
xvector = charge_by_boro$borough
count = charge_by_boro$count
}
if (input$drilldown == "Industry")
{
drilldown.data = charge_by_industry
xlabel = 'Industry'
xvector = charge_by_industry$industry.name
count = input$industry_number
}
if (input$drilldown == "Charge type")
{
drilldown.data = charge_by_chargeid
xlabel = 'Charge ID'
xvector = charge_by_chargeid$charge.id
count = input$charge_number
}
ggplot(data = drilldown.data, aes(x = reorder(xvector, count), y = count, fill = count)) +
geom_bar(stat = "identity") +
labs(x = xlabel,
y = 'Number of charges') +
theme_classic() +
coord_flip()
})
wordcloud_rep <- repeatable(wordcloud)
output$words <- renderPlot({
resetChargeData(input$charge_number, input$industry_number)
wordcloud_rep(words = d$word, freq = d$freq, min.freq = 2, scale=c(2.5,.25),
max.words=500, random.order=FALSE, rot.per=0.15,
colors=brewer.pal(8, "Dark2"))
})
output$charges <- renderTable({
resetChargeData(input$charge_number, input$industry_number)
charge_by_chargeid
})
})
resetChargeData <- function( charge_count, industry_count ) {
top_industries <-
group_by(charges.df, industry.name) %>%
summarize( count = n()) %>%
arrange( desc(count)) %>%
top_n(industry_count)
charges.df.filtered = semi_join(charges.df, top_industries, by = 'industry.name')
top_chargeids <- charges.df %>%
group_by(., charge.id) %>%
summarize( count = n()) %>%
arrange( desc(count)) %>%
top_n(charge_count)
charges.df.filtered = semi_join(charges.df.filtered, top_chargeids, by = 'charge.id')
charge_by_boro = charges.df.filtered %>%
group_by(., borough) %>%
summarize( count = sum(charge.count)) %>%
arrange( desc(count))
charge_by_industry = charges.df.filtered %>%
group_by(., industry.name) %>%
summarize( count = sum(charge.count))%>%
arrange( desc(count)) %>%
top_n(industry_count)
charge_by_chargeid = charges.df.filtered %>%
group_by(., charge.id) %>%
mutate(., description = first(charge.desc)) %>%
summarize( count = sum(charge.count), description = first(description))%>%
arrange( desc(count)) %>%
top_n(charge_count)
# word cloud
library(wordcloud) # this requires the tm and NLP packages
library(devtools)
library(SnowballC)
#word cloud
all_desc_vector <- as.vector(charges.df.filtered$description)
entire_desc_text = capture.output(cat(all_desc_vector))
#entire_desc_text = paste(charge_by_chargeid$description, collapse =" ")
docs <- Corpus(VectorSource(entire_desc_text))
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("near", "required", "upon", "use"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
#docs <- tm_map(docs, stemDocument)
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
}
#
# This is the user-interface definition of a Shiny web application. You can
# run the application by clicking 'Run App' above.
library(shiny)
# Define UI for application that draws bar graph
shinyUI(fluidPage(
theme = "bootstrap.css",
# Application title
headerPanel("Department of Consumer Affairs charges for 2016-2017"),
sidebarPanel(
radioButtons("drilldown", # choose the drilldown
label = "Drill down by ",
choices = c("Borough", "Industry", "Charge type"),
selected = "Borough"),
#checkboxGroupInput("year", label = "Year",
# choices = c("2016","2017"),
# selected = "2016"),
sliderInput("industry_number", label = "No. Industries",
min = 1, max = 50,
value = 20, step = 1,
pre = "", sep = "",
animate = TRUE),
sliderInput("charge_number", label = "No. Charge types",
min = 1, max = 50,
value = 20, step = 1,
pre = "", sep = "",
animate = TRUE)
),
# Show a bar graph
mainPanel(
tabsetPanel(
tabPanel("Plots", plotOutput("bar", width = "100%")),
tabPanel("Word cloud", plotOutput("words", width = "100%")),
#tabPanel("Over time", tableOutput("table")),
tabPanel("Charge types", tableOutput("charges"))
)
)
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment