ZipporahPolinskyNagel/global.R

## global.R
library(ggplot2)
library(dplyr)
library(data.table)
library(tidyr)


charges <- read.csv("charges.csv",stringsAsFactors = FALSE)
charges.df <- as.data.frame(charges)
names(charges.df)[1:23] <- tolower(names(charges.df)[1:23])

#extract month and year
charges.df$violation.date = as.Date(charges.df$violation.date, "%m/%d/%Y")
charges.df$month = as.numeric(month(charges.df$violation.date))
charges.df$year = as.numeric(year(charges.df$violation.date))

#extract industry no.
charges.df <- separate(data = charges.df, col = industry, into = c("industry.name", "industry.id"), sep = " \\- ")
#TODO remove blank industry id at 70 locations for Other (industry_name)

#extract charge type
charges.df <- separate(data = charges.df, col = charge, into = c("charge.id", "charge.desc"), sep = " \\-")
charges.df$charge.cat = gsub("..+§ ","",charges.df$charge.id)
charges.df$charge.cat =  gsub(" *\\(.*?\\) *","",charges.df$charge.cat)

#fix boro casing
capFirst <- function(s) {
  paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "")
}

charges.df$borough <- tolower(charges.df$borough)
charges.df$borough <- capFirst(charges.df$borough)

charges.df = filter( charges.df, !grepl('Outside', borough) )
charges.df = filter( charges.df, borough != "")

## server.R
library(shiny)


#server code
shinyServer(function(input, output) {


  output$bar <- renderPlot({
    resetChargeData(input$charge_number, input$industry_number)
    if (input$drilldown == "Borough")
    {
      drilldown.data = charge_by_boro
      xlabel = 'Borough'
      xvector = charge_by_boro$borough
      count = charge_by_boro$count
    }
    if (input$drilldown == "Industry")
    {
      drilldown.data = charge_by_industry
      xlabel = 'Industry'
      xvector = charge_by_industry$industry.name
      count = input$industry_number
    }
    if (input$drilldown == "Charge type")
    {
      drilldown.data = charge_by_chargeid
      xlabel = 'Charge ID'
      xvector = charge_by_chargeid$charge.id
      count = input$charge_number
    }

    ggplot(data = drilldown.data, aes(x = reorder(xvector, count), y = count, fill = count)) +
      geom_bar(stat = "identity") +
      labs(x = xlabel,
           y = 'Number of charges')  +
      theme_classic() +
      coord_flip()
  })

  wordcloud_rep <- repeatable(wordcloud)

  output$words <- renderPlot({

    resetChargeData(input$charge_number, input$industry_number)
    wordcloud_rep(words = d$word, freq = d$freq, min.freq = 2, scale=c(2.5,.25),
    max.words=500, random.order=FALSE, rot.per=0.15,
    colors=brewer.pal(8, "Dark2"))
  })

  output$charges <- renderTable({
    resetChargeData(input$charge_number, input$industry_number)
    charge_by_chargeid
  })

})

resetChargeData <- function( charge_count, industry_count ) {

  top_industries <-
    group_by(charges.df, industry.name) %>%
    summarize( count = n()) %>%
    arrange( desc(count)) %>%
    top_n(industry_count)

  charges.df.filtered = semi_join(charges.df, top_industries, by = 'industry.name')

  top_chargeids <- charges.df %>%
    group_by(., charge.id) %>%
    summarize( count = n()) %>%
    arrange( desc(count)) %>%
    top_n(charge_count)

  charges.df.filtered = semi_join(charges.df.filtered, top_chargeids, by = 'charge.id')

  charge_by_boro = charges.df.filtered %>%
    group_by(., borough) %>%
    summarize( count = sum(charge.count)) %>%
    arrange( desc(count))

  charge_by_industry = charges.df.filtered %>%
    group_by(., industry.name) %>%
    summarize( count = sum(charge.count))%>%
    arrange( desc(count)) %>%
    top_n(industry_count)

  charge_by_chargeid = charges.df.filtered %>%
    group_by(., charge.id) %>%
    mutate(., description = first(charge.desc)) %>%
    summarize( count = sum(charge.count), description = first(description))%>%
    arrange( desc(count)) %>%
    top_n(charge_count)

  # word cloud
  library(wordcloud) # this requires the tm and NLP packages
  library(devtools)
  library(SnowballC)

  #word cloud
  all_desc_vector <- as.vector(charges.df.filtered$description)
  entire_desc_text = capture.output(cat(all_desc_vector))
  #entire_desc_text = paste(charge_by_chargeid$description, collapse =" ")
  docs <- Corpus(VectorSource(entire_desc_text))
  toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
  docs <- tm_map(docs, toSpace, "/")
  docs <- tm_map(docs, toSpace, "@")
  docs <- tm_map(docs, toSpace, "\\|")

  # Convert the text to lower case
  docs <- tm_map(docs, content_transformer(tolower))
  # Remove numbers
  docs <- tm_map(docs, removeNumbers)
  # Remove english common stopwords
  docs <- tm_map(docs, removeWords, stopwords("english"))
  # Remove your own stop word
  # specify your stopwords as a character vector
  docs <- tm_map(docs, removeWords, c("near", "required", "upon", "use"))
  # Remove punctuations
  docs <- tm_map(docs, removePunctuation)
  # Eliminate extra white spaces
  docs <- tm_map(docs, stripWhitespace)
  #docs <- tm_map(docs, stemDocument)
  dtm <- TermDocumentMatrix(docs)
  m <- as.matrix(dtm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
}

## ui.R
#
# This is the user-interface definition of a Shiny web application. You can
# run the application by clicking 'Run App' above.
library(shiny)

# Define UI for application that draws bar graph
shinyUI(fluidPage(
  theme = "bootstrap.css",

  # Application title
  headerPanel("Department of Consumer Affairs charges for 2016-2017"),

  sidebarPanel(

      radioButtons("drilldown", # choose the drilldown
                  label = "Drill down by ",
                  choices = c("Borough", "Industry", "Charge type"),
                  selected = "Borough"),

      #checkboxGroupInput("year", label = "Year",
      #                   choices = c("2016","2017"),
      #                   selected = "2016"),

      sliderInput("industry_number", label = "No. Industries",
                  min = 1, max = 50,
                  value = 20, step = 1,
                  pre = "", sep = "",
                  animate = TRUE),

      sliderInput("charge_number", label = "No. Charge types",
                  min = 1, max = 50,
                  value = 20, step = 1,
                  pre = "", sep = "",
                  animate = TRUE)
    ),

    # Show a bar graph
   mainPanel(
     tabsetPanel(
       tabPanel("Plots", plotOutput("bar",  width = "100%")),
       tabPanel("Word cloud", plotOutput("words", width = "100%")),
       #tabPanel("Over time", tableOutput("table")),
       tabPanel("Charge types", tableOutput("charges"))
     )
   )

  )
)
	library(ggplot2)
	library(dplyr)
	library(data.table)
	library(tidyr)


	charges <- read.csv("charges.csv",stringsAsFactors = FALSE)
	charges.df <- as.data.frame(charges)
	names(charges.df)[1:23] <- tolower(names(charges.df)[1:23])

	#extract month and year
	charges.df$violation.date = as.Date(charges.df$violation.date, "%m/%d/%Y")
	charges.df$month = as.numeric(month(charges.df$violation.date))
	charges.df$year = as.numeric(year(charges.df$violation.date))

	#extract industry no.
	charges.df <- separate(data = charges.df, col = industry, into = c("industry.name", "industry.id"), sep = " \\- ")
	#TODO remove blank industry id at 70 locations for Other (industry_name)

	#extract charge type
	charges.df <- separate(data = charges.df, col = charge, into = c("charge.id", "charge.desc"), sep = " \\-")
	charges.df$charge.cat = gsub("..+§ ","",charges.df$charge.id)
	charges.df$charge.cat = gsub(" \\(.?\\) *","",charges.df$charge.cat)

	#fix boro casing
	capFirst <- function(s) {
	paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "")
	}

	charges.df$borough <- tolower(charges.df$borough)
	charges.df$borough <- capFirst(charges.df$borough)

	charges.df = filter( charges.df, !grepl('Outside', borough) )
	charges.df = filter( charges.df, borough != "")
	library(shiny)


	#server code
	shinyServer(function(input, output) {


	output$bar <- renderPlot({
	resetChargeData(input$charge_number, input$industry_number)
	if (input$drilldown == "Borough")
	{
	drilldown.data = charge_by_boro
	xlabel = 'Borough'
	xvector = charge_by_boro$borough
	count = charge_by_boro$count
	}
	if (input$drilldown == "Industry")
	{
	drilldown.data = charge_by_industry
	xlabel = 'Industry'
	xvector = charge_by_industry$industry.name
	count = input$industry_number
	}
	if (input$drilldown == "Charge type")
	{
	drilldown.data = charge_by_chargeid
	xlabel = 'Charge ID'
	xvector = charge_by_chargeid$charge.id
	count = input$charge_number
	}

	ggplot(data = drilldown.data, aes(x = reorder(xvector, count), y = count, fill = count)) +
	geom_bar(stat = "identity") +
	labs(x = xlabel,
	y = 'Number of charges') +
	theme_classic() +
	coord_flip()
	})

	wordcloud_rep <- repeatable(wordcloud)

	output$words <- renderPlot({

	resetChargeData(input$charge_number, input$industry_number)
	wordcloud_rep(words = d$word, freq = d$freq, min.freq = 2, scale=c(2.5,.25),
	max.words=500, random.order=FALSE, rot.per=0.15,
	colors=brewer.pal(8, "Dark2"))
	})

	output$charges <- renderTable({
	resetChargeData(input$charge_number, input$industry_number)
	charge_by_chargeid
	})

	})

	resetChargeData <- function( charge_count, industry_count ) {

	top_industries <-
	group_by(charges.df, industry.name) %>%
	summarize( count = n()) %>%
	arrange( desc(count)) %>%
	top_n(industry_count)

	charges.df.filtered = semi_join(charges.df, top_industries, by = 'industry.name')

	top_chargeids <- charges.df %>%
	group_by(., charge.id) %>%
	summarize( count = n()) %>%
	arrange( desc(count)) %>%
	top_n(charge_count)

	charges.df.filtered = semi_join(charges.df.filtered, top_chargeids, by = 'charge.id')

	charge_by_boro = charges.df.filtered %>%
	group_by(., borough) %>%
	summarize( count = sum(charge.count)) %>%
	arrange( desc(count))

	charge_by_industry = charges.df.filtered %>%
	group_by(., industry.name) %>%
	summarize( count = sum(charge.count))%>%
	arrange( desc(count)) %>%
	top_n(industry_count)

	charge_by_chargeid = charges.df.filtered %>%
	group_by(., charge.id) %>%
	mutate(., description = first(charge.desc)) %>%
	summarize( count = sum(charge.count), description = first(description))%>%
	arrange( desc(count)) %>%
	top_n(charge_count)

	# word cloud
	library(wordcloud) # this requires the tm and NLP packages
	library(devtools)
	library(SnowballC)

	#word cloud
	all_desc_vector <- as.vector(charges.df.filtered$description)
	entire_desc_text = capture.output(cat(all_desc_vector))
	#entire_desc_text = paste(charge_by_chargeid$description, collapse =" ")
	docs <- Corpus(VectorSource(entire_desc_text))
	toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
	docs <- tm_map(docs, toSpace, "/")
	docs <- tm_map(docs, toSpace, "@")
	docs <- tm_map(docs, toSpace, "\\\|")

	# Convert the text to lower case
	docs <- tm_map(docs, content_transformer(tolower))
	# Remove numbers
	docs <- tm_map(docs, removeNumbers)
	# Remove english common stopwords
	docs <- tm_map(docs, removeWords, stopwords("english"))
	# Remove your own stop word
	# specify your stopwords as a character vector
	docs <- tm_map(docs, removeWords, c("near", "required", "upon", "use"))
	# Remove punctuations
	docs <- tm_map(docs, removePunctuation)
	# Eliminate extra white spaces
	docs <- tm_map(docs, stripWhitespace)
	#docs <- tm_map(docs, stemDocument)
	dtm <- TermDocumentMatrix(docs)
	m <- as.matrix(dtm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)
	}
	#
	# This is the user-interface definition of a Shiny web application. You can
	# run the application by clicking 'Run App' above.
	library(shiny)

	# Define UI for application that draws bar graph
	shinyUI(fluidPage(
	theme = "bootstrap.css",

	# Application title
	headerPanel("Department of Consumer Affairs charges for 2016-2017"),

	sidebarPanel(

	radioButtons("drilldown", # choose the drilldown
	label = "Drill down by ",
	choices = c("Borough", "Industry", "Charge type"),
	selected = "Borough"),

	#checkboxGroupInput("year", label = "Year",
	# choices = c("2016","2017"),
	# selected = "2016"),

	sliderInput("industry_number", label = "No. Industries",
	min = 1, max = 50,
	value = 20, step = 1,
	pre = "", sep = "",
	animate = TRUE),

	sliderInput("charge_number", label = "No. Charge types",
	min = 1, max = 50,
	value = 20, step = 1,
	pre = "", sep = "",
	animate = TRUE)
	),

	# Show a bar graph
	mainPanel(
	tabsetPanel(
	tabPanel("Plots", plotOutput("bar", width = "100%")),
	tabPanel("Word cloud", plotOutput("words", width = "100%")),
	#tabPanel("Over time", tableOutput("table")),
	tabPanel("Charge types", tableOutput("charges"))
	)
	)

	)
	)