alexhanna/polClassify.R

## polClassify.R
# File-Name:       polClassify.R
# Edited:          2013-03-20
# Orig.Author:     Drew Conway (drew.conway@nyu.edu)
#
# Packages Used:   tm, ggplot2
#

# All source code is copyright (c) 2012, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php

# All images and materials produced by this code are licensed under the Creative Commons
# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/

# All rights reserved.

# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
# working directory for the console to whereever you have saved this file prior to running.
# Otherwise you will see errors when loading data or saving figures!

## modified by Alex Hanna (alex.hanna@gmail.com) for use with classifying political polarization

# Load libraries
library(tm)
library(ggplot2)
library(stringr)

colNames <- c(
"status_id",
"created_at",
"text",
"source",
"coords",
"user.id",
"user.name",
"user.screen_name",
"user.level",
"user.description",
"user.location",
"user.url",
"user.followers_count",
"user.friends_count",
"user.listed_count",
"user.statuses_count",
"rt.status_id",
"rt.created_at",
"rt.text",
"rt.source",
"rt.coords",
"rt.user.id",
"rt.user.name",
"rt.user.screen_name",
"rt.user.level",
"rt.user.description",
"rt.user.location",
"rt.user.url",
"rt.user.followers_count",
"rt.user.friends_count",
"rt.user.listed_count",
"rt.user.statuses_count"
        )


# Return a single element vector of just the email body
# This is a very simple approach, as we are only using
# words as features
get.msg <- function(path)
{
  con <- file(path, open = "rt", encoding = "latin1")
  text <- readLines(con)
  # The message always begins after the first full line break
  msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
  close(con)
  return(paste(msg, collapse = "\n"))
}

# Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
# The TDM control can be modified, and the sparsity level can be
# altered.  This TDM is used to create the feature set used to do
# train our classifier.
get.tdm <- function(doc.vec)
{
  control <- list(stopwords = TRUE,
                  removePunctuation = TRUE,
                  removeNumbers = TRUE,
                  minDocFreq = 2)
  doc.corpus <- Corpus(VectorSource(doc.vec))
  doc.dtm <- TermDocumentMatrix(doc.corpus, control)
  return(doc.dtm)
}

# This function takes a file path to an email file and a string,
# the term parameter, and returns the count of that term in
# the email body.
count.word <- function(path, term)
{
  msg <- get.msg(path)
  msg.corpus <- Corpus(VectorSource(msg))
  # Hard-coded TDM control
  control <- list(stopwords = TRUE,
                  removePunctuation = TRUE,
                  removeNumbers = TRUE)
  msg.tdm <- TermDocumentMatrix(msg.corpus, control)
  word.freq <- rowSums(as.matrix(msg.tdm))
  term.freq <- word.freq[which(names(word.freq) == term)]
  # We use ifelse here because term.freq = NA if nothing is found
  return(ifelse(length(term.freq) > 0, term.freq, 0))
}

# This is the our workhorse function for classifying email.  It takes
# two required paramters: a file path to an email to classify, and
# a data frame of the trained data.  The function also takes two
# optional parameters.  First, a prior over the probability that an email
# is SPAM, which we set to 0.5 (naive), and constant value for the
# probability on words in the email that are not in our training data.
# The function returns the naive Bayes probability that the given email
# is SPAM.
classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6)
{
  # Here, we use many of the support functions to get the
  # email text data in a workable format
  msg.tdm <- get.tdm(msg)
  msg.freq <- rowSums(as.matrix(msg.tdm))
  # Find intersections of words
  msg.match <- intersect(names(msg.freq), training.df$term)
  # Now, we just perform the naive Bayes calculation
  if(length(msg.match) < 1)
  {
    return(prior * c ^ (length(msg.freq)))
  }
  else
  {
    match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
    return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
  }
}

df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv',
        header = F, sep = "\t",
        col.names = colNames,
        stringsAsFactors = F, quote = "")

df$user.screen_name <- tolower(df$user.screen_name)

## left and right users
left.users  <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan")
right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos",
  "tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc",
  "mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac",
  "pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi",
  "mediatrackers")

left.docs  <- df[which(with(df, user.screen_name %in% left.users)), ]$text
right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text

## get lengths
left.length  <- length(left.docs)
right.length <- length(right.docs)

## get a random sample for training
left.docs  <- sample(left.docs, left.length)
right.docs <- sample(right.docs, right.length)

left.training  <- left.docs[1:(left.length / 2)]
right.training <- right.docs[1:(right.length / 2)]

# With all of our support functions written, we can perform the classification.
# First, we create document corpus for left messages

# Create a DocumentTermMatrix from left vector
left.tdm <- get.tdm(left.training)

# Create a data frame that provides the feature set from the training LEFT data
left.matrix <- as.matrix(left.tdm)
left.counts <- rowSums(left.matrix)
left.df <- data.frame(cbind(names(left.counts),
                            as.numeric(left.counts)),
                      stringsAsFactors = FALSE)
names(left.df) <- c("term", "frequency")
left.df$frequency <- as.numeric(left.df$frequency)
left.occurrence <- sapply(1:nrow(left.matrix),
                          function(i)
                          {
                            length(which(left.matrix[i, ] > 0)) / ncol(left.matrix)
                          })
left.density <- left.df$frequency / sum(left.df$frequency)

# Add the term density and occurrence rate
left.df <- transform(left.df,
                     density = left.density,
                     occurrence = left.occurrence)

# Now do the same for the RIGHT email
right.tdm <- get.tdm(right.training)

right.matrix <- as.matrix(right.tdm)
right.counts <- rowSums(right.matrix)
right.df <- data.frame(cbind(names(right.counts),
                               as.numeric(right.counts)),
                         stringsAsFactors = FALSE)
names(right.df) <- c("term", "frequency")
right.df$frequency <- as.numeric(right.df$frequency)
right.occurrence <- sapply(1:nrow(right.matrix),
                            function(i)
                            {
                              length(which(right.matrix[i, ] > 0)) / ncol(right.matrix)
                            })
right.density <- right.df$frequency / sum(right.df$frequency)

right.df <- transform(right.df,
                        density = right.density,
                        occurrence = right.occurrence)

## peel off 10000 to classify
rest.docs <- sample(df$text, 10000)

# Run classifer against whole data set
rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length],
                           function(p) classify.email(p, training.df = left.df))

rest.righttest <- sapply(rest.docs[(left.length / 2):left.length],
                          function(p) classify.email(p, training.df = right.df))

rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE)
summary(rest.res)

# Finally, attempt to classify the rest data using the classifer developed above.
# The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right
left.classifier <- function(msg)
{
  pr.left  <- classify.email(msg, left.df, prior = 0.5)
  pr.right <- classify.email(msg, right.df, prior = 0.5)
  return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0)))
}

# Classify them all!
left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length],
                                   function(p)
                                   {
                                     left.classifier(p)
                                   }))

right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length],
                                   function(p)
                                   {
                                     left.classifier(p)
                                   }))

# Create a single, final, data frame with all of the classification data in it
left.matrix <- do.call(rbind, left.class)
left.final <- cbind(left.matrix, "left")

right.matrix <- do.call(rbind, right.class)
right.final <- cbind(right.matrix, "right")

class.matrix <- rbind(left.final, right.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type")
class.df$Pr.Left <- as.numeric(class.df$Pr.Left)
class.df$Pr.Right <- as.numeric(class.df$Pr.Right)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)

# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) +
    geom_point(aes(color = Type, alpha = 0.5)) +
    stat_abline(yintercept = 0, slope = 1) +
    scale_color_manual(values = c("left" = "blue",
                                  "right" = "red"),
                       name = "Message Type") +
    scale_alpha(guide = "none") +
    xlab("log[Pr(Left)]") +
    ylab("log[Pr(Right)]") +
    theme_bw()

ggsave(plot = class.plot,
       filename = file.path("../img", "final_class_LR.png"),
       height = 10,
       width = 10)

get.results <- function(bool.vector)
{
  results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector),
               length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector))
  return(results)
}

# Save results as a 2x2 table
left.col <- get.results(subset(class.df, Type == "left")$Class)
right.col <- get.results(subset(class.df, Type == "right")$Class)

class.res <- rbind(left.col, right.col)
colnames(class.res) <- c("Left", "Right")
print(class.res)

# Save the training data for use in Chapter 4
write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE)
write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE)
	# File-Name: polClassify.R
	# Edited: 2013-03-20
	# Orig.Author: Drew Conway (drew.conway@nyu.edu)
	#
	# Packages Used: tm, ggplot2
	#

	# All source code is copyright (c) 2012, under the Simplified BSD License.
	# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php

	# All images and materials produced by this code are licensed under the Creative Commons
	# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/

	# All rights reserved.

	# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
	# working directory for the console to whereever you have saved this file prior to running.
	# Otherwise you will see errors when loading data or saving figures!

	## modified by Alex Hanna (alex.hanna@gmail.com) for use with classifying political polarization

	# Load libraries
	library(tm)
	library(ggplot2)
	library(stringr)

	colNames <- c(
	"status_id",
	"created_at",
	"text",
	"source",
	"coords",
	"user.id",
	"user.name",
	"user.screen_name",
	"user.level",
	"user.description",
	"user.location",
	"user.url",
	"user.followers_count",
	"user.friends_count",
	"user.listed_count",
	"user.statuses_count",
	"rt.status_id",
	"rt.created_at",
	"rt.text",
	"rt.source",
	"rt.coords",
	"rt.user.id",
	"rt.user.name",
	"rt.user.screen_name",
	"rt.user.level",
	"rt.user.description",
	"rt.user.location",
	"rt.user.url",
	"rt.user.followers_count",
	"rt.user.friends_count",
	"rt.user.listed_count",
	"rt.user.statuses_count"
	)


	# Return a single element vector of just the email body
	# This is a very simple approach, as we are only using
	# words as features
	get.msg <- function(path)
	{
	con <- file(path, open = "rt", encoding = "latin1")
	text <- readLines(con)
	# The message always begins after the first full line break
	msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
	close(con)
	return(paste(msg, collapse = "\n"))
	}

	# Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
	# The TDM control can be modified, and the sparsity level can be
	# altered. This TDM is used to create the feature set used to do
	# train our classifier.
	get.tdm <- function(doc.vec)
	{
	control <- list(stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE,
	minDocFreq = 2)
	doc.corpus <- Corpus(VectorSource(doc.vec))
	doc.dtm <- TermDocumentMatrix(doc.corpus, control)
	return(doc.dtm)
	}

	# This function takes a file path to an email file and a string,
	# the term parameter, and returns the count of that term in
	# the email body.
	count.word <- function(path, term)
	{
	msg <- get.msg(path)
	msg.corpus <- Corpus(VectorSource(msg))
	# Hard-coded TDM control
	control <- list(stopwords = TRUE,
	removePunctuation = TRUE,
	removeNumbers = TRUE)
	msg.tdm <- TermDocumentMatrix(msg.corpus, control)
	word.freq <- rowSums(as.matrix(msg.tdm))
	term.freq <- word.freq[which(names(word.freq) == term)]
	# We use ifelse here because term.freq = NA if nothing is found
	return(ifelse(length(term.freq) > 0, term.freq, 0))
	}

	# This is the our workhorse function for classifying email. It takes
	# two required paramters: a file path to an email to classify, and
	# a data frame of the trained data. The function also takes two
	# optional parameters. First, a prior over the probability that an email
	# is SPAM, which we set to 0.5 (naive), and constant value for the
	# probability on words in the email that are not in our training data.
	# The function returns the naive Bayes probability that the given email
	# is SPAM.
	classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6)
	{
	# Here, we use many of the support functions to get the
	# email text data in a workable format
	msg.tdm <- get.tdm(msg)
	msg.freq <- rowSums(as.matrix(msg.tdm))
	# Find intersections of words
	msg.match <- intersect(names(msg.freq), training.df$term)
	# Now, we just perform the naive Bayes calculation
	if(length(msg.match) < 1)
	{
	return(prior * c ^ (length(msg.freq)))
	}
	else
	{
	match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
	return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
	}
	}

	df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv',
	header = F, sep = "\t",
	col.names = colNames,
	stringsAsFactors = F, quote = "")

	df$user.screen_name <- tolower(df$user.screen_name)

	## left and right users
	left.users <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan")
	right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos",
	"tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc",
	"mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac",
	"pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi",
	"mediatrackers")

	left.docs <- df[which(with(df, user.screen_name %in% left.users)), ]$text
	right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text

	## get lengths
	left.length <- length(left.docs)
	right.length <- length(right.docs)

	## get a random sample for training
	left.docs <- sample(left.docs, left.length)
	right.docs <- sample(right.docs, right.length)

	left.training <- left.docs[1:(left.length / 2)]
	right.training <- right.docs[1:(right.length / 2)]

	# With all of our support functions written, we can perform the classification.
	# First, we create document corpus for left messages

	# Create a DocumentTermMatrix from left vector
	left.tdm <- get.tdm(left.training)

	# Create a data frame that provides the feature set from the training LEFT data
	left.matrix <- as.matrix(left.tdm)
	left.counts <- rowSums(left.matrix)
	left.df <- data.frame(cbind(names(left.counts),
	as.numeric(left.counts)),
	stringsAsFactors = FALSE)
	names(left.df) <- c("term", "frequency")
	left.df$frequency <- as.numeric(left.df$frequency)
	left.occurrence <- sapply(1:nrow(left.matrix),
	function(i)
	{
	length(which(left.matrix[i, ] > 0)) / ncol(left.matrix)
	})
	left.density <- left.df$frequency / sum(left.df$frequency)

	# Add the term density and occurrence rate
	left.df <- transform(left.df,
	density = left.density,
	occurrence = left.occurrence)

	# Now do the same for the RIGHT email
	right.tdm <- get.tdm(right.training)

	right.matrix <- as.matrix(right.tdm)
	right.counts <- rowSums(right.matrix)
	right.df <- data.frame(cbind(names(right.counts),
	as.numeric(right.counts)),
	stringsAsFactors = FALSE)
	names(right.df) <- c("term", "frequency")
	right.df$frequency <- as.numeric(right.df$frequency)
	right.occurrence <- sapply(1:nrow(right.matrix),
	function(i)
	{
	length(which(right.matrix[i, ] > 0)) / ncol(right.matrix)
	})
	right.density <- right.df$frequency / sum(right.df$frequency)

	right.df <- transform(right.df,
	density = right.density,
	occurrence = right.occurrence)

	## peel off 10000 to classify
	rest.docs <- sample(df$text, 10000)

	# Run classifer against whole data set
	rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length],
	function(p) classify.email(p, training.df = left.df))

	rest.righttest <- sapply(rest.docs[(left.length / 2):left.length],
	function(p) classify.email(p, training.df = right.df))

	rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE)
	summary(rest.res)

	# Finally, attempt to classify the rest data using the classifer developed above.
	# The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right
	left.classifier <- function(msg)
	{
	pr.left <- classify.email(msg, left.df, prior = 0.5)
	pr.right <- classify.email(msg, right.df, prior = 0.5)
	return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0)))
	}

	# Classify them all!
	left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length],
	function(p)
	{
	left.classifier(p)
	}))

	right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length],
	function(p)
	{
	left.classifier(p)
	}))

	# Create a single, final, data frame with all of the classification data in it
	left.matrix <- do.call(rbind, left.class)
	left.final <- cbind(left.matrix, "left")

	right.matrix <- do.call(rbind, right.class)
	right.final <- cbind(right.matrix, "right")

	class.matrix <- rbind(left.final, right.final)
	class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
	names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type")
	class.df$Pr.Left <- as.numeric(class.df$Pr.Left)
	class.df$Pr.Right <- as.numeric(class.df$Pr.Right)
	class.df$Class <- as.logical(as.numeric(class.df$Class))
	class.df$Type <- as.factor(class.df$Type)

	# Create final plot of results
	class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) +
	geom_point(aes(color = Type, alpha = 0.5)) +
	stat_abline(yintercept = 0, slope = 1) +
	scale_color_manual(values = c("left" = "blue",
	"right" = "red"),
	name = "Message Type") +
	scale_alpha(guide = "none") +
	xlab("log[Pr(Left)]") +
	ylab("log[Pr(Right)]") +
	theme_bw()

	ggsave(plot = class.plot,
	filename = file.path("../img", "final_class_LR.png"),
	height = 10,
	width = 10)

	get.results <- function(bool.vector)
	{
	results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector),
	length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector))
	return(results)
	}

	# Save results as a 2x2 table
	left.col <- get.results(subset(class.df, Type == "left")$Class)
	right.col <- get.results(subset(class.df, Type == "right")$Class)

	class.res <- rbind(left.col, right.col)
	colnames(class.res) <- c("Left", "Right")
	print(class.res)

	# Save the training data for use in Chapter 4
	write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE)
	write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE)