Skip to content

Instantly share code, notes, and snippets.

@alexhanna
Created March 21, 2013 00:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexhanna/5209830 to your computer and use it in GitHub Desktop.
Save alexhanna/5209830 to your computer and use it in GitHub Desktop.
Political classifier, largely adapted from Machine Learning for Hackers.
# File-Name: polClassify.R
# Edited: 2013-03-20
# Orig.Author: Drew Conway (drew.conway@nyu.edu)
#
# Packages Used: tm, ggplot2
#
# All source code is copyright (c) 2012, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
# All images and materials produced by this code are licensed under the Creative Commons
# Attribution-Share Alike 3.0 United States License: http://creativecommons.org/licenses/by-sa/3.0/us/
# All rights reserved.
# NOTE: If you are running this in the R console you must use the 'setwd' command to set the
# working directory for the console to whereever you have saved this file prior to running.
# Otherwise you will see errors when loading data or saving figures!
## modified by Alex Hanna (alex.hanna@gmail.com) for use with classifying political polarization
# Load libraries
library(tm)
library(ggplot2)
library(stringr)
colNames <- c(
"status_id",
"created_at",
"text",
"source",
"coords",
"user.id",
"user.name",
"user.screen_name",
"user.level",
"user.description",
"user.location",
"user.url",
"user.followers_count",
"user.friends_count",
"user.listed_count",
"user.statuses_count",
"rt.status_id",
"rt.created_at",
"rt.text",
"rt.source",
"rt.coords",
"rt.user.id",
"rt.user.name",
"rt.user.screen_name",
"rt.user.level",
"rt.user.description",
"rt.user.location",
"rt.user.url",
"rt.user.followers_count",
"rt.user.friends_count",
"rt.user.listed_count",
"rt.user.statuses_count"
)
# Return a single element vector of just the email body
# This is a very simple approach, as we are only using
# words as features
get.msg <- function(path)
{
con <- file(path, open = "rt", encoding = "latin1")
text <- readLines(con)
# The message always begins after the first full line break
msg <- text[seq(which(text == "")[1] + 1, length(text), 1)]
close(con)
return(paste(msg, collapse = "\n"))
}
# Create a TermDocumentMatrix (TDM) from the corpus of SPAM email.
# The TDM control can be modified, and the sparsity level can be
# altered. This TDM is used to create the feature set used to do
# train our classifier.
get.tdm <- function(doc.vec)
{
control <- list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE,
minDocFreq = 2)
doc.corpus <- Corpus(VectorSource(doc.vec))
doc.dtm <- TermDocumentMatrix(doc.corpus, control)
return(doc.dtm)
}
# This function takes a file path to an email file and a string,
# the term parameter, and returns the count of that term in
# the email body.
count.word <- function(path, term)
{
msg <- get.msg(path)
msg.corpus <- Corpus(VectorSource(msg))
# Hard-coded TDM control
control <- list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE)
msg.tdm <- TermDocumentMatrix(msg.corpus, control)
word.freq <- rowSums(as.matrix(msg.tdm))
term.freq <- word.freq[which(names(word.freq) == term)]
# We use ifelse here because term.freq = NA if nothing is found
return(ifelse(length(term.freq) > 0, term.freq, 0))
}
# This is the our workhorse function for classifying email. It takes
# two required paramters: a file path to an email to classify, and
# a data frame of the trained data. The function also takes two
# optional parameters. First, a prior over the probability that an email
# is SPAM, which we set to 0.5 (naive), and constant value for the
# probability on words in the email that are not in our training data.
# The function returns the naive Bayes probability that the given email
# is SPAM.
classify.email <- function(msg, training.df, prior = 0.5, c = 1e-6)
{
# Here, we use many of the support functions to get the
# email text data in a workable format
msg.tdm <- get.tdm(msg)
msg.freq <- rowSums(as.matrix(msg.tdm))
# Find intersections of words
msg.match <- intersect(names(msg.freq), training.df$term)
# Now, we just perform the naive Bayes calculation
if(length(msg.match) < 1)
{
return(prior * c ^ (length(msg.freq)))
}
else
{
match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
return(prior * prod(match.probs) * c ^ (length(msg.freq) - length(msg.match)))
}
}
df <- read.delim('/project/hanna/wirecall/wirecall.highdetail.csv',
header = F, sep = "\t",
col.names = colNames,
stringsAsFactors = F, quote = "")
df$user.screen_name <- tolower(df$user.screen_name)
## left and right users
left.users <- c("legaleagle", "spudlovr", "barrett4wi", "wisdems", "repgwenmoore", "markpocan")
right.users <- c("scottkwalker", "govwalker", "turnsc", "gopbrad", "repvos",
"tommyforwi", "afpwi", "tppatriots", "vickimckenna", "win8ive", "maciverwisc",
"mattbatzel", "gopbrad", "scottwalkerhq", "jbvanhollen", "ourcountrypac",
"pbartel", "BrickM", "wisgop", "rebeccaforreal", "danerepublicans", "the_rga", "prolifewi",
"mediatrackers")
left.docs <- df[which(with(df, user.screen_name %in% left.users)), ]$text
right.docs <- df[which(with(df, user.screen_name %in% right.users)), ]$text
## get lengths
left.length <- length(left.docs)
right.length <- length(right.docs)
## get a random sample for training
left.docs <- sample(left.docs, left.length)
right.docs <- sample(right.docs, right.length)
left.training <- left.docs[1:(left.length / 2)]
right.training <- right.docs[1:(right.length / 2)]
# With all of our support functions written, we can perform the classification.
# First, we create document corpus for left messages
# Create a DocumentTermMatrix from left vector
left.tdm <- get.tdm(left.training)
# Create a data frame that provides the feature set from the training LEFT data
left.matrix <- as.matrix(left.tdm)
left.counts <- rowSums(left.matrix)
left.df <- data.frame(cbind(names(left.counts),
as.numeric(left.counts)),
stringsAsFactors = FALSE)
names(left.df) <- c("term", "frequency")
left.df$frequency <- as.numeric(left.df$frequency)
left.occurrence <- sapply(1:nrow(left.matrix),
function(i)
{
length(which(left.matrix[i, ] > 0)) / ncol(left.matrix)
})
left.density <- left.df$frequency / sum(left.df$frequency)
# Add the term density and occurrence rate
left.df <- transform(left.df,
density = left.density,
occurrence = left.occurrence)
# Now do the same for the RIGHT email
right.tdm <- get.tdm(right.training)
right.matrix <- as.matrix(right.tdm)
right.counts <- rowSums(right.matrix)
right.df <- data.frame(cbind(names(right.counts),
as.numeric(right.counts)),
stringsAsFactors = FALSE)
names(right.df) <- c("term", "frequency")
right.df$frequency <- as.numeric(right.df$frequency)
right.occurrence <- sapply(1:nrow(right.matrix),
function(i)
{
length(which(right.matrix[i, ] > 0)) / ncol(right.matrix)
})
right.density <- right.df$frequency / sum(right.df$frequency)
right.df <- transform(right.df,
density = right.density,
occurrence = right.occurrence)
## peel off 10000 to classify
rest.docs <- sample(df$text, 10000)
# Run classifer against whole data set
rest.lefttest <- sapply(rest.docs[(left.length / 2):left.length],
function(p) classify.email(p, training.df = left.df))
rest.righttest <- sapply(rest.docs[(left.length / 2):left.length],
function(p) classify.email(p, training.df = right.df))
rest.res <- ifelse(rest.lefttest > rest.righttest, TRUE, FALSE)
summary(rest.res)
# Finally, attempt to classify the rest data using the classifer developed above.
# The rule is to classify a message as left if Pr(msg) = left > Pr(msg) = right
left.classifier <- function(msg)
{
pr.left <- classify.email(msg, left.df, prior = 0.5)
pr.right <- classify.email(msg, right.df, prior = 0.5)
return(c(pr.left, pr.right, ifelse(pr.left > pr.right, 1, 0)))
}
# Classify them all!
left.class <- suppressWarnings(lapply(left.docs[(left.length / 2):left.length],
function(p)
{
left.classifier(p)
}))
right.class <- suppressWarnings(lapply(right.docs[(right.length / 2):right.length],
function(p)
{
left.classifier(p)
}))
# Create a single, final, data frame with all of the classification data in it
left.matrix <- do.call(rbind, left.class)
left.final <- cbind(left.matrix, "left")
right.matrix <- do.call(rbind, right.class)
right.final <- cbind(right.matrix, "right")
class.matrix <- rbind(left.final, right.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("Pr.Left" ,"Pr.Right", "Class", "Type")
class.df$Pr.Left <- as.numeric(class.df$Pr.Left)
class.df$Pr.Right <- as.numeric(class.df$Pr.Right)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(Pr.Left), log(Pr.Right))) +
geom_point(aes(color = Type, alpha = 0.5)) +
stat_abline(yintercept = 0, slope = 1) +
scale_color_manual(values = c("left" = "blue",
"right" = "red"),
name = "Message Type") +
scale_alpha(guide = "none") +
xlab("log[Pr(Left)]") +
ylab("log[Pr(Right)]") +
theme_bw()
ggsave(plot = class.plot,
filename = file.path("../img", "final_class_LR.png"),
height = 10,
width = 10)
get.results <- function(bool.vector)
{
results <- c(length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector),
length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector))
return(results)
}
# Save results as a 2x2 table
left.col <- get.results(subset(class.df, Type == "left")$Class)
right.col <- get.results(subset(class.df, Type == "right")$Class)
class.res <- rbind(left.col, right.col)
colnames(class.res) <- c("Left", "Right")
print(class.res)
# Save the training data for use in Chapter 4
write.csv(left.df, file.path("data", "left_df.csv"), row.names = FALSE)
write.csv(right.df, file.path("data", "right_df.csv"), row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment