Skip to content

Instantly share code, notes, and snippets.

@rjweiss
Created November 22, 2013 08:26
Show Gist options
  • Save rjweiss/7596650 to your computer and use it in GitHub Desktop.
Save rjweiss/7596650 to your computer and use it in GitHub Desktop.
etcML blogpost demo code
library(ggplot2)
library(stringr)
library(scales)
library(plyr)
library(car)
predictions = read.csv('job831_predictions.tsv', sep="\t") # the job file from etcml as is
metadata = read.csv('etcml_iarchive_0106_full.tsv', header=F, stringsAsFactors=F) # my metadata from the archive
# this is rather sloppy but i'm lazy
predictions$channel = metadata$V3
predictions$show = metadata$V4
predictions$date = metadata$V5
predictions$time = metadata$V6
#for now, just look at cnn, cspan, foxnews, and msnbc
predictions = predictions[predictions$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]
#relabel as positive, neutral, and negative
predictions$prediction3 = cut(predictions$neg_probability, breaks=c(0, 0.3, 0.7, 1), labels = c('positive','neutral','negative'))
#let's drop neutral comments for now
predictions = predictions[!predictions$prediction3 %in% "neutral",]
predictions$prediction3 = factor(predictions$prediction3)
#evaluate statements that contain 'obama' or 'president'
obama_predictions = predictions[str_detect(predictions$text, c('obama','president')),]
#shows i want to look at
shows = c('Anderson_Cooper_360', 'CNN_Newsroom', 'FOX_and_Friends',
'Hannity', 'Larry_King_Live', 'Hardball_With_Chris_Matthews',
'The_Rachel_Maddow_Show', 'Morning_Joe', 'The_OReilly_Factor',
'U.S._House_of_Representatives',
'Countdown_With_Keith_Olbermann')
df <- count(obama_predictions, vars=c('channel','show','prediction3'))
df <- ddply(df, .(show), transform, p = freq/sum(freq))
df <- ddply(df, .(show), function(x) {
#print(x)
p = x$p
delta = p[1] / p[2]
delta
})
blogpost = df[df$show %in% shows,]
epic = ggplot(blogpost, aes(
x=V1,
y=reorder(show, V1))) +
geom_point(size=5) +
scale_x_continuous(limits=c(0,2)) +
ylab('') +
xlab('Ratio of positive to negative statements mentioning "Obama"') +
ggtitle('etcML Epic Sentiment Classifier') +
theme_gray(30) # ratio adjustment
ggsave('epic_sentiment_obama_iarchive.svg', epic)
#hard news classifier
predictions2 = read.csv('job836_predictions.tsv', sep="\t")
predictions2$label = car::recode(predictions2$prediction, recodes="3='negative'; 2='neutral'; 1='positive'")
predictions2$text = metadata$V2
predictions2$channel = metadata$V3
predictions2$show = metadata$V4
predictions2$date = metadata$V5
predictions2$time = metadata$V6
predictions2 = predictions2[predictions2$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]
predictions2 = predictions2[!predictions2$label %in% "neutral",]
predictions2$label = factor(predictions2$label)
obama_predictions2 = predictions2[str_detect(predictions2$text, c('obama','president')),]
df <- count(obama_predictions2, vars=c('channel','show','label'))
df <- ddply(df, .(show), transform, p = freq/sum(freq))
df <- ddply(df, .(show), function(x) {
#print(x)
p = x$p
delta = p[1] / p[2]
delta
})
blogpost2 = df[df$show %in% shows,]
hardnews = ggplot(blogpost2, aes(
x=V1,
y=reorder(show, V1))) +
geom_point(size=5) +
scale_x_continuous(limits=c(0,2)) +
#coord_flip()
ylab('') +
xlab('Ratio of positive to negative statements mentioning "Obama"') +
ggtitle('etcML Hard News Sentiment Classifier') +
theme_gray(30) # ratio adjustment
ggsave('hardnews_sentiment_obama_iarchive.svg', hardnews)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment