Created
November 22, 2013 08:26
-
-
Save rjweiss/7596650 to your computer and use it in GitHub Desktop.
etcML blogpost demo code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(stringr) | |
library(scales) | |
library(plyr) | |
library(car) | |
predictions = read.csv('job831_predictions.tsv', sep="\t") # the job file from etcml as is | |
metadata = read.csv('etcml_iarchive_0106_full.tsv', header=F, stringsAsFactors=F) # my metadata from the archive | |
# this is rather sloppy but i'm lazy | |
predictions$channel = metadata$V3 | |
predictions$show = metadata$V4 | |
predictions$date = metadata$V5 | |
predictions$time = metadata$V6 | |
#for now, just look at cnn, cspan, foxnews, and msnbc | |
predictions = predictions[predictions$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),] | |
#relabel as positive, neutral, and negative | |
predictions$prediction3 = cut(predictions$neg_probability, breaks=c(0, 0.3, 0.7, 1), labels = c('positive','neutral','negative')) | |
#let's drop neutral comments for now | |
predictions = predictions[!predictions$prediction3 %in% "neutral",] | |
predictions$prediction3 = factor(predictions$prediction3) | |
#evaluate statements that contain 'obama' or 'president' | |
obama_predictions = predictions[str_detect(predictions$text, c('obama','president')),] | |
#shows i want to look at | |
shows = c('Anderson_Cooper_360', 'CNN_Newsroom', 'FOX_and_Friends', | |
'Hannity', 'Larry_King_Live', 'Hardball_With_Chris_Matthews', | |
'The_Rachel_Maddow_Show', 'Morning_Joe', 'The_OReilly_Factor', | |
'U.S._House_of_Representatives', | |
'Countdown_With_Keith_Olbermann') | |
df <- count(obama_predictions, vars=c('channel','show','prediction3')) | |
df <- ddply(df, .(show), transform, p = freq/sum(freq)) | |
df <- ddply(df, .(show), function(x) { | |
#print(x) | |
p = x$p | |
delta = p[1] / p[2] | |
delta | |
}) | |
blogpost = df[df$show %in% shows,] | |
epic = ggplot(blogpost, aes( | |
x=V1, | |
y=reorder(show, V1))) + | |
geom_point(size=5) + | |
scale_x_continuous(limits=c(0,2)) + | |
ylab('') + | |
xlab('Ratio of positive to negative statements mentioning "Obama"') + | |
ggtitle('etcML Epic Sentiment Classifier') + | |
theme_gray(30) # ratio adjustment | |
ggsave('epic_sentiment_obama_iarchive.svg', epic) | |
#hard news classifier | |
predictions2 = read.csv('job836_predictions.tsv', sep="\t") | |
predictions2$label = car::recode(predictions2$prediction, recodes="3='negative'; 2='neutral'; 1='positive'") | |
predictions2$text = metadata$V2 | |
predictions2$channel = metadata$V3 | |
predictions2$show = metadata$V4 | |
predictions2$date = metadata$V5 | |
predictions2$time = metadata$V6 | |
predictions2 = predictions2[predictions2$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),] | |
predictions2 = predictions2[!predictions2$label %in% "neutral",] | |
predictions2$label = factor(predictions2$label) | |
obama_predictions2 = predictions2[str_detect(predictions2$text, c('obama','president')),] | |
df <- count(obama_predictions2, vars=c('channel','show','label')) | |
df <- ddply(df, .(show), transform, p = freq/sum(freq)) | |
df <- ddply(df, .(show), function(x) { | |
#print(x) | |
p = x$p | |
delta = p[1] / p[2] | |
delta | |
}) | |
blogpost2 = df[df$show %in% shows,] | |
hardnews = ggplot(blogpost2, aes( | |
x=V1, | |
y=reorder(show, V1))) + | |
geom_point(size=5) + | |
scale_x_continuous(limits=c(0,2)) + | |
#coord_flip() | |
ylab('') + | |
xlab('Ratio of positive to negative statements mentioning "Obama"') + | |
ggtitle('etcML Hard News Sentiment Classifier') + | |
theme_gray(30) # ratio adjustment | |
ggsave('hardnews_sentiment_obama_iarchive.svg', hardnews) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment