rjweiss/etcML

## etcML
library(ggplot2)
library(stringr)
library(scales)
library(plyr)
library(car)

predictions = read.csv('job831_predictions.tsv', sep="\t") # the job file from etcml as is
metadata = read.csv('etcml_iarchive_0106_full.tsv', header=F, stringsAsFactors=F) # my metadata from the archive

# this is rather sloppy but i'm lazy
predictions$channel = metadata$V3
predictions$show = metadata$V4
predictions$date = metadata$V5
predictions$time = metadata$V6

#for now, just look at cnn, cspan, foxnews, and msnbc
predictions = predictions[predictions$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]

#relabel as positive, neutral, and negative
predictions$prediction3 = cut(predictions$neg_probability, breaks=c(0, 0.3, 0.7, 1), labels = c('positive','neutral','negative'))

#let's drop neutral comments for now
predictions = predictions[!predictions$prediction3 %in% "neutral",]
predictions$prediction3 = factor(predictions$prediction3)

#evaluate statements that contain 'obama' or 'president'
obama_predictions = predictions[str_detect(predictions$text, c('obama','president')),]

#shows i want to look at
shows = c('Anderson_Cooper_360', 'CNN_Newsroom', 'FOX_and_Friends',
          'Hannity', 'Larry_King_Live', 'Hardball_With_Chris_Matthews',
          'The_Rachel_Maddow_Show', 'Morning_Joe',  'The_OReilly_Factor',
          'U.S._House_of_Representatives',
          'Countdown_With_Keith_Olbermann')

df <- count(obama_predictions, vars=c('channel','show','prediction3'))
df <- ddply(df, .(show), transform, p = freq/sum(freq))
df <- ddply(df, .(show), function(x) {
  #print(x)
  p = x$p
  delta = p[1] / p[2]
  delta
})

blogpost = df[df$show %in% shows,]

epic = ggplot(blogpost, aes(
  x=V1,
  y=reorder(show, V1))) +
  geom_point(size=5) +
  scale_x_continuous(limits=c(0,2)) +
  ylab('') +
  xlab('Ratio of positive to negative statements mentioning "Obama"') +
  ggtitle('etcML Epic Sentiment Classifier') +
  theme_gray(30) # ratio adjustment
ggsave('epic_sentiment_obama_iarchive.svg', epic)

#hard news classifier

predictions2 = read.csv('job836_predictions.tsv', sep="\t")
predictions2$label = car::recode(predictions2$prediction, recodes="3='negative'; 2='neutral'; 1='positive'")

predictions2$text = metadata$V2
predictions2$channel = metadata$V3
predictions2$show = metadata$V4
predictions2$date = metadata$V5
predictions2$time = metadata$V6

predictions2 = predictions2[predictions2$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]
predictions2 = predictions2[!predictions2$label %in% "neutral",]
predictions2$label = factor(predictions2$label)

obama_predictions2 = predictions2[str_detect(predictions2$text, c('obama','president')),]

df <- count(obama_predictions2, vars=c('channel','show','label'))
df <- ddply(df, .(show), transform, p = freq/sum(freq))
df <- ddply(df, .(show), function(x) {
  #print(x)
  p = x$p
  delta = p[1] / p[2]
  delta
})

blogpost2 = df[df$show %in% shows,]

hardnews = ggplot(blogpost2, aes(
  x=V1,
  y=reorder(show, V1))) +
  geom_point(size=5) +
  scale_x_continuous(limits=c(0,2)) +
  #coord_flip()
  ylab('') +
  xlab('Ratio of positive to negative statements mentioning "Obama"') +
  ggtitle('etcML Hard News Sentiment Classifier') +
  theme_gray(30) # ratio adjustment
ggsave('hardnews_sentiment_obama_iarchive.svg', hardnews)
	library(ggplot2)
	library(stringr)
	library(scales)
	library(plyr)
	library(car)

	predictions = read.csv('job831_predictions.tsv', sep="\t") # the job file from etcml as is
	metadata = read.csv('etcml_iarchive_0106_full.tsv', header=F, stringsAsFactors=F) # my metadata from the archive

	# this is rather sloppy but i'm lazy
	predictions$channel = metadata$V3
	predictions$show = metadata$V4
	predictions$date = metadata$V5
	predictions$time = metadata$V6

	#for now, just look at cnn, cspan, foxnews, and msnbc
	predictions = predictions[predictions$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]

	#relabel as positive, neutral, and negative
	predictions$prediction3 = cut(predictions$neg_probability, breaks=c(0, 0.3, 0.7, 1), labels = c('positive','neutral','negative'))

	#let's drop neutral comments for now
	predictions = predictions[!predictions$prediction3 %in% "neutral",]
	predictions$prediction3 = factor(predictions$prediction3)

	#evaluate statements that contain 'obama' or 'president'
	obama_predictions = predictions[str_detect(predictions$text, c('obama','president')),]

	#shows i want to look at
	shows = c('Anderson_Cooper_360', 'CNN_Newsroom', 'FOX_and_Friends',
	'Hannity', 'Larry_King_Live', 'Hardball_With_Chris_Matthews',
	'The_Rachel_Maddow_Show', 'Morning_Joe', 'The_OReilly_Factor',
	'U.S._House_of_Representatives',
	'Countdown_With_Keith_Olbermann')

	df <- count(obama_predictions, vars=c('channel','show','prediction3'))
	df <- ddply(df, .(show), transform, p = freq/sum(freq))
	df <- ddply(df, .(show), function(x) {
	#print(x)
	p = x$p
	delta = p[1] / p[2]
	delta
	})

	blogpost = df[df$show %in% shows,]

	epic = ggplot(blogpost, aes(
	x=V1,
	y=reorder(show, V1))) +
	geom_point(size=5) +
	scale_x_continuous(limits=c(0,2)) +
	ylab('') +
	xlab('Ratio of positive to negative statements mentioning "Obama"') +
	ggtitle('etcML Epic Sentiment Classifier') +
	theme_gray(30) # ratio adjustment
	ggsave('epic_sentiment_obama_iarchive.svg', epic)

	#hard news classifier

	predictions2 = read.csv('job836_predictions.tsv', sep="\t")
	predictions2$label = car::recode(predictions2$prediction, recodes="3='negative'; 2='neutral'; 1='positive'")

	predictions2$text = metadata$V2
	predictions2$channel = metadata$V3
	predictions2$show = metadata$V4
	predictions2$date = metadata$V5
	predictions2$time = metadata$V6

	predictions2 = predictions2[predictions2$channel %in% c('CNN','FOXNEWS','MSNBC','CSPAN'),]
	predictions2 = predictions2[!predictions2$label %in% "neutral",]
	predictions2$label = factor(predictions2$label)

	obama_predictions2 = predictions2[str_detect(predictions2$text, c('obama','president')),]

	df <- count(obama_predictions2, vars=c('channel','show','label'))
	df <- ddply(df, .(show), transform, p = freq/sum(freq))
	df <- ddply(df, .(show), function(x) {
	#print(x)
	p = x$p
	delta = p[1] / p[2]
	delta
	})

	blogpost2 = df[df$show %in% shows,]

	hardnews = ggplot(blogpost2, aes(
	x=V1,
	y=reorder(show, V1))) +
	geom_point(size=5) +
	scale_x_continuous(limits=c(0,2)) +
	#coord_flip()
	ylab('') +
	xlab('Ratio of positive to negative statements mentioning "Obama"') +
	ggtitle('etcML Hard News Sentiment Classifier') +
	theme_gray(30) # ratio adjustment
	ggsave('hardnews_sentiment_obama_iarchive.svg', hardnews)