Skip to content

Instantly share code, notes, and snippets.

@mcku
Forked from VikParuchuri/sentiment_plot.R
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mcku/b7f96047c28228055136 to your computer and use it in GitHub Desktop.
Save mcku/b7f96047c28228055136 to your computer and use it in GitHub Desktop.
term<-c("egypt","jordan","israel","saudi")
term_vec<-foreach(i=1:length(all_score_frames),.combine=rbind) %do%
{
score_row<-rep(0,length(term))
for(z in 1:length(score_row))
{
sel_score<-all_score_frames[[i]][all_score_frames[[i]]$term==term[z],"score"]
sel_score[is.na(sel_score)]<-0
if(length(sel_score)==0)
sel_score<-0
score_row[z]<-round(sel_score,5)
}
as.numeric(c(date_max_list[i],score_row))
}
term_vec<-as.data.frame(term_vec)
names(term_vec)<-c("year",term)
term_df <- melt(term_vec, id.vars="year")
term_means<-sapply(all_score_frames,function(x) mean(x$score))
text_size<-40
ggplot(data=term_df,aes(x=year, y=value, colour=variable))+geom_line(size=1) + geom_line(aes(x = as.numeric(date_max_list), y = term_means), colour = "black",size=1.5) + ylab("sentiment") + opts(title = expression("US Sentiment (+/-) Over Time"),legend.text=theme_text(size=text_size),legend.title=theme_text(size=0),plot.title=theme_text(size=text_size),axis.text.y=theme_text(size=text_size),axis.text.x=theme_text(size=text_size),axis.title.y=theme_text(size=text_size,angle=90),axis.title.x=theme_text(size=text_size),legend.key.size=unit(2,"cm"))
load_or_install(c("RODBC","corpora","ggplot2","tm","foreach","RColorBrewer","wordcloud","lsa","MASS","openNLP"))
channel <- odbcConnect(db_name, uid = "", pwd = "")
all_score_frames<-list()
ri_cols<-30000
max_cables_to_sample<-15000
for(z in 1:length(date_min_list))
{
date_min<-paste(date_min_list[z],"-01-01",sep="")
date_max<-paste(date_max_list[z],"-01-01",sep="")
print(date_min)
cable_frame<-sqlQuery(channel, paste("SELECT * from cable WHERE date > '",date_min,"' AND date <'",date_max,"'",sep=""),stringsAsFactors=FALSE,errors=TRUE)
ppatterns<-c("\\n","\\r")
sampled_indices<-sample(1:nrow(cable_frame),min(max_cables_to_sample,nrow(cable_frame)))
combined<-tolower(gsub(paste("(",paste(ppatterns,collapse="|"),")",sep=""),"",cable_frame$content[sampled_indices]))
combined<-sentDetect(combined)
combined<-combined[!is.na(combined)]
combined<-combined[nchar(combined)>5]
tokenized_combined<-lapply(combined,scan_tokenizer)
ri_mat<-matrix(0,length(full_term_list),ri_cols)
rownames(ri_mat)<-full_term_list
gc()
for(i in 1:length(combined))
{
if(i%%10000==0)
print(i)
tokens<-tokenized_combined[[i]]
tokens<-tokens[nchar(tokens)>4 & nchar(tokens)<20]
tokens<-tokens[tokens %in% full_term_list]
set.seed(i)
sample_vec<-rep(0,ri_cols)
s_inds<-sample(1:length(sample_vec),5)
sample_vec[s_inds]<-1
ri_mat[tokens,]<-ri_mat[tokens,]+sample_vec
}
gc()
ri_mat<-ri_mat[rowSums(ri_mat)>0,]
gc()
neg_vec<-colSums(ri_mat[rownames(ri_mat) %in% afinn_list$word[afinn_list$score< -2],])
pos_vec<-colSums(ri_mat[rownames(ri_mat) %in% afinn_list$word[afinn_list$score> 2],])
ri_mat<-ri_mat[!rownames(ri_mat) %in% afinn_list$word,]
neg_scores<-apply(ri_mat,1,function(x)cosine(x,neg_vec))
pos_scores<-apply(ri_mat,1,function(x)cosine(x,pos_vec))
score_frame<-data.frame(term=rownames(ri_mat),pos_scores,neg_scores,score=pos_scores-neg_scores)
sorted_score_frame<-score_frame[order(score_frame$score),]
all_score_frames[[z]]<-sorted_score_frame
rm(ri_mat)
gc()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment