Skip to content

Instantly share code, notes, and snippets.

@rheimann
Created November 18, 2013 01:30
Show Gist options
  • Save rheimann/7520974 to your computer and use it in GitHub Desktop.
Save rheimann/7520974 to your computer and use it in GitHub Desktop.
sentimentRAW
## LOAD LEXICONS & SENTIMENT FUNCTIONS ##
# load sentiment function. The https_function loads the R file directly into our session. Alternately
# you can go the https://github.com/SocialMediaMininginR and download the sentiment_function.
https_function("https://raw.github.com/rheimann/sentiment_function/master/sentiment.R")
# Source: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html & http://www3.nd.edu/~mcdonald/Word_Lists.html
# download positive lexicons from SocialMediaMining Github account
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/positive-words.txt",
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/pos_words.txt", method = "curl")
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/LoughranMcDonald_pos.csv",
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_pos.txt", method = "curl")
# import positive lexicons from your local directory defined in earlier step
pos <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'pos_words.txt'), what='character', comment.char=';')
# import financial positive lexicon from your local directory defined in earlier step
pos_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
# combine both files into one
pos_all <- c(pos, pos_finance)
download.file("https://raw.github.com/SocialMediaMininginR/neg_words/master/LoughranMcDonald_neg.csv",
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_neg.txt", method = "curl")
# import negative lexicons from your local directory defined in earlier step
neg <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'neg_words.txt'), what='character', comment.char=';')
# import financial negative lexicon from your local directory defined in earlier step
neg_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
# combine both files into one
neg_all <- c(neg, neg_finance)
### Define and Design ###
# Import data from Github and create a new data frame
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
sep="|")
# Import data from Github and create a new data frame
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
sep="|")
bad <- is.na(BB)
BB[bad]
# We can check for missing data (year~month) using H. Wickham's reshape
# package. We can see that there seems to be some systematic missing data
# May and December are missing data in all three years of data collection.
cast(BB, year ~ month, length)
# year 1 2 3 4 6 7 8 9 10 11
# 1 2011 1 0 1 1 1 1 0 1 1 1
# 2 2012 1 1 0 1 1 1 1 0 1 1
# 3 2013 1 0 1 1 1 1 0 0 0 0
# Cleaning of the data
colnames(BB)
# explain regular expressions and the purpose of cleaning the data!
View(head(BB))
# "The manufacturing sector continued to recover across all Districts." (2011,1)
BB$text <- gsub('[[:punct:]]', ' ', BB$text)
BB$text <- gsub('[[:cntrl:]]', ' ', BB$text)
BB$text <- gsub('\\d+', ' ', BB$text)
# "The manufacturing sector continued to recover across all Districts" (2011,1)
View(head(BB))
BB.keeps <- BB[1:3]
BB.score <- score.sentiment(cbind(BB.keeps(BB$text, pos.words, neg.words, .progress='text')))
BB.sentiment <- cbind(BB.keeps, BB.score)
BB.sentiment$month <- as.Date(BB.sentiment$month, "%m/")
BB.sentiment$date <- cbind(as.Date(BB.sentiment$year, BB.sentiment$month))
BB.mean <- mean(BB.sentiment$score)
BB.sum <- BB.sentiment$score
BB.sentiment$centered <- BB.sum - BB.mean
BB.sentiment$pos[BB.sentiment$centered>0] <- 1
BB.sentiment$pos[BB.sentiment$centered<0] <- 0
BB.hist <- hist(BB.text$centered, main="Sentiment Histogram", xlab="Score", ylab="Frequency")
BB.boxplot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, group=BB.sentiment$year))
BB.boxplot <- BB.boxplot + geom_boxplot(outlier.colour = "black", outlier.shape = 16, outlier.size = 2)
BB.boxplot <- BB.boxplot + geom_rect(data=rect2001, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
BB.boxplot <- BB.boxplot + geom_rect(data=rect2007, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
BB.boxplot <- BB.boxplot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2010)")
BB.boxplot
BB.plot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, color=BB.sentiment$score))
BB.plot <- BB.plot + geom_rect(data=recessions.df, aes(xmin=Peak, xmax=Trough, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2)
BB.plot <- BB.plot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2013)")
BB.plot + geom_smooth()
group=BB.sentiment$year
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment