Created
November 18, 2013 01:30
-
-
Save rheimann/7520974 to your computer and use it in GitHub Desktop.
sentimentRAW
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## LOAD LEXICONS & SENTIMENT FUNCTIONS ## | |
# load sentiment function. The https_function loads the R file directly into our session. Alternately | |
# you can go the https://github.com/SocialMediaMininginR and download the sentiment_function. | |
https_function("https://raw.github.com/rheimann/sentiment_function/master/sentiment.R") | |
# Source: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html & http://www3.nd.edu/~mcdonald/Word_Lists.html | |
# download positive lexicons from SocialMediaMining Github account | |
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/positive-words.txt", | |
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/pos_words.txt", method = "curl") | |
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/LoughranMcDonald_pos.csv", | |
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_pos.txt", method = "curl") | |
# import positive lexicons from your local directory defined in earlier step | |
pos <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'pos_words.txt'), what='character', comment.char=';') | |
# import financial positive lexicon from your local directory defined in earlier step | |
pos_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';') | |
# combine both files into one | |
pos_all <- c(pos, pos_finance) | |
download.file("https://raw.github.com/SocialMediaMininginR/neg_words/master/LoughranMcDonald_neg.csv", | |
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_neg.txt", method = "curl") | |
# import negative lexicons from your local directory defined in earlier step | |
neg <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'neg_words.txt'), what='character', comment.char=';') | |
# import financial negative lexicon from your local directory defined in earlier step | |
neg_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';') | |
# combine both files into one | |
neg_all <- c(neg, neg_finance) | |
### Define and Design ### | |
# Import data from Github and create a new data frame | |
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv", | |
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl") | |
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE, | |
sep="|") | |
# Import data from Github and create a new data frame | |
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv", | |
destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl") | |
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE, | |
sep="|") | |
bad <- is.na(BB) | |
BB[bad] | |
# We can check for missing data (year~month) using H. Wickham's reshape | |
# package. We can see that there seems to be some systematic missing data | |
# May and December are missing data in all three years of data collection. | |
cast(BB, year ~ month, length) | |
# year 1 2 3 4 6 7 8 9 10 11 | |
# 1 2011 1 0 1 1 1 1 0 1 1 1 | |
# 2 2012 1 1 0 1 1 1 1 0 1 1 | |
# 3 2013 1 0 1 1 1 1 0 0 0 0 | |
# Cleaning of the data | |
colnames(BB) | |
# explain regular expressions and the purpose of cleaning the data! | |
View(head(BB)) | |
# "The manufacturing sector continued to recover across all Districts." (2011,1) | |
BB$text <- gsub('[[:punct:]]', ' ', BB$text) | |
BB$text <- gsub('[[:cntrl:]]', ' ', BB$text) | |
BB$text <- gsub('\\d+', ' ', BB$text) | |
# "The manufacturing sector continued to recover across all Districts" (2011,1) | |
View(head(BB)) | |
BB.keeps <- BB[1:3] | |
BB.score <- score.sentiment(cbind(BB.keeps(BB$text, pos.words, neg.words, .progress='text'))) | |
BB.sentiment <- cbind(BB.keeps, BB.score) | |
BB.sentiment$month <- as.Date(BB.sentiment$month, "%m/") | |
BB.sentiment$date <- cbind(as.Date(BB.sentiment$year, BB.sentiment$month)) | |
BB.mean <- mean(BB.sentiment$score) | |
BB.sum <- BB.sentiment$score | |
BB.sentiment$centered <- BB.sum - BB.mean | |
BB.sentiment$pos[BB.sentiment$centered>0] <- 1 | |
BB.sentiment$pos[BB.sentiment$centered<0] <- 0 | |
BB.hist <- hist(BB.text$centered, main="Sentiment Histogram", xlab="Score", ylab="Frequency") | |
BB.boxplot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, group=BB.sentiment$year)) | |
BB.boxplot <- BB.boxplot + geom_boxplot(outlier.colour = "black", outlier.shape = 16, outlier.size = 2) | |
BB.boxplot <- BB.boxplot + geom_rect(data=rect2001, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE) | |
BB.boxplot <- BB.boxplot + geom_rect(data=rect2007, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE) | |
BB.boxplot <- BB.boxplot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2010)") | |
BB.boxplot | |
BB.plot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, color=BB.sentiment$score)) | |
BB.plot <- BB.plot + geom_rect(data=recessions.df, aes(xmin=Peak, xmax=Trough, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2) | |
BB.plot <- BB.plot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2013)") | |
BB.plot + geom_smooth() | |
group=BB.sentiment$year |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment