rheimann/sentimentRAW

## sentimentRAW
## LOAD LEXICONS & SENTIMENT FUNCTIONS ##
# load sentiment function. The https_function loads the R file directly into our session. Alternately
# you can go the https://github.com/SocialMediaMininginR and download the sentiment_function.
https_function("https://raw.github.com/rheimann/sentiment_function/master/sentiment.R")

# Source: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html & http://www3.nd.edu/~mcdonald/Word_Lists.html
# download positive lexicons from SocialMediaMining Github account
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/positive-words.txt",
              destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/pos_words.txt", method = "curl")
download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/LoughranMcDonald_pos.csv",
              destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_pos.txt", method = "curl")

# import positive lexicons from your local directory defined in earlier step
pos <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'pos_words.txt'), what='character', comment.char=';')
# import financial positive lexicon from your local directory defined in earlier step
pos_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
# combine both files into one
pos_all <- c(pos, pos_finance)

download.file("https://raw.github.com/SocialMediaMininginR/neg_words/master/LoughranMcDonald_neg.csv",
              destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_neg.txt", method = "curl")

# import negative lexicons from your local directory defined in earlier step
neg <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'neg_words.txt'), what='character', comment.char=';')
# import financial negative lexicon from your local directory defined in earlier step
neg_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
# combine both files into one
neg_all <- c(neg, neg_finance)

### Define and Design ###
# Import data from Github and create a new data frame
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
              destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
               sep="|")

# Import data from Github and create a new data frame
download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
              destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
               sep="|")


bad <- is.na(BB)
BB[bad]

# We can check for missing data (year~month) using H. Wickham's reshape
# package. We can see that there seems to be some systematic missing data
# May and December are missing data in all three years of data collection.
cast(BB, year ~ month, length)
# year 1 2 3 4 6 7 8 9 10 11
# 1 2011 1 0 1 1 1 1 0 1  1  1
# 2 2012 1 1 0 1 1 1 1 0  1  1
# 3 2013 1 0 1 1 1 1 0 0  0  0

# Cleaning of the data
colnames(BB)
# explain regular expressions and the purpose of cleaning the data!
View(head(BB))
# "The manufacturing sector continued to recover across all Districts." (2011,1)
BB$text <- gsub('[[:punct:]]', ' ', BB$text)
BB$text <- gsub('[[:cntrl:]]', ' ', BB$text)
BB$text <- gsub('\\d+', ' ', BB$text)
# "The manufacturing sector continued to recover across all Districts" (2011,1)
View(head(BB))

BB.keeps <- BB[1:3]
BB.score <- score.sentiment(cbind(BB.keeps(BB$text, pos.words, neg.words, .progress='text')))
BB.sentiment <- cbind(BB.keeps, BB.score)

BB.sentiment$month <- as.Date(BB.sentiment$month, "%m/")
BB.sentiment$date <- cbind(as.Date(BB.sentiment$year, BB.sentiment$month))

BB.mean <- mean(BB.sentiment$score)
BB.sum <- BB.sentiment$score
BB.sentiment$centered <- BB.sum - BB.mean
BB.sentiment$pos[BB.sentiment$centered>0] <- 1
BB.sentiment$pos[BB.sentiment$centered<0] <- 0

BB.hist <- hist(BB.text$centered, main="Sentiment Histogram", xlab="Score", ylab="Frequency")

BB.boxplot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, group=BB.sentiment$year))
BB.boxplot <- BB.boxplot + geom_boxplot(outlier.colour = "black", outlier.shape = 16, outlier.size = 2)
BB.boxplot <- BB.boxplot + geom_rect(data=rect2001, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
BB.boxplot <- BB.boxplot + geom_rect(data=rect2007, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
BB.boxplot <- BB.boxplot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2010)")
BB.boxplot

BB.plot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, color=BB.sentiment$score))
BB.plot <- BB.plot + geom_rect(data=recessions.df, aes(xmin=Peak, xmax=Trough, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2)
BB.plot <- BB.plot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2013)")

BB.plot + geom_smooth()

group=BB.sentiment$year
	## LOAD LEXICONS & SENTIMENT FUNCTIONS ##
	# load sentiment function. The https_function loads the R file directly into our session. Alternately
	# you can go the https://github.com/SocialMediaMininginR and download the sentiment_function.
	https_function("https://raw.github.com/rheimann/sentiment_function/master/sentiment.R")

	# Source: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html & http://www3.nd.edu/~mcdonald/Word_Lists.html
	# download positive lexicons from SocialMediaMining Github account
	download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/positive-words.txt",
	destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/pos_words.txt", method = "curl")
	download.file("https://raw.github.com/SocialMediaMininginR/pos_words/master/LoughranMcDonald_pos.csv",
	destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_pos.txt", method = "curl")

	# import positive lexicons from your local directory defined in earlier step
	pos <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'pos_words.txt'), what='character', comment.char=';')
	# import financial positive lexicon from your local directory defined in earlier step
	pos_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
	# combine both files into one
	pos_all <- c(pos, pos_finance)

	download.file("https://raw.github.com/SocialMediaMininginR/neg_words/master/LoughranMcDonald_neg.csv",
	destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/LoughranMcDonald_neg.txt", method = "curl")

	# import negative lexicons from your local directory defined in earlier step
	neg <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'neg_words.txt'), what='character', comment.char=';')
	# import financial negative lexicon from your local directory defined in earlier step
	neg_finance <- scan(file.path("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/data/", 'LoughranMcDonald_pos.txt'), what='character', comment.char=';')
	# combine both files into one
	neg_all <- c(neg, neg_finance)

	### Define and Design ###
	# Import data from Github and create a new data frame
	download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
	destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
	BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
	sep="\|")

	# Import data from Github and create a new data frame
	download.file("https://raw.github.com/SocialMediaMininginR/beigebook/master/bb_full.csv",
	destfile = "/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", method = "curl")
	BB <- read.csv("/Users/heimannrichard/Google Drive/SocialMediaMiningR/twitter_sentiment/bb_full.csv", header=TRUE,
	sep="\|")


	bad <- is.na(BB)
	BB[bad]

	# We can check for missing data (year~month) using H. Wickham's reshape
	# package. We can see that there seems to be some systematic missing data
	# May and December are missing data in all three years of data collection.
	cast(BB, year ~ month, length)
	# year 1 2 3 4 6 7 8 9 10 11
	# 1 2011 1 0 1 1 1 1 0 1 1 1
	# 2 2012 1 1 0 1 1 1 1 0 1 1
	# 3 2013 1 0 1 1 1 1 0 0 0 0

	# Cleaning of the data
	colnames(BB)
	# explain regular expressions and the purpose of cleaning the data!
	View(head(BB))
	# "The manufacturing sector continued to recover across all Districts." (2011,1)
	BB$text <- gsub('[[:punct:]]', ' ', BB$text)
	BB$text <- gsub('[[:cntrl:]]', ' ', BB$text)
	BB$text <- gsub('\\d+', ' ', BB$text)
	# "The manufacturing sector continued to recover across all Districts" (2011,1)
	View(head(BB))

	BB.keeps <- BB[1:3]
	BB.score <- score.sentiment(cbind(BB.keeps(BB$text, pos.words, neg.words, .progress='text')))
	BB.sentiment <- cbind(BB.keeps, BB.score)

	BB.sentiment$month <- as.Date(BB.sentiment$month, "%m/")
	BB.sentiment$date <- cbind(as.Date(BB.sentiment$year, BB.sentiment$month))

	BB.mean <- mean(BB.sentiment$score)
	BB.sum <- BB.sentiment$score
	BB.sentiment$centered <- BB.sum - BB.mean
	BB.sentiment$pos[BB.sentiment$centered>0] <- 1
	BB.sentiment$pos[BB.sentiment$centered<0] <- 0

	BB.hist <- hist(BB.text$centered, main="Sentiment Histogram", xlab="Score", ylab="Frequency")

	BB.boxplot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, group=BB.sentiment$year))
	BB.boxplot <- BB.boxplot + geom_boxplot(outlier.colour = "black", outlier.shape = 16, outlier.size = 2)
	BB.boxplot <- BB.boxplot + geom_rect(data=rect2001, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
	BB.boxplot <- BB.boxplot + geom_rect(data=rect2007, aes(xmin=xmin, xmax=xmax, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2, inherit.aes = FALSE)
	BB.boxplot <- BB.boxplot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2010)")
	BB.boxplot

	BB.plot <- ggplot(BB.sentiment, aes(x=BB.sentiment$year, y=BB.sentiment$centered, color=BB.sentiment$score))
	BB.plot <- BB.plot + geom_rect(data=recessions.df, aes(xmin=Peak, xmax=Trough, ymin=-Inf, ymax=+Inf), fill='pink', alpha=0.2)
	BB.plot <- BB.plot + xlab("Year") + ylab("Sentiment (Centered)") + ggtitle("Economic Sentiment - Beige Book (1996-2013)")

	BB.plot + geom_smooth()

	group=BB.sentiment$year