Skip to content

Instantly share code, notes, and snippets.

@smc-dta
Forked from Inpirical-Coder/sentiment_score_simple.R
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smc-dta/d5ad1528e4c1026f0f85 to your computer and use it in GitHub Desktop.
Save smc-dta/d5ad1528e4c1026f0f85 to your computer and use it in GitHub Desktop.
# Code to fetch news streams from 5 live sources, process the streams and text
# and apply a simple sentiment scoring algorigthm.
#
# A writeup of the analysis can be found here:
# https://www.linkedin.com/pulse/article/20141109035942-34768479-r-sentiment-scoring-hsbc-w-harvard-general-inquirer
# Define the packages we want to load:
packs = c(
"tm", # Text mining
"tm.plugin.webmining", # Web-source plugin for text mining
"SnowballC", # Stemmer
"RColorBrewer", # Colors for visualisation
"ggplot2", # Plotting
"wordcloud", # Draw wordclouds
"openNLP" # Split text into sentences.
)
sapply(packs, require, character.only=TRUE) # Load the packages.
# Download the corpora and insert them in a named list.
corpora = list(
googlefinance = WebCorpus(GoogleFinanceSource("NYSE:HSBC")),
googlenews = WebCorpus(GoogleNewsSource("HSBC")),
yahoofinance = WebCorpus(YahooFinanceSource("HSBC")),
yahooinplay = WebCorpus(YahooInplaySource()),
yahoonews = WebCorpus(YahooNewsSource("HSBC"))
)
# Save the corpora list.
save(corpora, file="data/corpora.Rdat")
#load('data/corpora.Rdat')
# Break the corpora down into sentences and define some functions to do so.
ToSentences = function(text, language="en") {
# Splits text into sentences using an Apache OpenNLP sentence detector.
# Arguments:
# "text" the text to be processed (character)
# "lang" ISO-639 code of the language of the text (character)
# Returns:
# sentences of the text (character vector)
if(length(text) ==0) {return("")}
if(nchar(text) == 0) {return("")} # Cover special case 0-character text.
# Convert text to String object; allows for splitting by index.
text = as.String(text)
# Discover the sentence markers in the text (specify NLP as
# source of annotate because there is also an annotate function in ggplot2)
markers = NLP::annotate(
text,
Maxent_Sent_Token_Annotator(language=language) # Annotator from OpenNLP
)
# Return sentences by splitting the text at the boundaries.
text[markers]
}
CorpusToSentences = function(corpus) {
# Split every document in the corpus into sentences and return a new corpus
# with all the sentences as individual documents.
# Extract the text from each document in the corpus.
text = lapply(corpus, "[[", "content")
# Basically convert the text
docs = lapply(text, ToSentences)
docs = as.vector(unlist(docs))
# Return a corpus with sentences as documents.
Corpus(VectorSource(docs))
}
# Create a new corpus which merges existing corpora after splitting them
# into sentences.
corpus = Reduce(c, lapply(corpora, CorpusToSentences))
# Process the corpora contents.
corpus = tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, stripWhitespace)
#toString = content_transformer(function(x, from, to) gsub(from, to, x))
#corpus = tm_map(corpus, toString, "hsbc", "hsbc")
# Stemming
# corpus = tm_map(corpus, stemDocument)
# Create a document term matrix from the corpus.
dtm = DocumentTermMatrix(corpus)
# Subset the DTM to include only documents including the term "hsbc".
dtm = dtm[rowSums(as.matrix(dtm[ , "hsbc"])) > 0, ]
# Remove terms which are not contained in any of the documents.
dtm = dtm[ , colSums(as.matrix(dtm)) > 0]
# ACQUIRING AND PROCESSING THE LEXICON.
# Load the sentiment lexicon (saved down in working directory as a
# comma separated value file).
lex = read.csv("inquirerbasic.csv", stringsAsFactors=FALSE)
# Collapse words with multiple entries into one entry. These are marked
# with a trailing #1, #2, ...
# Remove #1 tags.
lex$Entry = gsub("#1", "", lex$Entry)
# Remove entries that are still numbered (i.e. two or higher)
lex = lex[!grepl("#", lex$Entry), ]
# Extract the positive and negative words from the lexicon.
neg.lex = tolower(lex$Entry[lex$Negativ != ""])
pos.lex = tolower(lex$Entry[lex$Positiv != ""])
terms = colnames(dtm)
# Find the positive and negative terms using the lexicons.
neg.terms = terms[terms %in% neg.lex]
pos.terms = terms[terms %in% pos.lex]
# Specify positive terms which may be quiestionable.
pos.terms.adj = setdiff(pos.terms, c("equity", "share", "consensus"))
# Calculate the negative and positive sentence scores ("document scores").
neg.scores = rowSums(as.matrix(dtm[ , neg.terms]))
pos.scores = rowSums(as.matrix(dtm[ , pos.terms]))
document.scores = pos.scores - neg.scores
# Calulate the document signs ("sentence signs").
document.signs = sign(document.scores)
# Calculate the sentiment score
sentiment.score = sum(document.signs == 1) / sum(document.signs !=0)
## Visualisation:
# Generate word clouds (positive and negative).
PosCloud = function() {
wordcloud(
pos.terms,
colSums(as.matrix(dtm[ , pos.terms])),
min.freq=1,
scale=c(4,0.7),
color=brewer.pal(n=9, "Blues")[6:9]
)
}
NegCloud = function() {
wordcloud(
neg.terms,
colSums(as.matrix(dtm[ , neg.terms])),
min.freq=1,
scale=c(4,0.7),
color=brewer.pal(n=9, "Reds")[6:9]
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment