Created
June 26, 2012 13:15
-
-
Save tts/2995732 to your computer and use it in GitHub Desktop.
Text mining tweets from a list of Top100 Finns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Text mining suomitop100 list tweets | |
# https://twitter.com/#!/niku_hooli/ylen-suomitop100-lista/ | |
# | |
# Tuija Sonkkila | |
# 2012-06-26 | |
# | |
# Mining code is based on | |
# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/ | |
# | |
# I am a total newbie in text mining, but some remarks are fairly obvious. | |
# | |
# The text corpus is very sparse because of the small sample (521 tweets). | |
# In addition, Finns tweet in different languages, mostly Finnish and English, | |
# which adds heterogeneity. | |
# | |
# Based on this exercise, there is no answer to "What does Finland tweet about?". | |
# If anything, the result may say something about tweeting habits, e.g. | |
# quotes ('quote') and quoting ('via') seem to be rather popular. | |
# | |
# However, like the blog remarks, removal of punctuation deletes also the @ sign, | |
# transforming screen names to plain words. elinalappalaine, nikuhooli, raesmaa, | |
# saarikko, tuija, tuomasenbuske and winninghelix are all Twitter screen names. | |
# | |
# About the graph: you'll brake your neck while looking at the plot (I do). | |
# Whether the dendrogram could be rotated, is still unsure: | |
# http://r.789695.n4.nabble.com/rotate-dendrogram-td2288537.html | |
library(RCurl) | |
library(RJSONIO) | |
library(twitteR) | |
library(tm) | |
user <- "niku_hooli" | |
list.id <- "ylen-suomitop100-lista" | |
# http://lists.hexdump.org/pipermail/twitter-users-hexdump.org/2011-December/000015.html | |
# http://twitterapi.pbworks.com/w/page/22554716/Twitter%20REST%20API%20Method%3A%20GET%20list%20statuses | |
get_list_statuses <- function(user, list.id, page) { | |
u <- paste("https://api.twitter.com/1/", user, "/lists/", list.id, | |
"/statuses.json?", "&per_page=200&page=", page, sep = "") | |
json <- getURL(u) | |
dat <- fromJSON(json) | |
# return statuses | |
sapply(dat, function(d) d$text) | |
} | |
# Initialize a character vector to store tweets | |
tw.all <- character(0) | |
# Get tweets by paging | |
for (page in c(1:20)) | |
{ | |
tw <- get_list_statuses(user, list.id, page) | |
# append to tweets from the previous pages | |
tw.all <- c(tw, tw.all) | |
} | |
# How many tweets have we got? | |
length(tw.all) | |
# 521 | |
# Code and comments below are more or less copy-pasted from | |
# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/ | |
# build a corpus | |
mydata.corpus <- Corpus(VectorSource(tw.all)) | |
# make each letter lowercase | |
mydata.corpus <- tm_map(mydata.corpus, tolower) | |
# remove punctuation | |
mydata.corpus <- tm_map(mydata.corpus, removePunctuation) | |
# remove generic and custom stopwords | |
my_stopwords <- c(stopwords('english'), stopwords('finnish')) | |
mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords) | |
# build a term-document matrix | |
mydata.dtm <- TermDocumentMatrix(mydata.corpus) | |
# inspect the document-term matrix | |
mydata.dtm | |
# | |
# A term-document matrix (3574 terms, 521 documents) | |
# | |
# Non-/sparse entries: 4511/1857543 | |
# Sparsity : 100% | |
# Maximal term length: 31 | |
# Weighting : term frequency (tf) | |
# inspect most popular words | |
findFreqTerms(mydata.dtm, lowfreq=10) | |
# | |
# [1] "amp" "facebook" "nikuhooli" "ocl4ed" "quote" "saarikko" | |
# [7] "tuija" "tulevaisuus2030" "veikkausliiga" "via" "vielä" "winninghelix" | |
# which words are associated with a popular term | |
findAssocs(mydata.dtm, 'quote', 0.20) | |
# quote worth comes henry leadership thomas love person success | |
# 1.00 0.27 0.23 0.23 0.23 0.23 0.22 0.22 0.20 | |
# remove sparse terms to simplify the cluster plot | |
# Note: tweak the sparse parameter to determine the number of words. | |
# About 10-30 words is good. | |
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.99) | |
# convert the sparse term-document matrix to a standard data frame | |
mydata.df <- as.data.frame(inspect(mydata.dtm2)) | |
# inspect dimensions of the data frame | |
nrow(mydata.df) # 32 | |
ncol(mydata.df) # 521 | |
png("tweets.png") | |
# cluster analysis | |
mydata.df.scale <- scale(mydata.df) | |
d <- dist(mydata.df.scale, method = "euclidean") # distance matrix | |
fit <- hclust(d, method="ward") | |
plot(fit) | |
groups <- cutree(fit, k=8) # cut tree into k clusters | |
# draw dendogram with red borders around the k clusters | |
rect.hclust(fit, k=8, border="red") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment