tts/gist:2995732

## gistfile1.r
# Text mining suomitop100 list tweets
# https://twitter.com/#!/niku_hooli/ylen-suomitop100-lista/
#
# Tuija Sonkkila
# 2012-06-26
#
# Mining code is based on
# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/
#
# I am a total newbie in text mining, but some remarks are fairly obvious.
#
# The text corpus is very sparse because of the small sample (521 tweets).
# In addition, Finns tweet in different languages, mostly Finnish and English,
# which adds heterogeneity.
#
# Based on this exercise, there is no answer to "What does Finland tweet about?".
# If anything, the result may say something about tweeting habits, e.g.
# quotes ('quote') and quoting ('via') seem to be rather popular.
#
# However, like the blog remarks, removal of punctuation deletes also the @ sign,
# transforming screen names to plain words. elinalappalaine, nikuhooli, raesmaa,
# saarikko, tuija, tuomasenbuske and winninghelix are all Twitter screen names.
#
# About the graph: you'll brake your neck while looking at the plot (I do).
# Whether the dendrogram could be rotated, is still unsure:
# http://r.789695.n4.nabble.com/rotate-dendrogram-td2288537.html

library(RCurl)
library(RJSONIO)
library(twitteR)
library(tm)

user <- "niku_hooli"
list.id <- "ylen-suomitop100-lista"

# http://lists.hexdump.org/pipermail/twitter-users-hexdump.org/2011-December/000015.html
# http://twitterapi.pbworks.com/w/page/22554716/Twitter%20REST%20API%20Method%3A%20GET%20list%20statuses
get_list_statuses <- function(user, list.id, page) {
  u <- paste("https://api.twitter.com/1/", user, "/lists/", list.id,
             "/statuses.json?", "&per_page=200&page=", page, sep = "")
  json <- getURL(u)
  dat <- fromJSON(json)
  # return statuses
  sapply(dat, function(d) d$text)
}

# Initialize a character vector to store tweets
tw.all <- character(0)

# Get tweets by paging
for (page in c(1:20))
{
  tw <- get_list_statuses(user, list.id, page)
  # append to tweets from the previous pages
  tw.all <- c(tw, tw.all)
}

# How many tweets have we got?
length(tw.all)
# 521

# Code and comments below are more or less copy-pasted from
# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/

# build a corpus
mydata.corpus <- Corpus(VectorSource(tw.all))

# make each letter lowercase
mydata.corpus <- tm_map(mydata.corpus, tolower)

# remove punctuation
mydata.corpus <- tm_map(mydata.corpus, removePunctuation)

# remove generic and custom stopwords
my_stopwords <- c(stopwords('english'), stopwords('finnish'))

mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)

# build a term-document matrix
mydata.dtm <- TermDocumentMatrix(mydata.corpus)

# inspect the document-term matrix
mydata.dtm
#
# A term-document matrix (3574 terms, 521 documents)
#
# Non-/sparse entries: 4511/1857543
# Sparsity           : 100%
# Maximal term length: 31
# Weighting          : term frequency (tf)

# inspect most popular words
findFreqTerms(mydata.dtm, lowfreq=10)
#
# [1] "amp"             "facebook"        "nikuhooli"       "ocl4ed"          "quote"           "saarikko"
# [7] "tuija"           "tulevaisuus2030" "veikkausliiga"   "via"             "vielä"           "winninghelix"

# which words are associated with a popular term
findAssocs(mydata.dtm, 'quote', 0.20)
# quote      worth      comes      henry leadership     thomas       love     person    success
# 1.00       0.27       0.23       0.23       0.23       0.23       0.22       0.22       0.20

# remove sparse terms to simplify the cluster plot
# Note: tweak the sparse parameter to determine the number of words.
# About 10-30 words is good.
mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.99)

# convert the sparse term-document matrix to a standard data frame
mydata.df <- as.data.frame(inspect(mydata.dtm2))

# inspect dimensions of the data frame
nrow(mydata.df) # 32
ncol(mydata.df) # 521

png("tweets.png")

# cluster analysis
mydata.df.scale <- scale(mydata.df)
d <- dist(mydata.df.scale, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
plot(fit)

groups <- cutree(fit, k=8) # cut tree into k clusters
# draw dendogram with red borders around the k clusters
rect.hclust(fit, k=8, border="red")

dev.off()
	# Text mining suomitop100 list tweets
	# https://twitter.com/#!/niku_hooli/ylen-suomitop100-lista/
	#
	# Tuija Sonkkila
	# 2012-06-26
	#
	# Mining code is based on
	# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/
	#
	# I am a total newbie in text mining, but some remarks are fairly obvious.
	#
	# The text corpus is very sparse because of the small sample (521 tweets).
	# In addition, Finns tweet in different languages, mostly Finnish and English,
	# which adds heterogeneity.
	#
	# Based on this exercise, there is no answer to "What does Finland tweet about?".
	# If anything, the result may say something about tweeting habits, e.g.
	# quotes ('quote') and quoting ('via') seem to be rather popular.
	#
	# However, like the blog remarks, removal of punctuation deletes also the @ sign,
	# transforming screen names to plain words. elinalappalaine, nikuhooli, raesmaa,
	# saarikko, tuija, tuomasenbuske and winninghelix are all Twitter screen names.
	#
	# About the graph: you'll brake your neck while looking at the plot (I do).
	# Whether the dendrogram could be rotated, is still unsure:
	# http://r.789695.n4.nabble.com/rotate-dendrogram-td2288537.html

	library(RCurl)
	library(RJSONIO)
	library(twitteR)
	library(tm)

	user <- "niku_hooli"
	list.id <- "ylen-suomitop100-lista"

	# http://lists.hexdump.org/pipermail/twitter-users-hexdump.org/2011-December/000015.html
	# http://twitterapi.pbworks.com/w/page/22554716/Twitter%20REST%20API%20Method%3A%20GET%20list%20statuses
	get_list_statuses <- function(user, list.id, page) {
	u <- paste("https://api.twitter.com/1/", user, "/lists/", list.id,
	"/statuses.json?", "&per_page=200&page=", page, sep = "")
	json <- getURL(u)
	dat <- fromJSON(json)
	# return statuses
	sapply(dat, function(d) d$text)
	}

	# Initialize a character vector to store tweets
	tw.all <- character(0)

	# Get tweets by paging
	for (page in c(1:20))
	{
	tw <- get_list_statuses(user, list.id, page)
	# append to tweets from the previous pages
	tw.all <- c(tw, tw.all)
	}

	# How many tweets have we got?
	length(tw.all)
	# 521

	# Code and comments below are more or less copy-pasted from
	# http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/

	# build a corpus
	mydata.corpus <- Corpus(VectorSource(tw.all))

	# make each letter lowercase
	mydata.corpus <- tm_map(mydata.corpus, tolower)

	# remove punctuation
	mydata.corpus <- tm_map(mydata.corpus, removePunctuation)

	# remove generic and custom stopwords
	my_stopwords <- c(stopwords('english'), stopwords('finnish'))

	mydata.corpus <- tm_map(mydata.corpus, removeWords, my_stopwords)

	# build a term-document matrix
	mydata.dtm <- TermDocumentMatrix(mydata.corpus)

	# inspect the document-term matrix
	mydata.dtm
	#
	# A term-document matrix (3574 terms, 521 documents)
	#
	# Non-/sparse entries: 4511/1857543
	# Sparsity : 100%
	# Maximal term length: 31
	# Weighting : term frequency (tf)

	# inspect most popular words
	findFreqTerms(mydata.dtm, lowfreq=10)
	#
	# [1] "amp" "facebook" "nikuhooli" "ocl4ed" "quote" "saarikko"
	# [7] "tuija" "tulevaisuus2030" "veikkausliiga" "via" "vielä" "winninghelix"

	# which words are associated with a popular term
	findAssocs(mydata.dtm, 'quote', 0.20)
	# quote worth comes henry leadership thomas love person success
	# 1.00 0.27 0.23 0.23 0.23 0.23 0.22 0.22 0.20

	# remove sparse terms to simplify the cluster plot
	# Note: tweak the sparse parameter to determine the number of words.
	# About 10-30 words is good.
	mydata.dtm2 <- removeSparseTerms(mydata.dtm, sparse=0.99)

	# convert the sparse term-document matrix to a standard data frame
	mydata.df <- as.data.frame(inspect(mydata.dtm2))

	# inspect dimensions of the data frame
	nrow(mydata.df) # 32
	ncol(mydata.df) # 521

	png("tweets.png")

	# cluster analysis
	mydata.df.scale <- scale(mydata.df)
	d <- dist(mydata.df.scale, method = "euclidean") # distance matrix
	fit <- hclust(d, method="ward")
	plot(fit)

	groups <- cutree(fit, k=8) # cut tree into k clusters
	# draw dendogram with red borders around the k clusters
	rect.hclust(fit, k=8, border="red")

	dev.off()