tts/gist:4065001

## gistfile1.r
###############################################################
#
# Word cloud from Aalto People topics of interest
#
# Data from Linked Open Aalto Data Service
# http://data.aalto.fi/
#
# R code below is only slightly adapted from the first example at
# http://onertipaday.blogspot.fi/2011/07/word-cloud-in-r.html
#
# Tuija Sonkkila 13.11.2012, 18.5.2013
#
#
# Note that only a fraction of Aalto University staff
# has an open Aalto People profile. Again, from those that have,
# not everyone has mentioned a topic of interest.
#
# SPARQL 1.0 query for finding out how many haven't mentioned any interest:
# (see e.g. Bob DuCharme, Learning SPARQL, p. 57-58)
#
# SELECT (COUNT(DISTINCT ?nimi) AS ?NoTopics)
#  WHERE {
#   GRAPH <http://data.aalto.fi/id/people/>
#   {
#    ?person <http://xmlns.com/foaf/0.1/name> ?nimi .
#    OPTIONAL {  ?person <http://xmlns.com/foaf/0.1/topic_interest> ?topic . }
#    FILTER (!bound(?topic))
#   }
#  }
#
# And SPARQL 1.1
#
# SELECT (COUNT(DISTINCT ?nimi) AS ?NoTopics)
# WHERE {
#   GRAPH <http://data.aalto.fi/id/people/>
#  {
#    ?person foaf:name ?nimi .
#    NOT EXISTS { ?person foaf:topic_interest ?topic }
#  }
# }
#
###############################################################

library(tm)
library(wordcloud)
library(RColorBrewer)
library(SPARQL)

endpoint <- "http://data.aalto.fi/sparql"

q <- "SELECT ?interest
WHERE {
  GRAPH <http://data.aalto.fi/id/people/>
  {
   ?person <http://xmlns.com/foaf/0.1/name> ?nimi .
   ?person <http://xmlns.com/foaf/0.1/topic_interest> ?topic .
   ?topic  <http://www.w3.org/2000/01/rdf-schema#label> ?interest .
  }
}
ORDER BY ?interest"

res <- SPARQL(url=endpoint, q)$results

# The res data frame was in wide format with a variable/column for every topic.
# EDIT 18.5.2013 Now, the dimension has changed
dim(res)

# res.corpus <- Corpus(DataframeSource(res[1:987]))
res.corpus <- Corpus(DataframeSource(res[1]))
res.corpus <- tm_map(res.corpus, removePunctuation)
res.corpus <- tm_map(res.corpus, tolower)
res.corpus <- tm_map(res.corpus, function(x) removeWords(x, stopwords("english")))
tdm <- TermDocumentMatrix(res.corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:2)]
png("wordcloud.png", width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
dev.off()
	###############################################################
	#
	# Word cloud from Aalto People topics of interest
	#
	# Data from Linked Open Aalto Data Service
	# http://data.aalto.fi/
	#
	# R code below is only slightly adapted from the first example at
	# http://onertipaday.blogspot.fi/2011/07/word-cloud-in-r.html
	#
	# Tuija Sonkkila 13.11.2012, 18.5.2013
	#
	#
	# Note that only a fraction of Aalto University staff
	# has an open Aalto People profile. Again, from those that have,
	# not everyone has mentioned a topic of interest.
	#
	# SPARQL 1.0 query for finding out how many haven't mentioned any interest:
	# (see e.g. Bob DuCharme, Learning SPARQL, p. 57-58)
	#
	# SELECT (COUNT(DISTINCT ?nimi) AS ?NoTopics)
	# WHERE {
	# GRAPH <http://data.aalto.fi/id/people/>
	# {
	# ?person <http://xmlns.com/foaf/0.1/name> ?nimi .
	# OPTIONAL { ?person <http://xmlns.com/foaf/0.1/topic_interest> ?topic . }
	# FILTER (!bound(?topic))
	# }
	# }
	#
	# And SPARQL 1.1
	#
	# SELECT (COUNT(DISTINCT ?nimi) AS ?NoTopics)
	# WHERE {
	# GRAPH <http://data.aalto.fi/id/people/>
	# {
	# ?person foaf:name ?nimi .
	# NOT EXISTS { ?person foaf:topic_interest ?topic }
	# }
	# }
	#
	###############################################################

	library(tm)
	library(wordcloud)
	library(RColorBrewer)
	library(SPARQL)

	endpoint <- "http://data.aalto.fi/sparql"

	q <- "SELECT ?interest
	WHERE {
	GRAPH <http://data.aalto.fi/id/people/>
	{
	?person <http://xmlns.com/foaf/0.1/name> ?nimi .
	?person <http://xmlns.com/foaf/0.1/topic_interest> ?topic .
	?topic <http://www.w3.org/2000/01/rdf-schema#label> ?interest .
	}
	}
	ORDER BY ?interest"

	res <- SPARQL(url=endpoint, q)$results

	# The res data frame was in wide format with a variable/column for every topic.
	# EDIT 18.5.2013 Now, the dimension has changed
	dim(res)

	# res.corpus <- Corpus(DataframeSource(res[1:987]))
	res.corpus <- Corpus(DataframeSource(res[1]))
	res.corpus <- tm_map(res.corpus, removePunctuation)
	res.corpus <- tm_map(res.corpus, tolower)
	res.corpus <- tm_map(res.corpus, function(x) removeWords(x, stopwords("english")))
	tdm <- TermDocumentMatrix(res.corpus)
	m <- as.matrix(tdm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)
	pal <- brewer.pal(9, "BuGn")
	pal <- pal[-(1:2)]
	png("wordcloud.png", width=1280,height=800)
	wordcloud(d$word,d$freq, scale=c(8,.3),min.freq=2,max.words=100, random.order=T, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
	dev.off()