tts/gist:3053494

## gistfile1.r
########################################################################################
#
#  Make a list of Twitter screen names from
#  http://yle.fi/extrem/artikel/musiknoje/44988-Har-ar-svenskfinlands-basta-twittrare
#
#  AFAIK there is no Twitter list, so we need HTML parsing
#
#  Tuija Sonkkila
#  2012-07-05

library(RCurl)
library(RJSONIO)
library(XML)

url <- "http://yle.fi/extrem/artikel/musiknoje/44988-Har-ar-svenskfinlands-basta-twittrare"
d <- getURL(url)
doc <- htmlParse(d)

# Store into a list (src) all href attributes of those link elements which refer to Twitter
# (there are no extra, unrelated links on the page)
src <- xpathApply(doc, "//a[starts-with(@href, 'https://twitter.com/')]", xmlGetAttr, "href")

# Apply a find&replace function over all list elements, to get the screen names only
ppl <- lapply(src, function(x) gsub("https://twitter.com/", "", x))

# During the text mining phase later on, I got errors that I assumed where related to
# character encoding of the corpus (latin1). This conversion helped:
#
# tweets.utf8 <- iconv(tweets, "latin1", "UTF-8")
	########################################################################################
	#
	# Make a list of Twitter screen names from
	# http://yle.fi/extrem/artikel/musiknoje/44988-Har-ar-svenskfinlands-basta-twittrare
	#
	# AFAIK there is no Twitter list, so we need HTML parsing
	#
	# Tuija Sonkkila
	# 2012-07-05

	library(RCurl)
	library(RJSONIO)
	library(XML)

	url <- "http://yle.fi/extrem/artikel/musiknoje/44988-Har-ar-svenskfinlands-basta-twittrare"
	d <- getURL(url)
	doc <- htmlParse(d)

	# Store into a list (src) all href attributes of those link elements which refer to Twitter
	# (there are no extra, unrelated links on the page)
	src <- xpathApply(doc, "//a[starts-with(@href, 'https://twitter.com/')]", xmlGetAttr, "href")

	# Apply a find&replace function over all list elements, to get the screen names only
	ppl <- lapply(src, function(x) gsub("https://twitter.com/", "", x))

	# During the text mining phase later on, I got errors that I assumed where related to
	# character encoding of the corpus (latin1). This conversion helped:
	#
	# tweets.utf8 <- iconv(tweets, "latin1", "UTF-8")