evanemolo/list-compare.R

## list-compare.R
# Write code to find the 5 most popular words used in the descriptions of our users

# use strsplit to seperate the words in the description
split.description = strsplit(tweets$description, split = "[^a-zA-Z0-9]+")
# use unlist() to turn the list into a vector
split.description = unlist(strsplit(tweets$description, split = "[^a-zA-Z0-9]+"))
# use tolower() to convert all words to lower case to reduce duplicates
split.description = tolower(unlist(strsplit(tweets$description, split = "[^a-zA-Z0-9]+")))
# count the number of occurances of each word, and pull the top five
rev(sort(table(split.description)))[1:5]
# wait...get rid of the stopwords
# load up the .csv of stopwords
stopwords = read.csv("/Users/evanemolo/Dropbox/_ITP/_Fall_2012/DWB/HW/WK3/english-stopwords.csv", as.is=TRUE, header=F)
# append extraneous chars and "http" to stopwords for removal
stopwords = c(stopwords, "", "&", "-", "|", "http")
# turn stopwords into a vector
stopwords = unlist(stopwords)
# use %in% to remove the stopwords from split.description
split.desc.clean = split.description[!(split.description %in% stopwords)]
# reverse sort the vector, and target the first five index numbers
rev(sort(table(split.desc.clean)))[1:5]

#  news   conservative    world   love    follow
#  317        150          145     135     126
	# Write code to find the 5 most popular words used in the descriptions of our users

	# use strsplit to seperate the words in the description
	split.description = strsplit(tweets$description, split = "[^a-zA-Z0-9]+")
	# use unlist() to turn the list into a vector
	split.description = unlist(strsplit(tweets$description, split = "[^a-zA-Z0-9]+"))
	# use tolower() to convert all words to lower case to reduce duplicates
	split.description = tolower(unlist(strsplit(tweets$description, split = "[^a-zA-Z0-9]+")))
	# count the number of occurances of each word, and pull the top five
	rev(sort(table(split.description)))[1:5]
	# wait...get rid of the stopwords
	# load up the .csv of stopwords
	stopwords = read.csv("/Users/evanemolo/Dropbox/_ITP/_Fall_2012/DWB/HW/WK3/english-stopwords.csv", as.is=TRUE, header=F)
	# append extraneous chars and "http" to stopwords for removal
	stopwords = c(stopwords, "", "&", "-", "\|", "http")
	# turn stopwords into a vector
	stopwords = unlist(stopwords)
	# use %in% to remove the stopwords from split.description
	split.desc.clean = split.description[!(split.description %in% stopwords)]
	# reverse sort the vector, and target the first five index numbers
	rev(sort(table(split.desc.clean)))[1:5]

	# news conservative world love follow
	# 317 150 145 135 126