timabe/yelp_hclust

## yelp_hclust
load('busCats.Rdata') # load this data from github.com/timabe


catSums <- colSums(bus.cats) # get summary data, with just the category sums
catSums[order(-catSums)]->catSums # order the categories ahead of plotting them
plot(log(catSums))
# the plot shows a skewed set. Many of these will be useless in the hierarchical clustering
# as there are only a handful of observations of them. It's unlikely they will produce an
# interesting cluster membership
# lets concentrate on the top 60 since there's a slight kink there
# and for similarity purposes we only want businesses that have multiple categories
bus.cats$bizSums <- rowSums(bus.cats)
bus.cats <- subset(bus.cats, bizSums>=2, select = -c(bizSums))
bus.cats <- bus.cats[ , names(catSums[1:60])]

# All the categories precede with "category_"
# This gives us prettier names
n <- colnames(bus.cats)
strsplit(n, split='_')->n
do.call(rbind, n)[,2]->n
colnames(bus.cats)<-n

# Now create a distnace matrix with the similarity measurement as 'binary'
d <- dist(t(bus.cats)^2, method='binary')
# And creating the hierarchical clustering object is as easy as this!
h <- hclust(d, method='single')
	load('busCats.Rdata') # load this data from github.com/timabe


	catSums <- colSums(bus.cats) # get summary data, with just the category sums
	catSums[order(-catSums)]->catSums # order the categories ahead of plotting them
	plot(log(catSums))
	# the plot shows a skewed set. Many of these will be useless in the hierarchical clustering
	# as there are only a handful of observations of them. It's unlikely they will produce an
	# interesting cluster membership
	# lets concentrate on the top 60 since there's a slight kink there
	# and for similarity purposes we only want businesses that have multiple categories
	bus.cats$bizSums <- rowSums(bus.cats)
	bus.cats <- subset(bus.cats, bizSums>=2, select = -c(bizSums))
	bus.cats <- bus.cats[ , names(catSums[1:60])]

	# All the categories precede with "category_"
	# This gives us prettier names
	n <- colnames(bus.cats)
	strsplit(n, split='_')->n
	do.call(rbind, n)[,2]->n
	colnames(bus.cats)<-n

	# Now create a distnace matrix with the similarity measurement as 'binary'
	d <- dist(t(bus.cats)^2, method='binary')
	# And creating the hierarchical clustering object is as easy as this!
	h <- hclust(d, method='single')