dovahcrow/dbscan_kmeans.jl

## dbscan_kmeans.jl
using Distributions
using Gadfly
using RegERMs
using DataFrames
using Clustering
using Match
using Distances
using Iterators

gensin() = begin
  gauss = Distributions.Gaussian(0, 1)
  fuzz_sin(x) = sin(x) + rand(gauss) / 7
  fuzz_cos(x) = cos(x) + rand(gauss) / 7

  df = foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [0:0.01:5.2]) do acc, idx
    @match idx begin
      less, if less <= 1.57 end => push!(acc, [idx, fuzz_sin(idx), :sin])
      middle, if 1.57 < middle < 3.2 end => begin
        push!(acc, [idx, fuzz_sin(idx), :sin])
        push!(acc, [idx, fuzz_cos(idx), :cos])
      end
      more, if 3.2 <= more end => push!(acc, [idx, fuzz_cos(idx), :cos])
    end
    acc
  end
  df
end

genmdf(class... ;sample_per_class = 100) = begin
  foldl((DataFrame([Float64, Float64, Symbol], [:x, :y, :tag], 0), 0),
        chain(collect(map(class) do c
                    [rand(MvNormal([c[1], c[2]], c[3])) for i in 1:sample_per_class]
                  end)...)) do acc, b
    df = acc[1]
    count = acc[2] + 1
    tag = symbol("Class$(int(floor(acc[2] / sample_per_class)))")
    push!(df, [b[1], b[2], tag])
    (df, count)
  end[1]
end

gencir() = begin
  foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [-20:0.1:20]) do acc, idx
    r1 = 3
    r2 = 7
    x12 = r1^2 - idx^2 + 5*rand(gauss)
    x22 = r2^2 - idx^2 + 5*rand(gauss)
    if x12 >= 0
      push!(acc, [idx, sqrt(x12), :Circle1])
      push!(acc, [idx, -sqrt(x12), :Circle1])
    end
    if x22 >= 0
      push!(acc, [idx, sqrt(x22), :Circle2])
      push!(acc, [idx, -sqrt(x22), :Circle2])
    end
    acc
  end
end

km(df, ks = 2; title = "kmeans") = begin
  clusterdf = copy(df)
  kres = kmeans(array(clusterdf[[:x,:y]])', ks)
  clusterdf[:cluster] = [symbol("Cluster$t") for t in kres.assignments]
  plot(clusterdf, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("KMeans-$title"))
end

dbs(df, eps = 0.1, minpts = 1; title = "dbscan") = begin
  df = copy(df)
  D = zeros(Float64, nrow(df), nrow(df))
  map(combinations(1:nrow(df), 2)) do cob
    dis = euclidean(vec(array(df[cob[1],[:x,:y]])), vec(array(df[cob[2], [:x,:y]])))
    D[cob[1], cob[2]] = dis
    D[cob[2], cob[1]] = dis
    0
  end

  dbres = dbscan(D, eps, minpts)
  df[:cluster] = [symbol("Cluster$t") for t in dbres.assignments]
  plot(df, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("Dbscan-$title"))
end

# sin
############################################
sdf = gensin()
plot(df, x=:x, y=:y, color=:tag, Geom.point)
km(df, 2)
dbs(df, 1.5, 1)

# circle
cdf = gencir()
plot(cdf, x=:x, y=:y,)
km(cdf, 4)
dbs(cdf, 1, 1)

# multi gaussian
################################
mdf = genmdf((0., 0., 1), (6., 4., 2), (-1., 7., 1), (10., -3., 2))
plot(mdf, x=:x, y=:y,)
km(mdf, 4)
dbs(mdf, 1, 1)
	using Distributions
	using Gadfly
	using RegERMs
	using DataFrames
	using Clustering
	using Match
	using Distances
	using Iterators

	gensin() = begin
	gauss = Distributions.Gaussian(0, 1)
	fuzz_sin(x) = sin(x) + rand(gauss) / 7
	fuzz_cos(x) = cos(x) + rand(gauss) / 7

	df = foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [0:0.01:5.2]) do acc, idx
	@match idx begin
	less, if less <= 1.57 end => push!(acc, [idx, fuzz_sin(idx), :sin])
	middle, if 1.57 < middle < 3.2 end => begin
	push!(acc, [idx, fuzz_sin(idx), :sin])
	push!(acc, [idx, fuzz_cos(idx), :cos])
	end
	more, if 3.2 <= more end => push!(acc, [idx, fuzz_cos(idx), :cos])
	end
	acc
	end
	df
	end

	genmdf(class... ;sample_per_class = 100) = begin
	foldl((DataFrame([Float64, Float64, Symbol], [:x, :y, :tag], 0), 0),
	chain(collect(map(class) do c
	[rand(MvNormal([c[1], c[2]], c[3])) for i in 1:sample_per_class]
	end)...)) do acc, b
	df = acc[1]
	count = acc[2] + 1
	tag = symbol("Class$(int(floor(acc[2] / sample_per_class)))")
	push!(df, [b[1], b[2], tag])
	(df, count)
	end[1]
	end

	gencir() = begin
	foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [-20:0.1:20]) do acc, idx
	r1 = 3
	r2 = 7
	x12 = r1^2 - idx^2 + 5*rand(gauss)
	x22 = r2^2 - idx^2 + 5*rand(gauss)
	if x12 >= 0
	push!(acc, [idx, sqrt(x12), :Circle1])
	push!(acc, [idx, -sqrt(x12), :Circle1])
	end
	if x22 >= 0
	push!(acc, [idx, sqrt(x22), :Circle2])
	push!(acc, [idx, -sqrt(x22), :Circle2])
	end
	acc
	end
	end

	km(df, ks = 2; title = "kmeans") = begin
	clusterdf = copy(df)
	kres = kmeans(array(clusterdf[[:x,:y]])', ks)
	clusterdf[:cluster] = [symbol("Cluster$t") for t in kres.assignments]
	plot(clusterdf, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("KMeans-$title"))
	end

	dbs(df, eps = 0.1, minpts = 1; title = "dbscan") = begin
	df = copy(df)
	D = zeros(Float64, nrow(df), nrow(df))
	map(combinations(1:nrow(df), 2)) do cob
	dis = euclidean(vec(array(df[cob[1],[:x,:y]])), vec(array(df[cob[2], [:x,:y]])))
	D[cob[1], cob[2]] = dis
	D[cob[2], cob[1]] = dis
	0
	end

	dbres = dbscan(D, eps, minpts)
	df[:cluster] = [symbol("Cluster$t") for t in dbres.assignments]
	plot(df, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("Dbscan-$title"))
	end

	# sin
	############################################
	sdf = gensin()
	plot(df, x=:x, y=:y, color=:tag, Geom.point)
	km(df, 2)
	dbs(df, 1.5, 1)

	# circle
	cdf = gencir()
	plot(cdf, x=:x, y=:y,)
	km(cdf, 4)
	dbs(cdf, 1, 1)

	# multi gaussian
	################################
	mdf = genmdf((0., 0., 1), (6., 4., 2), (-1., 7., 1), (10., -3., 2))
	plot(mdf, x=:x, y=:y,)
	km(mdf, 4)
	dbs(mdf, 1, 1)