Skip to content

Instantly share code, notes, and snippets.

@dovahcrow
Created May 3, 2015 16:58
Show Gist options
  • Save dovahcrow/d414594810924fbb569e to your computer and use it in GitHub Desktop.
Save dovahcrow/d414594810924fbb569e to your computer and use it in GitHub Desktop.
dbscan vs kmeans
using Distributions
using Gadfly
using RegERMs
using DataFrames
using Clustering
using Match
using Distances
using Iterators
gensin() = begin
gauss = Distributions.Gaussian(0, 1)
fuzz_sin(x) = sin(x) + rand(gauss) / 7
fuzz_cos(x) = cos(x) + rand(gauss) / 7
df = foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [0:0.01:5.2]) do acc, idx
@match idx begin
less, if less <= 1.57 end => push!(acc, [idx, fuzz_sin(idx), :sin])
middle, if 1.57 < middle < 3.2 end => begin
push!(acc, [idx, fuzz_sin(idx), :sin])
push!(acc, [idx, fuzz_cos(idx), :cos])
end
more, if 3.2 <= more end => push!(acc, [idx, fuzz_cos(idx), :cos])
end
acc
end
df
end
genmdf(class... ;sample_per_class = 100) = begin
foldl((DataFrame([Float64, Float64, Symbol], [:x, :y, :tag], 0), 0),
chain(collect(map(class) do c
[rand(MvNormal([c[1], c[2]], c[3])) for i in 1:sample_per_class]
end)...)) do acc, b
df = acc[1]
count = acc[2] + 1
tag = symbol("Class$(int(floor(acc[2] / sample_per_class)))")
push!(df, [b[1], b[2], tag])
(df, count)
end[1]
end
gencir() = begin
foldl(DataFrame([Float64, Float64, Symbol],[:x, :y, :tag], 0), [-20:0.1:20]) do acc, idx
r1 = 3
r2 = 7
x12 = r1^2 - idx^2 + 5*rand(gauss)
x22 = r2^2 - idx^2 + 5*rand(gauss)
if x12 >= 0
push!(acc, [idx, sqrt(x12), :Circle1])
push!(acc, [idx, -sqrt(x12), :Circle1])
end
if x22 >= 0
push!(acc, [idx, sqrt(x22), :Circle2])
push!(acc, [idx, -sqrt(x22), :Circle2])
end
acc
end
end
km(df, ks = 2; title = "kmeans") = begin
clusterdf = copy(df)
kres = kmeans(array(clusterdf[[:x,:y]])', ks)
clusterdf[:cluster] = [symbol("Cluster$t") for t in kres.assignments]
plot(clusterdf, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("KMeans-$title"))
end
dbs(df, eps = 0.1, minpts = 1; title = "dbscan") = begin
df = copy(df)
D = zeros(Float64, nrow(df), nrow(df))
map(combinations(1:nrow(df), 2)) do cob
dis = euclidean(vec(array(df[cob[1],[:x,:y]])), vec(array(df[cob[2], [:x,:y]])))
D[cob[1], cob[2]] = dis
D[cob[2], cob[1]] = dis
0
end
dbres = dbscan(D, eps, minpts)
df[:cluster] = [symbol("Cluster$t") for t in dbres.assignments]
plot(df, x=:x, y=:y, color=:cluster, Geom.point, Guide.title("Dbscan-$title"))
end
# sin
############################################
sdf = gensin()
plot(df, x=:x, y=:y, color=:tag, Geom.point)
km(df, 2)
dbs(df, 1.5, 1)
# circle
cdf = gencir()
plot(cdf, x=:x, y=:y,)
km(cdf, 4)
dbs(cdf, 1, 1)
# multi gaussian
################################
mdf = genmdf((0., 0., 1), (6., 4., 2), (-1., 7., 1), (10., -3., 2))
plot(mdf, x=:x, y=:y,)
km(mdf, 4)
dbs(mdf, 1, 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment