Skip to content

Instantly share code, notes, and snippets.

@mgadzhi
Created March 9, 2021 22:11
Show Gist options
  • Save mgadzhi/cd513f7a9ad3feb45240f8732320e06f to your computer and use it in GitHub Desktop.
Save mgadzhi/cd513f7a9ad3feb45240f8732320e06f to your computer and use it in GitHub Desktop.
First version of DBSCAN in Kotlin
import kotlin.math.*
// First, we need to define, what a 'distance' is
interface Distance<T> {
fun T.distance(to: T): Double
}
// Now let's define few concrete data classes that will be used to store the data
data class Location(val lat: Double, val lon: Double)
data class Point(val x: Double, val y: Double)
// It is possible to define different distance functions for different data classes
// It is also possible to define different distance functions for the same data class
// For example, we could define full blown 'geodesic' distance for locations
// I tried to reproduce something like 'Typeclass' pattern which I would use in Scala/Haskell
val euclideanDistance = object : Distance<Point> {
override fun Point.distance(to: Point): Double {
return sqrt((x - to.x).pow(2) + (y - to.y).pow(2))
}
}
val haversineDistance = object : Distance<Location> {
override fun Location.distance(to: Location): Double {
val R = 6371000.0
val lat1 = Math.toRadians(lat)
val lat2 = Math.toRadians(to.lat)
val lon1 = Math.toRadians(lon)
val lon2 = Math.toRadians(to.lon)
return 2 * R * asin(sqrt(((lat1 - lat2) / 2).pow(2) + cos(lat1) * cos(lat2) * sin((lon1 - lon2) / 2).pow(2)))
}
}
// We will only require a 'cluster' to have an id for now
// And let's say we use integers as identifiers
data class Cluster(val id: Int)
// Now we can try to define what types can be 'clustered'
// 'We know how to compute a distance between 2 objects of a type' => 'We can cluster a list of such objects'
interface ClusteringAlgorithm<T, D> {
fun<D: Distance<T>> fit_transform(xs: List<T>, dist: D): List<Cluster>
}
// I first implemented a minimalistic version of a Matrix to hold pairwise distances,
// but I didn't really use everything from here after all.
// On a bright side, it's easy to add a method to ClusteringAlgorithm that would get
// an arbitrary pairwise distance matrix and skip its own distance calculations
// as it's done in sklearn
interface DummyMatrix<T> {
fun get(i: Int, j: Int): T
fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean>
}
open class DummySquareMatrix<T>(val entries: List<T>, val dim: Int): DummyMatrix<T> {
override fun get(i: Int, j: Int): T {
return entries[i * dim + j]
}
override fun bitMask(f: (item: T) -> Boolean): DummyMatrix<Boolean> {
return DummySquareMatrix(entries.map(f), dim)
}
}
class PairwiseDistanceMatrix private constructor(entries: List<Double>, dim: Int): DummySquareMatrix<Double>(entries, dim) {
companion object {
operator fun <T, D: Distance<T>> invoke(locs: List<T>, dist: D): PairwiseDistanceMatrix {
val n = locs.size
val distances = mutableListOf<Double>()
for (i in 0 until n) {
for (j in 0 until n) {
distances.add(dist.run { locs[i].distance(locs[j]) })
}
}
return PairwiseDistanceMatrix(distances, n)
}
}
}
// Simplified implementation of DBSCAN
// It doesn't take into account edge points
class DBSCAN(val eps: Double, val minPts: Int): ClusteringAlgorithm<Location, Distance<Location>> {
override fun <D : Distance<Location>> fit_transform(xs: List<Location>, dist: D): List<Cluster> {
val result = MutableList(xs.size) {Cluster(-1)}
var c = 0
val distances = PairwiseDistanceMatrix(xs, dist)
var neighbors: MutableList<Int>
for (i in xs.indices) {
if (result[i].id != -1) {
continue
}
neighbors = mutableListOf()
for (j in xs.indices) {
if (distances.get(i, j) < eps) {
neighbors.add(j)
}
}
if (neighbors.size < minPts) {
continue
}
c += 1
for (j in neighbors) {
if(result[j].id == -1) {
result[j] = Cluster(c)
}
}
}
return result
}
}
fun main() {
val a = Point(1.0, 1.0)
val b = Point(0.0, 0.0)
val d = euclideanDistance.run { a.distance(b) }
println(d)
val loc1 = Location(-34.83333, -58.5166646)
val loc2 = Location(49.0083899664, 2.53844117956)
val d2 = haversineDistance.run{ loc1.distance(loc2) } / 1000
val d3 = haversineDistance.run{ loc2.distance(loc1) } / 1000
val d4 = haversineDistance.run{ loc1.distance(loc1) } / 1000
println(d2)
println(d3)
println(d4)
val dataset = listOf<Location>(
// Groenplaats
Location(51.219227, 4.401766),
Location(51.218729, 4.401970),
Location(51.219162, 4.401155),
// Sentiance
Location(51.196732, 4.407988),
Location(51.196743, 4.408003),
Location(51.196677, 4.408226),
// Stadspark
Location(51.211933, 4.413849),
Location(51.212721, 4.414198),
Location(51.212113, 4.414257),
)
val dbscan = DBSCAN(150.0, 2)
val dbscanClusters = dbscan.fit_transform(dataset, haversineDistance)
println(dbscanClusters.map { it.id })
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment