Skip to content

Instantly share code, notes, and snippets.

@hhbyyh
Last active February 9, 2022 05:19
Show Gist options
  • Save hhbyyh/346467373014943a7f20df208caeb19b to your computer and use it in GitHub Desktop.
Save hhbyyh/346467373014943a7f20df208caeb19b to your computer and use it in GitHub Desktop.
package org.apache.spark.ml.feature
import org.apache.spark.ml.linalg.BLAS.axpy
import org.apache.spark.ml.linalg._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.util.Random
/**
* Created by yuhao on 12/1/16.
*/
object SmoteSampler {
def generateSamples(data: RDD[(Long, Vector)], k: Int, N: Int): RDD[Vector] = {
val knei = data.cartesian(data).map { case ((id1, vec1), (id2, vec2)) =>
(id1, vec1, vec2)
}.groupBy(_._1)
.map { case (id, iter) =>
val arr = iter.toArray
(arr(0)._2, arr.sortBy(t => Vectors.sqdist(t._2, t._3)).take(k + 1).tail.map(_._3))
}
knei.foreach(t => println(t._1 + "\t" + t._2.mkString(", ")))
knei.flatMap { case (vec, neighbours) =>
(1 to N).map { i =>
val rn = neighbours(Random.nextInt(k))
val diff = rn.copy
axpy(-1.0, vec, diff)
val newVec = vec.copy
axpy(Random.nextDouble(), diff, newVec)
newVec
}.iterator
}
}
}
// put it in another file.
package org.apache.spark.ml.feature
import org.apache.spark.ml.linalg.BLAS.axpy
import org.apache.spark.ml.linalg._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.util.Random
object SmoteTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[2]")
.appName("smote example")
.getOrCreate()
// $example on$
val df = spark.createDataFrame(Seq(
(0L, Vectors.dense(1, 2)),
(1L, Vectors.dense(3, 4)),
(2L, Vectors.dense(5, 6))
)).toDF("id", "features")
val k = 2
val N = 3
val data = df.rdd.map(r => (r.getLong(0), r.getAs[Vector](1)))
val newSamples = SmoteSampler.generateSamples(data, k, N)
newSamples.collect().foreach(println)
spark.stop()
}
}
@linhanwang
Copy link

You implementation may be too slow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment