pavlov99/group-additional-count.scala

## group-additional-count.scala
// This method uses Window function to eliminate double counting of objects, which belong to multiple groups.
// `groups` is a DataSet with two columns: id and group. The first column identifies the object, the second is a group name.
// One of the usage of this method is customer segmentation.
val disjointGroups = groups
  .withColumn("_rank", dense_rank().over(org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("group")))
  .filter($"_rank" === 1).drop("_rank")

// Show disjoint groups with additional count.
disjointGroups
  .groupBy("group")
  .agg(countDistinct("id") as "number_records")
  .orderBy("group")
  .show()
	// This method uses Window function to eliminate double counting of objects, which belong to multiple groups.
	// `groups` is a DataSet with two columns: id and group. The first column identifies the object, the second is a group name.
	// One of the usage of this method is customer segmentation.
	val disjointGroups = groups
	.withColumn("_rank", dense_rank().over(org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("group")))
	.filter($"_rank" === 1).drop("_rank")

	// Show disjoint groups with additional count.
	disjointGroups
	.groupBy("group")
	.agg(countDistinct("id") as "number_records")
	.orderBy("group")
	.show()