mkolod/OptimizedSparkInnerJoin.scala

## OptimizedSparkInnerJoin.scala
/** Hive/Pig/Cascading/Scalding-style inner join which will perform a map-side/replicated/broadcast
  * join if the "small" relation has fewer than maxNumRows, and a reduce-side join otherwise.
  * @param big the large relation
  * @param small the small relation
  * @maxNumRows the maximum number of rows that the small relation can have to be a
  *             candidate for a map-side/replicated/broadcast join
  * @return a joined RDD with a common key and a tuple of values from the two
  *         relations (the big relation value first, followed by the small one)
  */
private def optimizedInnerJoin[A : ClassTag, B : ClassTag, C : ClassTag]
  (big: RDD[(A, B)], small: RDD[(A, C)], maxNumRows: Long): RDD[(A, (B, C))] = {

  /* This is needed for efficiency's sake, since the choice between
   * map- and reduce-side joins is based on the row count of the
   * smaller relation. The count will materialize the small relation.
   * If it's too big for a map-side join, it will be already cached
   * for the reduce-side join. Caching is idempotent, so nothing
   * will happen if the dataset is already cached.
   */
  small.cache()

  val joined =
    if (small.count() <= maxNumRows) {

      /* There was another solution to this, i.e. "small.collectAsMap()"
       * (http://ampcamp.berkeley.edu/wp-content/uploads/2012/06/matei-zaharia-amp-camp-2012-advanced-spark.pdf),
       * but that gives incorrect results since the map deduplicates entries with identical keys,
       * but that's a normal occurrence in MapReduce frameworks (that's the rationale for grouping
       * entries by key in the reduce stage). The simpler solution gives incorrect results
       * in these cases, which constitute the vast majority of key-value RDD use cases.
       */
      val grouped: Map[A, Array[C]] =
        small.
          collect().
          groupBy {
            case (key, _) => key
          }.
          map {
            case (key, kv: Array[(A, C)]) =>
              (key, kv.map { case (_, v) => v })
          }

      /* Broadcast the map representing the small relation to all nodes.
       * Joining against the big dataset will be done locally on each node
       * for all partitions at the map stage. This is called a map-side join
       * in Hadoop-land, or a replicated join in distributed relational
       * databases. In the Spark context, we can also call it a broadcast join.
       */
      val smallBc = sc.broadcast(grouped)

      big.flatMap {

        case (a: A, b: B) if smallBc.value.contains(a) =>
          smallBc.value(a).flatMap {
            case c => Some((a, (b, c)))
          }

        case _ => None

      }

    } else {

      // "Small" dataset is too big - do a regular reduce-side join using the RDD API
      big.join(small)

    }

  small.unpersist(blocking = false)
  joined
}
	/** Hive/Pig/Cascading/Scalding-style inner join which will perform a map-side/replicated/broadcast
	* join if the "small" relation has fewer than maxNumRows, and a reduce-side join otherwise.
	* @param big the large relation
	* @param small the small relation
	* @maxNumRows the maximum number of rows that the small relation can have to be a
	* candidate for a map-side/replicated/broadcast join
	* @return a joined RDD with a common key and a tuple of values from the two
	* relations (the big relation value first, followed by the small one)
	*/
	private def optimizedInnerJoin[A : ClassTag, B : ClassTag, C : ClassTag]
	(big: RDD[(A, B)], small: RDD[(A, C)], maxNumRows: Long): RDD[(A, (B, C))] = {

	/* This is needed for efficiency's sake, since the choice between
	* map- and reduce-side joins is based on the row count of the
	* smaller relation. The count will materialize the small relation.
	* If it's too big for a map-side join, it will be already cached
	* for the reduce-side join. Caching is idempotent, so nothing
	* will happen if the dataset is already cached.
	*/
	small.cache()

	val joined =
	if (small.count() <= maxNumRows) {

	/* There was another solution to this, i.e. "small.collectAsMap()"
	* (http://ampcamp.berkeley.edu/wp-content/uploads/2012/06/matei-zaharia-amp-camp-2012-advanced-spark.pdf),
	* but that gives incorrect results since the map deduplicates entries with identical keys,
	* but that's a normal occurrence in MapReduce frameworks (that's the rationale for grouping
	* entries by key in the reduce stage). The simpler solution gives incorrect results
	* in these cases, which constitute the vast majority of key-value RDD use cases.
	*/
	val grouped: Map[A, Array[C]] =
	small.
	collect().
	groupBy {
	case (key, _) => key
	}.
	map {
	case (key, kv: Array[(A, C)]) =>
	(key, kv.map { case (_, v) => v })
	}

	/* Broadcast the map representing the small relation to all nodes.
	* Joining against the big dataset will be done locally on each node
	* for all partitions at the map stage. This is called a map-side join
	* in Hadoop-land, or a replicated join in distributed relational
	* databases. In the Spark context, we can also call it a broadcast join.
	*/
	val smallBc = sc.broadcast(grouped)

	big.flatMap {

	case (a: A, b: B) if smallBc.value.contains(a) =>
	smallBc.value(a).flatMap {
	case c => Some((a, (b, c)))
	}

	case _ => None

	}

	} else {

	// "Small" dataset is too big - do a regular reduce-side join using the RDD API
	big.join(small)

	}

	small.unpersist(blocking = false)
	joined
	}