rweeks/VariantDereferencer.scala

## VariantDereferencer.scala
package variantspark

import collection.JavaConverters._
import com.google.common.primitives.UnsignedBytes
import org.apache.accumulo.core.client.{IteratorSetting, ZooKeeperInstance}
import org.apache.accumulo.core.client.mapred.AccumuloInputFormat
import org.apache.accumulo.core.client.mapreduce.impl.DelegationTokenStub
import org.apache.accumulo.core.client.mapreduce.lib.impl.ConfiguratorBase._
import org.apache.accumulo.core.client.mapreduce.lib.impl.InputConfigurator
import org.apache.accumulo.core.data.{Key, Value, Range => ARange}
import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.security.{Credentials, UserGroupInformation}
import org.apache.spark.{HashPartitioner, SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD

class VariantDereferencer {

  /**
    * This function is meant to be passed to mapPartitions, where the partition that we're iterating over consists of
    * key/value pairs from the Variant transpose table. The input row id references are mapped to rows from the Variant
    * table, encoded using the WholeRowIterator.
    */
  def dereferenceVariantIds(batchSize: Int, numDereferenceThreads: Int,
                            wConf: SerializableWritable[HadoopConfiguration],
                            wCreds: SerializableWritable[Credentials])
                           (eIdPart: Iterator[(Array[Byte], Key)]) : Iterator[(Key,Value)] = {
    val (conf, creds) = (wConf.value, wCreds.value)
    val principal = getPrincipal(classOf[AccumuloInputFormat], conf)
    val delTokenStub = getAuthenticationToken(classOf[AccumuloInputFormat], conf)

    UserGroupInformation.getCurrentUser.addCredentials(creds)

    val clientConfig = getClientConfiguration(classOf[AccumuloInputFormat], conf)
    val scanAuths = InputConfigurator.getScanAuthorizations(classOf[AccumuloInputFormat], conf)

    val delToken = unwrapAuthenticationToken(Job.getInstance(conf), delTokenStub)

    val conn = new ZooKeeperInstance(clientConfig).getConnector(principal, delToken)
    val scanner = conn.createBatchScanner(VariantTable, scanAuths, numDereferenceThreads)
    scanner.addScanIterator(new IteratorSetting(
      40, "WholeRowIterator", classOf[org.apache.accumulo.core.iterators.user.WholeRowIterator])
    )

    val kvIterIter = for (keyGroup <- eIdPart.grouped(batchSize)) yield {
      scanner.setRanges(keyGroup.map(ek => ARange.exact(new Text(ek._1))).asJava)
      for (kvPair <- scanner.asScala) yield (kvPair.getKey, kvPair.getValue)
    }
    kvIterIter.flatten
  }

  /**
    * This is meant to be called from a Spark driver. It applies a query to the Variant transpose table and dereferences
    * the results to the Variant table.
    */
  def findEntities(sc: SparkContext,
                   query: String,
                   batchSize: Int = 1000,
                   numDereferenceThreads: Int = 2) : RDD[(Key,Value)] = {
    val job = jobBuilder.prepareJob()
      .forTable(VariantTransposeTable)
      .forRanges(new AttrQueryParser().newRangeFromQuery(query).toList)
    val jc = job.create().getConfiguration
    val creds = job.create().getCredentials
    implicit val arrayOrdering = UnsignedBytes.lexicographicalComparator()
    val entityIds = job.createRDD(sc)
      .map(x => (extractVariantId(x._1), x._1))
      .repartitionAndSortWithinPartitions(new HashPartitioner(sc.defaultParallelism))
      .mapPartitions(dereferenceEntityIds(
        tenantId, batchSize, numDereferenceThreads,
        new SerializableWritable(jc),
        new SerializableWritable(creds)
      ))
    entityIds
  }
}
	package variantspark

	import collection.JavaConverters._
	import com.google.common.primitives.UnsignedBytes
	import org.apache.accumulo.core.client.{IteratorSetting, ZooKeeperInstance}
	import org.apache.accumulo.core.client.mapred.AccumuloInputFormat
	import org.apache.accumulo.core.client.mapreduce.impl.DelegationTokenStub
	import org.apache.accumulo.core.client.mapreduce.lib.impl.ConfiguratorBase._
	import org.apache.accumulo.core.client.mapreduce.lib.impl.InputConfigurator
	import org.apache.accumulo.core.data.{Key, Value, Range => ARange}
	import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
	import org.apache.hadoop.io.Text
	import org.apache.hadoop.mapreduce.Job
	import org.apache.hadoop.security.{Credentials, UserGroupInformation}
	import org.apache.spark.{HashPartitioner, SerializableWritable, SparkContext}
	import org.apache.spark.rdd.RDD

	class VariantDereferencer {

	/**
	* This function is meant to be passed to mapPartitions, where the partition that we're iterating over consists of
	* key/value pairs from the Variant transpose table. The input row id references are mapped to rows from the Variant
	* table, encoded using the WholeRowIterator.
	*/
	def dereferenceVariantIds(batchSize: Int, numDereferenceThreads: Int,
	wConf: SerializableWritable[HadoopConfiguration],
	wCreds: SerializableWritable[Credentials])
	(eIdPart: Iterator[(Array[Byte], Key)]) : Iterator[(Key,Value)] = {
	val (conf, creds) = (wConf.value, wCreds.value)
	val principal = getPrincipal(classOf[AccumuloInputFormat], conf)
	val delTokenStub = getAuthenticationToken(classOf[AccumuloInputFormat], conf)

	UserGroupInformation.getCurrentUser.addCredentials(creds)

	val clientConfig = getClientConfiguration(classOf[AccumuloInputFormat], conf)
	val scanAuths = InputConfigurator.getScanAuthorizations(classOf[AccumuloInputFormat], conf)

	val delToken = unwrapAuthenticationToken(Job.getInstance(conf), delTokenStub)

	val conn = new ZooKeeperInstance(clientConfig).getConnector(principal, delToken)
	val scanner = conn.createBatchScanner(VariantTable, scanAuths, numDereferenceThreads)
	scanner.addScanIterator(new IteratorSetting(
	40, "WholeRowIterator", classOf[org.apache.accumulo.core.iterators.user.WholeRowIterator])
	)

	val kvIterIter = for (keyGroup <- eIdPart.grouped(batchSize)) yield {
	scanner.setRanges(keyGroup.map(ek => ARange.exact(new Text(ek._1))).asJava)
	for (kvPair <- scanner.asScala) yield (kvPair.getKey, kvPair.getValue)
	}
	kvIterIter.flatten
	}

	/**
	* This is meant to be called from a Spark driver. It applies a query to the Variant transpose table and dereferences
	* the results to the Variant table.
	*/
	def findEntities(sc: SparkContext,
	query: String,
	batchSize: Int = 1000,
	numDereferenceThreads: Int = 2) : RDD[(Key,Value)] = {
	val job = jobBuilder.prepareJob()
	.forTable(VariantTransposeTable)
	.forRanges(new AttrQueryParser().newRangeFromQuery(query).toList)
	val jc = job.create().getConfiguration
	val creds = job.create().getCredentials
	implicit val arrayOrdering = UnsignedBytes.lexicographicalComparator()
	val entityIds = job.createRDD(sc)
	.map(x => (extractVariantId(x._1), x._1))
	.repartitionAndSortWithinPartitions(new HashPartitioner(sc.defaultParallelism))
	.mapPartitions(dereferenceEntityIds(
	tenantId, batchSize, numDereferenceThreads,
	new SerializableWritable(jc),
	new SerializableWritable(creds)
	))
	entityIds
	}
	}