Skip to content

Instantly share code, notes, and snippets.

@RobColeman
Created August 24, 2016 05:35
Show Gist options
  • Save RobColeman/1dcc8a9bd27317b89fdde2c22c049100 to your computer and use it in GitHub Desktop.
Save RobColeman/1dcc8a9bd27317b89fdde2c22c049100 to your computer and use it in GitHub Desktop.
package com.chartboost.adrel.preprocessing.featureComputers
import com.chartboost.adrel.dataModels.EcpmPredictionRequest
import com.chartboost.adrel.util.JsonSaving
import org.apache.spark.mllib.linalg.{Vector => SparkVector, Vectors => SparkVectors}
import scala.util.hashing.MurmurHash3
/**
* The meta-data for computing features. In this format to easily save and load with models.
*
*
* @param featureName the name of the feature
* @param blockNumber ordinal, the order of the block in the vector
* @param offset the index at which this block starts
* @param length the length of this block. It follows that the block ends at vector index (offset + length)
* @param totalVectorSize the total size of the feature vector (maybe redundant, might remove)
*/
case class FeatureComputerParams(
featureName: String,
blockNumber: Int,
offset: Int,
length: Int,
totalVectorSize: Int = -1
) extends FeatureBlock with JsonSaving
/**
* A block of indices within the feature vector
* blockNumber: ordinal, the order of the block in the vector
* offset: the index at which this block starts
* length: the length of this block. It follows that the block ends at vector index (offset + length)
* val totalVectorSize: the total size of the feature vector (maybe redundant, might remove)
*/
abstract class FeatureBlock {
val blockNumber: Int
val offset: Int
val length: Int
val totalVectorSize: Int
}
/**
* Use pattern matching to select the feature computer of choice
*/
object FeatureComputerRouter {
def apply(
predictionRequest: EcpmPredictionRequest,
computedFeatures: Map[Int,Double],
computerParams: FeatureComputerParams
): Map[Int,Double] = {
computerParams.featureName match {
case "PubAppFourthDegreeHashedFeature" =>
PubAppFourthDegreeHashedFeature.compute(predictionRequest, computedFeatures, computerParams)
case "ModelFourthDegreeHashedFeature" =>
ModelFourthDegreeHashedFeature.compute(predictionRequest, computedFeatures, computerParams)
case _ => computedFeatures
}
}
def toSparkVector(computedFeatures: Map[Int,Double], numFeatures: Int): SparkVector = {
SparkVectors.sparse(numFeatures, computedFeatures.keys.toArray, computedFeatures.values.toArray)
}
}
abstract class FeatureComputer {
/**
* The feature value type, e.g. categorical, count, real-valued
*/
val valueType: String // use enumeration
/**
* The feature name
*/
val name: String
/**
* Compute feature and insert it into the provided computed features map
* @param predictionRequest the request and campaign data from which to compute features
* @param computedFeatures the computed features, prior to this feature, possibly an empty map
* @return the computedFeatures map with the new, computed, feature inserted
*/
def compute(
predictionRequest: EcpmPredictionRequest,
computedFeatures: Map[Int,Double],
computerParams: FeatureComputerParams
): Map[Int,Double] = {
val idx = computeIdx(predictionRequest, computerParams)
val value = computeValue(predictionRequest, computerParams)
insertIntoFeatureMap(idx, value, computedFeatures, computerParams)
}
/**
* compute the feature vector index, within this feature block, where the feature belongs
* @param predictionRequest the request and campaign data
* @return the feature index, within this feature block
*/
def computeIdx(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Int
/**
* Compute the feature value from the request and campaign data
* @param predictionRequest the request and campaign data
* @return the value, at the appropriate index, to be inserted, or added, to the feature vector
*/
def computeValue(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Double
/**
* Given a computed feature index, within this feature block, and the feature value, update the computedFeatures map
* @param idx the feature index within this feature block
* @param value the computed feature value
* @param computedFeatures the computed features, prior to this feature, possibly an empty map
* @return
*/
protected def insertIntoFeatureMap(
idx: Int,
value: Double,
computedFeatures: Map[Int,Double],
computerParams: FeatureComputerParams
): Map[Int,Double] = {
val offsetIndex = idx + computerParams.offset
computedFeatures + (offsetIndex -> (computedFeatures.getOrElse(offsetIndex, 0.0) + value))
}
}
/**
* Hashing feature computers insert features into indices by performing a hash function on the feature name
*/
abstract class HashingFeatureComputer extends FeatureComputer {
/**
* Generate the feature name, used to compute the feature vector index
* @param predictionRequest
* @return
*/
def genFeatureName(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): String
/**
* compute the feature index by hashing the name
* @param predictionRequest the request and campaign data
* @return the feature index, within this feature block
*/
def computeIdx(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Int = {
val featureName: String = genFeatureName(predictionRequest, computerParams)
val rawHash = MurmurHash3.stringHash(featureName) % computerParams.length
if (rawHash < 0) rawHash + computerParams.length else rawHash
}
}
/**
* One hot encode features, but hash them together
*/
abstract class OneHotHashingFeatureComputer extends HashingFeatureComputer {
/**
* Belonging to a category is represented by a 1.0 in the index to which that category is assigned.
*/
def computeValue(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): Double = 1.0
}
abstract class CategoricalFourthDegreeHashedFeature extends OneHotHashingFeatureComputer {
def genFeatureName(predictionRequest: EcpmPredictionRequest, computerParams: FeatureComputerParams): String = {
genFourthDegreeFeatureName(predictionRequest, computerParams)
}
def retrieveThisCategory(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String
def genFourthDegreeFeatureName(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String = {
s"${predictionRequest.advCampaignId}#${predictionRequest.advConditionSet}#${predictionRequest.adType}#${retrieveThisCategory(predictionRequest, computerParams)}"
}
}
object PubAppFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature {
val valueType: String = "Categorical"
val name: String = "PubAppFourthDegreeHashedFeature"
def retrieveThisCategory(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String = predictionRequest.publisherApp
}
object ModelFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature {
val valueType: String = "Categorical"
val name: String = "ModelFourthDegreeHashedFeature"
def retrieveThisCategory(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String = predictionRequest.model
}
object ReachabilityFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature {
val valueType: String = "Categorical"
val name: String = "ReachabilityFourthDegreeHashedFeature"
def retrieveThisCategory(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String = predictionRequest.reachability.toString
}
object CountryFourthDegreeHashedFeature extends CategoricalFourthDegreeHashedFeature {
val valueType: String = "Categorical"
val name: String = "ReachabilityFourthDegreeHashedFeature"
def retrieveThisCategory(
predictionRequest: EcpmPredictionRequest,
computerParams: FeatureComputerParams
): String = predictionRequest.country
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment