Skip to content

Instantly share code, notes, and snippets.

def getUniqueString(
df : DataFrame,
col : String
) : Array[String] = {
df.select(col).na.drop.dropDuplicates.map(_.getString(0)).collect
}
class StratifiedSplits (data : DataFrame, labelCol : String, numSplits : Int) extends Serializable {
private val labels : Seq[Double] = data.select(labelCol).distinct.map(row => row.getDouble(0)).collect
private val dataFrames : Seq[Array[DataFrame]] = labels.map(label => {
val newData = data.filter(data(labelCol) === label)
val splits : Array[Double] = (0 until numSplits).map(k => 1 / numSplits.toDouble).toArray
newData.randomSplit(splits)
private def getScores(doc: String): Array[Double] = {
// Helper function used to normalize probability scores.
// Returns an object of type Array[Double]
// Vectorize query,
val x: Vector = pd.transform(doc)
val z = scoreArray
.map(e => innerProduct(e._2, x.toArray) + e._1)
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)
# 2. Extract data.
cv = read.csv("~/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")
package conversions
import scala.math.max
import scala.math.min
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.UserDefinedFunction
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.clustering.DistributedLDAModel
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions
import org.apache.spark.sql.Row
import org.apache.spark.sql.UserDefinedFunction
class LogRegLDAModel (params : LogRegLDAParams) extends ClusteringModel {
def transform(data : Data, featuresCol : String, clusterCol : String) : DataFrame = {
// Define LR Model.
val labels = Array("conversion", "churnThreeMonths", "churnSixMonths", "churnNineMonths")
val lr : LogisticRegression = new LogisticRegression()
.setFeaturesCol(featuresCol)
.setRegParam(params.regParam)
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)
# 2. Extract data.
cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")
### BOOTSTRAP FUNCTIONS ###
# Install required packages if necessary.
#install.packages("MCMCpack", "rootSolve")
# Load required packages.
library(ggplot2)
# General function for computing boostrap samples for a numerical statistic.
# Input: input data, number of bootstrap samples B, bootstrap sample size n,
### Regression Example
f = function(x) {
0.1 * cos(x)^2 + 0.25 * sin(x)^(3)
}
regressionData = data.frame(rnorm(5000), sd = 5)
regressionData[,2] = f(regressionData[,1]) + rnorm(5000, sd = 0.1)
names(regressionData) = c("x", "y")