Skip to content

Instantly share code, notes, and snippets.

### Regression Example
f = function(x) {
0.1 * cos(x)^2 + 0.25 * sin(x)^(3)
}
regressionData = data.frame(rnorm(5000), sd = 5)
regressionData[,2] = f(regressionData[,1]) + rnorm(5000, sd = 0.1)
names(regressionData) = c("x", "y")
### BOOTSTRAP FUNCTIONS ###
# Install required packages if necessary.
#install.packages("MCMCpack", "rootSolve")
# Load required packages.
library(ggplot2)
# General function for computing boostrap samples for a numerical statistic.
# Input: input data, number of bootstrap samples B, bootstrap sample size n,
### QUANTILE FUNCTIONS ###
# Load required packages.
library(MCMCpack)
library(rootSolve)
# Define desired quantile.
p = 0.9
### EMPIRICAL QUANTILE ESTIMATION ###
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)
# 2. Extract data.
cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")
class LogRegLDAModel (params : LogRegLDAParams) extends ClusteringModel {
def transform(data : Data, featuresCol : String, clusterCol : String) : DataFrame = {
// Define LR Model.
val labels = Array("conversion", "churnThreeMonths", "churnSixMonths", "churnNineMonths")
val lr : LogisticRegression = new LogisticRegression()
.setFeaturesCol(featuresCol)
.setRegParam(params.regParam)
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.clustering.DistributedLDAModel
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions
import org.apache.spark.sql.Row
import org.apache.spark.sql.UserDefinedFunction
package conversions
import scala.math.max
import scala.math.min
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.UserDefinedFunction
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)
# 2. Extract data.
cv = read.csv("~/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")
private def getScores(doc: String): Array[Double] = {
// Helper function used to normalize probability scores.
// Returns an object of type Array[Double]
// Vectorize query,
val x: Vector = pd.transform(doc)
val z = scoreArray
.map(e => innerProduct(e._2, x.toArray) + e._1)
class StratifiedSplits (data : DataFrame, labelCol : String, numSplits : Int) extends Serializable {
private val labels : Seq[Double] = data.select(labelCol).distinct.map(row => row.getDouble(0)).collect
private val dataFrames : Seq[Array[DataFrame]] = labels.map(label => {
val newData = data.filter(data(labelCol) === label)
val splits : Array[Double] = (0 until numSplits).map(k => 1 / numSplits.toDouble).toArray
newData.randomSplit(splits)