srnghn

## ANOVA_Spark_2.0.py
from pyspark.sql.functions import *

# Implementation of ANOVA function: calculates the degrees of freedom, F-value, eta squared and omega squared values.
# Expects that 'categoryData' with two columns, the first being the categorical independent variable and the second being the scale dependent variable

def getAnovaStats(categoryData) :
    cat_val = categoryData.toDF("cat","value")
    cat_val.createOrReplaceTempView("df")
    newdf = spark.sql("select A.cat, A.value, cast((A.value * A.value) as double) as valueSq, ((A.value - B.avg) * (A.value - B.avg)) as diffSq from df A join (select cat, avg(value) as avg from df group by cat) B where A.cat = B.cat")
    grouped = newdf.groupBy("cat")

## Pearsons_R_Correlation_Spark_2.0.scala
// Create a class, ScaleTuple, to pass to the Pearson's R function so that columns can be referred to by specific names.
final case class ScaleTuple(var1: Double, var2: Double)

// Column names to use when converting to ScaleTuple
val colnames = Seq("var1", "var2")

/**
 * Implementation of Pearson's R function: calculates r, the measurement of linear dependence between two variables
 * Utilizes DataSet's 'agg' function
 **/

## ANOVA_Spark_2.0.scala
/**
 * Create a class, CatTuple, to pass to the ANOVA function so that columns can be referred to by specific names.
 * Create a class, ANOVAStats, that will be returned from the ANOVA function so that its outputs can be selected and referred to by name.
 **/
final case class CatTuple(cat: String, value: Double)
final case class ANOVAStats(dfb: Long, dfw: Double, F_value: Double, etaSq: Double, omegaSq: Double)

// Column names to use when converting to CatTuple
val colnames = Seq("cat", "value")
	from pyspark.sql.functions import *

	# Implementation of ANOVA function: calculates the degrees of freedom, F-value, eta squared and omega squared values.
	# Expects that 'categoryData' with two columns, the first being the categorical independent variable and the second being the scale dependent variable

	def getAnovaStats(categoryData) :
	cat_val = categoryData.toDF("cat","value")
	cat_val.createOrReplaceTempView("df")
	newdf = spark.sql("select A.cat, A.value, cast((A.value * A.value) as double) as valueSq, ((A.value - B.avg) * (A.value - B.avg)) as diffSq from df A join (select cat, avg(value) as avg from df group by cat) B where A.cat = B.cat")
	grouped = newdf.groupBy("cat")
	// Create a class, ScaleTuple, to pass to the Pearson's R function so that columns can be referred to by specific names.
	final case class ScaleTuple(var1: Double, var2: Double)

	// Column names to use when converting to ScaleTuple
	val colnames = Seq("var1", "var2")

	/**
	* Implementation of Pearson's R function: calculates r, the measurement of linear dependence between two variables
	* Utilizes DataSet's 'agg' function
	**/
	/**
	* Create a class, CatTuple, to pass to the ANOVA function so that columns can be referred to by specific names.
	* Create a class, ANOVAStats, that will be returned from the ANOVA function so that its outputs can be selected and referred to by name.
	**/
	final case class CatTuple(cat: String, value: Double)
	final case class ANOVAStats(dfb: Long, dfw: Double, F_value: Double, etaSq: Double, omegaSq: Double)

	// Column names to use when converting to CatTuple
	val colnames = Seq("cat", "value")