monocongo/compute_correlation_matrix.py

## compute_correlation_matrix.py
from pyspark.mllib.stat import Statistics
import pandas as pd

# result can be used w/ seaborn's heatmap
def compute_correlation_matrix(df, method='pearson'):
    # wrapper around
    # https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
    df_rdd = df.rdd.map(lambda row: row[0:])
    corr_mat = Statistics.corr(df_rdd, method=method)
    corr_mat_df = pd.DataFrame(corr_mat,
                    columns=df.columns,
                    index=df.columns)
    return corr_mat_df
	from pyspark.mllib.stat import Statistics
	import pandas as pd

	# result can be used w/ seaborn's heatmap
	def compute_correlation_matrix(df, method='pearson'):
	# wrapper around
	# https://forums.databricks.com/questions/3092/how-to-calculate-correlation-matrix-with-all-colum.html
	df_rdd = df.rdd.map(lambda row: row[0:])
	corr_mat = Statistics.corr(df_rdd, method=method)
	corr_mat_df = pd.DataFrame(corr_mat,
	columns=df.columns,
	index=df.columns)
	return corr_mat_df