Skip to content

Instantly share code, notes, and snippets.

View shaheeng's full-sized avatar

Shaheen shaheeng

View GitHub Profile
@shaheeng
shaheeng / pyspark_dataprofile
Created July 16, 2019 19:10
Pyspark utility function for profiling data
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col
def dataprofile(data_all_df,data_cols):
data_df = data_all_df.select(data_cols)
columns2Bprofiled = data_df.columns
global schema_name, table_name
if not 'schema_name' in globals():
schema_name = 'schema_name'
# Purpose: Build classification models to predict wine quality
# Use three different classification algorithms and compare their accuracies
# Author : Shaheen Gauher - Data Scientist at Microsoft
# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105
##download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
################################################################################
## Decision forest modeling
################################################################################
#Decision Forest
#using rxDForest() to build ML model
DForest_model <- rxDForest(formula = formula,
data = trainingdata,
seed = 10,
cp = 0.01,
nTree = 50,
#=======================================================
# Compute the accuracy of the trained models and how it performs on the test data
#=======================================================
#Function to compute accuracy of the trained model on the given data
computeaccuracy <- function(ML_model,scoredata){
if(file.exists("modelout_xdf.xdf") ) { file.remove("modelout_xdf.xdf") }
modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object
rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
writeModelVars = TRUE, reportProgress = 0)
#collect names of columns (features) to be used for modelling
allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))
#create formula for modelling
formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))
formula
#use rxDataStep() to create a col called 'splitcol' to use for splitting
rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)
#split using the col "splitcol"
#rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )
trainingdata = listofxdfs[[2]]
testdata = listofxdfs[[1]]
#make a new column factorQuality from the column quality -- make the label column categorical
rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)
#can remove the column quality now
ColsToKeep = setdiff(names(data_classi),c('quality'))
data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)
#rename the label col 'factorQuality' as 'LabelsCol'
##download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
header=T,sep=";",na.strings="NA") #1599 12
class(data_wine) #"data.frame"
#Relabel quality ratings as follows
#3,4,5 as Low
#6 as Med
#7,8 as High
# Purpose: Evaluate your classification model against some baseline metrics
# Establish that your model is not just making lucky guesses but is significantly better than a random model
# Author : Shaheen Gauher - Data Scientist at Microsoft
# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105
#download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data