ajdamico/machine learning foundations

## machine learning foundations


# support vector machines #
data(quakes)
plot( quakes , col = as.factor( round( quakes$mag ) ) )

this_df <- quakes
this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

library(e1071)

fit_linear <- svm( mag ~ . , data = this_df , kernel = 'linear' )
plot( fit_linear , data = this_df , stations ~ depth )
ypred <- predict( fit_linear , this_df )
mean( ypred == this_df$mag )

myfun <-
	function( ... ) {
		list(
			linear = svm( ... , kernel = 'linear' ) ,
			sigmoid = svm( ... , kernel = 'sigmoid' ) ,
			radial = svm( ... , kernel = 'radial' ) ,
			polynomial = svm( ... , kernel = 'polynomial' )
		)
	}

full_models <- myfun( mag ~ . , data = this_df )

full_model_predictions <-
	lapply(
		full_models ,
		predict ,
		this_df
	)

full_model_accuracy <-
	lapply(
		full_model_predictions ,
		function( w ) mean( w == this_df[ , 'mag' ] )
	)


full_model_tables <-
	lapply(
		full_model_predictions ,
		function( w ) table( predict = w , this_df[ , 'mag' ] )
	)


partial_models <- myfun( mag ~ depth + stations , data = this_df )

partial_model_predictions <-
	lapply(
		partial_models ,
		predict ,
		this_df
	)


partial_model_accuracy <-
	lapply(
		partial_model_predictions ,
		function( w ) mean( w == this_df[ , 'mag' ] )
	)

partial_model_tables <-
	lapply(
		partial_model_predictions ,
		function( w ) table( predict = w , this_df[ , 'mag' ] )
	)


# resampling methods for classification testing & training #
data(quakes)
this_df <- quakes
this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

set.seed(2023)
training_vals <-
	sample(
		seq( nrow( this_df ) ) ,
		round( nrow( this_df ) * 0.7 ) ,
		replace = FALSE
	)

training_df <- this_df[ training_vals , ]
testing_df <- this_df[ -training_vals , ]

library(e1071)
fit_radial <- svm( mag ~ . , data = training_df , kernel = 'radial' )


predicted_train <- predict( fit_radial , training_df )
table( predicted_train , training_df$mag )
mean( predicted_train == training_df$mag )


predicted_test <- predict( fit_radial , testing_df )
table( predicted_test , testing_df$mag )
mean( predicted_test == testing_df$mag )


# cross-validation #

library(e1071)
data(quakes)
this_df <- quakes
this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

set.seed(2023)
training_vals <-
	sample(
		seq( nrow( this_df ) ) ,
		round( nrow( this_df ) * 0.7 ) ,
		replace = FALSE
	)

training_df <- this_df[ training_vals , ]
testing_df <- this_df[ -training_vals , ]


tc <- tune.control( cross = 10 )

tune_out <-
	tune(
		svm ,
		mag ~ . ,
		data = training_df ,
		kernel = 'radial' ,
		ranges = list( gamma = c( 1 / ncol( training_df ) , 0.3 , 0.5 , 1 , 2 , 5 ) ) ,
		tunecontrol = tc
	)

predicted_training <- predict( tune_out$best.model , training_df )
mean( predicted_training == training_df$mag )


predicted_testing <- predict( tune_out$best.model , testing_df )
mean( predicted_testing == testing_df$mag )


# trees #
library(rpart)
data(quakes)

set.seed(2023)

# regression
this_df <- quakes

training_records <-
	sample(
		seq( nrow( this_df ) ) ,
		round( nrow( this_df ) * 0.7 ) ,
		replace = FALSE
	)

training_df <- this_df[ training_records , ]
testing_df <- this_df[ -training_records , ]

fit_anova <-
	rpart(
		mag ~ . ,
		data = training_df ,
		method = 'anova'
	)

training_anova <- predict( fit_anova , training_df )
rss <- sum( ( training_df$mag - training_anova )^2 )
tss <- sum( ( training_df$mag - mean( training_df$mag ) )^2 )
1 - ( rss / tss )

testing_anova <- predict( fit_anova , testing_df )
rss <- sum( ( testing_df$mag - testing_anova )^2 )
tss <- sum( ( testing_df$mag - mean( testing_df$mag ) )^2 )
1 - ( rss / tss )

# classification
this_df <- quakes
this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

training_df <- this_df[ training_records , ]
testing_df <- this_df[ -training_records , ]

fit_class <-
	rpart(
		mag ~ . ,
		data = training_df ,
		method = 'class'
	)

training_class <- predict( fit_class , training_df , type = 'class' )
mean( training_class == training_df$mag )

testing_class <- predict( fit_class , testing_df , type = 'class' )
mean( testing_class == testing_df$mag )


# random forests #
library(randomForest)
data(quakes)

set.seed(2023)

this_df <- quakes

training_records <-
	sample(
		seq( nrow( this_df ) ) ,
		round( nrow( this_df ) * 0.7 ) ,
		replace = FALSE
	)

this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

training_df <- this_df[ training_records , ]
testing_df <- this_df[ -training_records , ]

fit_class <-
	randomForest(
		mag ~ . ,
		data = training_df
	)

training_class <- predict( fit_class , training_df )
mean( training_class == training_df$mag )

testing_class <- predict( fit_class , testing_df )
mean( testing_class == testing_df$mag )


	# support vector machines #
	data(quakes)
	plot( quakes , col = as.factor( round( quakes$mag ) ) )

	this_df <- quakes
	this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

	library(e1071)

	fit_linear <- svm( mag ~ . , data = this_df , kernel = 'linear' )
	plot( fit_linear , data = this_df , stations ~ depth )
	ypred <- predict( fit_linear , this_df )
	mean( ypred == this_df$mag )

	myfun <-
	function( ... ) {
	list(
	linear = svm( ... , kernel = 'linear' ) ,
	sigmoid = svm( ... , kernel = 'sigmoid' ) ,
	radial = svm( ... , kernel = 'radial' ) ,
	polynomial = svm( ... , kernel = 'polynomial' )
	)
	}

	full_models <- myfun( mag ~ . , data = this_df )

	full_model_predictions <-
	lapply(
	full_models ,
	predict ,
	this_df
	)

	full_model_accuracy <-
	lapply(
	full_model_predictions ,
	function( w ) mean( w == this_df[ , 'mag' ] )
	)


	full_model_tables <-
	lapply(
	full_model_predictions ,
	function( w ) table( predict = w , this_df[ , 'mag' ] )
	)


	partial_models <- myfun( mag ~ depth + stations , data = this_df )

	partial_model_predictions <-
	lapply(
	partial_models ,
	predict ,
	this_df
	)


	partial_model_accuracy <-
	lapply(
	partial_model_predictions ,
	function( w ) mean( w == this_df[ , 'mag' ] )
	)

	partial_model_tables <-
	lapply(
	partial_model_predictions ,
	function( w ) table( predict = w , this_df[ , 'mag' ] )
	)




	# resampling methods for classification testing & training #
	data(quakes)
	this_df <- quakes
	this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

	set.seed(2023)
	training_vals <-
	sample(
	seq( nrow( this_df ) ) ,
	round( nrow( this_df ) * 0.7 ) ,
	replace = FALSE
	)

	training_df <- this_df[ training_vals , ]
	testing_df <- this_df[ -training_vals , ]

	library(e1071)
	fit_radial <- svm( mag ~ . , data = training_df , kernel = 'radial' )


	predicted_train <- predict( fit_radial , training_df )
	table( predicted_train , training_df$mag )
	mean( predicted_train == training_df$mag )


	predicted_test <- predict( fit_radial , testing_df )
	table( predicted_test , testing_df$mag )
	mean( predicted_test == testing_df$mag )


	# cross-validation #

	library(e1071)
	data(quakes)
	this_df <- quakes
	this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

	set.seed(2023)
	training_vals <-
	sample(
	seq( nrow( this_df ) ) ,
	round( nrow( this_df ) * 0.7 ) ,
	replace = FALSE
	)

	training_df <- this_df[ training_vals , ]
	testing_df <- this_df[ -training_vals , ]



	tc <- tune.control( cross = 10 )

	tune_out <-
	tune(
	svm ,
	mag ~ . ,
	data = training_df ,
	kernel = 'radial' ,
	ranges = list( gamma = c( 1 / ncol( training_df ) , 0.3 , 0.5 , 1 , 2 , 5 ) ) ,
	tunecontrol = tc
	)

	predicted_training <- predict( tune_out$best.model , training_df )
	mean( predicted_training == training_df$mag )



	predicted_testing <- predict( tune_out$best.model , testing_df )
	mean( predicted_testing == testing_df$mag )





	# trees #
	library(rpart)
	data(quakes)

	set.seed(2023)

	# regression
	this_df <- quakes

	training_records <-
	sample(
	seq( nrow( this_df ) ) ,
	round( nrow( this_df ) * 0.7 ) ,
	replace = FALSE
	)

	training_df <- this_df[ training_records , ]
	testing_df <- this_df[ -training_records , ]

	fit_anova <-
	rpart(
	mag ~ . ,
	data = training_df ,
	method = 'anova'
	)

	training_anova <- predict( fit_anova , training_df )
	rss <- sum( ( training_df$mag - training_anova )^2 )
	tss <- sum( ( training_df$mag - mean( training_df$mag ) )^2 )
	1 - ( rss / tss )

	testing_anova <- predict( fit_anova , testing_df )
	rss <- sum( ( testing_df$mag - testing_anova )^2 )
	tss <- sum( ( testing_df$mag - mean( testing_df$mag ) )^2 )
	1 - ( rss / tss )

	# classification
	this_df <- quakes
	this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

	training_df <- this_df[ training_records , ]
	testing_df <- this_df[ -training_records , ]

	fit_class <-
	rpart(
	mag ~ . ,
	data = training_df ,
	method = 'class'
	)

	training_class <- predict( fit_class , training_df , type = 'class' )
	mean( training_class == training_df$mag )

	testing_class <- predict( fit_class , testing_df , type = 'class' )
	mean( testing_class == testing_df$mag )



	# random forests #
	library(randomForest)
	data(quakes)

	set.seed(2023)

	this_df <- quakes

	training_records <-
	sample(
	seq( nrow( this_df ) ) ,
	round( nrow( this_df ) * 0.7 ) ,
	replace = FALSE
	)

	this_df[ , 'mag' ] <- as.factor( round( this_df[ , 'mag' ] ) )

	training_df <- this_df[ training_records , ]
	testing_df <- this_df[ -training_records , ]

	fit_class <-
	randomForest(
	mag ~ . ,
	data = training_df
	)

	training_class <- predict( fit_class , training_df )
	mean( training_class == training_df$mag )

	testing_class <- predict( fit_class , testing_df )
	mean( testing_class == testing_df$mag )