bmmalone/cross-validation.pseudo

## cross-validation.pseudo
given labeled_training_indices (e.g., maybe there are 20 labeled training instances)
given labeled_test_indices (there are always ~3000 of these due to the split created by Harutyunyan et al.)

train_fold, val_fold <- stratified split(labeled_training_indices, train=70%, "test"=30%) # "test" is really the validation set here

# for example, if we have 20 labeled training instances, then we have 14 instances for training and 6 for validation
# ... so we really don't have a lot when the number of labeled training instances is small

hp_grid = ParameterGrid({
	'penalty': ['l1', 'l2'],
	'C': [0.001, 0.01, 0.1, ...]
	'embedding_epoch': [1, 11, 21, ...],
	... other hyperparameters ...
})

best_model <- None

for each hp in hp_grid:
	load embeddings for 'embedding_epoch'
	train logistic regression model on train_fold using embeddings and other hps
	evaluate model on val_fold

	if model is better than best_model: #"model" includes the embedding epoch
		best_model <- model

evaluate best_model (including the embedding epoch) on labeled_test_indices
	given labeled_training_indices (e.g., maybe there are 20 labeled training instances)
	given labeled_test_indices (there are always ~3000 of these due to the split created by Harutyunyan et al.)

	train_fold, val_fold <- stratified split(labeled_training_indices, train=70%, "test"=30%) # "test" is really the validation set here

	# for example, if we have 20 labeled training instances, then we have 14 instances for training and 6 for validation
	# ... so we really don't have a lot when the number of labeled training instances is small

	hp_grid = ParameterGrid({
	'penalty': ['l1', 'l2'],
	'C': [0.001, 0.01, 0.1, ...]
	'embedding_epoch': [1, 11, 21, ...],
	... other hyperparameters ...
	})

	best_model <- None

	for each hp in hp_grid:
	load embeddings for 'embedding_epoch'
	train logistic regression model on train_fold using embeddings and other hps
	evaluate model on val_fold

	if model is better than best_model: #"model" includes the embedding epoch
	best_model <- model

	evaluate best_model (including the embedding epoch) on labeled_test_indices