douglaspsteen/ssl_scrub.py

## ssl_scrub.py
# Shuffle the data

df = df.sample(frac=1, random_state=15).reset_index(drop=True)


# Generate indices for splits

test_ind = round(len(df)*0.25)
train_ind = test_ind + round(len(df)*0.01)
unlabeled_ind = train_ind + round(len(df)*0.74)


# Partition the data

test = df.iloc[:test_ind]
train = df.iloc[test_ind:train_ind]
unlabeled = df.iloc[train_ind:unlabeled_ind]


# Assign data to train, test, and unlabeled sets

X_train = train.drop('complication', axis=1)
y_train = train.complication

X_unlabeled = unlabeled.drop('complication', axis=1)

X_test = test.drop('complication', axis=1)
y_test = test.complication


# Check dimensions of data after splitting

print(f"X_train dimensions: {X_train.shape}")
print(f"y_train dimensions: {y_train.shape}\n")

print(f"X_test dimensions: {X_test.shape}")
print(f"y_test dimensions: {y_test.shape}\n")

print(f"X_unlabeled dimensions: {X_unlabeled.shape}")
	# Shuffle the data

	df = df.sample(frac=1, random_state=15).reset_index(drop=True)


	# Generate indices for splits

	test_ind = round(len(df)*0.25)
	train_ind = test_ind + round(len(df)*0.01)
	unlabeled_ind = train_ind + round(len(df)*0.74)


	# Partition the data

	test = df.iloc[:test_ind]
	train = df.iloc[test_ind:train_ind]
	unlabeled = df.iloc[train_ind:unlabeled_ind]


	# Assign data to train, test, and unlabeled sets

	X_train = train.drop('complication', axis=1)
	y_train = train.complication

	X_unlabeled = unlabeled.drop('complication', axis=1)

	X_test = test.drop('complication', axis=1)
	y_test = test.complication


	# Check dimensions of data after splitting

	print(f"X_train dimensions: {X_train.shape}")
	print(f"y_train dimensions: {y_train.shape}\n")

	print(f"X_test dimensions: {X_test.shape}")
	print(f"y_test dimensions: {y_test.shape}\n")

	print(f"X_unlabeled dimensions: {X_unlabeled.shape}")