Skip to content

Instantly share code, notes, and snippets.

Last active September 7, 2021 10:42
Show Gist options
  • Save emuccino/960fdf978abdd979edd25c7a7a23d16a to your computer and use it in GitHub Desktop.
Save emuccino/960fdf978abdd979edd25c7a7a23d16a to your computer and use it in GitHub Desktop.
Active Learning for Fast Data Set Labeling
import numpy as np
from numpy.random import choice, normal
from tensorflow import set_random_seed
from tensorflow.python.keras.layers import Input, BatchNormalization,Conv2D, MaxPooling2D
from tensorflow.python.keras.layers import Dropout, Reshape
from tensorflow.python.keras.layers import Dense, Flatten
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
from keras.datasets import mnist
#load mnist data
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#scale and reshpae training data
x_train = x_train.reshape(-1,28,28,1) / 255
n_samples = len(x_train)
#hyperparameters for auto labeling
#init_label is the initial percent of data that is hand labeled
init_label = 0.01
#high_confidence_threshold is the minimum classification probability that..
#..classifier must predict for the sample to be auto labeled
high_confidence_threshold = 0.9999
#low_confidence_percent is the percentage of unlabeled data with lowest..
#..classification probabilities that are hand labeled each iteration
low_confidence_percent = 0.01
#select data as initial hand labeled samples
x_labeled = x_train[:round(len(x_train)*init_label)]
y_labeled = y_train[:round(len(x_train)*init_label)]
#store the rest of the data as unlabeled data
x_unlabeled = x_train[round(len(x_train)*init_label):]
y_unlabeled = y_train[round(len(x_train)*init_label):]
#empty arrays for storing auto labeles and correct labeld for comparing accuracy
auto_labels = np.array([])
correct_labels = np.array([])
n_hand_labeled = len(x_labeled)
#set up new classifier
def compile_model():
inputs = Input(shape=(28,28,1))
net = inputs
net = Conv2D(32, 3, activation='relu')(net)
net = Conv2D(64, 3, activation='relu')(net)
net = MaxPooling2D(pool_size=(2, 2))(net)
net = Dropout(0.25)(net)
net = Flatten()(net)
net = Dense(128, activation='relu')(net)
outputs = Dense(10, 'softmax')(net)
model = Model(inputs=inputs, outputs=outputs)
return model
run = True
#algorithm loops until less than 1% of samples are unlabeled
while run == True:
model = compile_model()
#early stopping when validation accuracy stops increasing for 10 epochs
earlyStop = EarlyStopping(monitor='val_acc', min_delta=0, patience=10,
verbose=0, mode='max', baseline=None, restore_best_weights=True)
#fit model on labeled data using 20% of labeled data as validation, y=y_labeled, batch_size=32, epochs=99999, verbose=0,
callbacks=[earlyStop], validation_split=0.2)
#use model to predict labels for all unlabeled data
predictions = model.predict(x_unlabeled)
#prediction indices sorted by classsification probabilty
sorted_indices = np.argsort(predictions.max(axis=1))
#indices of unlabeled data that are going to be hand labeled..
#..and added to labeled data set
low_confidence = sorted_indices[:round(n_samples*low_confidence_percent)]
x_labeled = np.concatenate([x_labeled,x_unlabeled[low_confidence]])
y_labeled = np.concatenate([y_labeled,y_unlabeled[low_confidence]])
#indices of unlabeled data that are going to be.. labeled and added to data set
high_confidence = np.array([x for x
in np.argwhere(predictions.max(axis=1)>high_confidence_threshold).flatten()
if x not in low_confidence])
if len(high_confidence) > 0:
x_labeled = np.concatenate([x_labeled,
y_labeled = np.concatenate([y_labeled,
correct_labels = np.concatenate([correct_labels,
auto_labels = np.concatenate([auto_labels,
#remove labeled dat from unlabeled data set
x_unlabeled = np.delete(x_unlabeled,
y_unlabeled = np.delete(y_unlabeled,
#remove labeled dat from unlabeled data set
x_unlabeled = np.delete(x_unlabeled,low_confidence,0)
y_unlabeled = np.delete(y_unlabeled,low_confidence,0)
total_auto_labeled = len(correct_labels)
total_incorrect_labels = total_auto_labeled -
np.count_nonzero(correct_labels == auto_labels)
n_hand_labeled += len(low_confidence)
print('after iteration '+str(iteration)+':',len(x_labeled),
'labeled samples,',len(x_unlabeled),'unlabeled samples')
print(str(len(low_confidence))+' hand labeled')
print(str(len(high_confidence))+' auto labeled')
print(str(len(y_unlabeled)/n_samples)+'% unlabeled')
'% incorrectly labeled')
if len(y_unlabeled)/n_samples < 0.01:
print('total hand labeled:',n_hand_labeled)
print('total auto labeled:',total_auto_labeled+
len(y_unlabeled)))+'% incorrectly labeled')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment