Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Load and pre-process data
# Load dataset from csv using pandas
dataset = pd.read_csv('data/hypothyroid.csv')
# Renaming the first column as target
dataset = dataset.rename(columns = {dataset.columns[0]:"target"})
dataset["target"] = dataset["target"].map({"negative":0,"hypothyroid":1})
# Replacing the categorical values into binary values
dataset = dataset.replace({'f':0,'t':1, 'y':1, 'n':0, 'M':0, 'F':1})
# Replacing ? into NaN values
dataset.replace(to_replace='?', inplace=True, value=np.NaN)
# Count the number of null values
# Dropping the TBG column as it contains extremely high number of null values
dataset.drop('TBG', axis = 1, inplace=True)
# Selecting columns with data type as 'object'
columns = dataset.columns[dataset.dtypes.eq('object')]
# Convert to numeric values
dataset[columns] = dataset[columns].apply(pd.to_numeric, errors='coerce')
# Replacing null values by mean
dataset['Age'].fillna(dataset['Age'].mean(), inplace = True)
dataset['T4U'].fillna(dataset['T4U'].mean(), inplace = True)
# Replacing null values by median
dataset['TSH'].fillna(dataset['TSH'].mean(), inplace = True)
dataset['T3'].fillna(dataset['T3'].median(), inplace = True)
dataset['TT4'].fillna(dataset['TT4'].median(), inplace = True)
dataset['FTI'].fillna(dataset['FTI'].median(), inplace = True)
# The gender data looks to be imbalanced with 0 lesser than 1
# Replacing null values with 0
dataset['Gender'].fillna(0, inplace = True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment