Load and pre-process data
# Load dataset from csv using pandas | |
dataset = pd.read_csv('data/hypothyroid.csv') | |
dataset.head() | |
# Renaming the first column as target | |
dataset = dataset.rename(columns = {dataset.columns[0]:"target"}) | |
dataset["target"] = dataset["target"].map({"negative":0,"hypothyroid":1}) | |
# Replacing the categorical values into binary values | |
dataset = dataset.replace({'f':0,'t':1, 'y':1, 'n':0, 'M':0, 'F':1}) | |
# Replacing ? into NaN values | |
dataset.replace(to_replace='?', inplace=True, value=np.NaN) | |
# Count the number of null values | |
dataset.isnull().sum() | |
# Dropping the TBG column as it contains extremely high number of null values | |
dataset.drop('TBG', axis = 1, inplace=True) | |
# Selecting columns with data type as 'object' | |
columns = dataset.columns[dataset.dtypes.eq('object')] | |
# Convert to numeric values | |
dataset[columns] = dataset[columns].apply(pd.to_numeric, errors='coerce') | |
# Replacing null values by mean | |
dataset['Age'].fillna(dataset['Age'].mean(), inplace = True) | |
dataset['T4U'].fillna(dataset['T4U'].mean(), inplace = True) | |
# Replacing null values by median | |
dataset['TSH'].fillna(dataset['TSH'].mean(), inplace = True) | |
dataset['T3'].fillna(dataset['T3'].median(), inplace = True) | |
dataset['TT4'].fillna(dataset['TT4'].median(), inplace = True) | |
dataset['FTI'].fillna(dataset['FTI'].median(), inplace = True) | |
# The gender data looks to be imbalanced with 0 lesser than 1 | |
# Replacing null values with 0 | |
dataset['Gender'].fillna(0, inplace = True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment