Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save toshihiroryuu/afc2dcbd80f877c8144e9f4d5de8058f to your computer and use it in GitHub Desktop.
Save toshihiroryuu/afc2dcbd80f877c8144e9f4d5de8058f to your computer and use it in GitHub Desktop.
Add missing data, Replace missing data , transform missing data, remove rows with missing data using fillna() in pandas library or by using Imputer in sklear.preprocessing.Imputer
import pandas as pd
import numpy as np
df=pd.read_csv("/home/pima_indians_diabetes.csv",header=None)
# # to get a statistical summary of all columns, so that we can identify missing data by checking min value
# print(df.describe())
# print(df.head(20))
# # to find number of missing data in each column
# print((df[[0, 1, 2, 3, 4, 5, 6, 7, 8]]==0).sum())
# # to replace missing values with NaN
df[[0, 1, 2, 3, 4, 5, 6, 7, 8]] = df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].replace(0,np.NaN)
# # count number of NaN values in each column to check if it is replaced
# print(df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].isnull().sum())
# print(df.head(20))
# # drop rows with NaN values using dropna()
# print("Original dataset shape is {}".format(df.shape))
# # df.dropna(inplace=True)
# print("Reduced dataset shape after removing rows with NaN is {}".format(df.shape))
# # using pandas fillna() to impute or add missing values
# df.fillna(df.mean(), inplace=True)
# print(df.head(20))
# # count the number of NaN values in each column
# print(df.isnull().sum())
# using imputer in sklearn.preprocessing to add missing values
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
values=df.values
transformrd=imputer.fit_transform(values)
print(transformrd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment