toshihiroryuu/Filling,Removing or processing missing data from tabular dataset

## Filling,Removing or processing missing data from tabular dataset
import pandas as pd
import numpy as np

df=pd.read_csv("/home/pima_indians_diabetes.csv",header=None)

# # to get a statistical summary of all columns, so that we can identify missing data by checking min value
# print(df.describe())

# print(df.head(20))

# # to find number of missing data in each column
# print((df[[0, 1, 2, 3, 4, 5, 6, 7, 8]]==0).sum())

# # to replace missing values with NaN
df[[0, 1, 2, 3, 4, 5, 6, 7, 8]] = df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].replace(0,np.NaN)


# # count number of NaN values in each column to check if it is replaced
# print(df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].isnull().sum())
# print(df.head(20))

# #  drop rows with NaN values using dropna()
# print("Original dataset shape is {}".format(df.shape))

# # df.dropna(inplace=True)
# print("Reduced dataset shape after removing rows with NaN is {}".format(df.shape))

# # using pandas fillna() to impute or add missing values
# df.fillna(df.mean(), inplace=True)
# print(df.head(20))

# # count the number of NaN values in each column
# print(df.isnull().sum())

# using imputer in sklearn.preprocessing to add missing values
from sklearn.preprocessing import Imputer

imputer=Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
values=df.values

transformrd=imputer.fit_transform(values)
print(transformrd)
	import pandas as pd
	import numpy as np

	df=pd.read_csv("/home/pima_indians_diabetes.csv",header=None)

	# # to get a statistical summary of all columns, so that we can identify missing data by checking min value
	# print(df.describe())

	# print(df.head(20))

	# # to find number of missing data in each column
	# print((df[[0, 1, 2, 3, 4, 5, 6, 7, 8]]==0).sum())

	# # to replace missing values with NaN
	df[[0, 1, 2, 3, 4, 5, 6, 7, 8]] = df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].replace(0,np.NaN)


	# # count number of NaN values in each column to check if it is replaced
	# print(df[[0, 1, 2, 3, 4, 5, 6, 7, 8]].isnull().sum())
	# print(df.head(20))

	# # drop rows with NaN values using dropna()
	# print("Original dataset shape is {}".format(df.shape))

	# # df.dropna(inplace=True)
	# print("Reduced dataset shape after removing rows with NaN is {}".format(df.shape))

	# # using pandas fillna() to impute or add missing values
	# df.fillna(df.mean(), inplace=True)
	# print(df.head(20))

	# # count the number of NaN values in each column
	# print(df.isnull().sum())

	# using imputer in sklearn.preprocessing to add missing values
	from sklearn.preprocessing import Imputer

	imputer=Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
	values=df.values

	transformrd=imputer.fit_transform(values)
	print(transformrd)