vernal-inertia tanveer-sayyed

## NaN-13.py
In [23]:

print(sum(original)/len(original))
print(sum(replaced_by_mean)/len(replaced_by_mean))
print(sum(replaced_by_proportion)/len(replaced_by_proportion))

Out [23]:
1.8
1.8000000000000003
1.8

## NaN-12.py
In [24]:
original               = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3] # removing NaNs
replaced_by_mean       = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3,  1.8, 1.8, 1.8, 1.8, 1.8]
replaced_by_proportion = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3,  1, 1, 2, 2, 3]

plt.figure(figsize=(10,6))
plt.subplot(3, 1, 1)
plt.boxplot(original, showmeans=True, vert=False)
plt.title('original', color='green')
plt.subplot(3, 1, 2)

## NaN-11.py
In [63]:
new_df.drop(columns=['c2'], inplace=True)

new_df

Out[63]:
	    c0 	  c1 	  c3
index
i0 	  1.0 	2.0 	3.0
i1 	  5.5 	7.0 	10.0

## NaN-10.py
In [62]:
# Usually .fillna() method is used to replace NaNs
new_df['c0'].fillna(mean_c0, inplace=True)

# We can do the same by .replace() method
new_df['c1'].replace(to_replace= np.nan, value= median_c1, inplace=True)

# Also by vectorised indexing
null_indexes_in_c3 = new_df.index[new_df['c3'].isnull()]
new_df.loc[null_indexes_in_c3, 'c3'] = mode_c3[0]

## NaN-9.py
In [61]:
mean_c0 = new_df['c0'].mean() # returns a single float
median_c1 = new_df['c1'].median()  # returns a single float
mode_c3 = new_df['c3'].mode()  # returns a series

print('mean_c0 = ', mean_c0)
print('median_c1 = ', median_c1)
print('mode_c3[0] = ', mode_c3[0]) # 0th element: the most repeated term

Out [61]:

## NaN-8.py
In [59]:
print('Data types:\n', new_df.dtypes)

# (Although such an assignment is not correct, but the purpose here is to
# demonstrate use of mean/median/mode academically so let's continue with it)
new_df.loc['i2', 'c3'] = 10

new_df['c3'] = new_df['c3'].astype('float64') # if skipped a warning would be thrown
                                              # as '#$%' was a string.


## NaN-7.py
In [58]:
new_df.dropna()
# Oops!

Out[58]:
	    c0 	c1 	c2 	c3
index

## NaN-6.py
In [27]:
df.to_csv('Df_with_NaNs', index=False)

In [57]:
new_df = pd.read_csv('Df_with_NaNs', na_values= ['missing', 'not available', 'NA'])
# we should avoid adding the '#$%' value here as a value might be inappropriate
# in one column but not in another.
# eg: 'Male' entered as a value in Age column instead of Gender.

# Setting new index with the same name

## NaN-5.py
In [26]:
df['c3'].isnull()

Out[26]:
i0    False
i1     True
i2    False
i3    False
i4    False
i5    False

## NaN-4.py
In [25]:
df = pd.DataFrame(data=[[1,         2,   np.nan,   3],
                        [np.NaN,  None,  np.NAN,  np.nan], # Notice 3 different NaNs here
                        [4,         5,    'NA',  '#$%'],
                        [6,         7,   np.nan,   8],
                        ["missing", 9,   np.nan,   10],
                        [11,       12,   np.nan,  "not available"]],
                  index='i0,i1,i2,i3,i4,i5'.split(','),
                  columns='c0,c1,c2,c3'.split(','))
	In [23]:

	print(sum(original)/len(original))
	print(sum(replaced_by_mean)/len(replaced_by_mean))
	print(sum(replaced_by_proportion)/len(replaced_by_proportion))

	Out [23]:
	1.8
	1.8000000000000003
	1.8
	In [24]:
	original = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3] # removing NaNs
	replaced_by_mean = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 1.8, 1.8, 1.8, 1.8, 1.8]
	replaced_by_proportion = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 1, 1, 2, 2, 3]

	plt.figure(figsize=(10,6))
	plt.subplot(3, 1, 1)
	plt.boxplot(original, showmeans=True, vert=False)
	plt.title('original', color='green')
	plt.subplot(3, 1, 2)
	In [63]:
	new_df.drop(columns=['c2'], inplace=True)

	new_df

	Out[63]:
	c0 c1 c3
	index
	i0 1.0 2.0 3.0
	i1 5.5 7.0 10.0
	In [62]:
	# Usually .fillna() method is used to replace NaNs
	new_df['c0'].fillna(mean_c0, inplace=True)

	# We can do the same by .replace() method
	new_df['c1'].replace(to_replace= np.nan, value= median_c1, inplace=True)

	# Also by vectorised indexing
	null_indexes_in_c3 = new_df.index[new_df['c3'].isnull()]
	new_df.loc[null_indexes_in_c3, 'c3'] = mode_c3[0]
	In [61]:
	mean_c0 = new_df['c0'].mean() # returns a single float
	median_c1 = new_df['c1'].median() # returns a single float
	mode_c3 = new_df['c3'].mode() # returns a series

	print('mean_c0 = ', mean_c0)
	print('median_c1 = ', median_c1)
	print('mode_c3[0] = ', mode_c3[0]) # 0th element: the most repeated term

	Out [61]:
	In [59]:
	print('Data types:\n', new_df.dtypes)

	# (Although such an assignment is not correct, but the purpose here is to
	# demonstrate use of mean/median/mode academically so let's continue with it)
	new_df.loc['i2', 'c3'] = 10

	new_df['c3'] = new_df['c3'].astype('float64') # if skipped a warning would be thrown
	# as '#$%' was a string.
	In [27]:
	df.to_csv('Df_with_NaNs', index=False)

	In [57]:
	new_df = pd.read_csv('Df_with_NaNs', na_values= ['missing', 'not available', 'NA'])
	# we should avoid adding the '#$%' value here as a value might be inappropriate
	# in one column but not in another.
	# eg: 'Male' entered as a value in Age column instead of Gender.

	# Setting new index with the same name
	In [26]:
	df['c3'].isnull()

	Out[26]:
	i0 False
	i1 True
	i2 False
	i3 False
	i4 False
	i5 False
	In [25]:
	df = pd.DataFrame(data=[[1, 2, np.nan, 3],
	[np.NaN, None, np.NAN, np.nan], # Notice 3 different NaNs here
	[4, 5, 'NA', '#$%'],
	[6, 7, np.nan, 8],
	["missing", 9, np.nan, 10],
	[11, 12, np.nan, "not available"]],
	index='i0,i1,i2,i3,i4,i5'.split(','),
	columns='c0,c1,c2,c3'.split(','))