vernal-inertia tanveer-sayyed

## NaN-1.py
In [12]:
# Both are not equal
print(np.NaN == None)

# In a condition we cannot get hold of a NaN value
print(np.nan == np.nan)

# But we can hold None just like strings ["missing”, “not available”, “NA”]
print(None == None)

## NaN-2.py
# Creating series
series_with_all_missing = pd.Series([None, None, np.nan, np.nan])
series_with_2_missing   = pd.Series([1, np.nan, 2, np.nan])

## NaN-3.py
In [21]:
series_with_all_missing + series_with_2_missing

Out[21]:
0   NaN
1   NaN
2   NaN
3   NaN
dtype: float64

## NaN-4.py
In [25]:
df = pd.DataFrame(data=[[1,         2,   np.nan,   3],
                        [np.NaN,  None,  np.NAN,  np.nan], # Notice 3 different NaNs here
                        [4,         5,    'NA',  '#$%'],
                        [6,         7,   np.nan,   8],
                        ["missing", 9,   np.nan,   10],
                        [11,       12,   np.nan,  "not available"]],
                  index='i0,i1,i2,i3,i4,i5'.split(','),
                  columns='c0,c1,c2,c3'.split(','))

## NaN-5.py
In [26]:
df['c3'].isnull()

Out[26]:
i0    False
i1     True
i2    False
i3    False
i4    False
i5    False

## NaN-6.py
In [27]:
df.to_csv('Df_with_NaNs', index=False)

In [57]:
new_df = pd.read_csv('Df_with_NaNs', na_values= ['missing', 'not available', 'NA'])
# we should avoid adding the '#$%' value here as a value might be inappropriate
# in one column but not in another.
# eg: 'Male' entered as a value in Age column instead of Gender.

# Setting new index with the same name

## NaN-7.py
In [58]:
new_df.dropna()
# Oops!

Out[58]:
	    c0 	c1 	c2 	c3
index

## NaN-8.py
In [59]:
print('Data types:\n', new_df.dtypes)

# (Although such an assignment is not correct, but the purpose here is to
# demonstrate use of mean/median/mode academically so let's continue with it)
new_df.loc['i2', 'c3'] = 10

new_df['c3'] = new_df['c3'].astype('float64') # if skipped a warning would be thrown
                                              # as '#$%' was a string.


## NaN-9.py
In [61]:
mean_c0 = new_df['c0'].mean() # returns a single float
median_c1 = new_df['c1'].median()  # returns a single float
mode_c3 = new_df['c3'].mode()  # returns a series

print('mean_c0 = ', mean_c0)
print('median_c1 = ', median_c1)
print('mode_c3[0] = ', mode_c3[0]) # 0th element: the most repeated term

Out [61]:

## NaN-10.py
In [62]:
# Usually .fillna() method is used to replace NaNs
new_df['c0'].fillna(mean_c0, inplace=True)

# We can do the same by .replace() method
new_df['c1'].replace(to_replace= np.nan, value= median_c1, inplace=True)

# Also by vectorised indexing
null_indexes_in_c3 = new_df.index[new_df['c3'].isnull()]
new_df.loc[null_indexes_in_c3, 'c3'] = mode_c3[0]
	In [12]:
	# Both are not equal
	print(np.NaN == None)

	# In a condition we cannot get hold of a NaN value
	print(np.nan == np.nan)

	# But we can hold None just like strings ["missing”, “not available”, “NA”]
	print(None == None)
	# Creating series
	series_with_all_missing = pd.Series([None, None, np.nan, np.nan])
	series_with_2_missing = pd.Series([1, np.nan, 2, np.nan])
	In [21]:
	series_with_all_missing + series_with_2_missing

	Out[21]:
	0 NaN
	1 NaN
	2 NaN
	3 NaN
	dtype: float64
	In [25]:
	df = pd.DataFrame(data=[[1, 2, np.nan, 3],
	[np.NaN, None, np.NAN, np.nan], # Notice 3 different NaNs here
	[4, 5, 'NA', '#$%'],
	[6, 7, np.nan, 8],
	["missing", 9, np.nan, 10],
	[11, 12, np.nan, "not available"]],
	index='i0,i1,i2,i3,i4,i5'.split(','),
	columns='c0,c1,c2,c3'.split(','))
	In [26]:
	df['c3'].isnull()

	Out[26]:
	i0 False
	i1 True
	i2 False
	i3 False
	i4 False
	i5 False
	In [27]:
	df.to_csv('Df_with_NaNs', index=False)

	In [57]:
	new_df = pd.read_csv('Df_with_NaNs', na_values= ['missing', 'not available', 'NA'])
	# we should avoid adding the '#$%' value here as a value might be inappropriate
	# in one column but not in another.
	# eg: 'Male' entered as a value in Age column instead of Gender.

	# Setting new index with the same name
	In [59]:
	print('Data types:\n', new_df.dtypes)

	# (Although such an assignment is not correct, but the purpose here is to
	# demonstrate use of mean/median/mode academically so let's continue with it)
	new_df.loc['i2', 'c3'] = 10

	new_df['c3'] = new_df['c3'].astype('float64') # if skipped a warning would be thrown
	# as '#$%' was a string.
	In [61]:
	mean_c0 = new_df['c0'].mean() # returns a single float
	median_c1 = new_df['c1'].median() # returns a single float
	mode_c3 = new_df['c3'].mode() # returns a series

	print('mean_c0 = ', mean_c0)
	print('median_c1 = ', median_c1)
	print('mode_c3[0] = ', mode_c3[0]) # 0th element: the most repeated term

	Out [61]:
	In [62]:
	# Usually .fillna() method is used to replace NaNs
	new_df['c0'].fillna(mean_c0, inplace=True)

	# We can do the same by .replace() method
	new_df['c1'].replace(to_replace= np.nan, value= median_c1, inplace=True)

	# Also by vectorised indexing
	null_indexes_in_c3 = new_df.index[new_df['c3'].isnull()]
	new_df.loc[null_indexes_in_c3, 'c3'] = mode_c3[0]