Skip to content

Instantly share code, notes, and snippets.

View tanveer-sayyed's full-sized avatar

vernal-inertia tanveer-sayyed

  • Spherex
  • Mumbai
View GitHub Profile
In [23]:
print(sum(original)/len(original))
print(sum(replaced_by_mean)/len(replaced_by_mean))
print(sum(replaced_by_proportion)/len(replaced_by_proportion))
Out [23]:
1.8
1.8000000000000003
1.8
In [24]:
original = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3] # removing NaNs
replaced_by_mean = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 1.8, 1.8, 1.8, 1.8, 1.8]
replaced_by_proportion = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 1, 1, 2, 2, 3]
plt.figure(figsize=(10,6))
plt.subplot(3, 1, 1)
plt.boxplot(original, showmeans=True, vert=False)
plt.title('original', color='green')
plt.subplot(3, 1, 2)
In [63]:
new_df.drop(columns=['c2'], inplace=True)
new_df
Out[63]:
c0 c1 c3
index
i0 1.0 2.0 3.0
i1 5.5 7.0 10.0
In [62]:
# Usually .fillna() method is used to replace NaNs
new_df['c0'].fillna(mean_c0, inplace=True)
# We can do the same by .replace() method
new_df['c1'].replace(to_replace= np.nan, value= median_c1, inplace=True)
# Also by vectorised indexing
null_indexes_in_c3 = new_df.index[new_df['c3'].isnull()]
new_df.loc[null_indexes_in_c3, 'c3'] = mode_c3[0]
In [61]:
mean_c0 = new_df['c0'].mean() # returns a single float
median_c1 = new_df['c1'].median() # returns a single float
mode_c3 = new_df['c3'].mode() # returns a series
print('mean_c0 = ', mean_c0)
print('median_c1 = ', median_c1)
print('mode_c3[0] = ', mode_c3[0]) # 0th element: the most repeated term
Out [61]:
In [59]:
print('Data types:\n', new_df.dtypes)
# (Although such an assignment is not correct, but the purpose here is to
# demonstrate use of mean/median/mode academically so let's continue with it)
new_df.loc['i2', 'c3'] = 10
new_df['c3'] = new_df['c3'].astype('float64') # if skipped a warning would be thrown
# as '#$%' was a string.
In [58]:
new_df.dropna()
# Oops!
Out[58]:
c0 c1 c2 c3
index
In [27]:
df.to_csv('Df_with_NaNs', index=False)
In [57]:
new_df = pd.read_csv('Df_with_NaNs', na_values= ['missing', 'not available', 'NA'])
# we should avoid adding the '#$%' value here as a value might be inappropriate
# in one column but not in another.
# eg: 'Male' entered as a value in Age column instead of Gender.
# Setting new index with the same name
In [26]:
df['c3'].isnull()
Out[26]:
i0 False
i1 True
i2 False
i3 False
i4 False
i5 False
In [25]:
df = pd.DataFrame(data=[[1, 2, np.nan, 3],
[np.NaN, None, np.NAN, np.nan], # Notice 3 different NaNs here
[4, 5, 'NA', '#$%'],
[6, 7, np.nan, 8],
["missing", 9, np.nan, 10],
[11, 12, np.nan, "not available"]],
index='i0,i1,i2,i3,i4,i5'.split(','),
columns='c0,c1,c2,c3'.split(','))