Skip to content

Instantly share code, notes, and snippets.

@ahmedshahriar
Last active November 16, 2021 19:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahmedshahriar/30225c6df46529941f6a7b02f98886bc to your computer and use it in GitHub Desktop.
Save ahmedshahriar/30225c6df46529941f6a7b02f98886bc to your computer and use it in GitHub Desktop.
A simple function to check for missing values and percentage
# data - pandas dataframe
def missing_value_describe(data):
# check missing values in the data
total = data.isna().sum().sort_values(ascending=False)
missing_value_pct_stats = (data.isnull().sum() / len(data)*100)
missing_value_col_count = sum(missing_value_pct_stats > 0)
# missing_value_stats = missing_value_pct_stats.sort_values(ascending=False)[:missing_value_col_count]
missing_data = pd.concat([total, missing_value_pct_stats], axis=1, keys=['Total', 'Percent'])
print("Number of rows with at least 1 missing values:", data.isna().any(axis = 1).sum())
print("Number of columns with missing values:", missing_value_col_count)
if missing_value_col_count != 0:
# print out column names with missing value percentage
print("\nMissing percentage (desceding):")
display(missing_data[:missing_value_col_count])
# plot missing values
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
else:
print("No missing data!!!")
# pass a dataframe to the function
missing_value_describe(df)
# Removes Data Duplicates while Retaining the First one
def remove_duplicate(data):
data.drop_duplicates(keep="first", inplace=True)
return "Checked Duplicates"
# Removes Duplicates from train data
remove_duplicate(train)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment