Skip to content

Instantly share code, notes, and snippets.

@SahilFruitwala
Last active November 19, 2023 13:02
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save SahilFruitwala/2c65b649965cccc0881c6c15b876f577 to your computer and use it in GitHub Desktop.
Save SahilFruitwala/2c65b649965cccc0881c6c15b876f577 to your computer and use it in GitHub Desktop.
Data Cleaning Code
import pandas as pd
"""
Data Loading
"""
# df = pd.read_csv('my_file.csv')
# df = pd.read_csv('my_file.csv', delimiter=',')
# df = pd.read_csv('my_file.csv', delimiter=',', header=None)
df = pd.read_csv('my_file.csv', delimiter=',', header=0, names=['Id','Name', 'Type', 'Price'])
#--------------------------------------------------------------------------------------------------------
"""
Removing Duplicates
"""
# 1. Removes duplicate and returns a copy of dataframe
df = df.drop_duplicates()
# 2. Removes duplicates in place
df = df.drop_duplicates(inplace=True)
# 3. Drops duplicates and keep first/last occurance
df = df.drop_duplicates(inplace=True, keep='last')
# 4. Consider only certain columns for identigying duplicates
df = df.drop_duplicates(subset=['Id', 'Price'], inplace=True, keep='last')
#-----------------------------------------------------------------------------------------------------------
"""
Remove emojis
"""
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
#-----------------------------------------------------------------------------------------------------------
"""
Convert to lowercase
"""
df['Type'] = df['Type'].str.lower()
df['Name'] = df['Name'].str.lower()
#-----------------------------------------------------------------------------------------------------------
"""
Remove multiple whitespaces, tabs and newlines
"""
df['Type'] = df['Type'].str.replace('\n', '')
df['Type'] = df['Type'].str.replace('\t', ' ')
df['Type'] = df['Type'].str.replace(' {2,}', ' ', regex=True)
df['Type'] = df['Type'].str.strip()
#-----------------------------------------------------------------------------------------------------------
"""
Remove URLs
"""
df['Type'] = df['Type'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
#-----------------------------------------------------------------------------------------------------------
"""
Drop Empty Rows
"""
df.dropna()
df['Type'].astype(bool)
df = df[df['Type'].astype(bool)]
#-----------------------------------------------------------------------------------------------------------
"""
More Data Processing
"""
import numpy as np
df = df.drop(['Id', 'Name'], axis=1)
df = df[df['Type'].str.contains('frozen') | df['Type'].str.contains('green')]
def detect_price(row):
if row['Price'] > 15.50:
return 'High'
elif row['Price'] > 5.50 and row['Price'] <= 15.50:
return 'Medium'
elif row['Price'] > 0.0 and row['Price'] <= 5.50:
return 'Low'
else:
return np.NaN
df['Range'] = df.apply (lambda row: detect_price(row), axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment