SahilFruitwala/Data_Cleaning.py

## Data_Cleaning.py
import pandas as pd

"""
Data Loading
"""
# df = pd.read_csv('my_file.csv')
# df = pd.read_csv('my_file.csv', delimiter=',')
# df = pd.read_csv('my_file.csv', delimiter=',', header=None)
df = pd.read_csv('my_file.csv', delimiter=',', header=0, names=['Id','Name', 'Type', 'Price'])

#--------------------------------------------------------------------------------------------------------
"""
Removing Duplicates
"""
# 1. Removes duplicate and returns a copy of dataframe
df = df.drop_duplicates()

# 2. Removes duplicates in place
df = df.drop_duplicates(inplace=True)

# 3. Drops duplicates and keep first/last occurance
df = df.drop_duplicates(inplace=True, keep='last')

# 4. Consider only certain columns for identigying duplicates
df = df.drop_duplicates(subset=['Id', 'Price'], inplace=True, keep='last')

#-----------------------------------------------------------------------------------------------------------
"""
Remove emojis
"""
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

#-----------------------------------------------------------------------------------------------------------
"""
Convert to lowercase
"""
df['Type'] = df['Type'].str.lower()
df['Name'] = df['Name'].str.lower()

#-----------------------------------------------------------------------------------------------------------
"""
Remove multiple whitespaces, tabs and newlines
"""
df['Type'] = df['Type'].str.replace('\n', '')
df['Type'] = df['Type'].str.replace('\t', ' ')
df['Type'] = df['Type'].str.replace(' {2,}', ' ', regex=True)
df['Type'] = df['Type'].str.strip()

#-----------------------------------------------------------------------------------------------------------
"""
Remove URLs
"""
df['Type'] = df['Type'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

#-----------------------------------------------------------------------------------------------------------
"""
Drop Empty Rows
"""
df.dropna()

df['Type'].astype(bool)
df = df[df['Type'].astype(bool)]

#-----------------------------------------------------------------------------------------------------------
"""
More Data Processing
"""
import numpy as np

df = df.drop(['Id', 'Name'], axis=1)
df = df[df['Type'].str.contains('frozen') | df['Type'].str.contains('green')]

def detect_price(row):
	if row['Price'] > 15.50:
		return 'High'
	elif row['Price'] > 5.50 and row['Price'] <= 15.50:
		return 'Medium'
	elif row['Price'] > 0.0 and row['Price'] <= 5.50:
		return 'Low'
	else:
		return np.NaN

df['Range'] = df.apply (lambda row: detect_price(row), axis=1)
	import pandas as pd

	"""
	Data Loading
	"""
	# df = pd.read_csv('my_file.csv')
	# df = pd.read_csv('my_file.csv', delimiter=',')
	# df = pd.read_csv('my_file.csv', delimiter=',', header=None)
	df = pd.read_csv('my_file.csv', delimiter=',', header=0, names=['Id','Name', 'Type', 'Price'])

	#--------------------------------------------------------------------------------------------------------
	"""
	Removing Duplicates
	"""
	# 1. Removes duplicate and returns a copy of dataframe
	df = df.drop_duplicates()

	# 2. Removes duplicates in place
	df = df.drop_duplicates(inplace=True)

	# 3. Drops duplicates and keep first/last occurance
	df = df.drop_duplicates(inplace=True, keep='last')

	# 4. Consider only certain columns for identigying duplicates
	df = df.drop_duplicates(subset=['Id', 'Price'], inplace=True, keep='last')

	#-----------------------------------------------------------------------------------------------------------
	"""
	Remove emojis
	"""
	df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

	#-----------------------------------------------------------------------------------------------------------
	"""
	Convert to lowercase
	"""
	df['Type'] = df['Type'].str.lower()
	df['Name'] = df['Name'].str.lower()

	#-----------------------------------------------------------------------------------------------------------
	"""
	Remove multiple whitespaces, tabs and newlines
	"""
	df['Type'] = df['Type'].str.replace('\n', '')
	df['Type'] = df['Type'].str.replace('\t', ' ')
	df['Type'] = df['Type'].str.replace(' {2,}', ' ', regex=True)
	df['Type'] = df['Type'].str.strip()

	#-----------------------------------------------------------------------------------------------------------
	"""
	Remove URLs
	"""
	df['Type'] = df['Type'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

	#-----------------------------------------------------------------------------------------------------------
	"""
	Drop Empty Rows
	"""
	df.dropna()

	df['Type'].astype(bool)
	df = df[df['Type'].astype(bool)]

	#-----------------------------------------------------------------------------------------------------------
	"""
	More Data Processing
	"""
	import numpy as np

	df = df.drop(['Id', 'Name'], axis=1)
	df = df[df['Type'].str.contains('frozen') \| df['Type'].str.contains('green')]

	def detect_price(row):
	if row['Price'] > 15.50:
	return 'High'
	elif row['Price'] > 5.50 and row['Price'] <= 15.50:
	return 'Medium'
	elif row['Price'] > 0.0 and row['Price'] <= 5.50:
	return 'Low'
	else:
	return np.NaN

	df['Range'] = df.apply (lambda row: detect_price(row), axis=1)