makispl/data_cleaning.py

## data_cleaning.py
# Read in the initial dataset
df_init = pd.read_csv('whiskey_data.csv')
display(df_init.shape)
df_init.head()

# Check for nulls
df_init.info()

# Check and drop redundant columns
for col in df_init.columns:
    if len(df_init[col].unique()) < 3:
            display(df_init[col].unique())
df_init.drop(columns=['Unnamed: 0', 'currency'], inplace=True)

# Find all the rows that the 'price' column contains alphabetical characters
pattern = r'[A-Za-z]'
df_init[df_init['price'].str.contains(pattern)]['price']

# Replace the 33rd row with the 15.000 dollars unit price
df_init.iloc[33,3] = '15000'

# Remove ','
df_init['price'].replace(',', '', inplace = True, regex = True)

# Split on the '/' character and keep the 1st element
df_init['price'] = df_init['price'].map(lambda x: x.split('/')[0])

# Convert to float type
df_init['price'] = df_init['price'].astype('float')
df_init.info()

# Check for duplicated liquors
df_init.duplicated('name').sum()
df_init[df_init.duplicated(subset=['name'], keep=False)].sort_values('name')

# For each duplicate group, calculate the mean and set it on a new column
df_init['m_rating'] = df_init.groupby('name')['rating'].transform('mean')
df_init['m_price'] = df_init.groupby('name')['price'].transform('mean')

# Drop the duplicate columns (keep the first each time)
df_init.drop_duplicates(subset=['name'], keep='first', inplace=True)

# Drop the old columns
df_init.drop(columns=['price', 'rating'], inplace=True)
df_init.shape

# Rename the affected columns
df_init.rename(columns={"m_rating": "rating", "m_price": "price"}, inplace=True)
df_init = df_init[['name', 'category', 'rating', 'price', 'description']]
df_init.head()

# Switch to the final df
df = df_init.copy()

# Extract new features
df['name'] = df['name'].str.replace(' ABV ', '')
df['alcohol'] = df['name'].str.extract(r"([\(\,\,\'\"\’\”\$] ? ?\d+(\.\d+)?%)")[0]
df['alcohol'] = df['alcohol'].str.replace("[^\d\.]", "").astype(float)

df['age'] = df['name'].str.extract(r'(\d+) year')[0].astype(float)

df = df[['name', 'category', 'rating', 'alcohol', 'age', 'price', 'description']]
display(df.shape)
df.head()
	# Read in the initial dataset
	df_init = pd.read_csv('whiskey_data.csv')
	display(df_init.shape)
	df_init.head()

	# Check for nulls
	df_init.info()

	# Check and drop redundant columns
	for col in df_init.columns:
	if len(df_init[col].unique()) < 3:
	display(df_init[col].unique())
	df_init.drop(columns=['Unnamed: 0', 'currency'], inplace=True)

	# Find all the rows that the 'price' column contains alphabetical characters
	pattern = r'[A-Za-z]'
	df_init[df_init['price'].str.contains(pattern)]['price']

	# Replace the 33rd row with the 15.000 dollars unit price
	df_init.iloc[33,3] = '15000'

	# Remove ','
	df_init['price'].replace(',', '', inplace = True, regex = True)

	# Split on the '/' character and keep the 1st element
	df_init['price'] = df_init['price'].map(lambda x: x.split('/')[0])

	# Convert to float type
	df_init['price'] = df_init['price'].astype('float')
	df_init.info()

	# Check for duplicated liquors
	df_init.duplicated('name').sum()
	df_init[df_init.duplicated(subset=['name'], keep=False)].sort_values('name')

	# For each duplicate group, calculate the mean and set it on a new column
	df_init['m_rating'] = df_init.groupby('name')['rating'].transform('mean')
	df_init['m_price'] = df_init.groupby('name')['price'].transform('mean')

	# Drop the duplicate columns (keep the first each time)
	df_init.drop_duplicates(subset=['name'], keep='first', inplace=True)

	# Drop the old columns
	df_init.drop(columns=['price', 'rating'], inplace=True)
	df_init.shape

	# Rename the affected columns
	df_init.rename(columns={"m_rating": "rating", "m_price": "price"}, inplace=True)
	df_init = df_init[['name', 'category', 'rating', 'price', 'description']]
	df_init.head()

	# Switch to the final df
	df = df_init.copy()

	# Extract new features
	df['name'] = df['name'].str.replace(' ABV ', '')
	df['alcohol'] = df['name'].str.extract(r"([\(\,\,\'\"\’\”\$] ? ?\d+(\.\d+)?%)")[0]
	df['alcohol'] = df['alcohol'].str.replace("[^\d\.]", "").astype(float)

	df['age'] = df['name'].str.extract(r'(\d+) year')[0].astype(float)

	df = df[['name', 'category', 'rating', 'alcohol', 'age', 'price', 'description']]
	display(df.shape)
	df.head()