Skip to content

Instantly share code, notes, and snippets.

@makispl
Last active February 22, 2021 05:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save makispl/a89a3043adb439f6548ebfdaed89d323 to your computer and use it in GitHub Desktop.
Save makispl/a89a3043adb439f6548ebfdaed89d323 to your computer and use it in GitHub Desktop.
# Read in the initial dataset
df_init = pd.read_csv('whiskey_data.csv')
display(df_init.shape)
df_init.head()
# Check for nulls
df_init.info()
# Check and drop redundant columns
for col in df_init.columns:
if len(df_init[col].unique()) < 3:
display(df_init[col].unique())
df_init.drop(columns=['Unnamed: 0', 'currency'], inplace=True)
# Find all the rows that the 'price' column contains alphabetical characters
pattern = r'[A-Za-z]'
df_init[df_init['price'].str.contains(pattern)]['price']
# Replace the 33rd row with the 15.000 dollars unit price
df_init.iloc[33,3] = '15000'
# Remove ','
df_init['price'].replace(',', '', inplace = True, regex = True)
# Split on the '/' character and keep the 1st element
df_init['price'] = df_init['price'].map(lambda x: x.split('/')[0])
# Convert to float type
df_init['price'] = df_init['price'].astype('float')
df_init.info()
# Check for duplicated liquors
df_init.duplicated('name').sum()
df_init[df_init.duplicated(subset=['name'], keep=False)].sort_values('name')
# For each duplicate group, calculate the mean and set it on a new column
df_init['m_rating'] = df_init.groupby('name')['rating'].transform('mean')
df_init['m_price'] = df_init.groupby('name')['price'].transform('mean')
# Drop the duplicate columns (keep the first each time)
df_init.drop_duplicates(subset=['name'], keep='first', inplace=True)
# Drop the old columns
df_init.drop(columns=['price', 'rating'], inplace=True)
df_init.shape
# Rename the affected columns
df_init.rename(columns={"m_rating": "rating", "m_price": "price"}, inplace=True)
df_init = df_init[['name', 'category', 'rating', 'price', 'description']]
df_init.head()
# Switch to the final df
df = df_init.copy()
# Extract new features
df['name'] = df['name'].str.replace(' ABV ', '')
df['alcohol'] = df['name'].str.extract(r"([\(\,\,\'\"\’\”\$] ? ?\d+(\.\d+)?%)")[0]
df['alcohol'] = df['alcohol'].str.replace("[^\d\.]", "").astype(float)
df['age'] = df['name'].str.extract(r'(\d+) year')[0].astype(float)
df = df[['name', 'category', 'rating', 'alcohol', 'age', 'price', 'description']]
display(df.shape)
df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment