Feature Engineering for Machine Learning in Python-dataCamp
Selecting specific data types
# Create subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])
# Print the column names contained in so_survey_df_num
One-hot encoding and dummy variables
# Convert the Country column to a one hot encoded Data Frame
one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')
# Print the columns names
# Create dummy variables for the Country column
dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')
# Print the columns names
Dealing with uncommon categories
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Print the count values for each category
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)
# Print the top 5 rows in the mask series
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)
# Label all other categories as Other
countries[mask] = 'Other'
# Print the updated category counts
Binarizing columns
# Create the Paid_Job column filled with zeros
so_survey_df['Paid_Job'] = 0
# Replace all the Paid_Job values where ConvertedSalary is > 0
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1
# Print the first five rows of the columns
print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())
Binning values
# Bin the continuous variable ConvertedSalary into 5 bins
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)
# Print the first 5 rows of the equal_binned column
print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
# Import numpy
import numpy as np
# Specify the boundaries of the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]
# Bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']
# Bin the continuous variable ConvertedSalary using these boundaries
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
bins, labels = labels)
# Print the first 5 rows of the boundary_binned column
print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
Finding the missing values
Listwise deletion
# Create a new DataFrame dropping all incomplete rows
no_missing_values_rows = so_survey_df.dropna(how='any')
# Print the shape of the new DataFrame
# Create a new DataFrame dropping all columns with incomplete rows
no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)
# Print the shape of the new DataFrame
# Drop all rows where Gender is missing
no_gender = so_survey_df.dropna(subset=['Gender'])
# Print the shape of the new DataFrame
Replacing missing values with constants
# Replace missing values
so_survey_df['Gender'].fillna(value='Not Given', inplace=True)
# Print the count of each value
Filling continuous missing values
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)
# Print the first five rows of StackOverflowJobsRecommend column
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)
# Round the StackOverflowJobsRecommend values
so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])
# Print the top 5 rows
Dealing with stray characters (I)
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
Dealing with stray characters (II)
# Attempt to convert the column to numeric values
numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')
# Find the indexes of missing values
idx = numeric_vals.isna()
# Print the relevant rows
# Replace the offending characters
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')
# Convert the column to float
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')
# Print the column
Method chaining
# Use method chaining
so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
.str.replace(',', '')\
.str.replace('$', '')\
.str.replace('£', '')\
# Print the RawSalary column
What does your data look like? (I)
# Create a histogram
# Create a boxplot of two columns
so_numeric_df[['Age', 'Years Experience']].boxplot()
# Create a boxplot of ConvertedSalary
What does your data look like? (II)
# Import packages
import matplotlib.pyplot as plt
import seaborn as sns
# Plot pairwise relationships
# Show plot
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Instantiate MinMaxScaler
MM_scaler = MinMaxScaler()
# Fit MM_scaler to the data[['Age']])
# Transform the data using the fitted scaler
so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])
# Compare the origional and transformed column
print(so_numeric_df[['Age_MM', 'Age']].head())
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Instantiate StandardScaler
SS_scaler = StandardScaler()
# Fit SS_scaler to the data[['Age']])
# Transform the data using the fitted scaler
so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])
# Compare the origional and transformed column
print(so_numeric_df[['Age_SS', 'Age']].head())
Log transformation
# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer
# Instantiate PowerTransformer
pow_trans = PowerTransformer()
# Train the transform on the data[['ConvertedSalary']])
# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])
# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
Percentage based outlier removal
# Find the 95th quantile
quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)
# Trim the outliers
trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]
# The original histogram
# The trimmed histogram
Statistical outlier removal
# Find the mean and standard dev
std = so_numeric_df['ConvertedSalary'].std()
mean = so_numeric_df['ConvertedSalary'].mean()
# Calculate the cutoff
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
# Trim the outliers
trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
& (so_numeric_df['ConvertedSalary'] > lower)]
# The trimmed box plot
Train and testing transformations (I)
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Apply a standard scaler to the data
SS_scaler = StandardScaler()
# Fit the standard scaler to the data[['Age']])
# Transform the test data using the fitted scaler
so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
print(so_test_numeric[['Age', 'Age_ss']].head())
Train and testing transformations (II)
train_std = so_train_numeric['ConvertedSalary'].std()
train_mean = so_train_numeric['ConvertedSalary'].mean()
cut_off = train_std * 3
train_lower, train_upper = train_mean - cut_off, train_mean + cut_off
# Trim the test DataFrame
trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
& (so_test_numeric['ConvertedSalary'] > train_lower)]
Cleaning up your text
# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')
# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()
# Print the first 5 rows of the text_clean column
High level text features
# Find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()
# Count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()
# Find the average length of word
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']
# Print the first 5 rows of these columns
print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
