Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save vidit0210/c3ca8454dc1f3d7c65309cc0015b288d to your computer and use it in GitHub Desktop.
Save vidit0210/c3ca8454dc1f3d7c65309cc0015b288d to your computer and use it in GitHub Desktop.
Feature Engineering for Machine Learning in Python-dataCamp
----
Selecting specific data types
----
# Create subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])
# Print the column names contained in so_survey_df_num
print(so_numeric_df.columns)
---
One-hot encoding and dummy variables
----
# Convert the Country column to a one hot encoded Data Frame
one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')
# Print the columns names
print(one_hot_encoded.columns)
---
# Create dummy variables for the Country column
dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')
# Print the columns names
print(dummy.columns)
---
Dealing with uncommon categories
----
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Print the count values for each category
print(country_counts)
-----
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)
# Print the top 5 rows in the mask series
print(mask.head())
----
# Create a series out of the Country column
countries = so_survey_df['Country']
# Get the counts of each category
country_counts = countries.value_counts()
# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)
# Label all other categories as Other
countries[mask] = 'Other'
# Print the updated category counts
print(pd.value_counts(countries))
-----
Binarizing columns
-----
# Create the Paid_Job column filled with zeros
so_survey_df['Paid_Job'] = 0
# Replace all the Paid_Job values where ConvertedSalary is > 0
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1
# Print the first five rows of the columns
print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())
-----
Binning values
-----
# Bin the continuous variable ConvertedSalary into 5 bins
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)
# Print the first 5 rows of the equal_binned column
print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
------
# Import numpy
import numpy as np
# Specify the boundaries of the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]
# Bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']
# Bin the continuous variable ConvertedSalary using these boundaries
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
bins, labels = labels)
# Print the first 5 rows of the boundary_binned column
print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
-------
Finding the missing values
-----
print(sub_df.head(10).isnull())
--
Listwise deletion
----
# Create a new DataFrame dropping all incomplete rows
no_missing_values_rows = so_survey_df.dropna(how='any')
# Print the shape of the new DataFrame
print(no_missing_values_rows.shape)
----
# Create a new DataFrame dropping all columns with incomplete rows
no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)
# Print the shape of the new DataFrame
print(no_missing_values_cols.shape)
----
# Drop all rows where Gender is missing
no_gender = so_survey_df.dropna(subset=['Gender'])
# Print the shape of the new DataFrame
print(no_gender.shape)
----
Replacing missing values with constants
-----
# Replace missing values
so_survey_df['Gender'].fillna(value='Not Given', inplace=True)
# Print the count of each value
print(so_survey_df['Gender'].value_counts())
----
Filling continuous missing values
-----
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)
# Print the first five rows of StackOverflowJobsRecommend column
print(so_survey_df['StackOverflowJobsRecommend'].head())
----
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)
# Round the StackOverflowJobsRecommend values
so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])
# Print the top 5 rows
print(so_survey_df['StackOverflowJobsRecommend'].head())
-----
Dealing with stray characters (I)
----
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
----
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
----
Dealing with stray characters (II)
------
# Attempt to convert the column to numeric values
numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')
# Find the indexes of missing values
idx = numeric_vals.isna()
# Print the relevant rows
print(so_survey_df['RawSalary'][idx])
----
# Replace the offending characters
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')
# Convert the column to float
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')
# Print the column
print(so_survey_df['RawSalary'])
----
Method chaining
----
# Use method chaining
so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
.str.replace(',', '')\
.str.replace('$', '')\
.str.replace('£', '')\
.astype('float')
# Print the RawSalary column
print(so_survey_df['RawSalary'])
-----
What does your data look like? (I)
----
# Create a histogram
so_numeric_df.hist()
plt.show()
---
# Create a boxplot of two columns
so_numeric_df[['Age', 'Years Experience']].boxplot()
plt.show()
----
# Create a boxplot of ConvertedSalary
so_numeric_df[['ConvertedSalary']].boxplot()
plt.show()
-----
What does your data look like? (II)
----
# Import packages
import matplotlib.pyplot as plt
import seaborn as sns
# Plot pairwise relationships
sns.pairplot(so_numeric_df)
# Show plot
plt.show()
----
Normalization
----
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Instantiate MinMaxScaler
MM_scaler = MinMaxScaler()
# Fit MM_scaler to the data
MM_scaler.fit(so_numeric_df[['Age']])
# Transform the data using the fitted scaler
so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])
# Compare the origional and transformed column
print(so_numeric_df[['Age_MM', 'Age']].head())
------
Standardization
------
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Instantiate StandardScaler
SS_scaler = StandardScaler()
# Fit SS_scaler to the data
SS_scaler.fit(so_numeric_df[['Age']])
# Transform the data using the fitted scaler
so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])
# Compare the origional and transformed column
print(so_numeric_df[['Age_SS', 'Age']].head())
----
Log transformation
----
# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer
# Instantiate PowerTransformer
pow_trans = PowerTransformer()
# Train the transform on the data
pow_trans.fit(so_numeric_df[['ConvertedSalary']])
# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])
# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
plt.show()
-----
Percentage based outlier removal
----
# Find the 95th quantile
quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)
# Trim the outliers
trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]
# The original histogram
so_numeric_df[['ConvertedSalary']].hist()
plt.show()
plt.clf()
# The trimmed histogram
trimmed_df[['ConvertedSalary']].hist()
plt.show()
-----
Statistical outlier removal
-----
# Find the mean and standard dev
std = so_numeric_df['ConvertedSalary'].std()
mean = so_numeric_df['ConvertedSalary'].mean()
# Calculate the cutoff
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
# Trim the outliers
trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
& (so_numeric_df['ConvertedSalary'] > lower)]
# The trimmed box plot
trimmed_df[['ConvertedSalary']].boxplot()
plt.show()
------
Train and testing transformations (I)
-----
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Apply a standard scaler to the data
SS_scaler = StandardScaler()
# Fit the standard scaler to the data
SS_scaler.fit(so_train_numeric[['Age']])
# Transform the test data using the fitted scaler
so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
print(so_test_numeric[['Age', 'Age_ss']].head())
------
Train and testing transformations (II)
------
train_std = so_train_numeric['ConvertedSalary'].std()
train_mean = so_train_numeric['ConvertedSalary'].mean()
cut_off = train_std * 3
train_lower, train_upper = train_mean - cut_off, train_mean + cut_off
# Trim the test DataFrame
trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
& (so_test_numeric['ConvertedSalary'] > train_lower)]
-------
Cleaning up your text
------
# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')
# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()
# Print the first 5 rows of the text_clean column
print(speech_df['text_clean'].head())
----
High level text features
----
# Find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()
# Count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()
# Find the average length of word
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']
# Print the first 5 rows of these columns
print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
-----
LAST COURSE REMAINING INVOLVES TEXT
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment