vidit0210/Feature Engineering for Machine Learning in Python-DataCamp

## Feature Engineering for Machine Learning in Python-DataCamp
----
Selecting specific data types
----
# Create subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

# Print the column names contained in so_survey_df_num
print(so_numeric_df.columns)
---
One-hot encoding and dummy variables
----
# Convert the Country column to a one hot encoded Data Frame
one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')

# Print the columns names
print(one_hot_encoded.columns)
---
# Create dummy variables for the Country column
dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')

# Print the columns names
print(dummy.columns)
---
Dealing with uncommon categories
----
# Create a series out of the Country column
countries = so_survey_df['Country']

# Get the counts of each category
country_counts = countries.value_counts()

# Print the count values for each category
print(country_counts)
-----
# Create a series out of the Country column
countries = so_survey_df['Country']

# Get the counts of each category
country_counts = countries.value_counts()

# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)

# Print the top 5 rows in the mask series
print(mask.head())
----
# Create a series out of the Country column
countries = so_survey_df['Country']

# Get the counts of each category
country_counts = countries.value_counts()

# Create a mask for only categories that occur less than 10 times
mask = countries.isin(country_counts[country_counts < 10].index)

# Label all other categories as Other
countries[mask] = 'Other'

# Print the updated category counts
print(pd.value_counts(countries))
-----
Binarizing columns
-----
# Create the Paid_Job column filled with zeros
so_survey_df['Paid_Job'] = 0

# Replace all the Paid_Job values where ConvertedSalary is > 0
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1

# Print the first five rows of the columns
print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())

-----
Binning values
-----
# Bin the continuous variable ConvertedSalary into 5 bins
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)

# Print the first 5 rows of the equal_binned column
print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
------
# Import numpy
import numpy as np

# Specify the boundaries of the bins
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

# Bin labels
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

# Bin the continuous variable ConvertedSalary using these boundaries
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
                                         bins, labels = labels)

# Print the first 5 rows of the boundary_binned column
print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
-------
Finding the missing values
-----
print(sub_df.head(10).isnull())
--
Listwise deletion
----
# Create a new DataFrame dropping all incomplete rows
no_missing_values_rows = so_survey_df.dropna(how='any')

# Print the shape of the new DataFrame
print(no_missing_values_rows.shape)
----
# Create a new DataFrame dropping all columns with incomplete rows
no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)

# Print the shape of the new DataFrame
print(no_missing_values_cols.shape)
----
# Drop all rows where Gender is missing
no_gender = so_survey_df.dropna(subset=['Gender'])

# Print the shape of the new DataFrame
print(no_gender.shape)
----
Replacing missing values with constants
-----
# Replace missing values
so_survey_df['Gender'].fillna(value='Not Given', inplace=True)

# Print the count of each value
print(so_survey_df['Gender'].value_counts())
----
Filling continuous missing values
-----
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

# Print the first five rows of StackOverflowJobsRecommend column
print(so_survey_df['StackOverflowJobsRecommend'].head())
----
# Fill missing values with the mean
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

# Round the StackOverflowJobsRecommend values
so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])

# Print the top 5 rows
print(so_survey_df['StackOverflowJobsRecommend'].head())
-----
Dealing with stray characters (I)
----
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
----
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
----
Dealing with stray characters (II)
------
# Attempt to convert the column to numeric values
numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')

# Find the indexes of missing values
idx = numeric_vals.isna()

# Print the relevant rows
print(so_survey_df['RawSalary'][idx])
----
# Replace the offending characters
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')

# Convert the column to float
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')

# Print the column
print(so_survey_df['RawSalary'])
----
Method chaining
----
# Use method chaining
so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
                              .str.replace(',', '')\
                              .str.replace('$', '')\
                              .str.replace('£', '')\
                              .astype('float')

# Print the RawSalary column
print(so_survey_df['RawSalary'])
-----
What does your data look like? (I)
----
# Create a histogram
so_numeric_df.hist()
plt.show()
---
# Create a boxplot of two columns
so_numeric_df[['Age', 'Years Experience']].boxplot()
plt.show()
----
# Create a boxplot of ConvertedSalary
so_numeric_df[['ConvertedSalary']].boxplot()
plt.show()
-----
What does your data look like? (II)
----
# Import packages
import matplotlib.pyplot as plt
import seaborn as sns

# Plot pairwise relationships
sns.pairplot(so_numeric_df)

# Show plot
plt.show()
----
Normalization
----
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler
MM_scaler = MinMaxScaler()

# Fit MM_scaler to the data
MM_scaler.fit(so_numeric_df[['Age']])

# Transform the data using the fitted scaler
so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])

# Compare the origional and transformed column
print(so_numeric_df[['Age_MM', 'Age']].head())
------
Standardization
------
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
SS_scaler = StandardScaler()

# Fit SS_scaler to the data
SS_scaler.fit(so_numeric_df[['Age']])

# Transform the data using the fitted scaler
so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])

# Compare the origional and transformed column
print(so_numeric_df[['Age_SS', 'Age']].head())
----
Log transformation
----
# Import PowerTransformer
from sklearn.preprocessing import PowerTransformer

# Instantiate PowerTransformer
pow_trans = PowerTransformer()

# Train the transform on the data
pow_trans.fit(so_numeric_df[['ConvertedSalary']])

# Apply the power transform to the data
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])

# Plot the data before and after the transformation
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
plt.show()
-----
Percentage based outlier removal
----
# Find the 95th quantile
quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)

# Trim the outliers
trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]

# The original histogram
so_numeric_df[['ConvertedSalary']].hist()
plt.show()
plt.clf()

# The trimmed histogram
trimmed_df[['ConvertedSalary']].hist()
plt.show()
-----
Statistical outlier removal
-----
# Find the mean and standard dev
std = so_numeric_df['ConvertedSalary'].std()
mean = so_numeric_df['ConvertedSalary'].mean()

# Calculate the cutoff
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off

# Trim the outliers
trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
                           & (so_numeric_df['ConvertedSalary'] > lower)]

# The trimmed box plot
trimmed_df[['ConvertedSalary']].boxplot()
plt.show()
------
Train and testing transformations (I)
-----
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Apply a standard scaler to the data
SS_scaler = StandardScaler()

# Fit the standard scaler to the data
SS_scaler.fit(so_train_numeric[['Age']])

# Transform the test data using the fitted scaler
so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
print(so_test_numeric[['Age', 'Age_ss']].head())
------
Train and testing transformations (II)
------
train_std = so_train_numeric['ConvertedSalary'].std()
train_mean = so_train_numeric['ConvertedSalary'].mean()

cut_off = train_std * 3
train_lower, train_upper = train_mean - cut_off, train_mean + cut_off

# Trim the test DataFrame
trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
                             & (so_test_numeric['ConvertedSalary'] > train_lower)]
-------
Cleaning up your text
------
# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()

# Print the first 5 rows of the text_clean column
print(speech_df['text_clean'].head())
----
High level text features
----
# Find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()

# Count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

# Find the average length of word
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

# Print the first 5 rows of these columns
print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
-----
LAST COURSE REMAINING INVOLVES TEXT
	----
	Selecting specific data types
	----
	# Create subset of only the numeric columns
	so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

	# Print the column names contained in so_survey_df_num
	print(so_numeric_df.columns)
	---
	One-hot encoding and dummy variables
	----
	# Convert the Country column to a one hot encoded Data Frame
	one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')

	# Print the columns names
	print(one_hot_encoded.columns)
	---
	# Create dummy variables for the Country column
	dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')

	# Print the columns names
	print(dummy.columns)
	---
	Dealing with uncommon categories
	----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Print the count values for each category
	print(country_counts)
	-----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Create a mask for only categories that occur less than 10 times
	mask = countries.isin(country_counts[country_counts < 10].index)

	# Print the top 5 rows in the mask series
	print(mask.head())
	----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Create a mask for only categories that occur less than 10 times
	mask = countries.isin(country_counts[country_counts < 10].index)

	# Label all other categories as Other
	countries[mask] = 'Other'

	# Print the updated category counts
	print(pd.value_counts(countries))
	-----
	Binarizing columns
	-----
	# Create the Paid_Job column filled with zeros
	so_survey_df['Paid_Job'] = 0

	# Replace all the Paid_Job values where ConvertedSalary is > 0
	so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1

	# Print the first five rows of the columns
	print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())

	-----
	Binning values
	-----
	# Bin the continuous variable ConvertedSalary into 5 bins
	so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)

	# Print the first 5 rows of the equal_binned column
	print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
	------
	# Import numpy
	import numpy as np

	# Specify the boundaries of the bins
	bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

	# Bin labels
	labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

	# Bin the continuous variable ConvertedSalary using these boundaries
	so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
	bins, labels = labels)

	# Print the first 5 rows of the boundary_binned column
	print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
	-------
	Finding the missing values
	-----
	print(sub_df.head(10).isnull())
	--
	Listwise deletion
	----
	# Create a new DataFrame dropping all incomplete rows
	no_missing_values_rows = so_survey_df.dropna(how='any')

	# Print the shape of the new DataFrame
	print(no_missing_values_rows.shape)
	----
	# Create a new DataFrame dropping all columns with incomplete rows
	no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)

	# Print the shape of the new DataFrame
	print(no_missing_values_cols.shape)
	----
	# Drop all rows where Gender is missing
	no_gender = so_survey_df.dropna(subset=['Gender'])

	# Print the shape of the new DataFrame
	print(no_gender.shape)
	----
	Replacing missing values with constants
	-----
	# Replace missing values
	so_survey_df['Gender'].fillna(value='Not Given', inplace=True)

	# Print the count of each value
	print(so_survey_df['Gender'].value_counts())
	----
	Filling continuous missing values
	-----
	# Fill missing values with the mean
	so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

	# Print the first five rows of StackOverflowJobsRecommend column
	print(so_survey_df['StackOverflowJobsRecommend'].head())
	----
	# Fill missing values with the mean
	so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

	# Round the StackOverflowJobsRecommend values
	so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])

	# Print the top 5 rows
	print(so_survey_df['StackOverflowJobsRecommend'].head())
	-----
	Dealing with stray characters (I)
	----
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
	----
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
	----
	Dealing with stray characters (II)
	------
	# Attempt to convert the column to numeric values
	numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')

	# Find the indexes of missing values
	idx = numeric_vals.isna()

	# Print the relevant rows
	print(so_survey_df['RawSalary'][idx])
	----
	# Replace the offending characters
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')

	# Convert the column to float
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')

	# Print the column
	print(so_survey_df['RawSalary'])
	----
	Method chaining
	----
	# Use method chaining
	so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
	.str.replace(',', '')\
	.str.replace('$', '')\
	.str.replace('£', '')\
	.astype('float')

	# Print the RawSalary column
	print(so_survey_df['RawSalary'])
	-----
	What does your data look like? (I)
	----
	# Create a histogram
	so_numeric_df.hist()
	plt.show()
	---
	# Create a boxplot of two columns
	so_numeric_df[['Age', 'Years Experience']].boxplot()
	plt.show()
	----
	# Create a boxplot of ConvertedSalary
	so_numeric_df[['ConvertedSalary']].boxplot()
	plt.show()
	-----
	What does your data look like? (II)
	----
	# Import packages
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Plot pairwise relationships
	sns.pairplot(so_numeric_df)

	# Show plot
	plt.show()
	----
	Normalization
	----
	# Import MinMaxScaler
	from sklearn.preprocessing import MinMaxScaler

	# Instantiate MinMaxScaler
	MM_scaler = MinMaxScaler()

	# Fit MM_scaler to the data
	MM_scaler.fit(so_numeric_df[['Age']])

	# Transform the data using the fitted scaler
	so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])

	# Compare the origional and transformed column
	print(so_numeric_df[['Age_MM', 'Age']].head())
	------
	Standardization
	------
	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Instantiate StandardScaler
	SS_scaler = StandardScaler()

	# Fit SS_scaler to the data
	SS_scaler.fit(so_numeric_df[['Age']])

	# Transform the data using the fitted scaler
	so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])

	# Compare the origional and transformed column
	print(so_numeric_df[['Age_SS', 'Age']].head())
	----
	Log transformation
	----
	# Import PowerTransformer
	from sklearn.preprocessing import PowerTransformer

	# Instantiate PowerTransformer
	pow_trans = PowerTransformer()

	# Train the transform on the data
	pow_trans.fit(so_numeric_df[['ConvertedSalary']])

	# Apply the power transform to the data
	so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])

	# Plot the data before and after the transformation
	so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
	plt.show()
	-----
	Percentage based outlier removal
	----
	# Find the 95th quantile
	quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)

	# Trim the outliers
	trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]

	# The original histogram
	so_numeric_df[['ConvertedSalary']].hist()
	plt.show()
	plt.clf()

	# The trimmed histogram
	trimmed_df[['ConvertedSalary']].hist()
	plt.show()
	-----
	Statistical outlier removal
	-----
	# Find the mean and standard dev
	std = so_numeric_df['ConvertedSalary'].std()
	mean = so_numeric_df['ConvertedSalary'].mean()

	# Calculate the cutoff
	cut_off = std * 3
	lower, upper = mean - cut_off, mean + cut_off

	# Trim the outliers
	trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
	& (so_numeric_df['ConvertedSalary'] > lower)]

	# The trimmed box plot
	trimmed_df[['ConvertedSalary']].boxplot()
	plt.show()
	------
	Train and testing transformations (I)
	-----
	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Apply a standard scaler to the data
	SS_scaler = StandardScaler()

	# Fit the standard scaler to the data
	SS_scaler.fit(so_train_numeric[['Age']])

	# Transform the test data using the fitted scaler
	so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
	print(so_test_numeric[['Age', 'Age_ss']].head())
	------
	Train and testing transformations (II)
	------
	train_std = so_train_numeric['ConvertedSalary'].std()
	train_mean = so_train_numeric['ConvertedSalary'].mean()

	cut_off = train_std * 3
	train_lower, train_upper = train_mean - cut_off, train_mean + cut_off

	# Trim the test DataFrame
	trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
	& (so_test_numeric['ConvertedSalary'] > train_lower)]
	-------
	Cleaning up your text
	------
	# Replace all non letter characters with a whitespace
	speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

	# Change to lower case
	speech_df['text_clean'] = speech_df['text_clean'].str.lower()

	# Print the first 5 rows of the text_clean column
	print(speech_df['text_clean'].head())
	----
	High level text features
	----
	# Find the length of each text
	speech_df['char_cnt'] = speech_df['text_clean'].str.len()

	# Count the number of words in each text
	speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

	# Find the average length of word
	speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

	# Print the first 5 rows of these columns
	print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
	-----
	LAST COURSE REMAINING INVOLVES TEXT