WillKoehrsen/feature_engineering.py

## feature_engineering.py
# Copy the original data
features = data.copy()

# Select the numeric columns
numeric_subset = data.select_dtypes('number')

# Create columns with log of numeric columns
for col in numeric_subset.columns:
    # Skip the Energy Star Score column
    if col == 'score':
        next
    else:
        numeric_subset['log_' + col] = np.log(numeric_subset[col])

# Select the categorical columns
categorical_subset = data[['Borough', 'Largest Property Use Type']]

# One hot encode
categorical_subset = pd.get_dummies(categorical_subset)

# Join the two dataframes using concat
# Make sure to use axis = 1 to perform a column bind
features = pd.concat([numeric_subset, categorical_subset], axis = 1)
	# Copy the original data
	features = data.copy()

	# Select the numeric columns
	numeric_subset = data.select_dtypes('number')

	# Create columns with log of numeric columns
	for col in numeric_subset.columns:
	# Skip the Energy Star Score column
	if col == 'score':
	next
	else:
	numeric_subset['log_' + col] = np.log(numeric_subset[col])

	# Select the categorical columns
	categorical_subset = data[['Borough', 'Largest Property Use Type']]

	# One hot encode
	categorical_subset = pd.get_dummies(categorical_subset)

	# Join the two dataframes using concat
	# Make sure to use axis = 1 to perform a column bind
	features = pd.concat([numeric_subset, categorical_subset], axis = 1)