Feature/attributes/input/predictors extraction from given name string.
def extract_feature(name: str):
name = name.upper()
feature = dict()
# additional feature extraction
# feature["first_1"] = name[0]
# for letter in 'abcdefghijklmnopqrstuvwxyz'.upper():
# feature["count({})".format(letter)] = name.count(letter)
# feature["has({})".format(letter)] = (letter in name)
feature.update({
'last_1': name[-1],
'last_2': name[-2:],
'last_3': name[-3:],
'last_is_vowel': (name[-1] in 'AEIOUY')
})
return feature
Applying probability distribution as one name have two possible outcomes.
male_probability = total_male_count / (total_male_count + total_female_count).
female_probability = 1 - male_probability.
No system can hold the value 1.0 in probability so making it to 0.99.
def get_probability_distribution(name_tuple):
male_counts = name_tuple[1]
female_counts = name_tuple[2]
male_prob = (male_counts * 1.0) / sum([male_counts, female_counts])
if male_prob == 1.0:
male_prob = 0.99
elif male_prob == 0.00:
male_prob = 0.01
female_prob = 1.0 - male_prob
return male_prob, female_prob
Preparing feature matrix (X) and response vector (y) - Supervised Learning model.
def prepare_data_set():
feature_set = list()
male_names, female_names = split_names(load_names())
names = {'M': male_names, 'F': female_names}
for gender in names.keys():
for name in names[gender]:
features = extract_feature(name[0])
male_prob, female_prob = get_probability_distribution(name)
features['m_prob'] = male_prob
features['f_prob'] = female_prob
feature_set.append((features, gender))
random.shuffle(feature_set)
return feature_set
Well if you know pandas, you could snap the data using the following function.
def validate_data_set(feature_set: list):
data_list = []
for feature_value, gender in feature_set:
data_list.append({**feature_value, **{'gender': gender}})
import pandas as pd
df = pd.DataFrame(data_list)
print('Feature matrix shape - ', df.shape)