Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save stephenleo/f0af3c46661c86e36872a252b97fb942 to your computer and use it in GitHub Desktop.
Save stephenleo/f0af3c46661c86e36872a252b97fb942 to your computer and use it in GitHub Desktop.
[Medium] Boy or Girl? A Machine Learning Web App to Detect Gender from Name

Code for the medium post

numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2
from google.cloud import bigquery
client = bigquery.Client()
sql = """
SELECT
name,
gender,
COUNT(name) AS num_names
FROM
`bigquery-public-data.usa_names.usa_1910_current`
GROUP BY
name,
gender
"""
names_df = client.query(sql).to_dataframe()
print(names_df.shape)
names_df.head()
def preprocess(names_df, train=True):
# Step 1: Lowercase
names_df['name'] = names_df['name'].str.lower()
# Step 2: Split individual characters
names_df['name'] = [list(name) for name in names_df['name']]
# Step 3: Pad names with spaces to make all names same length
name_length = 50
names_df['name'] = [
(name + [' ']*name_length)[:name_length]
for name in names_df['name']
]
# Step 4: Encode Characters to Numbers
names_df['name'] = [
[
max(0.0, ord(char)-96.0)
for char in name
]
for name in names_df['name']
]
if train:
# Step 5: Encode Gender to Numbers
names_df['gender'] = [
0.0 if gender=='F' else 1.0
for gender in names_df['gender']
]
return names_df
names_df = preprocess(names_df)
names_df.head()
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256):
model = Sequential([
Embedding(num_alphabets, embedding_dim, input_length=name_length),
Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)),
Dense(1, activation="sigmoid")
])
model.compile(loss='binary_crossentropy',
optimizer=Adam(learning_rate=0.001),
metrics=['accuracy'])
return model
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
# Step 1: Instantiate the model
model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256)
# Step 2: Split Training and Test Data
X = np.asarray(names_df['name'].values.tolist())
y = np.asarray(names_df['gender'].values.tolist())
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
# Step 3: Train the model
callbacks = [
EarlyStopping(monitor='val_accuracy',
min_delta=1e-3,
patience=5,
mode='max',
restore_best_weights=True,
verbose=1),
]
history = model.fit(x=X_train,
y=y_train,
batch_size=64,
epochs=50,
validation_data=(X_test, y_test),
callbacks=callbacks)
# Step 4: Save the model
model.save('boyorgirl.h5')
# Step 5: Plot accuracies
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
pred_model = load_model('boyorgirl.h5')
# Input names
names = ['Joe', 'Biden', 'Kamala', 'Harris']
# Convert to dataframe
pred_df = pd.DataFrame({'name': names})
# Preprocess
pred_df = preprocess(pred_df, train=False)
# Predictions
result = pred_model.predict(np.asarray(
pred_df['name'].values.tolist())).squeeze(axis=1)
pred_df['Boy or Girl?'] = [
'Boy' if logit > 0.5 else 'Girl' for logit in result
]
pred_df['Probability'] = [
logit if logit > 0.5 else 1.0 - logit for logit in result
]
# Format the output
pred_df['name'] = names
pred_df.rename(columns={'name': 'Name'}, inplace=True)
pred_df['Probability'] = pred_df['Probability'].round(2)
pred_df.drop_duplicates(inplace=True)
pred_df.head()
import pandas as pd
import plotly.express as px
import requests
import streamlit as st
# Get user inputs
names = st.text_input(
"Names", help="Input the names you'd like to check separated with spaces or commas"
)
# Add a submit button
if st.button("Submit"):
# Code to post the user inputs to the API and get the predictions
# Paste the URL to your API here!
api_url = "https://name-gender1.p.rapidapi.com/predict"
headers = {
"content-type": "application/json",
"X-RapidAPI-Key": st.secrets["RAPID_API_KEY"], # Enter your RAPID API Key here
"X-RapidAPI-Host": "name-gender1.p.rapidapi.com",
}
with st.spinner("🥁 Drumroll..."):
response = requests.post(api_url, json=[names], headers=headers)
predictions_df = pd.DataFrame(response.json()["response"])
predictions_df.columns = ["Name", "Boy or Girl?", "Probability"]
predictions_df = predictions_df.apply(
lambda x: x.str.title() if x.dtype == "object" else x
)
fig = px.bar(
predictions_df,
x="Probability",
y="Name",
color="Boy or Girl?",
orientation="h",
color_discrete_map={"Boy": "dodgerblue", "Girl": "lightcoral"},
)
fig.update_layout(
title={"text": "Confidence in Prediction", "x": 0.5},
yaxis={
"categoryorder": "array",
"categoryarray": predictions_df["Name"].values.tolist(),
"autorange": "reversed",
},
xaxis={"range": [0, 1]},
font={"size": 14},
# width=700
)
st.write("Predictions")
st.dataframe(predictions_df)
st.plotly_chart(fig, use_container_width=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment