Code for the medium post
numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2
Code for the medium post
numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2
from google.cloud import bigquery | |
client = bigquery.Client() | |
sql = """ | |
SELECT | |
name, | |
gender, | |
COUNT(name) AS num_names | |
FROM | |
`bigquery-public-data.usa_names.usa_1910_current` | |
GROUP BY | |
name, | |
gender | |
""" | |
names_df = client.query(sql).to_dataframe() | |
print(names_df.shape) | |
names_df.head() |
def preprocess(names_df, train=True): | |
# Step 1: Lowercase | |
names_df['name'] = names_df['name'].str.lower() | |
# Step 2: Split individual characters | |
names_df['name'] = [list(name) for name in names_df['name']] | |
# Step 3: Pad names with spaces to make all names same length | |
name_length = 50 | |
names_df['name'] = [ | |
(name + [' ']*name_length)[:name_length] | |
for name in names_df['name'] | |
] | |
# Step 4: Encode Characters to Numbers | |
names_df['name'] = [ | |
[ | |
max(0.0, ord(char)-96.0) | |
for char in name | |
] | |
for name in names_df['name'] | |
] | |
if train: | |
# Step 5: Encode Gender to Numbers | |
names_df['gender'] = [ | |
0.0 if gender=='F' else 1.0 | |
for gender in names_df['gender'] | |
] | |
return names_df | |
names_df = preprocess(names_df) | |
names_df.head() |
from tensorflow.keras import Sequential | |
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense | |
from tensorflow.keras.optimizers import Adam | |
def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256): | |
model = Sequential([ | |
Embedding(num_alphabets, embedding_dim, input_length=name_length), | |
Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)), | |
Dense(1, activation="sigmoid") | |
]) | |
model.compile(loss='binary_crossentropy', | |
optimizer=Adam(learning_rate=0.001), | |
metrics=['accuracy']) | |
return model |
import numpy as np | |
from matplotlib import pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from tensorflow.keras.callbacks import EarlyStopping | |
# Step 1: Instantiate the model | |
model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256) | |
# Step 2: Split Training and Test Data | |
X = np.asarray(names_df['name'].values.tolist()) | |
y = np.asarray(names_df['gender'].values.tolist()) | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.2, | |
random_state=0) | |
# Step 3: Train the model | |
callbacks = [ | |
EarlyStopping(monitor='val_accuracy', | |
min_delta=1e-3, | |
patience=5, | |
mode='max', | |
restore_best_weights=True, | |
verbose=1), | |
] | |
history = model.fit(x=X_train, | |
y=y_train, | |
batch_size=64, | |
epochs=50, | |
validation_data=(X_test, y_test), | |
callbacks=callbacks) | |
# Step 4: Save the model | |
model.save('boyorgirl.h5') | |
# Step 5: Plot accuracies | |
plt.plot(history.history['accuracy'], label='train') | |
plt.plot(history.history['val_accuracy'], label='val') | |
plt.xlabel('Epochs') | |
plt.ylabel('Accuracy') | |
plt.legend() |
from tensorflow.keras.models import load_model | |
import pandas as pd | |
import numpy as np | |
pred_model = load_model('boyorgirl.h5') | |
# Input names | |
names = ['Joe', 'Biden', 'Kamala', 'Harris'] | |
# Convert to dataframe | |
pred_df = pd.DataFrame({'name': names}) | |
# Preprocess | |
pred_df = preprocess(pred_df, train=False) | |
# Predictions | |
result = pred_model.predict(np.asarray( | |
pred_df['name'].values.tolist())).squeeze(axis=1) | |
pred_df['Boy or Girl?'] = [ | |
'Boy' if logit > 0.5 else 'Girl' for logit in result | |
] | |
pred_df['Probability'] = [ | |
logit if logit > 0.5 else 1.0 - logit for logit in result | |
] | |
# Format the output | |
pred_df['name'] = names | |
pred_df.rename(columns={'name': 'Name'}, inplace=True) | |
pred_df['Probability'] = pred_df['Probability'].round(2) | |
pred_df.drop_duplicates(inplace=True) | |
pred_df.head() |
import pandas as pd | |
import plotly.express as px | |
import requests | |
import streamlit as st | |
# Get user inputs | |
names = st.text_input( | |
"Names", help="Input the names you'd like to check separated with spaces or commas" | |
) | |
# Add a submit button | |
if st.button("Submit"): | |
# Code to post the user inputs to the API and get the predictions | |
# Paste the URL to your API here! | |
api_url = "https://name-gender1.p.rapidapi.com/predict" | |
headers = { | |
"content-type": "application/json", | |
"X-RapidAPI-Key": st.secrets["RAPID_API_KEY"], # Enter your RAPID API Key here | |
"X-RapidAPI-Host": "name-gender1.p.rapidapi.com", | |
} | |
with st.spinner("🥁 Drumroll..."): | |
response = requests.post(api_url, json=[names], headers=headers) | |
predictions_df = pd.DataFrame(response.json()["response"]) | |
predictions_df.columns = ["Name", "Boy or Girl?", "Probability"] | |
predictions_df = predictions_df.apply( | |
lambda x: x.str.title() if x.dtype == "object" else x | |
) | |
fig = px.bar( | |
predictions_df, | |
x="Probability", | |
y="Name", | |
color="Boy or Girl?", | |
orientation="h", | |
color_discrete_map={"Boy": "dodgerblue", "Girl": "lightcoral"}, | |
) | |
fig.update_layout( | |
title={"text": "Confidence in Prediction", "x": 0.5}, | |
yaxis={ | |
"categoryorder": "array", | |
"categoryarray": predictions_df["Name"].values.tolist(), | |
"autorange": "reversed", | |
}, | |
xaxis={"range": [0, 1]}, | |
font={"size": 14}, | |
# width=700 | |
) | |
st.write("Predictions") | |
st.dataframe(predictions_df) | |
st.plotly_chart(fig, use_container_width=True) |