stephenleo/00_Introducing Boy or Girl - a Machine Learning Web App to Detect Gender fromName.md

## 00_Introducing Boy or Girl - a Machine Learning Web App to Detect Gender fromName.md

      
    Raw
  

              00_Introducing Boy or Girl - a Machine Learning Web App to Detect Gender fromName.md
            
          
    Code for the medium post
numpy==1.19.5
tensorflow==2.4.1
scikit-learn==0.24.2


## 01_data_query.py
from google.cloud import bigquery
client = bigquery.Client()

sql = """
SELECT
  name,
  gender,
  COUNT(name) AS num_names
FROM
  `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY
  name,
  gender
"""

names_df = client.query(sql).to_dataframe()

print(names_df.shape)
names_df.head()

## 02_preprocessing.py
def preprocess(names_df, train=True):
    # Step 1: Lowercase
    names_df['name'] = names_df['name'].str.lower()

    # Step 2: Split individual characters
    names_df['name'] = [list(name) for name in names_df['name']]

    # Step 3: Pad names with spaces to make all names same length
    name_length = 50
    names_df['name'] = [
        (name + [' ']*name_length)[:name_length]
        for name in names_df['name']
    ]

    # Step 4: Encode Characters to Numbers
    names_df['name'] = [
        [
            max(0.0, ord(char)-96.0)
            for char in name
        ]
        for name in names_df['name']
    ]

    if train:
        # Step 5: Encode Gender to Numbers
        names_df['gender'] = [
            0.0 if gender=='F' else 1.0
            for gender in names_df['gender']
        ]

    return names_df

names_df = preprocess(names_df)
names_df.head()

## 03_model.py
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256):
    model = Sequential([
        Embedding(num_alphabets, embedding_dim, input_length=name_length),
        Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)),
        Dense(1, activation="sigmoid")
    ])

    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])

    return model

## 04_train.py
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Instantiate the model
model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256)

# Step 2: Split Training and Test Data
X = np.asarray(names_df['name'].values.tolist())
y = np.asarray(names_df['gender'].values.tolist())

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Step 3: Train the model
callbacks = [
    EarlyStopping(monitor='val_accuracy',
                  min_delta=1e-3,
                  patience=5,
                  mode='max',
                  restore_best_weights=True,
                  verbose=1),
]

history = model.fit(x=X_train,
                    y=y_train,
                    batch_size=64,
                    epochs=50,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

# Step 4: Save the model
model.save('boyorgirl.h5')

# Step 5: Plot accuracies
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

## 05_inference.py
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np

pred_model = load_model('boyorgirl.h5')

# Input names
names = ['Joe', 'Biden', 'Kamala', 'Harris']

# Convert to dataframe
pred_df = pd.DataFrame({'name': names})

# Preprocess
pred_df = preprocess(pred_df, train=False)

# Predictions
result = pred_model.predict(np.asarray(
    pred_df['name'].values.tolist())).squeeze(axis=1)

pred_df['Boy or Girl?'] = [
    'Boy' if logit > 0.5 else 'Girl' for logit in result
]

pred_df['Probability'] = [
    logit if logit > 0.5 else 1.0 - logit for logit in result
]

# Format the output
pred_df['name'] = names
pred_df.rename(columns={'name': 'Name'}, inplace=True)
pred_df['Probability'] = pred_df['Probability'].round(2)
pred_df.drop_duplicates(inplace=True)

pred_df.head()

## 06_streamlit.py
import pandas as pd
import plotly.express as px
import requests
import streamlit as st

# Get user inputs
names = st.text_input(
    "Names", help="Input the names you'd like to check separated with spaces or commas"
)

# Add a submit button
if st.button("Submit"):
    # Code to post the user inputs to the API and get the predictions
    # Paste the URL to your API here!
    api_url = "https://name-gender1.p.rapidapi.com/predict"

    headers = {
        "content-type": "application/json",
        "X-RapidAPI-Key": st.secrets["RAPID_API_KEY"],  # Enter your RAPID API Key here
        "X-RapidAPI-Host": "name-gender1.p.rapidapi.com",
    }

    with st.spinner("🥁 Drumroll..."):
        response = requests.post(api_url, json=[names], headers=headers)

    predictions_df = pd.DataFrame(response.json()["response"])
    predictions_df.columns = ["Name", "Boy or Girl?", "Probability"]
    predictions_df = predictions_df.apply(
        lambda x: x.str.title() if x.dtype == "object" else x
    )

    fig = px.bar(
        predictions_df,
        x="Probability",
        y="Name",
        color="Boy or Girl?",
        orientation="h",
        color_discrete_map={"Boy": "dodgerblue", "Girl": "lightcoral"},
    )

    fig.update_layout(
        title={"text": "Confidence in Prediction", "x": 0.5},
        yaxis={
            "categoryorder": "array",
            "categoryarray": predictions_df["Name"].values.tolist(),
            "autorange": "reversed",
        },
        xaxis={"range": [0, 1]},
        font={"size": 14},
        # width=700
    )

    st.write("Predictions")
    st.dataframe(predictions_df)
    st.plotly_chart(fig, use_container_width=True)
	from google.cloud import bigquery
	client = bigquery.Client()

	sql = """
	SELECT
	name,
	gender,
	COUNT(name) AS num_names
	FROM
	`bigquery-public-data.usa_names.usa_1910_current`
	GROUP BY
	name,
	gender
	"""

	names_df = client.query(sql).to_dataframe()

	print(names_df.shape)
	names_df.head()
	def preprocess(names_df, train=True):
	# Step 1: Lowercase
	names_df['name'] = names_df['name'].str.lower()

	# Step 2: Split individual characters
	names_df['name'] = [list(name) for name in names_df['name']]

	# Step 3: Pad names with spaces to make all names same length
	name_length = 50
	names_df['name'] = [
	(name + [' ']*name_length)[:name_length]
	for name in names_df['name']
	]

	# Step 4: Encode Characters to Numbers
	names_df['name'] = [
	[
	max(0.0, ord(char)-96.0)
	for char in name
	]
	for name in names_df['name']
	]

	if train:
	# Step 5: Encode Gender to Numbers
	names_df['gender'] = [
	0.0 if gender=='F' else 1.0
	for gender in names_df['gender']
	]

	return names_df

	names_df = preprocess(names_df)
	names_df.head()
	from tensorflow.keras import Sequential
	from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
	from tensorflow.keras.optimizers import Adam

	def lstm_model(num_alphabets=27, name_length=50, embedding_dim=256):
	model = Sequential([
	Embedding(num_alphabets, embedding_dim, input_length=name_length),
	Bidirectional(LSTM(units=128, recurrent_dropout=0.2, dropout=0.2)),
	Dense(1, activation="sigmoid")
	])

	model.compile(loss='binary_crossentropy',
	optimizer=Adam(learning_rate=0.001),
	metrics=['accuracy'])

	return model
	import numpy as np
	from matplotlib import pyplot as plt
	from sklearn.model_selection import train_test_split
	from tensorflow.keras.callbacks import EarlyStopping

	# Step 1: Instantiate the model
	model = lstm_model(num_alphabets=27, name_length=50, embedding_dim=256)

	# Step 2: Split Training and Test Data
	X = np.asarray(names_df['name'].values.tolist())
	y = np.asarray(names_df['gender'].values.tolist())

	X_train, X_test, y_train, y_test = train_test_split(X,
	y,
	test_size=0.2,
	random_state=0)

	# Step 3: Train the model
	callbacks = [
	EarlyStopping(monitor='val_accuracy',
	min_delta=1e-3,
	patience=5,
	mode='max',
	restore_best_weights=True,
	verbose=1),
	]

	history = model.fit(x=X_train,
	y=y_train,
	batch_size=64,
	epochs=50,
	validation_data=(X_test, y_test),
	callbacks=callbacks)

	# Step 4: Save the model
	model.save('boyorgirl.h5')

	# Step 5: Plot accuracies
	plt.plot(history.history['accuracy'], label='train')
	plt.plot(history.history['val_accuracy'], label='val')
	plt.xlabel('Epochs')
	plt.ylabel('Accuracy')
	plt.legend()
	from tensorflow.keras.models import load_model
	import pandas as pd
	import numpy as np

	pred_model = load_model('boyorgirl.h5')

	# Input names
	names = ['Joe', 'Biden', 'Kamala', 'Harris']

	# Convert to dataframe
	pred_df = pd.DataFrame({'name': names})

	# Preprocess
	pred_df = preprocess(pred_df, train=False)

	# Predictions
	result = pred_model.predict(np.asarray(
	pred_df['name'].values.tolist())).squeeze(axis=1)

	pred_df['Boy or Girl?'] = [
	'Boy' if logit > 0.5 else 'Girl' for logit in result
	]

	pred_df['Probability'] = [
	logit if logit > 0.5 else 1.0 - logit for logit in result
	]

	# Format the output
	pred_df['name'] = names
	pred_df.rename(columns={'name': 'Name'}, inplace=True)
	pred_df['Probability'] = pred_df['Probability'].round(2)
	pred_df.drop_duplicates(inplace=True)

	pred_df.head()
	import pandas as pd
	import plotly.express as px
	import requests
	import streamlit as st

	# Get user inputs
	names = st.text_input(
	"Names", help="Input the names you'd like to check separated with spaces or commas"
	)

	# Add a submit button
	if st.button("Submit"):
	# Code to post the user inputs to the API and get the predictions
	# Paste the URL to your API here!
	api_url = "https://name-gender1.p.rapidapi.com/predict"

	headers = {
	"content-type": "application/json",
	"X-RapidAPI-Key": st.secrets["RAPID_API_KEY"], # Enter your RAPID API Key here
	"X-RapidAPI-Host": "name-gender1.p.rapidapi.com",
	}

	with st.spinner("🥁 Drumroll..."):
	response = requests.post(api_url, json=[names], headers=headers)

	predictions_df = pd.DataFrame(response.json()["response"])
	predictions_df.columns = ["Name", "Boy or Girl?", "Probability"]
	predictions_df = predictions_df.apply(
	lambda x: x.str.title() if x.dtype == "object" else x
	)

	fig = px.bar(
	predictions_df,
	x="Probability",
	y="Name",
	color="Boy or Girl?",
	orientation="h",
	color_discrete_map={"Boy": "dodgerblue", "Girl": "lightcoral"},
	)

	fig.update_layout(
	title={"text": "Confidence in Prediction", "x": 0.5},
	yaxis={
	"categoryorder": "array",
	"categoryarray": predictions_df["Name"].values.tolist(),
	"autorange": "reversed",
	},
	xaxis={"range": [0, 1]},
	font={"size": 14},
	# width=700
	)

	st.write("Predictions")
	st.dataframe(predictions_df)
	st.plotly_chart(fig, use_container_width=True)