Skip to content

Instantly share code, notes, and snippets.

@nithinivi
Last active March 27, 2019 10:29
Show Gist options
  • Save nithinivi/79071b3512c5f5b5b4f902fec3035099 to your computer and use it in GitHub Desktop.
Save nithinivi/79071b3512c5f5b5b4f902fec3035099 to your computer and use it in GitHub Desktop.
Gender Classifier Generator using Tpot
import time
import numpy as np
import sklearn.metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from tpot import TPOTClassifier
def name_count(name):
arr = np.zeros(52 + 26 * 26)
# iterate each character
name = name.decode("utf-8")
for ind, x in enumerate(name):
try:
arr[ord(x) - ord('a')] += 1
arr[ord(x) - ord('a') + 26] += ind + 1
except TypeError:
raise TypeError("bytearray from numpy on iteration ")
# Iterate every 2 character
for x in range(len(name) - 1):
try:
ind = (ord(name[x]) - ord('a')) * 26 - (
ord(name[x + 1]) - ord('a')) + 52
arr[ind] += 1
except TypeError:
raise TypeError("bytearray from numpy on iteration ") #
arr[-3] = ord(name[-1]) - ord('a') + 1
arr[-2] = ord(name[-2]) - ord('a') + 1
arr[-1] = len(name)
return arr
if __name__ == '__main__':
# data => https://www.ssa.gov/oact/babynames/names.zip
# TODO: read all the data from the names
my_data = np.genfromtxt(
"data/yob2014.txt",
delimiter=",",
dtype=[('name', "S50"), ('gender', 'S1'), ('count', 'i4')],
converters={0: lambda s: s.lower()})
my_data = np.array([row for row in my_data if row[2] > 20])
name_map = np.vectorize(name_count, otypes=[np.ndarray])
Xlist = name_map(my_data['name'].view("S50"))
X = np.array(Xlist.tolist())
y = my_data['gender']
tpot = TPOTClassifier(verbosity=3,
scoring="balanced_accuracy",
random_state=23,
periodic_checkpoint_folder="classifiers_",
n_jobs=-1,
generations=10,
population_size=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
start_time = time.time()
tpot.fit(X_train, y_train)
elapsed = time.time() - start_time
times.append(elapsed)
winning_pipes.append(tpot.fitted_pipeline_)
scores.append(tpot.score(X_test, y_test))
tpot.export('tpot_pipeline.py')
times = [time/60 for time in times]
print('Times:', times)
print('Scores:', scores)
print('Winning pipelines:', winning_pipes)
wget --header="Host: www.ssa.gov" --header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" --header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,ml;q=0.7" "https://www.ssa.gov/oact/babynames/names.zip" -O "names.zip" -c
unzip names.zip
mkdir data/
mv *.txt data/
rm -rf rm NationalReadMe.pdf
pip install numpy tpot scikit-learn xgboost tpot
python main.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment