Last active
March 27, 2019 10:29
-
-
Save nithinivi/79071b3512c5f5b5b4f902fec3035099 to your computer and use it in GitHub Desktop.
Gender Classifier Generator using Tpot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import sklearn.metrics | |
from sklearn.model_selection import train_test_split, cross_val_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import svm | |
from tpot import TPOTClassifier | |
def name_count(name): | |
arr = np.zeros(52 + 26 * 26) | |
# iterate each character | |
name = name.decode("utf-8") | |
for ind, x in enumerate(name): | |
try: | |
arr[ord(x) - ord('a')] += 1 | |
arr[ord(x) - ord('a') + 26] += ind + 1 | |
except TypeError: | |
raise TypeError("bytearray from numpy on iteration ") | |
# Iterate every 2 character | |
for x in range(len(name) - 1): | |
try: | |
ind = (ord(name[x]) - ord('a')) * 26 - ( | |
ord(name[x + 1]) - ord('a')) + 52 | |
arr[ind] += 1 | |
except TypeError: | |
raise TypeError("bytearray from numpy on iteration ") # | |
arr[-3] = ord(name[-1]) - ord('a') + 1 | |
arr[-2] = ord(name[-2]) - ord('a') + 1 | |
arr[-1] = len(name) | |
return arr | |
if __name__ == '__main__': | |
# data => https://www.ssa.gov/oact/babynames/names.zip | |
# TODO: read all the data from the names | |
my_data = np.genfromtxt( | |
"data/yob2014.txt", | |
delimiter=",", | |
dtype=[('name', "S50"), ('gender', 'S1'), ('count', 'i4')], | |
converters={0: lambda s: s.lower()}) | |
my_data = np.array([row for row in my_data if row[2] > 20]) | |
name_map = np.vectorize(name_count, otypes=[np.ndarray]) | |
Xlist = name_map(my_data['name'].view("S50")) | |
X = np.array(Xlist.tolist()) | |
y = my_data['gender'] | |
tpot = TPOTClassifier(verbosity=3, | |
scoring="balanced_accuracy", | |
random_state=23, | |
periodic_checkpoint_folder="classifiers_", | |
n_jobs=-1, | |
generations=10, | |
population_size=100) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) | |
start_time = time.time() | |
tpot.fit(X_train, y_train) | |
elapsed = time.time() - start_time | |
times.append(elapsed) | |
winning_pipes.append(tpot.fitted_pipeline_) | |
scores.append(tpot.score(X_test, y_test)) | |
tpot.export('tpot_pipeline.py') | |
times = [time/60 for time in times] | |
print('Times:', times) | |
print('Scores:', scores) | |
print('Winning pipelines:', winning_pipes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wget --header="Host: www.ssa.gov" --header="User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" --header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8,ml;q=0.7" "https://www.ssa.gov/oact/babynames/names.zip" -O "names.zip" -c | |
unzip names.zip | |
mkdir data/ | |
mv *.txt data/ | |
rm -rf rm NationalReadMe.pdf | |
pip install numpy tpot scikit-learn xgboost tpot | |
python main.py |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment