-
-
Save AkiyonKS/a8a44650a2a9d739ed05613b94dcac89 to your computer and use it in GitHub Desktop.
train model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
import pandas as pd | |
import os | |
import glob | |
import cv2 | |
import datetime | |
import time | |
import itertools | |
from imblearn.under_sampling import RandomUnderSampler | |
from imblearn.over_sampling import RandomOverSampler | |
from tensorflow import keras | |
from keras.layers import Dense, Dropout, Flatten, Input | |
from keras.applications.vgg16 import VGG16 | |
from keras.models import Model, Sequential | |
from keras import optimizers | |
from keras.utils import to_categorical | |
from keras.utils import Sequence | |
# ------------------------------- | |
# ここから関数定義 | |
# データフレームの行を辞書型に変換 | |
def convert_df_row_to_dic(df_row): | |
params = {} | |
for key in list(df_row.keys()): | |
params[key] = df_row[key] | |
return params | |
# モデル訓練に使う画像の枚数情報を取得 | |
def fetch_n_photos(params): | |
d = {} | |
d['all'] = params["n_all"] | |
d['train'] = int(params["n_all"] * params["train_test_ratio"]) | |
d['test'] = d['all'] - d['train'] | |
return d | |
# モデルの訓練に使用したパラメータをcsvファイルに保存 | |
def write_params_to_model_scores(params): | |
file_path = "../csv/model_results/model_scores.csv" | |
df = pd.read_csv(file_path) | |
dic_scores = { | |
'datetime': params["datetimenow"], | |
'dense_repeat': params['dense_repeat'], | |
"dense_units": params["dense_units"], | |
'img_size': params["img_size"], | |
'n_photos': params["n_all"], | |
'batch_size': params["batch_size"], | |
'epochs': params["epochs"] | |
} | |
s = pd.Series(list(dic_scores.values()), index=list(dic_scores.keys())) | |
df = pd.concat([df, pd.DataFrame(s).T]) | |
s_df = df.loc[:, ["dense_units", "img_size", "n_photos", "batch_size", "epochs"]] | |
s_df = s_df.applymap(lambda x: "{:.0f}".format(x)) | |
df.update(s_df) | |
df.to_csv(file_path, index=False) | |
# 訓練に使用する画像の枚数を揃えるためのimblearnの関数に渡すパラメータsampling_strategyを取得 | |
def fetch_strategy(params): | |
d = {} | |
for w in ['train', 'test']: | |
d[w] = {} | |
for i in params["df_labels_for_model"].y: | |
d[w][i] = params["n_photos"][w] | |
return d | |
# ディレクトリの確認 指定したpathがない場合に作成する | |
def my_makedirs(path): | |
if not os.path.isdir(path): | |
os.makedirs(path) | |
# モデルの訓練や検証に使用するphoto_idとラベルyのデータフレームをcsvファイルに保存 | |
def save_df_photo_ids_to_csv(df_photo_ids, datetimenow): | |
df = df_photo_ids.copy() | |
for w in ['all', 'train', 'test']: | |
df[w] = df[w].applymap(int).sort_values(['y', 'photo_id']).reset_index().drop(columns="index") | |
df[w].to_csv("../csv/model_results/" + datetimenow + "/data_" + w + ".csv") | |
# モデルの訓練や検証時に用いるラベルとphoto_idのデータフレームを取得 | |
# df_photo_ids(辞書型)に以下のインデックスをつけてデータフレームを保存 | |
# all: モデル学習に適した手持ちデータの全て(種類で数にばらつき) | |
# 訓練用と検証用のデータ数の比率からallをtrain_allとtest_allに分配 | |
# train_all: 訓練用データ全て(種類で数にばらつき) | |
# test_all: 検証用データ全て(種類で数にばらつき) | |
# train: 種類で数を揃えた訓練用データ | |
# test: 種類で数を揃えた検証用データ | |
def fetch_df_photo_ids(params): | |
# labelとphoto_idsの対応を記載したcollections_label.csvを読み込み | |
df_collections_label = pd.read_csv("../csv/collections_label.csv") | |
# モデルの訓練、検証に使用したくないphoto_idの情報を記載したcsvを読み込み | |
remove_photo_ids = pd.read_csv("../csv/remove_photo_ids.csv").photo_id.values | |
photo_ids = [] | |
photo_labels = [] | |
# params["df_labels_for_model"].yはモデル学習時のyラベル(0 ~ 9の整数) | |
for i in params["df_labels_for_model"].y: | |
label_row0 = params["df_labels_for_model"][i:i+1] | |
label_row = label_row0.iloc[0] | |
# ラベルが一致する行を抽出し、photo_idsを取得 | |
row = df_collections_label.loc[df_collections_label.value == label_row.label].iloc[-1] | |
print(row) | |
s_photo_ids_all = list(map(int, row.photo_id.split(','))) | |
# モデルの訓練、検証に使用したくないphoto_idを省く | |
s_photo_ids = [photo_id for photo_id in s_photo_ids_all if photo_id not in remove_photo_ids] | |
# photo_idsに追加 | |
photo_ids = photo_ids + s_photo_ids | |
# yラベルを追加 | |
s_photo_labels = [i] * len(s_photo_ids) | |
photo_labels = photo_labels + s_photo_labels | |
df_photo_ids = {} | |
# データをランダムに並べ替える | |
df_photo_ids['all'] = pd.DataFrame({'photo_id': photo_ids, 'y': photo_labels}).sample(frac=1).reset_index().drop(columns="index") | |
# データフレームの値をintに変換 | |
df_photo_ids['all'] = df_photo_ids['all'].applymap(int) | |
# train, test用に空のデータフレームを準備 | |
for w in ['train', 'test']: | |
df_photo_ids[w + "_all"] = pd.DataFrame({'photo_id': [], 'y': []}) | |
# ラベルyの値ごとにphoto_idを分割してデータフレームに保存 | |
for i in params["df_labels_for_model"].y: | |
sub_df = df_photo_ids['all'].loc[df_photo_ids['all'].y == i] | |
# 訓練用のサンプル数を手持ちサンプル数から算出 | |
j = int(len(sub_df) * params["train_test_ratio"]) | |
# jが設定した訓練用サンプル数を超える場合はjの値を置き換える | |
if j > params["n_photos"]['train']: | |
j = params["n_photos"]['train'] | |
# jの値を利用してデータフレームを分割してtrain_all, test_allに保存 | |
s_df = {} | |
s_df["train_all"] = sub_df.iloc[0:j, ] | |
s_df["test_all"] = sub_df.iloc[j:(j+params["n_photos"]['test']), ] | |
# データフレームを結合 | |
for w in ['train', 'test']: | |
s = w + '_all' | |
df_photo_ids[s] = pd.concat([df_photo_ids[s], s_df[s]], axis=0) | |
# all, train_all, test_allにおけるyラベルの数をカウント | |
vc = {} | |
for w in ['all', 'train_all', 'test_all']: | |
vc[w] = df_photo_ids[w].y.value_counts() | |
# ここからimblearnを利用してyの種類で数を揃える作業 | |
# RandomUnderSamplerやRandomOverSamplerに渡すパラメータ sampling_strategyの取得 | |
strategy = fetch_strategy(params) | |
# rusを定義 | |
for w in ['train', 'test']: | |
w_all = w + "_all" | |
# 全てのyで設定したサンプル数を上回る場合はRandomUnderSamplerを利用して揃える | |
if min(vc[w_all].values) > params["n_photos"][w]: | |
print(w, "data: RandomUnderSampler") | |
rus = RandomUnderSampler(random_state = 0, sampling_strategy = strategy[w]) | |
# 全てのyで設定したサンプル数と同数または下回る場合はRandomOverSamplerを利用して揃える | |
else: | |
print(w, "data: RandomOverSampler") | |
rus = RandomOverSampler(random_state = 0, sampling_strategy = strategy[w]) | |
# fit_resampleを実行して結果を取得 | |
photo_ids, y = rus.fit_resample(np.array(df_photo_ids[w_all].photo_id).reshape(-1,1), np.array(df_photo_ids[w_all].y).reshape(-1,1)) | |
photo_ids, y = list(map(lambda x: x.reshape(1,-1)[0].tolist(), [photo_ids, y])) | |
# データフレームに変換 | |
df_photo_ids[w] = pd.DataFrame({'photo_id': photo_ids, 'y': y}) | |
# train, testのサンプル数を取得 | |
for w in ['train', 'test']: | |
vc[w] = df_photo_ids[w].y.value_counts() | |
# 辞書型のvcをデータフレームに変換 | |
df_vc = pd.concat(vc.values(), axis=1) | |
df_vc = df_vc.sort_index().reset_index() | |
df_vc.columns = ['y', *list(vc.keys())] | |
df_vc.y = list(map(int, df_vc.y)) | |
df_vc = pd.merge(params["df_labels_for_model"], df_vc, on="y", how="outer") | |
print(df_vc) | |
# サンプル数の情報を保存 | |
save_file_path = '../csv/model_results/' + params["datetimenow"] | |
my_makedirs(save_file_path) | |
df_vc.to_csv(save_file_path + "/data_value_counts.csv", index=False) | |
# ラベルyとphoto_idのデータフレームを保存 | |
save_df_photo_ids_to_csv(df_photo_ids, params["datetimenow"]) | |
# バッチサイズと訓練用サンプル数から繰り返し数を算出(訓練時に使用) | |
params['len_iter'] = math.ceil(len(df_photo_ids['train'])/params["batch_size"]) | |
return df_photo_ids, params | |
# パラメータを読み込み、モデルを作成する | |
def create_model(params): | |
# 画像サイズ | |
arr_img_size = [params["img_size"]] * 2 | |
# VGG16による転移学習 | |
# 入力の形を定義 | |
input_tensor = Input(shape=(*arr_img_size,3)) | |
# VGGモデルのインスタンス(前半部分)を作成 | |
# include_topをFalseにし、VGGの特徴抽出部分のみを利用 | |
vgg16 = VGG16(include_top=False, weights='imagenet', input_tensor=input_tensor) | |
# 自作モデル(後半部分)の作成 | |
top_model = Sequential() | |
top_model.add(Flatten(input_shape=vgg16.output_shape[1:])) | |
dense_units = 0 + params["dense_units"] | |
for _ in range(params["dense_repeat"]): | |
top_model.add(Dense(dense_units, activation='relu')) | |
top_model.add(Dropout(0.5)) | |
dense_units = int(dense_units/2) | |
top_model.add(Dense(params["len_labels"], activation='softmax')) | |
#入力はvgg.input, 出力は, top_modelにvgg16の出力を入れたもの | |
model = Model(inputs=vgg16.input, outputs=top_model(vgg16.output)) | |
# modelの19層目までがvggのモデル | |
for layer in model.layers[:19]: | |
layer.trainable = False | |
# モデルのコンパイル | |
model.compile(optimizer=optimizers.gradient_descent_v2.SGD(learning_rate=1e-4, momentum=0.9), | |
loss='categorical_crossentropy', | |
metrics=['accuracy']) | |
# モデルSummaryの保存 | |
summary_path = '../csv/model_results/' + params["datetimenow"] + '/model_summary.txt' | |
with open(summary_path, "w") as fp: | |
model.summary(print_fn=lambda x: fp.write(x + "\r\n")) | |
return model | |
# yラベルとphoto_idの情報を持つデータフレームに、ジェネレータを利用する場合に必要なバッチナンバーを付加 | |
def add_batch_number_to_df(df, params): | |
df = df.sample(frac=1).sort_values(["y"]).reset_index().drop(columns="index") | |
tmp = list(range(params["len_iter"])) * params["batch_size"] | |
print("add_batch_number_to_df. len(df): {0}, len(tmp): {1}".format(len(df), len(tmp))) | |
if len(tmp) > len(df): | |
j = 0 | |
while len(tmp) > len(df): | |
for y in range(params["len_labels"]): | |
s = df.loc[df.y == y].iloc[j] | |
df = pd.concat([df, pd.DataFrame(s).T]) | |
j += 1 | |
df = df.iloc[0:len(tmp)].sort_values(["y"]).reset_index().drop(columns="index") | |
print("len(tmp): {0}, len(df): {1}".format(len(tmp), len(df))) | |
df["batch"] = tmp | |
return df | |
# モデルの訓練時に利用するジェネレータ(メモリ節約できる) | |
class generate_fit_data(Sequence): | |
def __init__(self, df, params): | |
self.df = df | |
self.params = params | |
def __len__(self): | |
return int(np.ceil(len(self.df) / float(self.params["batch_size"]))) | |
def __getitem__(self, idx): | |
s_df = self.df.loc[self.df.batch == idx] | |
batch_x, batch_y = fetch_fit_data(s_df.photo_id, s_df.y, self.params['img_size'], self.params["df_labels_for_model"]) | |
return batch_x, batch_y | |
# 画像の前処理 元画像のアスペクト比を維持したまま、縦横の大きさを揃えて、余白部分は黒にする | |
def preprocess(img): | |
h, w, _ = img.shape | |
longest_edge = max(h, w) | |
top = 0 | |
bottom = 0 | |
left = 0 | |
right = 0 | |
if h < longest_edge: | |
diff_h = longest_edge - h | |
top = diff_h // 2 | |
bottom = diff_h - top | |
elif w < longest_edge: | |
diff_w = longest_edge - w | |
left = diff_w // 2 | |
right = diff_w - left | |
else: | |
pass | |
img = cv2.copyMakeBorder(img, top, bottom, left, right, | |
cv2.BORDER_CONSTANT, value=[0, 0, 0]) | |
return img | |
# photo_idから画像のpathを取得 | |
def fetch_img_path_by_photo_id(photo_id): | |
path = "../img/raillab/original/*/" + str(int(photo_id)) + ".jpg" | |
list_path = glob.glob(path) | |
return list_path[0] | |
# photo_idから画像を取得 (OpenCVを利用) | |
def fetch_img_by_photo_id(photo_id): | |
dir = fetch_img_path_by_photo_id(photo_id) | |
img = cv2.imread(dir) | |
return img | |
# photo_idとimg_sizeを指定して前処理済みの画像を取得 | |
def fetch_img_by_photo_id_and_resize(photo_id, img_size): | |
photo_id = str(int(photo_id)) | |
img = fetch_img_by_photo_id(photo_id) | |
img = preprocess(img) | |
img = cv2.resize(img, img_size) | |
return img | |
# X (list_x), y(list_y), img_sizeを指定してモデルの訓練または検証用のデータ(画像とyラベル)を取得 | |
def fetch_fit_data(list_x, list_y, img_size, df_labels_for_model): | |
x = np.array(list(map(lambda photo_id: fetch_img_by_photo_id_and_resize(photo_id, (img_size, img_size)), list_x))) | |
len_y = len(list_y) | |
tmp = list(list_y) + list(range(len(df_labels_for_model))) | |
tmp = tmp[0:len_y] | |
y = to_categorical(np.array(tmp)) | |
return x, y | |
# モデル訓練用と検証用のデータ(画像とyラベル)を取得 | |
def fetch_train_test_data(df_photo_ids, img_size, df_labels_for_model): | |
X = {} | |
y = {} | |
for w in ['train', 'test']: | |
X[w], y[w] = fetch_fit_data(df_photo_ids[w].photo_id, df_photo_ids[w].y, img_size, df_labels_for_model) | |
return X, y | |
# モデルの訓練結果をcsvファイルに保存 | |
def save_history_to_csv(df_history, datetimenow, filename_appendix=''): | |
arr0 = ['loss', 'accuracy', 'val_loss', 'val_accuracy'] | |
arr = [w for w in arr0 if w in df_history.columns] | |
df = df_history.loc[:, arr] | |
df = df.applymap(lambda x: "{:.5f}".format(x)) | |
df_history.update(df) | |
save_path = '../csv/model_results/' + datetimenow + '/history' + filename_appendix + '.csv' | |
df_history.to_csv(save_path, index=False) | |
# モデルの訓練 | |
def model_fit(model, df_photo_ids, params): | |
# ジェネレータを使う場合(メモリ節約、パラメータ、実行環境によってはバグらないとは言い切れない) | |
if params["train_mode"] == "model_fit_by_generator": | |
print("model_fit_by_generator") | |
list_df = add_batch_number_to_df(df_photo_ids['train'], params) | |
gen = generate_fit_data(list_df, params) | |
X_test, y_test = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"]) | |
history = model.fit(x = gen, batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X_test, y_test)) | |
# ジェネレータを使わない場合(パラメータ、実行環境によってはバグる可能性がある) | |
else: | |
print("model_fit") | |
X, y = fetch_train_test_data(df_photo_ids, params["img_size"], params["df_labels_for_model"]) | |
history = model.fit(X['train'], y['train'], batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X['test'], y['test'])) | |
# モデルの訓練結果を取得してcsvファイルに保存 | |
df_history = pd.DataFrame(history.history) | |
df_history.insert(loc=0, column='epoch', value=range(len(df_history) + 1)[1:(len(df_history)+1)]) | |
print(df_history) | |
save_history_to_csv(df_history, params["datetimenow"]) | |
# モデルをh5ファイルで保存 | |
model_filename = '../models/model_train_' + params["datetimenow"] + '.h5' | |
model.save(model_filename) | |
return model, params | |
# モデル訓練時に使用したパラメータやモデル精度(スコア)などをcsvファイルに保存 | |
def write_model_fit_scores(scores, params): | |
file_path = "../csv/model_results/model_scores.csv" | |
df = pd.read_csv(file_path) | |
r = df.loc[df.datetime == params["datetimenow"]] | |
row_index = list(r.index)[0] | |
df.loc[row_index, "time"] = params["time_train"] | |
df.loc[row_index, "loss"] = scores[0] | |
df.loc[row_index, "accuracy"] = scores[1] | |
for w in ['best_epochs', 'best_batch', 'best_epochs2']: | |
if w in list(params.keys()): | |
df.loc[row_index, w] = str(int(params[w])) | |
print("params.keys()", list(params.keys())) | |
s_df = df.loc[:, ["img_size", "n_photos", "batch_size", "epochs"]] | |
s_df = s_df.applymap(lambda x: "{:.0f}".format(x)) | |
df.update(s_df) | |
s_df = df.loc[:, ["loss", "accuracy"]] | |
s_df = s_df.applymap(lambda x: "{:.5f}".format(x)) | |
df.update(s_df) | |
s_df = df.loc[:, ["time"]] | |
s_df = s_df.applymap(lambda x: "{:.2f}".format(x)) | |
df.update(s_df) | |
df.to_csv(file_path, index=False) | |
# 学習済みモデルから予測 | |
def pred_train(model, labels, photo_id, y_value, img_size): | |
img = fetch_img_by_photo_id_and_resize(photo_id, img_size) | |
photo_label = labels[int(y_value)] | |
pred_all = model.predict(np.array([img])) | |
pred_all2 = list(map(lambda x: "{:.5f}".format(x), pred_all.reshape(1,-1)[0].tolist())) | |
pred = np.argmax(pred_all) | |
res = [photo_id, photo_label == labels[pred], y_value, pred, photo_label, labels[pred], *pred_all2] | |
return res | |
# 学習済みモデルから予測して結果をcsvファイルに保存 | |
def pred_train_and_save_to_csv(model, params, df_photo_ids_for_test): | |
rand_index = np.random.permutation(np.arange(len(df_photo_ids_for_test))) | |
res_arr = [] | |
for j, i in enumerate(rand_index): | |
row = df_photo_ids_for_test.iloc[i] | |
photo_id = row.photo_id | |
y_value = row.y | |
print("{0}/{1}".format(j, len(rand_index))) | |
res = pred_train(model, params["df_labels_for_model"].label.values, photo_id, y_value, [params["img_size"]] * 2) | |
print(res) | |
res_arr.append(res) | |
df = pd.DataFrame(res_arr) | |
col_names = ['photo_id', 'match', 'y', 'pred_y', 'label', 'pred_label'] + list(map(lambda x: "#" + str(x), params["df_labels_for_model"].index.values)) | |
df.columns = col_names | |
df = df.sort_values(['match', 'y', 'photo_id']).reset_index().drop(columns="index") | |
df_r = df.loc[:, ['photo_id', 'y', 'pred_y']] | |
df_r = df_r.applymap(lambda x: ("{:.0f}".format(x))) | |
df.update(df_r) | |
file_path = '../csv/model_results/' + params["datetimenow"] + '/pred_results.csv' | |
df.to_csv(file_path, index=False) | |
# モデルの訓練と予測を行い、結果を保存 | |
def train_model(params): | |
print("train_model") | |
# モデル訓練の記録に必要な各パラメータを設定または取得 | |
params["time_i"] = time.perf_counter() | |
params["datetimenow"] = (datetime.datetime.now() + datetime.timedelta(hours=9)).strftime('%Y%m%d_%H%M%S') | |
params["n_photos"] = fetch_n_photos(params) | |
print("train_model. params\n", params) | |
# 開始時のパラメータをcsvファイルに保存 | |
write_params_to_model_scores(params) | |
# 訓練、検証に用いるyラベルとphoto_idを取得 | |
df_photo_ids, params = fetch_df_photo_ids(params) | |
# モデルの作成 | |
model = create_model(params) | |
# モデルの訓練 | |
model, params = model_fit(model, df_photo_ids, params) | |
params["time_f"] = time.perf_counter() | |
params["time_train"] = params["time_f"] - params["time_i"] | |
# 訓練済みモデルの精度を評価して結果をcsvファイルに保存 | |
X, y = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"]) | |
scores = model.evaluate(X, y, batch_size=params["batch_size"], verbose=1) | |
write_model_fit_scores(scores, params) | |
# 訓練済みモデルから予測して結果をcsvファイルに保存 | |
pred_train_and_save_to_csv(model, params, df_photo_ids['test_all']) | |
# 関数定義ここまで | |
# ------------------------------- | |
# set parameters | |
params = {} | |
# train_modeは"model_fit"、または"model_fit_by_generator" | |
params["train_mode"] = "model_fit_by_generator" | |
params["train_test_ratio"] = 0.8 | |
# パラメータのサイズによってはエラーがでた | |
# 最終的に採用したモデルに適用したパラメータのみを記載 | |
dic_all_params = { | |
"dense_repeat": [1], | |
"dense_units": [1024], | |
'img_size': [300], | |
"n_all": [300], | |
"epochs": [8], | |
"batch_size": [64] | |
} | |
repeat_len = 3 # 同条件での繰り返し数 | |
# パラメータの組み合わせを作成してデータフレームに変換 | |
all_params = list(itertools.product(*list(dic_all_params.values()))) | |
df_all_params = pd.DataFrame(all_params, columns=dic_all_params.keys()) | |
print(df_all_params) | |
params["df_labels_for_model"] = pd.read_csv("../csv/labels_for_model.csv") | |
params["len_labels"] = len(params["df_labels_for_model"]) | |
print("") | |
print(params["df_labels_for_model"]) | |
len_paramset = len(df_all_params.index.values) | |
row_nums = df_all_params.index.values | |
for j in range(repeat_len): | |
print("j{0}/{1}".format(j, repeat_len)) | |
for i in row_nums: | |
print("\n-------------------------------------------------------") | |
print("fit model: {0}/{1}".format(i, len_paramset)) | |
params.update(convert_df_row_to_dic(df_all_params.iloc[i])) | |
train_model(params) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment