AkiyonKS/train_model_for_blog.py Secret

## train_model_for_blog.py
import math
import numpy as np
import pandas as pd
import os
import glob
import cv2
import datetime
import time
import itertools
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from tensorflow import keras
from keras.layers import Dense, Dropout, Flatten, Input
from keras.applications.vgg16 import VGG16
from keras.models import Model, Sequential
from keras import optimizers
from keras.utils import to_categorical
from keras.utils import Sequence

# -------------------------------
# ここから関数定義

# データフレームの行を辞書型に変換
def convert_df_row_to_dic(df_row):
    params = {}
    for key in list(df_row.keys()):
        params[key] = df_row[key]

    return params

# モデル訓練に使う画像の枚数情報を取得
def fetch_n_photos(params):
    d = {}
    d['all'] = params["n_all"]
    d['train'] = int(params["n_all"] * params["train_test_ratio"])
    d['test'] = d['all'] - d['train']
    return d

# モデルの訓練に使用したパラメータをcsvファイルに保存
def write_params_to_model_scores(params):
    file_path = "../csv/model_results/model_scores.csv"
    df = pd.read_csv(file_path)
    dic_scores = {
        'datetime': params["datetimenow"],
        'dense_repeat': params['dense_repeat'],
        "dense_units": params["dense_units"],
        'img_size': params["img_size"],
        'n_photos': params["n_all"],
        'batch_size': params["batch_size"],
        'epochs': params["epochs"]
    }

    s = pd.Series(list(dic_scores.values()), index=list(dic_scores.keys()))
    df = pd.concat([df, pd.DataFrame(s).T])
    s_df = df.loc[:, ["dense_units", "img_size", "n_photos", "batch_size", "epochs"]]
    s_df = s_df.applymap(lambda x: "{:.0f}".format(x))
    df.update(s_df)
    df.to_csv(file_path, index=False)

# 訓練に使用する画像の枚数を揃えるためのimblearnの関数に渡すパラメータsampling_strategyを取得
def fetch_strategy(params):
    d = {}
    for w in ['train', 'test']:
        d[w] = {}
        for i in params["df_labels_for_model"].y:
            d[w][i] = params["n_photos"][w]

    return d

# ディレクトリの確認　指定したpathがない場合に作成する
def my_makedirs(path):
    if not os.path.isdir(path):
        os.makedirs(path)

# モデルの訓練や検証に使用するphoto_idとラベルyのデータフレームをcsvファイルに保存
def save_df_photo_ids_to_csv(df_photo_ids, datetimenow):
    df = df_photo_ids.copy()
    for w in ['all', 'train', 'test']:
        df[w] = df[w].applymap(int).sort_values(['y', 'photo_id']).reset_index().drop(columns="index")
        df[w].to_csv("../csv/model_results/" + datetimenow + "/data_" + w + ".csv")

# モデルの訓練や検証時に用いるラベルとphoto_idのデータフレームを取得
# df_photo_ids（辞書型）に以下のインデックスをつけてデータフレームを保存
# all: モデル学習に適した手持ちデータの全て(種類で数にばらつき)
# 訓練用と検証用のデータ数の比率からallをtrain_allとtest_allに分配
# train_all: 訓練用データ全て(種類で数にばらつき)
# test_all: 検証用データ全て(種類で数にばらつき)
# train: 種類で数を揃えた訓練用データ
# test: 種類で数を揃えた検証用データ
def fetch_df_photo_ids(params):
    # labelとphoto_idsの対応を記載したcollections_label.csvを読み込み
    df_collections_label = pd.read_csv("../csv/collections_label.csv")
    # モデルの訓練、検証に使用したくないphoto_idの情報を記載したcsvを読み込み
    remove_photo_ids = pd.read_csv("../csv/remove_photo_ids.csv").photo_id.values

    photo_ids = []
    photo_labels = []
    # params["df_labels_for_model"].yはモデル学習時のyラベル(0 ~ 9の整数)
    for i in params["df_labels_for_model"].y:
        label_row0 = params["df_labels_for_model"][i:i+1]
        label_row = label_row0.iloc[0]

        # ラベルが一致する行を抽出し、photo_idsを取得
        row = df_collections_label.loc[df_collections_label.value == label_row.label].iloc[-1]
        print(row)
        s_photo_ids_all = list(map(int, row.photo_id.split(',')))
        # モデルの訓練、検証に使用したくないphoto_idを省く
        s_photo_ids = [photo_id for photo_id in s_photo_ids_all if photo_id not in remove_photo_ids]
        # photo_idsに追加
        photo_ids = photo_ids + s_photo_ids
        # yラベルを追加
        s_photo_labels = [i] * len(s_photo_ids)
        photo_labels = photo_labels + s_photo_labels

    df_photo_ids = {}
    # データをランダムに並べ替える
    df_photo_ids['all'] = pd.DataFrame({'photo_id': photo_ids, 'y': photo_labels}).sample(frac=1).reset_index().drop(columns="index")
    # データフレームの値をintに変換
    df_photo_ids['all'] = df_photo_ids['all'].applymap(int)

    # train, test用に空のデータフレームを準備
    for w in ['train', 'test']:
        df_photo_ids[w + "_all"] = pd.DataFrame({'photo_id': [], 'y': []})

    # ラベルyの値ごとにphoto_idを分割してデータフレームに保存
    for i in params["df_labels_for_model"].y:
        sub_df = df_photo_ids['all'].loc[df_photo_ids['all'].y == i]
        # 訓練用のサンプル数を手持ちサンプル数から算出
        j = int(len(sub_df) * params["train_test_ratio"])
        # jが設定した訓練用サンプル数を超える場合はjの値を置き換える
        if j > params["n_photos"]['train']:
            j = params["n_photos"]['train']

        # jの値を利用してデータフレームを分割してtrain_all, test_allに保存
        s_df = {}
        s_df["train_all"] = sub_df.iloc[0:j, ]
        s_df["test_all"] = sub_df.iloc[j:(j+params["n_photos"]['test']), ]
        # データフレームを結合
        for w in ['train', 'test']:
            s = w + '_all'
            df_photo_ids[s] = pd.concat([df_photo_ids[s], s_df[s]], axis=0)

    # all, train_all, test_allにおけるyラベルの数をカウント
    vc = {}
    for w in ['all', 'train_all', 'test_all']:
        vc[w] = df_photo_ids[w].y.value_counts()


    # ここからimblearnを利用してyの種類で数を揃える作業
    # RandomUnderSamplerやRandomOverSamplerに渡すパラメータ sampling_strategyの取得
    strategy = fetch_strategy(params)

    # rusを定義
    for w in ['train', 'test']:
        w_all = w + "_all"
        # 全てのyで設定したサンプル数を上回る場合はRandomUnderSamplerを利用して揃える
        if min(vc[w_all].values) > params["n_photos"][w]:
            print(w, "data: RandomUnderSampler")
            rus = RandomUnderSampler(random_state = 0, sampling_strategy = strategy[w])
        # 全てのyで設定したサンプル数と同数または下回る場合はRandomOverSamplerを利用して揃える
        else:
            print(w, "data: RandomOverSampler")
            rus = RandomOverSampler(random_state = 0, sampling_strategy = strategy[w])

        # fit_resampleを実行して結果を取得
        photo_ids, y = rus.fit_resample(np.array(df_photo_ids[w_all].photo_id).reshape(-1,1), np.array(df_photo_ids[w_all].y).reshape(-1,1))
        photo_ids, y = list(map(lambda x: x.reshape(1,-1)[0].tolist(), [photo_ids, y]))
        # データフレームに変換
        df_photo_ids[w] = pd.DataFrame({'photo_id': photo_ids, 'y': y})

    # train, testのサンプル数を取得
    for w in ['train', 'test']:
        vc[w] = df_photo_ids[w].y.value_counts()

    # 辞書型のvcをデータフレームに変換
    df_vc = pd.concat(vc.values(), axis=1)
    df_vc = df_vc.sort_index().reset_index()
    df_vc.columns = ['y', *list(vc.keys())]
    df_vc.y = list(map(int, df_vc.y))
    df_vc = pd.merge(params["df_labels_for_model"], df_vc, on="y", how="outer")
    print(df_vc)

    # サンプル数の情報を保存
    save_file_path = '../csv/model_results/' + params["datetimenow"]
    my_makedirs(save_file_path)
    df_vc.to_csv(save_file_path + "/data_value_counts.csv", index=False)

    # ラベルyとphoto_idのデータフレームを保存
    save_df_photo_ids_to_csv(df_photo_ids, params["datetimenow"])
    # バッチサイズと訓練用サンプル数から繰り返し数を算出（訓練時に使用）
    params['len_iter'] = math.ceil(len(df_photo_ids['train'])/params["batch_size"])

    return df_photo_ids, params

# パラメータを読み込み、モデルを作成する
def create_model(params):
    # 画像サイズ
    arr_img_size = [params["img_size"]] * 2

    # VGG16による転移学習
    # 入力の形を定義
    input_tensor = Input(shape=(*arr_img_size,3))
    # VGGモデルのインスタンス(前半部分)を作成
    # include_topをFalseにし、VGGの特徴抽出部分のみを利用
    vgg16 = VGG16(include_top=False, weights='imagenet', input_tensor=input_tensor)

    # 自作モデル(後半部分)の作成
    top_model = Sequential()
    top_model.add(Flatten(input_shape=vgg16.output_shape[1:]))

    dense_units = 0 + params["dense_units"]
    for _ in range(params["dense_repeat"]):
        top_model.add(Dense(dense_units, activation='relu'))
        top_model.add(Dropout(0.5))
        dense_units = int(dense_units/2)

    top_model.add(Dense(params["len_labels"], activation='softmax'))

    #入力はvgg.input, 出力は, top_modelにvgg16の出力を入れたもの
    model = Model(inputs=vgg16.input, outputs=top_model(vgg16.output))
    # modelの19層目までがvggのモデル
    for layer in model.layers[:19]:
        layer.trainable = False

    # モデルのコンパイル
    model.compile(optimizer=optimizers.gradient_descent_v2.SGD(learning_rate=1e-4, momentum=0.9),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    # モデルSummaryの保存
    summary_path = '../csv/model_results/' + params["datetimenow"] + '/model_summary.txt'
    with open(summary_path, "w") as fp:
        model.summary(print_fn=lambda x: fp.write(x + "\r\n"))

    return model

# yラベルとphoto_idの情報を持つデータフレームに、ジェネレータを利用する場合に必要なバッチナンバーを付加
def add_batch_number_to_df(df, params):
    df = df.sample(frac=1).sort_values(["y"]).reset_index().drop(columns="index")
    tmp = list(range(params["len_iter"])) * params["batch_size"]
    print("add_batch_number_to_df. len(df): {0}, len(tmp): {1}".format(len(df), len(tmp)))
    if len(tmp) > len(df):
        j = 0
        while len(tmp) > len(df):
            for y in range(params["len_labels"]):
                s = df.loc[df.y == y].iloc[j]
                df = pd.concat([df, pd.DataFrame(s).T])
            j += 1
        df = df.iloc[0:len(tmp)].sort_values(["y"]).reset_index().drop(columns="index")
    print("len(tmp): {0}, len(df): {1}".format(len(tmp), len(df)))
    df["batch"] = tmp
    return df

# モデルの訓練時に利用するジェネレータ(メモリ節約できる)
class generate_fit_data(Sequence):
    def __init__(self, df, params):
        self.df = df
        self.params = params

    def __len__(self):
        return int(np.ceil(len(self.df) / float(self.params["batch_size"])))

    def __getitem__(self, idx):
        s_df = self.df.loc[self.df.batch == idx]
        batch_x, batch_y = fetch_fit_data(s_df.photo_id, s_df.y, self.params['img_size'], self.params["df_labels_for_model"])

        return batch_x, batch_y

# 画像の前処理　元画像のアスペクト比を維持したまま、縦横の大きさを揃えて、余白部分は黒にする
def preprocess(img):
    h, w, _ = img.shape
    longest_edge = max(h, w)
    top = 0
    bottom = 0
    left = 0
    right = 0
    if h < longest_edge:
        diff_h = longest_edge - h
        top = diff_h // 2
        bottom = diff_h - top
    elif w < longest_edge:
        diff_w = longest_edge - w
        left = diff_w // 2
        right = diff_w - left
    else:
        pass

    img = cv2.copyMakeBorder(img, top, bottom, left, right,
                             cv2.BORDER_CONSTANT, value=[0, 0, 0])
    return img

# photo_idから画像のpathを取得
def fetch_img_path_by_photo_id(photo_id):
    path = "../img/raillab/original/*/" + str(int(photo_id)) + ".jpg"
    list_path = glob.glob(path)
    return list_path[0]

# photo_idから画像を取得 (OpenCVを利用)
def fetch_img_by_photo_id(photo_id):
    dir = fetch_img_path_by_photo_id(photo_id)
    img = cv2.imread(dir)
    return img

# photo_idとimg_sizeを指定して前処理済みの画像を取得
def fetch_img_by_photo_id_and_resize(photo_id, img_size):
    photo_id = str(int(photo_id))
    img = fetch_img_by_photo_id(photo_id)
    img = preprocess(img)
    img = cv2.resize(img, img_size)
    return img

# X (list_x), y(list_y), img_sizeを指定してモデルの訓練または検証用のデータ(画像とyラベル)を取得
def fetch_fit_data(list_x, list_y, img_size, df_labels_for_model):
    x = np.array(list(map(lambda photo_id: fetch_img_by_photo_id_and_resize(photo_id, (img_size, img_size)), list_x)))
    len_y = len(list_y)
    tmp = list(list_y) + list(range(len(df_labels_for_model)))
    tmp = tmp[0:len_y]
    y = to_categorical(np.array(tmp))
    return x, y

# モデル訓練用と検証用のデータ(画像とyラベル)を取得
def fetch_train_test_data(df_photo_ids, img_size, df_labels_for_model):
    X = {}
    y = {}
    for w in ['train', 'test']:
        X[w], y[w] = fetch_fit_data(df_photo_ids[w].photo_id, df_photo_ids[w].y, img_size, df_labels_for_model)
    return X, y

# モデルの訓練結果をcsvファイルに保存
def save_history_to_csv(df_history, datetimenow, filename_appendix=''):
    arr0 = ['loss', 'accuracy', 'val_loss', 'val_accuracy']
    arr = [w for w in arr0 if w in df_history.columns]
    df = df_history.loc[:, arr]
    df = df.applymap(lambda x: "{:.5f}".format(x))
    df_history.update(df)
    save_path = '../csv/model_results/' + datetimenow + '/history' + filename_appendix + '.csv'
    df_history.to_csv(save_path, index=False)

# モデルの訓練
def model_fit(model, df_photo_ids, params):
    # ジェネレータを使う場合（メモリ節約、パラメータ、実行環境によってはバグらないとは言い切れない）
    if params["train_mode"] == "model_fit_by_generator":
        print("model_fit_by_generator")
        list_df = add_batch_number_to_df(df_photo_ids['train'], params)
        gen = generate_fit_data(list_df, params)
        X_test, y_test = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"])
        history = model.fit(x = gen, batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X_test, y_test))
    # ジェネレータを使わない場合（パラメータ、実行環境によってはバグる可能性がある）
    else:
        print("model_fit")
        X, y = fetch_train_test_data(df_photo_ids, params["img_size"], params["df_labels_for_model"])
        history = model.fit(X['train'], y['train'], batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X['test'], y['test']))

    # モデルの訓練結果を取得してcsvファイルに保存
    df_history = pd.DataFrame(history.history)
    df_history.insert(loc=0, column='epoch', value=range(len(df_history) + 1)[1:(len(df_history)+1)])
    print(df_history)
    save_history_to_csv(df_history, params["datetimenow"])

    # モデルをh5ファイルで保存
    model_filename = '../models/model_train_' + params["datetimenow"] + '.h5'
    model.save(model_filename)

    return model, params

# モデル訓練時に使用したパラメータやモデル精度（スコア）などをcsvファイルに保存
def write_model_fit_scores(scores, params):
    file_path = "../csv/model_results/model_scores.csv"
    df = pd.read_csv(file_path)
    r = df.loc[df.datetime == params["datetimenow"]]
    row_index = list(r.index)[0]
    df.loc[row_index, "time"] = params["time_train"]
    df.loc[row_index, "loss"] = scores[0]
    df.loc[row_index, "accuracy"] = scores[1]
    for w in ['best_epochs', 'best_batch', 'best_epochs2']:
        if w in list(params.keys()):
            df.loc[row_index, w] = str(int(params[w]))
    print("params.keys()", list(params.keys()))
    s_df = df.loc[:, ["img_size", "n_photos", "batch_size", "epochs"]]
    s_df = s_df.applymap(lambda x: "{:.0f}".format(x))
    df.update(s_df)
    s_df = df.loc[:, ["loss", "accuracy"]]
    s_df = s_df.applymap(lambda x: "{:.5f}".format(x))
    df.update(s_df)
    s_df = df.loc[:, ["time"]]
    s_df = s_df.applymap(lambda x: "{:.2f}".format(x))
    df.update(s_df)
    df.to_csv(file_path, index=False)

# 学習済みモデルから予測
def pred_train(model, labels, photo_id, y_value, img_size):
    img = fetch_img_by_photo_id_and_resize(photo_id, img_size)
    photo_label = labels[int(y_value)]
    pred_all = model.predict(np.array([img]))
    pred_all2 = list(map(lambda x: "{:.5f}".format(x), pred_all.reshape(1,-1)[0].tolist()))
    pred = np.argmax(pred_all)
    res = [photo_id, photo_label == labels[pred], y_value, pred, photo_label, labels[pred], *pred_all2]
    return res

# 学習済みモデルから予測して結果をcsvファイルに保存
def pred_train_and_save_to_csv(model, params, df_photo_ids_for_test):
    rand_index = np.random.permutation(np.arange(len(df_photo_ids_for_test)))
    res_arr = []
    for j, i in enumerate(rand_index):
        row = df_photo_ids_for_test.iloc[i]
        photo_id = row.photo_id
        y_value = row.y
        print("{0}/{1}".format(j, len(rand_index)))
        res = pred_train(model, params["df_labels_for_model"].label.values, photo_id, y_value, [params["img_size"]] * 2)
        print(res)
        res_arr.append(res)

    df = pd.DataFrame(res_arr)
    col_names = ['photo_id', 'match', 'y', 'pred_y', 'label', 'pred_label'] + list(map(lambda x: "#" + str(x), params["df_labels_for_model"].index.values))
    df.columns = col_names
    df = df.sort_values(['match', 'y', 'photo_id']).reset_index().drop(columns="index")
    df_r = df.loc[:, ['photo_id', 'y', 'pred_y']]
    df_r = df_r.applymap(lambda x: ("{:.0f}".format(x)))
    df.update(df_r)
    file_path = '../csv/model_results/' + params["datetimenow"] + '/pred_results.csv'
    df.to_csv(file_path, index=False)

# モデルの訓練と予測を行い、結果を保存
def train_model(params):
    print("train_model")

    # モデル訓練の記録に必要な各パラメータを設定または取得
    params["time_i"] = time.perf_counter()
    params["datetimenow"] = (datetime.datetime.now() + datetime.timedelta(hours=9)).strftime('%Y%m%d_%H%M%S')
    params["n_photos"] = fetch_n_photos(params)
    print("train_model. params\n", params)
    # 開始時のパラメータをcsvファイルに保存
    write_params_to_model_scores(params)

    # 訓練、検証に用いるyラベルとphoto_idを取得
    df_photo_ids, params = fetch_df_photo_ids(params)

    # モデルの作成
    model = create_model(params)
    # モデルの訓練
    model, params = model_fit(model, df_photo_ids, params)

    params["time_f"] = time.perf_counter()
    params["time_train"] = params["time_f"] - params["time_i"]

    # 訓練済みモデルの精度を評価して結果をcsvファイルに保存
    X, y = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"])
    scores = model.evaluate(X, y, batch_size=params["batch_size"], verbose=1)
    write_model_fit_scores(scores, params)

    # 訓練済みモデルから予測して結果をcsvファイルに保存
    pred_train_and_save_to_csv(model, params, df_photo_ids['test_all'])

# 関数定義ここまで
# -------------------------------

# set parameters
params = {}
# train_modeは"model_fit"、または"model_fit_by_generator"
params["train_mode"] = "model_fit_by_generator"
params["train_test_ratio"] = 0.8

# パラメータのサイズによってはエラーがでた
# 最終的に採用したモデルに適用したパラメータのみを記載
dic_all_params = {
    "dense_repeat": [1],
    "dense_units": [1024],
    'img_size': [300],
    "n_all": [300],
    "epochs": [8],
    "batch_size": [64]
}

repeat_len = 3 # 同条件での繰り返し数

# パラメータの組み合わせを作成してデータフレームに変換
all_params = list(itertools.product(*list(dic_all_params.values())))
df_all_params = pd.DataFrame(all_params, columns=dic_all_params.keys())
print(df_all_params)

params["df_labels_for_model"] = pd.read_csv("../csv/labels_for_model.csv")
params["len_labels"] = len(params["df_labels_for_model"])
print("")
print(params["df_labels_for_model"])

len_paramset = len(df_all_params.index.values)
row_nums = df_all_params.index.values

for j in range(repeat_len):
    print("j{0}/{1}".format(j, repeat_len))
    for i in row_nums:
        print("\n-------------------------------------------------------")
        print("fit model: {0}/{1}".format(i, len_paramset))
        params.update(convert_df_row_to_dic(df_all_params.iloc[i]))
        train_model(params)
	import math
	import numpy as np
	import pandas as pd
	import os
	import glob
	import cv2
	import datetime
	import time
	import itertools
	from imblearn.under_sampling import RandomUnderSampler
	from imblearn.over_sampling import RandomOverSampler

	from tensorflow import keras
	from keras.layers import Dense, Dropout, Flatten, Input
	from keras.applications.vgg16 import VGG16
	from keras.models import Model, Sequential
	from keras import optimizers
	from keras.utils import to_categorical
	from keras.utils import Sequence

	# -------------------------------
	# ここから関数定義

	# データフレームの行を辞書型に変換
	def convert_df_row_to_dic(df_row):
	params = {}
	for key in list(df_row.keys()):
	params[key] = df_row[key]

	return params

	# モデル訓練に使う画像の枚数情報を取得
	def fetch_n_photos(params):
	d = {}
	d['all'] = params["n_all"]
	d['train'] = int(params["n_all"] * params["train_test_ratio"])
	d['test'] = d['all'] - d['train']
	return d

	# モデルの訓練に使用したパラメータをcsvファイルに保存
	def write_params_to_model_scores(params):
	file_path = "../csv/model_results/model_scores.csv"
	df = pd.read_csv(file_path)
	dic_scores = {
	'datetime': params["datetimenow"],
	'dense_repeat': params['dense_repeat'],
	"dense_units": params["dense_units"],
	'img_size': params["img_size"],
	'n_photos': params["n_all"],
	'batch_size': params["batch_size"],
	'epochs': params["epochs"]
	}

	s = pd.Series(list(dic_scores.values()), index=list(dic_scores.keys()))
	df = pd.concat([df, pd.DataFrame(s).T])
	s_df = df.loc[:, ["dense_units", "img_size", "n_photos", "batch_size", "epochs"]]
	s_df = s_df.applymap(lambda x: "{:.0f}".format(x))
	df.update(s_df)
	df.to_csv(file_path, index=False)

	# 訓練に使用する画像の枚数を揃えるためのimblearnの関数に渡すパラメータsampling_strategyを取得
	def fetch_strategy(params):
	d = {}
	for w in ['train', 'test']:
	d[w] = {}
	for i in params["df_labels_for_model"].y:
	d[w][i] = params["n_photos"][w]

	return d

	# ディレクトリの確認　指定したpathがない場合に作成する
	def my_makedirs(path):
	if not os.path.isdir(path):
	os.makedirs(path)

	# モデルの訓練や検証に使用するphoto_idとラベルyのデータフレームをcsvファイルに保存
	def save_df_photo_ids_to_csv(df_photo_ids, datetimenow):
	df = df_photo_ids.copy()
	for w in ['all', 'train', 'test']:
	df[w] = df[w].applymap(int).sort_values(['y', 'photo_id']).reset_index().drop(columns="index")
	df[w].to_csv("../csv/model_results/" + datetimenow + "/data_" + w + ".csv")

	# モデルの訓練や検証時に用いるラベルとphoto_idのデータフレームを取得
	# df_photo_ids（辞書型）に以下のインデックスをつけてデータフレームを保存
	# all: モデル学習に適した手持ちデータの全て(種類で数にばらつき)
	# 訓練用と検証用のデータ数の比率からallをtrain_allとtest_allに分配
	# train_all: 訓練用データ全て(種類で数にばらつき)
	# test_all: 検証用データ全て(種類で数にばらつき)
	# train: 種類で数を揃えた訓練用データ
	# test: 種類で数を揃えた検証用データ
	def fetch_df_photo_ids(params):
	# labelとphoto_idsの対応を記載したcollections_label.csvを読み込み
	df_collections_label = pd.read_csv("../csv/collections_label.csv")
	# モデルの訓練、検証に使用したくないphoto_idの情報を記載したcsvを読み込み
	remove_photo_ids = pd.read_csv("../csv/remove_photo_ids.csv").photo_id.values

	photo_ids = []
	photo_labels = []
	# params["df_labels_for_model"].yはモデル学習時のyラベル(0 ~ 9の整数)
	for i in params["df_labels_for_model"].y:
	label_row0 = params["df_labels_for_model"][i:i+1]
	label_row = label_row0.iloc[0]

	# ラベルが一致する行を抽出し、photo_idsを取得
	row = df_collections_label.loc[df_collections_label.value == label_row.label].iloc[-1]
	print(row)
	s_photo_ids_all = list(map(int, row.photo_id.split(',')))
	# モデルの訓練、検証に使用したくないphoto_idを省く
	s_photo_ids = [photo_id for photo_id in s_photo_ids_all if photo_id not in remove_photo_ids]
	# photo_idsに追加
	photo_ids = photo_ids + s_photo_ids
	# yラベルを追加
	s_photo_labels = [i] * len(s_photo_ids)
	photo_labels = photo_labels + s_photo_labels

	df_photo_ids = {}
	# データをランダムに並べ替える
	df_photo_ids['all'] = pd.DataFrame({'photo_id': photo_ids, 'y': photo_labels}).sample(frac=1).reset_index().drop(columns="index")
	# データフレームの値をintに変換
	df_photo_ids['all'] = df_photo_ids['all'].applymap(int)

	# train, test用に空のデータフレームを準備
	for w in ['train', 'test']:
	df_photo_ids[w + "_all"] = pd.DataFrame({'photo_id': [], 'y': []})

	# ラベルyの値ごとにphoto_idを分割してデータフレームに保存
	for i in params["df_labels_for_model"].y:
	sub_df = df_photo_ids['all'].loc[df_photo_ids['all'].y == i]
	# 訓練用のサンプル数を手持ちサンプル数から算出
	j = int(len(sub_df) * params["train_test_ratio"])
	# jが設定した訓練用サンプル数を超える場合はjの値を置き換える
	if j > params["n_photos"]['train']:
	j = params["n_photos"]['train']

	# jの値を利用してデータフレームを分割してtrain_all, test_allに保存
	s_df = {}
	s_df["train_all"] = sub_df.iloc[0:j, ]
	s_df["test_all"] = sub_df.iloc[j:(j+params["n_photos"]['test']), ]
	# データフレームを結合
	for w in ['train', 'test']:
	s = w + '_all'
	df_photo_ids[s] = pd.concat([df_photo_ids[s], s_df[s]], axis=0)

	# all, train_all, test_allにおけるyラベルの数をカウント
	vc = {}
	for w in ['all', 'train_all', 'test_all']:
	vc[w] = df_photo_ids[w].y.value_counts()


	# ここからimblearnを利用してyの種類で数を揃える作業
	# RandomUnderSamplerやRandomOverSamplerに渡すパラメータ sampling_strategyの取得
	strategy = fetch_strategy(params)

	# rusを定義
	for w in ['train', 'test']:
	w_all = w + "_all"
	# 全てのyで設定したサンプル数を上回る場合はRandomUnderSamplerを利用して揃える
	if min(vc[w_all].values) > params["n_photos"][w]:
	print(w, "data: RandomUnderSampler")
	rus = RandomUnderSampler(random_state = 0, sampling_strategy = strategy[w])
	# 全てのyで設定したサンプル数と同数または下回る場合はRandomOverSamplerを利用して揃える
	else:
	print(w, "data: RandomOverSampler")
	rus = RandomOverSampler(random_state = 0, sampling_strategy = strategy[w])

	# fit_resampleを実行して結果を取得
	photo_ids, y = rus.fit_resample(np.array(df_photo_ids[w_all].photo_id).reshape(-1,1), np.array(df_photo_ids[w_all].y).reshape(-1,1))
	photo_ids, y = list(map(lambda x: x.reshape(1,-1)[0].tolist(), [photo_ids, y]))
	# データフレームに変換
	df_photo_ids[w] = pd.DataFrame({'photo_id': photo_ids, 'y': y})

	# train, testのサンプル数を取得
	for w in ['train', 'test']:
	vc[w] = df_photo_ids[w].y.value_counts()

	# 辞書型のvcをデータフレームに変換
	df_vc = pd.concat(vc.values(), axis=1)
	df_vc = df_vc.sort_index().reset_index()
	df_vc.columns = ['y', *list(vc.keys())]
	df_vc.y = list(map(int, df_vc.y))
	df_vc = pd.merge(params["df_labels_for_model"], df_vc, on="y", how="outer")
	print(df_vc)

	# サンプル数の情報を保存
	save_file_path = '../csv/model_results/' + params["datetimenow"]
	my_makedirs(save_file_path)
	df_vc.to_csv(save_file_path + "/data_value_counts.csv", index=False)

	# ラベルyとphoto_idのデータフレームを保存
	save_df_photo_ids_to_csv(df_photo_ids, params["datetimenow"])
	# バッチサイズと訓練用サンプル数から繰り返し数を算出（訓練時に使用）
	params['len_iter'] = math.ceil(len(df_photo_ids['train'])/params["batch_size"])

	return df_photo_ids, params

	# パラメータを読み込み、モデルを作成する
	def create_model(params):
	# 画像サイズ
	arr_img_size = [params["img_size"]] * 2

	# VGG16による転移学習
	# 入力の形を定義
	input_tensor = Input(shape=(*arr_img_size,3))
	# VGGモデルのインスタンス(前半部分)を作成
	# include_topをFalseにし、VGGの特徴抽出部分のみを利用
	vgg16 = VGG16(include_top=False, weights='imagenet', input_tensor=input_tensor)

	# 自作モデル(後半部分)の作成
	top_model = Sequential()
	top_model.add(Flatten(input_shape=vgg16.output_shape[1:]))

	dense_units = 0 + params["dense_units"]
	for _ in range(params["dense_repeat"]):
	top_model.add(Dense(dense_units, activation='relu'))
	top_model.add(Dropout(0.5))
	dense_units = int(dense_units/2)

	top_model.add(Dense(params["len_labels"], activation='softmax'))

	#入力はvgg.input, 出力は, top_modelにvgg16の出力を入れたもの
	model = Model(inputs=vgg16.input, outputs=top_model(vgg16.output))
	# modelの19層目までがvggのモデル
	for layer in model.layers[:19]:
	layer.trainable = False

	# モデルのコンパイル
	model.compile(optimizer=optimizers.gradient_descent_v2.SGD(learning_rate=1e-4, momentum=0.9),
	loss='categorical_crossentropy',
	metrics=['accuracy'])

	# モデルSummaryの保存
	summary_path = '../csv/model_results/' + params["datetimenow"] + '/model_summary.txt'
	with open(summary_path, "w") as fp:
	model.summary(print_fn=lambda x: fp.write(x + "\r\n"))

	return model

	# yラベルとphoto_idの情報を持つデータフレームに、ジェネレータを利用する場合に必要なバッチナンバーを付加
	def add_batch_number_to_df(df, params):
	df = df.sample(frac=1).sort_values(["y"]).reset_index().drop(columns="index")
	tmp = list(range(params["len_iter"])) * params["batch_size"]
	print("add_batch_number_to_df. len(df): {0}, len(tmp): {1}".format(len(df), len(tmp)))
	if len(tmp) > len(df):
	j = 0
	while len(tmp) > len(df):
	for y in range(params["len_labels"]):
	s = df.loc[df.y == y].iloc[j]
	df = pd.concat([df, pd.DataFrame(s).T])
	j += 1
	df = df.iloc[0:len(tmp)].sort_values(["y"]).reset_index().drop(columns="index")
	print("len(tmp): {0}, len(df): {1}".format(len(tmp), len(df)))
	df["batch"] = tmp
	return df

	# モデルの訓練時に利用するジェネレータ(メモリ節約できる)
	class generate_fit_data(Sequence):
	def __init__(self, df, params):
	self.df = df
	self.params = params

	def __len__(self):
	return int(np.ceil(len(self.df) / float(self.params["batch_size"])))

	def __getitem__(self, idx):
	s_df = self.df.loc[self.df.batch == idx]
	batch_x, batch_y = fetch_fit_data(s_df.photo_id, s_df.y, self.params['img_size'], self.params["df_labels_for_model"])

	return batch_x, batch_y

	# 画像の前処理　元画像のアスペクト比を維持したまま、縦横の大きさを揃えて、余白部分は黒にする
	def preprocess(img):
	h, w, _ = img.shape
	longest_edge = max(h, w)
	top = 0
	bottom = 0
	left = 0
	right = 0
	if h < longest_edge:
	diff_h = longest_edge - h
	top = diff_h // 2
	bottom = diff_h - top
	elif w < longest_edge:
	diff_w = longest_edge - w
	left = diff_w // 2
	right = diff_w - left
	else:
	pass

	img = cv2.copyMakeBorder(img, top, bottom, left, right,
	cv2.BORDER_CONSTANT, value=[0, 0, 0])
	return img

	# photo_idから画像のpathを取得
	def fetch_img_path_by_photo_id(photo_id):
	path = "../img/raillab/original/*/" + str(int(photo_id)) + ".jpg"
	list_path = glob.glob(path)
	return list_path[0]

	# photo_idから画像を取得 (OpenCVを利用)
	def fetch_img_by_photo_id(photo_id):
	dir = fetch_img_path_by_photo_id(photo_id)
	img = cv2.imread(dir)
	return img

	# photo_idとimg_sizeを指定して前処理済みの画像を取得
	def fetch_img_by_photo_id_and_resize(photo_id, img_size):
	photo_id = str(int(photo_id))
	img = fetch_img_by_photo_id(photo_id)
	img = preprocess(img)
	img = cv2.resize(img, img_size)
	return img

	# X (list_x), y(list_y), img_sizeを指定してモデルの訓練または検証用のデータ(画像とyラベル)を取得
	def fetch_fit_data(list_x, list_y, img_size, df_labels_for_model):
	x = np.array(list(map(lambda photo_id: fetch_img_by_photo_id_and_resize(photo_id, (img_size, img_size)), list_x)))
	len_y = len(list_y)
	tmp = list(list_y) + list(range(len(df_labels_for_model)))
	tmp = tmp[0:len_y]
	y = to_categorical(np.array(tmp))
	return x, y

	# モデル訓練用と検証用のデータ(画像とyラベル)を取得
	def fetch_train_test_data(df_photo_ids, img_size, df_labels_for_model):
	X = {}
	y = {}
	for w in ['train', 'test']:
	X[w], y[w] = fetch_fit_data(df_photo_ids[w].photo_id, df_photo_ids[w].y, img_size, df_labels_for_model)
	return X, y

	# モデルの訓練結果をcsvファイルに保存
	def save_history_to_csv(df_history, datetimenow, filename_appendix=''):
	arr0 = ['loss', 'accuracy', 'val_loss', 'val_accuracy']
	arr = [w for w in arr0 if w in df_history.columns]
	df = df_history.loc[:, arr]
	df = df.applymap(lambda x: "{:.5f}".format(x))
	df_history.update(df)
	save_path = '../csv/model_results/' + datetimenow + '/history' + filename_appendix + '.csv'
	df_history.to_csv(save_path, index=False)

	# モデルの訓練
	def model_fit(model, df_photo_ids, params):
	# ジェネレータを使う場合（メモリ節約、パラメータ、実行環境によってはバグらないとは言い切れない）
	if params["train_mode"] == "model_fit_by_generator":
	print("model_fit_by_generator")
	list_df = add_batch_number_to_df(df_photo_ids['train'], params)
	gen = generate_fit_data(list_df, params)
	X_test, y_test = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"])
	history = model.fit(x = gen, batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X_test, y_test))
	# ジェネレータを使わない場合（パラメータ、実行環境によってはバグる可能性がある）
	else:
	print("model_fit")
	X, y = fetch_train_test_data(df_photo_ids, params["img_size"], params["df_labels_for_model"])
	history = model.fit(X['train'], y['train'], batch_size=params["batch_size"], epochs=params["epochs"], validation_data=(X['test'], y['test']))

	# モデルの訓練結果を取得してcsvファイルに保存
	df_history = pd.DataFrame(history.history)
	df_history.insert(loc=0, column='epoch', value=range(len(df_history) + 1)[1:(len(df_history)+1)])
	print(df_history)
	save_history_to_csv(df_history, params["datetimenow"])

	# モデルをh5ファイルで保存
	model_filename = '../models/model_train_' + params["datetimenow"] + '.h5'
	model.save(model_filename)

	return model, params

	# モデル訓練時に使用したパラメータやモデル精度（スコア）などをcsvファイルに保存
	def write_model_fit_scores(scores, params):
	file_path = "../csv/model_results/model_scores.csv"
	df = pd.read_csv(file_path)
	r = df.loc[df.datetime == params["datetimenow"]]
	row_index = list(r.index)[0]
	df.loc[row_index, "time"] = params["time_train"]
	df.loc[row_index, "loss"] = scores[0]
	df.loc[row_index, "accuracy"] = scores[1]
	for w in ['best_epochs', 'best_batch', 'best_epochs2']:
	if w in list(params.keys()):
	df.loc[row_index, w] = str(int(params[w]))
	print("params.keys()", list(params.keys()))
	s_df = df.loc[:, ["img_size", "n_photos", "batch_size", "epochs"]]
	s_df = s_df.applymap(lambda x: "{:.0f}".format(x))
	df.update(s_df)
	s_df = df.loc[:, ["loss", "accuracy"]]
	s_df = s_df.applymap(lambda x: "{:.5f}".format(x))
	df.update(s_df)
	s_df = df.loc[:, ["time"]]
	s_df = s_df.applymap(lambda x: "{:.2f}".format(x))
	df.update(s_df)
	df.to_csv(file_path, index=False)

	# 学習済みモデルから予測
	def pred_train(model, labels, photo_id, y_value, img_size):
	img = fetch_img_by_photo_id_and_resize(photo_id, img_size)
	photo_label = labels[int(y_value)]
	pred_all = model.predict(np.array([img]))
	pred_all2 = list(map(lambda x: "{:.5f}".format(x), pred_all.reshape(1,-1)[0].tolist()))
	pred = np.argmax(pred_all)
	res = [photo_id, photo_label == labels[pred], y_value, pred, photo_label, labels[pred], *pred_all2]
	return res

	# 学習済みモデルから予測して結果をcsvファイルに保存
	def pred_train_and_save_to_csv(model, params, df_photo_ids_for_test):
	rand_index = np.random.permutation(np.arange(len(df_photo_ids_for_test)))
	res_arr = []
	for j, i in enumerate(rand_index):
	row = df_photo_ids_for_test.iloc[i]
	photo_id = row.photo_id
	y_value = row.y
	print("{0}/{1}".format(j, len(rand_index)))
	res = pred_train(model, params["df_labels_for_model"].label.values, photo_id, y_value, [params["img_size"]] * 2)
	print(res)
	res_arr.append(res)

	df = pd.DataFrame(res_arr)
	col_names = ['photo_id', 'match', 'y', 'pred_y', 'label', 'pred_label'] + list(map(lambda x: "#" + str(x), params["df_labels_for_model"].index.values))
	df.columns = col_names
	df = df.sort_values(['match', 'y', 'photo_id']).reset_index().drop(columns="index")
	df_r = df.loc[:, ['photo_id', 'y', 'pred_y']]
	df_r = df_r.applymap(lambda x: ("{:.0f}".format(x)))
	df.update(df_r)
	file_path = '../csv/model_results/' + params["datetimenow"] + '/pred_results.csv'
	df.to_csv(file_path, index=False)

	# モデルの訓練と予測を行い、結果を保存
	def train_model(params):
	print("train_model")

	# モデル訓練の記録に必要な各パラメータを設定または取得
	params["time_i"] = time.perf_counter()
	params["datetimenow"] = (datetime.datetime.now() + datetime.timedelta(hours=9)).strftime('%Y%m%d_%H%M%S')
	params["n_photos"] = fetch_n_photos(params)
	print("train_model. params\n", params)
	# 開始時のパラメータをcsvファイルに保存
	write_params_to_model_scores(params)

	# 訓練、検証に用いるyラベルとphoto_idを取得
	df_photo_ids, params = fetch_df_photo_ids(params)

	# モデルの作成
	model = create_model(params)
	# モデルの訓練
	model, params = model_fit(model, df_photo_ids, params)

	params["time_f"] = time.perf_counter()
	params["time_train"] = params["time_f"] - params["time_i"]

	# 訓練済みモデルの精度を評価して結果をcsvファイルに保存
	X, y = fetch_fit_data(df_photo_ids['test'].photo_id, df_photo_ids['test'].y, params["img_size"], params["df_labels_for_model"])
	scores = model.evaluate(X, y, batch_size=params["batch_size"], verbose=1)
	write_model_fit_scores(scores, params)

	# 訓練済みモデルから予測して結果をcsvファイルに保存
	pred_train_and_save_to_csv(model, params, df_photo_ids['test_all'])

	# 関数定義ここまで
	# -------------------------------

	# set parameters
	params = {}
	# train_modeは"model_fit"、または"model_fit_by_generator"
	params["train_mode"] = "model_fit_by_generator"
	params["train_test_ratio"] = 0.8

	# パラメータのサイズによってはエラーがでた
	# 最終的に採用したモデルに適用したパラメータのみを記載
	dic_all_params = {
	"dense_repeat": [1],
	"dense_units": [1024],
	'img_size': [300],
	"n_all": [300],
	"epochs": [8],
	"batch_size": [64]
	}

	repeat_len = 3 # 同条件での繰り返し数

	# パラメータの組み合わせを作成してデータフレームに変換
	all_params = list(itertools.product(*list(dic_all_params.values())))
	df_all_params = pd.DataFrame(all_params, columns=dic_all_params.keys())
	print(df_all_params)

	params["df_labels_for_model"] = pd.read_csv("../csv/labels_for_model.csv")
	params["len_labels"] = len(params["df_labels_for_model"])
	print("")
	print(params["df_labels_for_model"])

	len_paramset = len(df_all_params.index.values)
	row_nums = df_all_params.index.values

	for j in range(repeat_len):
	print("j{0}/{1}".format(j, repeat_len))
	for i in row_nums:
	print("\n-------------------------------------------------------")
	print("fit model: {0}/{1}".format(i, len_paramset))
	params.update(convert_df_row_to_dic(df_all_params.iloc[i]))
	train_model(params)