Created
September 6, 2019 07:57
-
-
Save Koziev/06795638e9b9931292dd25dafaa55e11 to your computer and use it in GitHub Desktop.
Классификатор интентов для чатбота на базе BERT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
""" | |
Тренер классификатора интентов для чатбота - нейросетка поверх BERT. | |
13.07.2019 первая реализация | |
13.07.2019 сделан gridsearch для подбора параметров сетки | |
20.07.2019 переделка для прямого использования nlu.md | |
26.07.2019 в кач-ве метрики кроссвалидации используется f1_weighted | |
""" | |
from __future__ import print_function | |
import numpy as np | |
import argparse | |
import platform | |
import io | |
import pandas as pd | |
import csv | |
import os | |
import json | |
from scipy.sparse import lil_matrix | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import cross_val_score | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import LinearSVC | |
from sklearn.svm import SVC | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.svm import LinearSVC | |
from sklearn.externals import joblib | |
from sklearn.ensemble import GradientBoostingClassifier | |
import sklearn.metrics | |
import keras.callbacks | |
from keras import backend as K | |
from keras.callbacks import ModelCheckpoint, EarlyStopping | |
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D | |
from keras.layers import Input | |
from keras.layers import Lambda | |
from keras.layers import recurrent | |
from keras.layers import Dropout | |
from keras.layers.core import Dense | |
from keras.layers.merge import concatenate, add, multiply | |
from keras.layers.wrappers import Bidirectional | |
from keras.models import Model | |
from keras.models import model_from_json | |
from keras.layers.normalization import BatchNormalization | |
from keras.layers import Flatten | |
import keras.regularizers | |
from keras.wrappers.scikit_learn import KerasClassifier | |
from bert_embedder2 import BERTEmbedder | |
NFOLD = 8 | |
def get_params_str(model_params): | |
return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items()) | |
def prepare_phrase(phrase): | |
for delim in u'?,!«»"()': | |
phrase = phrase.replace(delim, ' ' + delim + ' ') | |
if phrase[-1] == '.': | |
phrase = phrase[:-1] | |
phrase = phrase.replace(' ', ' ').strip() | |
return phrase | |
def load_data(dataset_path, embedder): | |
samples = set() | |
with io.open(dataset_path, 'r', encoding='utf-8') as rdr: | |
current_intent = None | |
for iline, line in enumerate(rdr): | |
if line.startswith('#'): | |
if line.startswith('##'): | |
if 'intent:' in line: | |
current_intent = line.split(':')[1].strip() | |
else: | |
raise RuntimeError() | |
else: | |
# комментарии пропускаем | |
continue | |
else: | |
line = line.strip() | |
if line.startswith('-'): # в файлах RASA строки начинаются с - | |
line = line[1:] | |
if line: | |
if current_intent: | |
phrase = prepare_phrase(line) | |
samples.add((phrase, current_intent)) | |
else: | |
print('line #{}: Current intent is "None"!'.format(iline)) | |
exit(0) | |
samples = list(samples) | |
df = pd.DataFrame(columns='phrase intent'.split(), index=None) | |
for sample in samples: | |
df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True) | |
labels = df['intent'].values | |
phrases = df['phrase'].values | |
label2index = dict((label, i) for (i, label) in enumerate(set(labels))) | |
y_data = np.zeros((len(phrases), len(label2index))) | |
for i, label in enumerate(labels): | |
y_data[i, label2index[label]] = 1 | |
X_data = embedder(phrases) | |
return X_data, y_data, label2index | |
def scorer(estimator, X, y): | |
y_pred = estimator.predict(X) | |
#return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred) | |
return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted') | |
def create_model(x_dim, nb_labels, model_params): | |
input = Input(shape=(x_dim,), dtype='float32', name='input') | |
net = input | |
optimizer = model_params['optimizer'] | |
units1 = model_params['units1'] | |
units2 = model_params['units2'] | |
activ1 = model_params['activ1'] | |
dropout_rate = model_params['dropout_rate'] | |
if units1 > 0: | |
net = Dense(units=units1, activation=activ1)(net) | |
if dropout_rate > 0.0: | |
net = Dropout(rate=dropout_rate)(net) | |
if units2 > 0: | |
net = Dense(units=units2, activation=activ1)(net) | |
if dropout_rate > 0.0: | |
net = Dropout(rate=dropout_rate)(net) | |
net = Dense(units=nb_labels, activation='softmax')(net) | |
model = Model(inputs=[input], outputs=net) | |
model.compile(loss='categorical_crossentropy', optimizer=optimizer) | |
return model | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split()) | |
parser.add_argument('--tmp', type=str, default='../../../tmp') | |
parser.add_argument('--dataset', default='../../../data/intents.txt') | |
args = parser.parse_args() | |
tmp_dir = args.tmp | |
run_mode = args.run_mode | |
dataset_path = args.dataset | |
weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights')) | |
arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch')) | |
config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config')) | |
max_seq_len = 40 | |
# Предполагается, что архив с pretrained BERT model скачан и распакован. | |
# Гугловский multilingual: | |
# bert_path = '/home/inkoziev/polygon/BERT_multilingual/model/multi_cased_L-12_H-768_A-12' | |
# deeppavlov ruBERT: | |
bert_path = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/BERT_multilingual/model/rubert_cased_L-12_H-768_A-12_v1' | |
embedder = BERTEmbedder(model_path=bert_path, seq_len=max_seq_len) | |
if run_mode in ('gridsearch', 'train'): | |
X_data, y_data, label2index = load_data(dataset_path, embedder) | |
if run_mode == 'gridsearch': | |
best_params = None | |
best_score = 0.0 | |
for epochs in [8, 10, 20]: | |
for batch_size in [20, 50, 100]: # 100, 50, | |
for optimizer in ['nadam']: # 'rmsprop', 'adam', | |
for units1 in [200, 390, 500]: | |
for units2 in [0]: | |
for activ1 in ['sigmoid']: | |
for dropout_rate in [0.0, 0.1]: | |
sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, | |
#, 'validation_split': 0.2 | |
#'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')], | |
} | |
model_params = sk_params.copy() | |
model_params['optimizer'] = optimizer | |
model_params['units1'] = units1 | |
model_params['units2'] = units2 | |
model_params['activ1'] = activ1 | |
model_params['dropout_rate'] = dropout_rate | |
estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1], | |
len(label2index), | |
model_params), **sk_params) | |
cv_res = cross_val_score(estimator, X_data, y_data, | |
scoring=scorer, cv=NFOLD, n_jobs=1, | |
verbose=1) | |
cv_score = np.mean(cv_res) | |
print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score)) | |
if cv_score > best_score: | |
print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params))) | |
best_score = cv_score | |
best_params = model_params | |
else: | |
print('No improvement over current best_score={}'.format(best_score)) | |
print('best_score={} params={}'.format(best_score, get_params_str(best_params))) | |
elif run_mode == 'train': | |
# epochs=50 batch_size=20 verbose=0 optimizer=nadam units1=390 units2=0 activ1=sigmoid dropout_rate=0.1 | |
model_params = {} | |
model_params['optimizer'] = 'nadam' | |
model_params['units1'] = 390 | |
model_params['units2'] = 0 | |
model_params['activ1'] = 'sigmoid' | |
model_params['dropout_rate'] = 0.1 | |
epochs = 50 | |
batch_size = 20 | |
model = create_model(X_data.shape[1], len(label2index), model_params) | |
with open(arch_file, 'w') as f: | |
f.write(model.to_json()) | |
model.fit(X_data, y_data, epochs=epochs, batch_size=batch_size, verbose=2) | |
model.save_weights(weights_file) | |
config = {'max_seq_len': max_seq_len, | |
'bert_path': bert_path, | |
'label2index': label2index, | |
'weights': weights_file, | |
'arch': arch_file} | |
with open(config_file, 'w') as f: | |
json.dump(config, f, indent=4) | |
elif run_mode == 'query': | |
with open(config_file, 'r') as f: | |
model_config = json.load(f) | |
max_seq_len = model_config['max_seq_len'] | |
label2index = model_config['label2index'] | |
index2label = dict((i, l) for (l, i) in label2index.items()) | |
with open(arch_file, 'r') as f: | |
model = model_from_json(f.read()) | |
model.load_weights(weights_file) | |
while True: | |
phrase = input(':> ').strip() | |
phrase = prepare_phrase(phrase) | |
X_data = embedder([phrase]) | |
y_pred = model.predict(X_data, verbose=0) | |
label = index2label[np.argmax(y_pred[0])] | |
print(u'label={}'.format(label)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment