Skip to content

Instantly share code, notes, and snippets.

@Koziev
Created September 6, 2019 07:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Koziev/06795638e9b9931292dd25dafaa55e11 to your computer and use it in GitHub Desktop.
Save Koziev/06795638e9b9931292dd25dafaa55e11 to your computer and use it in GitHub Desktop.
Классификатор интентов для чатбота на базе BERT
# coding: utf-8
"""
Тренер классификатора интентов для чатбота - нейросетка поверх BERT.
13.07.2019 первая реализация
13.07.2019 сделан gridsearch для подбора параметров сетки
20.07.2019 переделка для прямого использования nlu.md
26.07.2019 в кач-ве метрики кроссвалидации используется f1_weighted
"""
from __future__ import print_function
import numpy as np
import argparse
import platform
import io
import pandas as pd
import csv
import os
import json
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics
import keras.callbacks
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D
from keras.layers import Input
from keras.layers import Lambda
from keras.layers import recurrent
from keras.layers import Dropout
from keras.layers.core import Dense
from keras.layers.merge import concatenate, add, multiply
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.models import model_from_json
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten
import keras.regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from bert_embedder2 import BERTEmbedder
NFOLD = 8
def get_params_str(model_params):
return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items())
def prepare_phrase(phrase):
for delim in u'?,!«»"()':
phrase = phrase.replace(delim, ' ' + delim + ' ')
if phrase[-1] == '.':
phrase = phrase[:-1]
phrase = phrase.replace(' ', ' ').strip()
return phrase
def load_data(dataset_path, embedder):
samples = set()
with io.open(dataset_path, 'r', encoding='utf-8') as rdr:
current_intent = None
for iline, line in enumerate(rdr):
if line.startswith('#'):
if line.startswith('##'):
if 'intent:' in line:
current_intent = line.split(':')[1].strip()
else:
raise RuntimeError()
else:
# комментарии пропускаем
continue
else:
line = line.strip()
if line.startswith('-'): # в файлах RASA строки начинаются с -
line = line[1:]
if line:
if current_intent:
phrase = prepare_phrase(line)
samples.add((phrase, current_intent))
else:
print('line #{}: Current intent is "None"!'.format(iline))
exit(0)
samples = list(samples)
df = pd.DataFrame(columns='phrase intent'.split(), index=None)
for sample in samples:
df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True)
labels = df['intent'].values
phrases = df['phrase'].values
label2index = dict((label, i) for (i, label) in enumerate(set(labels)))
y_data = np.zeros((len(phrases), len(label2index)))
for i, label in enumerate(labels):
y_data[i, label2index[label]] = 1
X_data = embedder(phrases)
return X_data, y_data, label2index
def scorer(estimator, X, y):
y_pred = estimator.predict(X)
#return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred)
return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted')
def create_model(x_dim, nb_labels, model_params):
input = Input(shape=(x_dim,), dtype='float32', name='input')
net = input
optimizer = model_params['optimizer']
units1 = model_params['units1']
units2 = model_params['units2']
activ1 = model_params['activ1']
dropout_rate = model_params['dropout_rate']
if units1 > 0:
net = Dense(units=units1, activation=activ1)(net)
if dropout_rate > 0.0:
net = Dropout(rate=dropout_rate)(net)
if units2 > 0:
net = Dense(units=units2, activation=activ1)(net)
if dropout_rate > 0.0:
net = Dropout(rate=dropout_rate)(net)
net = Dense(units=nb_labels, activation='softmax')(net)
model = Model(inputs=[input], outputs=net)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split())
parser.add_argument('--tmp', type=str, default='../../../tmp')
parser.add_argument('--dataset', default='../../../data/intents.txt')
args = parser.parse_args()
tmp_dir = args.tmp
run_mode = args.run_mode
dataset_path = args.dataset
weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights'))
arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch'))
config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config'))
max_seq_len = 40
# Предполагается, что архив с pretrained BERT model скачан и распакован.
# Гугловский multilingual:
# bert_path = '/home/inkoziev/polygon/BERT_multilingual/model/multi_cased_L-12_H-768_A-12'
# deeppavlov ruBERT:
bert_path = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/BERT_multilingual/model/rubert_cased_L-12_H-768_A-12_v1'
embedder = BERTEmbedder(model_path=bert_path, seq_len=max_seq_len)
if run_mode in ('gridsearch', 'train'):
X_data, y_data, label2index = load_data(dataset_path, embedder)
if run_mode == 'gridsearch':
best_params = None
best_score = 0.0
for epochs in [8, 10, 20]:
for batch_size in [20, 50, 100]: # 100, 50,
for optimizer in ['nadam']: # 'rmsprop', 'adam',
for units1 in [200, 390, 500]:
for units2 in [0]:
for activ1 in ['sigmoid']:
for dropout_rate in [0.0, 0.1]:
sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0,
#, 'validation_split': 0.2
#'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')],
}
model_params = sk_params.copy()
model_params['optimizer'] = optimizer
model_params['units1'] = units1
model_params['units2'] = units2
model_params['activ1'] = activ1
model_params['dropout_rate'] = dropout_rate
estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1],
len(label2index),
model_params), **sk_params)
cv_res = cross_val_score(estimator, X_data, y_data,
scoring=scorer, cv=NFOLD, n_jobs=1,
verbose=1)
cv_score = np.mean(cv_res)
print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score))
if cv_score > best_score:
print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params)))
best_score = cv_score
best_params = model_params
else:
print('No improvement over current best_score={}'.format(best_score))
print('best_score={} params={}'.format(best_score, get_params_str(best_params)))
elif run_mode == 'train':
# epochs=50 batch_size=20 verbose=0 optimizer=nadam units1=390 units2=0 activ1=sigmoid dropout_rate=0.1
model_params = {}
model_params['optimizer'] = 'nadam'
model_params['units1'] = 390
model_params['units2'] = 0
model_params['activ1'] = 'sigmoid'
model_params['dropout_rate'] = 0.1
epochs = 50
batch_size = 20
model = create_model(X_data.shape[1], len(label2index), model_params)
with open(arch_file, 'w') as f:
f.write(model.to_json())
model.fit(X_data, y_data, epochs=epochs, batch_size=batch_size, verbose=2)
model.save_weights(weights_file)
config = {'max_seq_len': max_seq_len,
'bert_path': bert_path,
'label2index': label2index,
'weights': weights_file,
'arch': arch_file}
with open(config_file, 'w') as f:
json.dump(config, f, indent=4)
elif run_mode == 'query':
with open(config_file, 'r') as f:
model_config = json.load(f)
max_seq_len = model_config['max_seq_len']
label2index = model_config['label2index']
index2label = dict((i, l) for (l, i) in label2index.items())
with open(arch_file, 'r') as f:
model = model_from_json(f.read())
model.load_weights(weights_file)
while True:
phrase = input(':> ').strip()
phrase = prepare_phrase(phrase)
X_data = embedder([phrase])
y_pred = model.predict(X_data, verbose=0)
label = index2label[np.argmax(y_pred[0])]
print(u'label={}'.format(label))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment