Created
September 6, 2019 08:03
-
-
Save Koziev/f136361b8563db863637faac01f13760 to your computer and use it in GitHub Desktop.
Intent classifier using ELMo embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
""" | |
Тренер классификатора интентов для чатбота - нейросетка поверх ELMO. | |
05.09.2019 первая реализация, за основу взят код train_intent_classifier_bert.py | |
""" | |
from __future__ import print_function | |
import numpy as np | |
import argparse | |
import platform | |
import io | |
import pandas as pd | |
import csv | |
import os | |
import json | |
import tqdm | |
from scipy.sparse import lil_matrix | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import cross_val_score | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import LinearSVC | |
from sklearn.svm import SVC | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.svm import LinearSVC | |
from sklearn.externals import joblib | |
from sklearn.ensemble import GradientBoostingClassifier | |
import sklearn.metrics | |
import keras.callbacks | |
from keras import backend as K | |
from keras.callbacks import ModelCheckpoint, EarlyStopping | |
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D | |
from keras.layers import Input | |
from keras.layers import Lambda | |
from keras.layers import recurrent | |
from keras.layers import Dropout | |
from keras.layers.core import Dense | |
from keras.layers.merge import concatenate, add, multiply | |
from keras.layers.wrappers import Bidirectional | |
from keras.models import Model | |
from keras.models import model_from_json | |
from keras.layers.normalization import BatchNormalization | |
from keras.layers import Flatten | |
import keras.regularizers | |
from keras.wrappers.scikit_learn import KerasClassifier | |
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder | |
import rutokenizer | |
NFOLD = 8 | |
def get_params_str(model_params): | |
return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items()) | |
def prepare_phrase(phrase): | |
for delim in u'?,!«»"()': | |
phrase = phrase.replace(delim, ' ' + delim + ' ') | |
if phrase[-1] == '.': | |
phrase = phrase[:-1] | |
phrase = phrase.replace(' ', ' ').strip() | |
return phrase | |
def load_data(dataset_path, embedder): | |
samples = set() | |
tokenizer = rutokenizer.Tokenizer() | |
tokenizer.load() | |
with io.open(dataset_path, 'r', encoding='utf-8') as rdr: | |
current_intent = None | |
for iline, line in enumerate(rdr): | |
if line.startswith('#'): | |
if line.startswith('##'): | |
if 'intent:' in line: | |
current_intent = line.split(':')[1].strip() | |
else: | |
raise RuntimeError() | |
else: | |
# комментарии пропускаем | |
continue | |
else: | |
line = line.strip() | |
if line.startswith('-'): # в файлах RASA строки начинаются с - | |
line = line[1:] | |
if line: | |
if current_intent: | |
phrase = prepare_phrase(line) | |
samples.add((phrase, current_intent)) | |
else: | |
print('line #{}: Current intent is "None"!'.format(iline)) | |
exit(0) | |
samples = list(samples) | |
df = pd.DataFrame(columns='phrase intent'.split(), index=None) | |
for sample in samples: | |
df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True) | |
labels = df['intent'].values | |
phrases = df['phrase'].values | |
label2index = dict((label, i) for (i, label) in enumerate(set(labels))) | |
y_data = np.zeros((len(phrases), len(label2index))) | |
for i, label in enumerate(labels): | |
y_data[i, label2index[label]] = 1 | |
x0 = embedder([tokenizer.tokenize(phrases[0])]) | |
xdim = x0.shape[1] | |
X_data = np.zeros((len(phrases), xdim), dtype=np.float32) | |
for i, phrase in tqdm.tqdm(enumerate(phrases), total=len(phrases), desc='ELMO vectorization'): | |
X_data[i] = embedder([tokenizer.tokenize(phrase)])[0] | |
return X_data, y_data, label2index | |
def scorer(estimator, X, y): | |
y_pred = estimator.predict(X) | |
#return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred) | |
return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted') | |
def create_model(x_dim, nb_labels, model_params): | |
input = Input(shape=(x_dim,), dtype='float32', name='input') | |
net = input | |
optimizer = model_params['optimizer'] | |
units1 = model_params['units1'] | |
units2 = model_params['units2'] | |
activ1 = model_params['activ1'] | |
dropout_rate = model_params['dropout_rate'] | |
if units1 > 0: | |
net = Dense(units=units1, activation=activ1)(net) | |
if dropout_rate > 0.0: | |
net = Dropout(rate=dropout_rate)(net) | |
if units2 > 0: | |
net = Dense(units=units2, activation=activ1)(net) | |
if dropout_rate > 0.0: | |
net = Dropout(rate=dropout_rate)(net) | |
net = Dense(units=nb_labels, activation='softmax')(net) | |
model = Model(inputs=[input], outputs=net) | |
model.compile(loss='categorical_crossentropy', optimizer=optimizer) | |
return model | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split()) | |
parser.add_argument('--tmp', type=str, default='../../../tmp') | |
parser.add_argument('--dataset', default='../../../data/intents.txt') | |
args = parser.parse_args() | |
tmp_dir = args.tmp | |
run_mode = args.run_mode | |
dataset_path = args.dataset | |
weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights')) | |
arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch')) | |
config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config')) | |
max_seq_len = 40 | |
num_features = 1024 | |
# model_fn = "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps" | |
# model_fn = "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz" | |
model_fn = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/deeppavlov_data/elmo_model' | |
embedder = ELMoEmbedder(model_fn, dim=num_features) | |
if run_mode in ('gridsearch', 'train'): | |
X_data, y_data, label2index = load_data(dataset_path, embedder) | |
if run_mode == 'gridsearch': | |
best_params = None | |
best_score = 0.0 | |
for epochs in [10, 20, 40]: | |
for batch_size in [50, 100, 200]: # 100, 50, | |
for optimizer in ['nadam']: # 'rmsprop', 'adam', | |
for units1 in [200, 390, 500]: | |
for units2 in [0]: | |
for activ1 in ['sigmoid']: | |
for dropout_rate in [0.0, 0.1]: | |
sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, | |
#, 'validation_split': 0.2 | |
#'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')], | |
} | |
model_params = sk_params.copy() | |
model_params['optimizer'] = optimizer | |
model_params['units1'] = units1 | |
model_params['units2'] = units2 | |
model_params['activ1'] = activ1 | |
model_params['dropout_rate'] = dropout_rate | |
estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1], | |
len(label2index), | |
model_params), **sk_params) | |
cv_res = cross_val_score(estimator, X_data, y_data, | |
scoring=scorer, cv=NFOLD, n_jobs=1, | |
verbose=1) | |
cv_score = np.mean(cv_res) | |
print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score)) | |
if cv_score > best_score: | |
print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params))) | |
best_score = cv_score | |
best_params = model_params | |
else: | |
print('No improvement over current best_score={}'.format(best_score)) | |
print('best_score={} params={}'.format(best_score, get_params_str(best_params))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment