Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Intent classifier using ELMo embeddings
# coding: utf-8
"""
Тренер классификатора интентов для чатбота - нейросетка поверх ELMO.
05.09.2019 первая реализация, за основу взят код train_intent_classifier_bert.py
"""
from __future__ import print_function
import numpy as np
import argparse
import platform
import io
import pandas as pd
import csv
import os
import json
import tqdm
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics
import keras.callbacks
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, AveragePooling1D
from keras.layers import Input
from keras.layers import Lambda
from keras.layers import recurrent
from keras.layers import Dropout
from keras.layers.core import Dense
from keras.layers.merge import concatenate, add, multiply
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.models import model_from_json
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten
import keras.regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
import rutokenizer
NFOLD = 8
def get_params_str(model_params):
return ' '.join('{}={}'.format(k, v) for (k, v) in model_params.items())
def prepare_phrase(phrase):
for delim in u'?,!«»"()':
phrase = phrase.replace(delim, ' ' + delim + ' ')
if phrase[-1] == '.':
phrase = phrase[:-1]
phrase = phrase.replace(' ', ' ').strip()
return phrase
def load_data(dataset_path, embedder):
samples = set()
tokenizer = rutokenizer.Tokenizer()
tokenizer.load()
with io.open(dataset_path, 'r', encoding='utf-8') as rdr:
current_intent = None
for iline, line in enumerate(rdr):
if line.startswith('#'):
if line.startswith('##'):
if 'intent:' in line:
current_intent = line.split(':')[1].strip()
else:
raise RuntimeError()
else:
# комментарии пропускаем
continue
else:
line = line.strip()
if line.startswith('-'): # в файлах RASA строки начинаются с -
line = line[1:]
if line:
if current_intent:
phrase = prepare_phrase(line)
samples.add((phrase, current_intent))
else:
print('line #{}: Current intent is "None"!'.format(iline))
exit(0)
samples = list(samples)
df = pd.DataFrame(columns='phrase intent'.split(), index=None)
for sample in samples:
df = df.append({'phrase': sample[0], 'intent': sample[1]}, ignore_index=True)
labels = df['intent'].values
phrases = df['phrase'].values
label2index = dict((label, i) for (i, label) in enumerate(set(labels)))
y_data = np.zeros((len(phrases), len(label2index)))
for i, label in enumerate(labels):
y_data[i, label2index[label]] = 1
x0 = embedder([tokenizer.tokenize(phrases[0])])
xdim = x0.shape[1]
X_data = np.zeros((len(phrases), xdim), dtype=np.float32)
for i, phrase in tqdm.tqdm(enumerate(phrases), total=len(phrases), desc='ELMO vectorization'):
X_data[i] = embedder([tokenizer.tokenize(phrase)])[0]
return X_data, y_data, label2index
def scorer(estimator, X, y):
y_pred = estimator.predict(X)
#return sklearn.metrics.accuracy_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred)
return sklearn.metrics.f1_score(y_true=np.argmax(y, axis=-1), y_pred=y_pred, average='weighted')
def create_model(x_dim, nb_labels, model_params):
input = Input(shape=(x_dim,), dtype='float32', name='input')
net = input
optimizer = model_params['optimizer']
units1 = model_params['units1']
units2 = model_params['units2']
activ1 = model_params['activ1']
dropout_rate = model_params['dropout_rate']
if units1 > 0:
net = Dense(units=units1, activation=activ1)(net)
if dropout_rate > 0.0:
net = Dropout(rate=dropout_rate)(net)
if units2 > 0:
net = Dense(units=units2, activation=activ1)(net)
if dropout_rate > 0.0:
net = Dropout(rate=dropout_rate)(net)
net = Dense(units=nb_labels, activation='softmax')(net)
model = Model(inputs=[input], outputs=net)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--run_mode', type=str, default='gridsearch', choices='train query gridsearch'.split())
parser.add_argument('--tmp', type=str, default='../../../tmp')
parser.add_argument('--dataset', default='../../../data/intents.txt')
args = parser.parse_args()
tmp_dir = args.tmp
run_mode = args.run_mode
dataset_path = args.dataset
weights_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.weights'))
arch_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.arch'))
config_file = os.path.abspath(os.path.join(tmp_dir, 'intent_classifier_bert.config'))
max_seq_len = 40
num_features = 1024
# model_fn = "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps"
# model_fn = "http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-wiki_600k_steps.tar.gz"
model_fn = '/mnt/7383b08e-ace3-49d3-8991-5b9aa07d2596/EmbeddingModels/deeppavlov_data/elmo_model'
embedder = ELMoEmbedder(model_fn, dim=num_features)
if run_mode in ('gridsearch', 'train'):
X_data, y_data, label2index = load_data(dataset_path, embedder)
if run_mode == 'gridsearch':
best_params = None
best_score = 0.0
for epochs in [10, 20, 40]:
for batch_size in [50, 100, 200]: # 100, 50,
for optimizer in ['nadam']: # 'rmsprop', 'adam',
for units1 in [200, 390, 500]:
for units2 in [0]:
for activ1 in ['sigmoid']:
for dropout_rate in [0.0, 0.1]:
sk_params = {'epochs': epochs, 'batch_size': batch_size, 'verbose': 0,
#, 'validation_split': 0.2
#'callbacks': [EarlyStopping(monitor='val_loss', patience=10, mode='auto')],
}
model_params = sk_params.copy()
model_params['optimizer'] = optimizer
model_params['units1'] = units1
model_params['units2'] = units2
model_params['activ1'] = activ1
model_params['dropout_rate'] = dropout_rate
estimator = KerasClassifier(build_fn=lambda: create_model(X_data.shape[1],
len(label2index),
model_params), **sk_params)
cv_res = cross_val_score(estimator, X_data, y_data,
scoring=scorer, cv=NFOLD, n_jobs=1,
verbose=1)
cv_score = np.mean(cv_res)
print('{} ==> cv score={}'.format(get_params_str(model_params), cv_score))
if cv_score > best_score:
print('!!! NEW BEST !!! score={} for {}'.format(cv_score, get_params_str(model_params)))
best_score = cv_score
best_params = model_params
else:
print('No improvement over current best_score={}'.format(best_score))
print('best_score={} params={}'.format(best_score, get_params_str(best_params)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment