Классификатор на базе XGBoost для генерации форм деепричастия из инфинтивных форм
# -*- coding: utf-8 -*-
Генерация словарных статей для деепричастий по статьям инфинитивов.
from __future__ import division # for python2 compatability
from __future__ import print_function
import codecs
import collections
import itertools
import os
import random
import glob
import re
import numpy as np
from sklearn.model_selection import train_test_split
import gc
import xgboost
import scipy.sparse
# Грузим датасет с известными парами из деепричастий и инфинитивов.
# На основе этого датасета будем строить модель, которая создаст деепричастия
# из новых инфинитивов.
samples = []
max_inf_len = 0
all_chars = set()
all_transducers = collections.Counter()
consonants = set(u'бвгджзклмнопрстфхцчшщ')
with'existing_adverbial_participles_and_infinitives.tsv', 'r', 'utf-8') as rdr:
for line in rdr:
tx = tuple(line.strip().split(u'\t'))
ap = tx[0] # деепричастие
inf = tx[1] # инфинитив
aspect = tx[2]
common_prefix_len = 0
for i, (c1, c2) in enumerate(zip(ap, inf)):
if c1 != c2:
common_prefix_len = i
# отсекаемое окончание должно начинаться на согласную
for j in range(common_prefix_len, 1, -1):
if ap[j-1] in consonants:
common_prefix_len = j
if common_prefix_len > 0:
old_ending = inf[common_prefix_len-1:]
new_ending = ap[common_prefix_len-1:]
transducer = (len(old_ending), new_ending)
all_transducers[transducer] += 1
samples.append((ap, inf, aspect, transducer))
if len(samples)<10:
print(u'inf={}\tap={}\tprefix={}'.format(inf, ap, inf[:common_prefix_len]))
max_inf_len = max(max_inf_len, len(inf))
all_chars.remove(u' ')
char2index = dict([(c, i) for (i, c) in enumerate(itertools.chain([u' '], all_chars))])
nb_chars = len(char2index)
total_freq = sum(all_transducers.values())
cumul_freq = 0
for rule, freq in all_transducers.most_common(20):
cumul_freq += freq
print(u'{:10}\t\t{:5.2f}\t{:5.2f}'.format(str(rule[0])+u':'+rule[1], 100.0*float(freq)/total_freq, 100.0*float(cumul_freq)/total_freq))
print('{} samples in dataset'.format(len(samples)))
transducer2id = dict([(t, i) for (i, t) in enumerate([x[0] for x in all_transducers.most_common(30)])])
index2transducer = dict([(i, t) for (t, i) in transducer2id.iteritems()])
nb_transducers = len(transducer2id)
print('{} transducers'.format(nb_transducers))
samples = list(filter(lambda z:z[3] in transducer2id, samples))
nb_samples = len(samples)
samples2 = []
with'adverbial_participles.tsv', 'r', 'utf-8') as rdr:
for line in rdr:
tx = line.strip().split(u'\t')
inf = tx[0]
aspect = tx[1]
samples2.append((inf, aspect))
max_inf_len = max(max_inf_len, len(inf))
# векторизуем сэмплы
input_dim = max_inf_len*nb_chars*2 + 1
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool)
y_data = np.zeros((nb_samples), dtype=np.int32)
for isample, sample in enumerate(samples):
ap = sample[0]
inf = sample[1]
aspect = sample[2]
transducer = sample[3]
# целевой класс - номер трансдьюсера
y = transducer2id[transducer]
y_data[isample] = y
# форма инфинитива с конца к началу
for i, c in enumerate(inf[::-1]):
X_data[isample, i*nb_chars + char2index[c]] = True
# форма инфинитива от начала к концу
xpos = nb_chars*max_inf_len
for i, c in enumerate(inf):
X_data[isample, xpos + i*nb_chars + char2index[c]] = True
# признак вида
xpos = nb_chars*max_inf_len*2
X_data[isample, xpos] = aspect == u'СОВЕРШ'
# Разбивка на обучающий/валидационный наборы
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.3, random_state=123456)
# Обучение классификатора
print("Prepare DMatrix'es...")
D_train = xgboost.DMatrix(X_train, y_train )
D_val = xgboost.DMatrix(X_val, y_val )
xgb_params = dict()
xgb_params['eta'] = 0.2
xgb_params['max_depth'] = 5
xgb_params['subsample'] = 0.85
xgb_params['min_child_weight'] = 3
xgb_params['gamma'] = 0.1
xgb_params['colsample_bytree'] = 1.00
xgb_params['colsample_bylevel'] = 1.00
xgb_params['objective'] = 'multi:softprob'
xgb_params['eval_metric'] = [ 'merror', 'mlogloss']
xgb_params['num_class'] = nb_transducers
xgb_params['seed'] = 123456
xgb_params['silent'] = True
watchlist = [(D_train, 'train'), (D_val, 'valid')]
if True:
print('Compute best number of estimators using')
cvres =,
# verbose=True,
# print_every_n=50,
nbrounds = cvres.shape[0]
print('CV finished, nbrounds={}'.format(nbrounds))
nbrounds = 320 # <-- 320
print('Start training...')
model = xgboost.train( params=xgb_params,
verbose_eval=10 )
# Применение обученного классификатора для получения форм деепричастия на новых инфинитивах
nb_samples = len(samples2)
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool)
for isample, sample in enumerate(samples2):
inf = sample[0]
aspect = sample[1]
# форма инфинитива с конца к началу
for i, c in enumerate(inf[::-1]):
X_data[isample, i*nb_chars + char2index[c]] = True
# форма инфинитива от начала к концу
xpos = nb_chars*max_inf_len
for i, c in enumerate(inf):
X_data[isample, xpos + i*nb_chars + char2index[c]] = True
# признак вида
xpos = nb_chars*max_inf_len*2
X_data[isample, xpos] = aspect == u'СОВЕРШ'
D_probe = xgboost.DMatrix(X_data)
y_probe = model.predict(D_probe)
with'rus_new_adverbial_participles.sol', 'w', 'utf-8') as wrt:
for isample, (sample, y) in enumerate(zip(samples2, y_probe)):
inf = sample[0]
aspect = sample[1]
transducer = index2transducer[np.argmax(y)]
new_entry = inf[0:len(inf)-transducer[0]] + transducer[1]
wrt.write(u'дп( {}, {}, {} )\n'.format(new_entry, aspect, inf))
