Created
April 11, 2018 06:21
-
-
Save Koziev/3db175f51010c1a7753d642c15aa0986 to your computer and use it in GitHub Desktop.
Классификатор на базе XGBoost для генерации форм деепричастия из инфинтивных форм
We can make this file beautiful and searchable if this error is corrected: No tabs found in this TSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Генерация словарных статей для деепричастий по статьям инфинитивов. | |
""" | |
from __future__ import division # for python2 compatability | |
from __future__ import print_function | |
import codecs | |
import collections | |
import itertools | |
import os | |
import random | |
import glob | |
import re | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
import gc | |
import xgboost | |
import scipy.sparse | |
# Грузим датасет с известными парами из деепричастий и инфинитивов. | |
# На основе этого датасета будем строить модель, которая создаст деепричастия | |
# из новых инфинитивов. | |
samples = [] | |
max_inf_len = 0 | |
all_chars = set() | |
all_transducers = collections.Counter() | |
consonants = set(u'бвгджзклмнопрстфхцчшщ') | |
with codecs.open('existing_adverbial_participles_and_infinitives.tsv', 'r', 'utf-8') as rdr: | |
for line in rdr: | |
tx = tuple(line.strip().split(u'\t')) | |
ap = tx[0] # деепричастие | |
inf = tx[1] # инфинитив | |
aspect = tx[2] | |
common_prefix_len = 0 | |
for i, (c1, c2) in enumerate(zip(ap, inf)): | |
if c1 != c2: | |
common_prefix_len = i | |
break | |
# отсекаемое окончание должно начинаться на согласную | |
for j in range(common_prefix_len, 1, -1): | |
if ap[j-1] in consonants: | |
common_prefix_len = j | |
break | |
if common_prefix_len > 0: | |
old_ending = inf[common_prefix_len-1:] | |
new_ending = ap[common_prefix_len-1:] | |
transducer = (len(old_ending), new_ending) | |
all_transducers[transducer] += 1 | |
samples.append((ap, inf, aspect, transducer)) | |
if len(samples)<10: | |
print(u'inf={}\tap={}\tprefix={}'.format(inf, ap, inf[:common_prefix_len])) | |
max_inf_len = max(max_inf_len, len(inf)) | |
all_chars.update(inf) | |
all_chars.remove(u' ') | |
char2index = dict([(c, i) for (i, c) in enumerate(itertools.chain([u' '], all_chars))]) | |
nb_chars = len(char2index) | |
total_freq = sum(all_transducers.values()) | |
cumul_freq = 0 | |
print('rule\t\tshare\tcumulative') | |
for rule, freq in all_transducers.most_common(20): | |
cumul_freq += freq | |
print(u'{:10}\t\t{:5.2f}\t{:5.2f}'.format(str(rule[0])+u':'+rule[1], 100.0*float(freq)/total_freq, 100.0*float(cumul_freq)/total_freq)) | |
print('{} samples in dataset'.format(len(samples))) | |
transducer2id = dict([(t, i) for (i, t) in enumerate([x[0] for x in all_transducers.most_common(30)])]) | |
index2transducer = dict([(i, t) for (t, i) in transducer2id.iteritems()]) | |
nb_transducers = len(transducer2id) | |
print('{} transducers'.format(nb_transducers)) | |
samples = list(filter(lambda z:z[3] in transducer2id, samples)) | |
nb_samples = len(samples) | |
samples2 = [] | |
with codecs.open('adverbial_participles.tsv', 'r', 'utf-8') as rdr: | |
for line in rdr: | |
tx = line.strip().split(u'\t') | |
inf = tx[0] | |
aspect = tx[1] | |
samples2.append((inf, aspect)) | |
max_inf_len = max(max_inf_len, len(inf)) | |
# векторизуем сэмплы | |
input_dim = max_inf_len*nb_chars*2 + 1 | |
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool) | |
y_data = np.zeros((nb_samples), dtype=np.int32) | |
for isample, sample in enumerate(samples): | |
ap = sample[0] | |
inf = sample[1] | |
aspect = sample[2] | |
transducer = sample[3] | |
# целевой класс - номер трансдьюсера | |
y = transducer2id[transducer] | |
y_data[isample] = y | |
# форма инфинитива с конца к началу | |
for i, c in enumerate(inf[::-1]): | |
X_data[isample, i*nb_chars + char2index[c]] = True | |
# форма инфинитива от начала к концу | |
xpos = nb_chars*max_inf_len | |
for i, c in enumerate(inf): | |
X_data[isample, xpos + i*nb_chars + char2index[c]] = True | |
# признак вида | |
xpos = nb_chars*max_inf_len*2 | |
X_data[isample, xpos] = aspect == u'СОВЕРШ' | |
# Разбивка на обучающий/валидационный наборы | |
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.3, random_state=123456) | |
# Обучение классификатора | |
print("Prepare DMatrix'es...") | |
D_train = xgboost.DMatrix(X_train, y_train ) | |
D_val = xgboost.DMatrix(X_val, y_val ) | |
gc.collect() | |
xgb_params = dict() | |
xgb_params['eta'] = 0.2 | |
xgb_params['max_depth'] = 5 | |
xgb_params['subsample'] = 0.85 | |
xgb_params['min_child_weight'] = 3 | |
xgb_params['gamma'] = 0.1 | |
xgb_params['colsample_bytree'] = 1.00 | |
xgb_params['colsample_bylevel'] = 1.00 | |
xgb_params['objective'] = 'multi:softprob' | |
xgb_params['eval_metric'] = [ 'merror', 'mlogloss'] | |
xgb_params['num_class'] = nb_transducers | |
xgb_params['seed'] = 123456 | |
xgb_params['silent'] = True | |
watchlist = [(D_train, 'train'), (D_val, 'valid')] | |
if True: | |
print('Compute best number of estimators using xgboost.cv') | |
cvres = xgboost.cv(xgb_params, | |
D_train, | |
num_boost_round=1000, | |
nfold=5, | |
early_stopping_rounds=10, | |
metrics=['merror'], | |
seed=123456, | |
verbose_eval=10, | |
# verbose=True, | |
# print_every_n=50, | |
) | |
nbrounds = cvres.shape[0] | |
print('CV finished, nbrounds={}'.format(nbrounds)) | |
else: | |
nbrounds = 320 # <-- 320 | |
print('Start training...') | |
model = xgboost.train( params=xgb_params, | |
dtrain=D_train, | |
num_boost_round=nbrounds, | |
evals=watchlist, | |
verbose_eval=10 ) | |
# Применение обученного классификатора для получения форм деепричастия на новых инфинитивах | |
nb_samples = len(samples2) | |
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool) | |
for isample, sample in enumerate(samples2): | |
inf = sample[0] | |
aspect = sample[1] | |
# форма инфинитива с конца к началу | |
for i, c in enumerate(inf[::-1]): | |
X_data[isample, i*nb_chars + char2index[c]] = True | |
# форма инфинитива от начала к концу | |
xpos = nb_chars*max_inf_len | |
for i, c in enumerate(inf): | |
X_data[isample, xpos + i*nb_chars + char2index[c]] = True | |
# признак вида | |
xpos = nb_chars*max_inf_len*2 | |
X_data[isample, xpos] = aspect == u'СОВЕРШ' | |
D_probe = xgboost.DMatrix(X_data) | |
y_probe = model.predict(D_probe) | |
with codecs.open('rus_new_adverbial_participles.sol', 'w', 'utf-8') as wrt: | |
for isample, (sample, y) in enumerate(zip(samples2, y_probe)): | |
inf = sample[0] | |
aspect = sample[1] | |
transducer = index2transducer[np.argmax(y)] | |
new_entry = inf[0:len(inf)-transducer[0]] + transducer[1] | |
wrt.write(u'дп( {}, {}, {} )\n'.format(new_entry, aspect, inf)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment