Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Koziev/3db175f51010c1a7753d642c15aa0986 to your computer and use it in GitHub Desktop.
Save Koziev/3db175f51010c1a7753d642c15aa0986 to your computer and use it in GitHub Desktop.
Классификатор на базе XGBoost для генерации форм деепричастия из инфинтивных форм
We can make this file beautiful and searchable if this error is corrected: No tabs found in this TSV file in line 0.
# -*- coding: utf-8 -*-
"""
Генерация словарных статей для деепричастий по статьям инфинитивов.
"""
from __future__ import division # for python2 compatability
from __future__ import print_function
import codecs
import collections
import itertools
import os
import random
import glob
import re
import numpy as np
from sklearn.model_selection import train_test_split
import gc
import xgboost
import scipy.sparse
# Грузим датасет с известными парами из деепричастий и инфинитивов.
# На основе этого датасета будем строить модель, которая создаст деепричастия
# из новых инфинитивов.
samples = []
max_inf_len = 0
all_chars = set()
all_transducers = collections.Counter()
consonants = set(u'бвгджзклмнопрстфхцчшщ')
with codecs.open('existing_adverbial_participles_and_infinitives.tsv', 'r', 'utf-8') as rdr:
for line in rdr:
tx = tuple(line.strip().split(u'\t'))
ap = tx[0] # деепричастие
inf = tx[1] # инфинитив
aspect = tx[2]
common_prefix_len = 0
for i, (c1, c2) in enumerate(zip(ap, inf)):
if c1 != c2:
common_prefix_len = i
break
# отсекаемое окончание должно начинаться на согласную
for j in range(common_prefix_len, 1, -1):
if ap[j-1] in consonants:
common_prefix_len = j
break
if common_prefix_len > 0:
old_ending = inf[common_prefix_len-1:]
new_ending = ap[common_prefix_len-1:]
transducer = (len(old_ending), new_ending)
all_transducers[transducer] += 1
samples.append((ap, inf, aspect, transducer))
if len(samples)<10:
print(u'inf={}\tap={}\tprefix={}'.format(inf, ap, inf[:common_prefix_len]))
max_inf_len = max(max_inf_len, len(inf))
all_chars.update(inf)
all_chars.remove(u' ')
char2index = dict([(c, i) for (i, c) in enumerate(itertools.chain([u' '], all_chars))])
nb_chars = len(char2index)
total_freq = sum(all_transducers.values())
cumul_freq = 0
print('rule\t\tshare\tcumulative')
for rule, freq in all_transducers.most_common(20):
cumul_freq += freq
print(u'{:10}\t\t{:5.2f}\t{:5.2f}'.format(str(rule[0])+u':'+rule[1], 100.0*float(freq)/total_freq, 100.0*float(cumul_freq)/total_freq))
print('{} samples in dataset'.format(len(samples)))
transducer2id = dict([(t, i) for (i, t) in enumerate([x[0] for x in all_transducers.most_common(30)])])
index2transducer = dict([(i, t) for (t, i) in transducer2id.iteritems()])
nb_transducers = len(transducer2id)
print('{} transducers'.format(nb_transducers))
samples = list(filter(lambda z:z[3] in transducer2id, samples))
nb_samples = len(samples)
samples2 = []
with codecs.open('adverbial_participles.tsv', 'r', 'utf-8') as rdr:
for line in rdr:
tx = line.strip().split(u'\t')
inf = tx[0]
aspect = tx[1]
samples2.append((inf, aspect))
max_inf_len = max(max_inf_len, len(inf))
# векторизуем сэмплы
input_dim = max_inf_len*nb_chars*2 + 1
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool)
y_data = np.zeros((nb_samples), dtype=np.int32)
for isample, sample in enumerate(samples):
ap = sample[0]
inf = sample[1]
aspect = sample[2]
transducer = sample[3]
# целевой класс - номер трансдьюсера
y = transducer2id[transducer]
y_data[isample] = y
# форма инфинитива с конца к началу
for i, c in enumerate(inf[::-1]):
X_data[isample, i*nb_chars + char2index[c]] = True
# форма инфинитива от начала к концу
xpos = nb_chars*max_inf_len
for i, c in enumerate(inf):
X_data[isample, xpos + i*nb_chars + char2index[c]] = True
# признак вида
xpos = nb_chars*max_inf_len*2
X_data[isample, xpos] = aspect == u'СОВЕРШ'
# Разбивка на обучающий/валидационный наборы
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.3, random_state=123456)
# Обучение классификатора
print("Prepare DMatrix'es...")
D_train = xgboost.DMatrix(X_train, y_train )
D_val = xgboost.DMatrix(X_val, y_val )
gc.collect()
xgb_params = dict()
xgb_params['eta'] = 0.2
xgb_params['max_depth'] = 5
xgb_params['subsample'] = 0.85
xgb_params['min_child_weight'] = 3
xgb_params['gamma'] = 0.1
xgb_params['colsample_bytree'] = 1.00
xgb_params['colsample_bylevel'] = 1.00
xgb_params['objective'] = 'multi:softprob'
xgb_params['eval_metric'] = [ 'merror', 'mlogloss']
xgb_params['num_class'] = nb_transducers
xgb_params['seed'] = 123456
xgb_params['silent'] = True
watchlist = [(D_train, 'train'), (D_val, 'valid')]
if True:
print('Compute best number of estimators using xgboost.cv')
cvres = xgboost.cv(xgb_params,
D_train,
num_boost_round=1000,
nfold=5,
early_stopping_rounds=10,
metrics=['merror'],
seed=123456,
verbose_eval=10,
# verbose=True,
# print_every_n=50,
)
nbrounds = cvres.shape[0]
print('CV finished, nbrounds={}'.format(nbrounds))
else:
nbrounds = 320 # <-- 320
print('Start training...')
model = xgboost.train( params=xgb_params,
dtrain=D_train,
num_boost_round=nbrounds,
evals=watchlist,
verbose_eval=10 )
# Применение обученного классификатора для получения форм деепричастия на новых инфинитивах
nb_samples = len(samples2)
X_data = np.zeros((nb_samples, input_dim), dtype=np.bool)
for isample, sample in enumerate(samples2):
inf = sample[0]
aspect = sample[1]
# форма инфинитива с конца к началу
for i, c in enumerate(inf[::-1]):
X_data[isample, i*nb_chars + char2index[c]] = True
# форма инфинитива от начала к концу
xpos = nb_chars*max_inf_len
for i, c in enumerate(inf):
X_data[isample, xpos + i*nb_chars + char2index[c]] = True
# признак вида
xpos = nb_chars*max_inf_len*2
X_data[isample, xpos] = aspect == u'СОВЕРШ'
D_probe = xgboost.DMatrix(X_data)
y_probe = model.predict(D_probe)
with codecs.open('rus_new_adverbial_participles.sol', 'w', 'utf-8') as wrt:
for isample, (sample, y) in enumerate(zip(samples2, y_probe)):
inf = sample[0]
aspect = sample[1]
transducer = index2transducer[np.argmax(y)]
new_entry = inf[0:len(inf)-transducer[0]] + transducer[1]
wrt.write(u'дп( {}, {}, {} )\n'.format(new_entry, aspect, inf))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment