Нужно будет еще переделать этот гребанный рандомайзер. Он многое гадит.
import re
from random import uniform
from collections import defaultdict
from pymongo import MongoClient, ASCENDING
sent_end = ('.!?,;:$')
comma = ',;:'
r_alphabet = re.compile(u'[a-zA-Zа-яёА-ЯЁ0-9-]+|[.,:;?!]+')
__doc__ = """
class model(object):
def __init__(self, host='localhost', port=27017, db_name='sentences'):
db = MongoClient(host, port)[db_name]
self.gramms = db['gramms']
if len(self.gramms.index_information()) <= 1:
self.gramms.ensure_index([('1', ASCENDING), ('2', ASCENDING)], unique=True)
def init_sub_model(self, words):
sub_model = {}
for word in words:
for el in self.gramms.find({'$or': [{'1': word}, {'2': word}, {'sib.3': word}]}):
sub_model[(el['1'], el['2'])] = el['sib']
res_sub_model = {}
is_good_data = False
for key, value in sub_model.iteritems():
for sib_el in value:
next = key[1], sib_el['3']
if next in sub_model:
if key in res_sub_model:
res_sub_model[key] = [sib_el]
if sib_el['3'] in sent_end and not is_good_data:
is_good_data = True
res_sub_model[sib_el['3'], '$'] = [{'3': '$', 'w': 1}]
if ('$', '$') not in sub_model or not is_good_data:
raise Exception('bad train data for this words')
return res_sub_model
def get_seq(self, t0, t1):
res = self.gramms.find_one({'1': t0, '2': t1})
if res:
return res['sib']
def add_to_seq(self, t0, t1, token, weight):
container = self.gramms.find_one({'1': t0, '2': t1})
if container:
self.gramms.update(container, {'$push': {'sib': {'3': token, 'w': weight}}}, upsert=True)
else:{'1': t0, '2': t1, 'sib': [{'3': token, 'w': weight}]})
def gen_lines(corpus):
data = open(corpus)
for line in data:
yield line.decode('utf-8').lower()
def gen_tokens(lines):
for line in lines:
for token in r_alphabet.findall(line):
yield token
def gen_trigrams(tokens):
t0, t1 = '$', '$'
for t2 in tokens:
yield t0, t1, t2
if t2 in '.!?':
yield t1, t2, '$'
yield t2, '$', '$'
t0, t1 = '$', '$'
t0, t1 = t1, t2
def train(corpus):
lines = gen_lines(corpus)
tokens = gen_tokens(lines)
trigrams = gen_trigrams(tokens)
bi, tri = defaultdict(lambda: 0.0), defaultdict(lambda: 0.0)
for t0, t1, t2 in trigrams:
bi[t0, t1] += 1
tri[t0, t1, t2] += 1
m = model()
for (t0, t1, t2), freq in tri.iteritems():
m.add_to_seq(t0, t1, t2, freq / bi[t0, t1])
return m
def generate_sentence(model):
phrase = ''
t0, t1 = '$', '$'
while 1: # for every iteration in t1 new word random getted from model
if (t0, t1) not in model:
t0, t1 = t1, unirand(model[t0, t1])
if t1 == '$': break
if t1 in sent_end or t0 == '$':
phrase += t1
phrase += ' ' + t1
print phrase
return phrase.capitalize()
def unirand(seq):
here generating next word on sequence.
sum_, freq_ = 0, 0
for el in seq:
sum_ += el['w']
rnd = uniform(0, sum_)
for el in seq:
freq_ += el['w']
if rnd < freq_:
return el['3']
if __name__ == '__main__':
## model = train('/home/alesha/develop/seo-venv/dicts/text.txt')
# model = model(words=[u'начать', u'имении', u'граф', u'пьер'])
# for i in range(10):
# print generate_sentence(model)
#model = train('../data/data.txt')
model = model()
words = [u'любовь', u'деньги', u'я', u'шеф']
sub_model = model.init_sub_model(words)
print generate_sentence(sub_model)
