matsu7874/knj.py

## knj.py
import collections
import random
import bisect

BOL = '\\^'
EOL = '\\$'

def main():
    word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
    reversed_word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
    word_cnt = collections.defaultdict(int)
    word_index_hash = {BOL: 0, EOL: 1}
    index_word_hash = {0: BOL, 1: EOL}
    uniq_word_size = 2

    fin = open('tweets.txt', 'r', encoding='utf-8')
    for line in fin:
        words = [BOL] + line.split() + [EOL]

        # 各単語に一意なIDを付番する
        for word in words:
            if word not in word_index_hash:
                word_index_hash[word] = uniq_word_size
                index_word_hash[uniq_word_size] = word
                uniq_word_size += 1

        length = len(words)
        for i in range(length - 1):
            wi1 = word_index_hash[words[i]]
            wi2 = word_index_hash[words[i + 1]]
            word_word_cnt[wi1][wi2] += 1
            reversed_word_word_cnt[wi2][wi1] += 1
            word_cnt[wi1] += 1
    fin.close()

    # 各単語から各単語に遷移する割合を計算
    for w1 in word_cnt.keys():
        for w2 in word_word_cnt[w1].keys():
            word_word_cnt[w1][w2] /= word_cnt[w1]
    for w1 in word_cnt.keys():
        for w2 in reversed_word_word_cnt[w1].keys():
            reversed_word_word_cnt[w1][w2] /= word_cnt[w1]

    word_word = [[] for i in range(uniq_word_size)]
    reversed_word_word = [[] for i in range(uniq_word_size)]

    # 各単語を[0, 1)にマッピングする
    for wi1 in range(uniq_word_size):
        begin = 0
        for wi2 in word_word_cnt[wi1]:
            word_word[wi1].append((begin, wi2))
            begin += word_word_cnt[wi1][wi2]
        begin = 0
        for wi2 in reversed_word_word_cnt[wi1]:
            reversed_word_word[wi1].append((begin, wi2))
            begin += reversed_word_word_cnt[wi1][wi2]


    while True:
        print('> ', end='')
        center = input()
        if center not in word_index_hash:
            print('……')
            continue

        for i in range(10):

            # 後ろ向きに文章を伸ばす
            sentence = [word_index_hash[center]]
            while sentence[-1] != word_index_hash[EOL]:
                pre_index = sentence[-1]
                rnd = random.random()
                next_index_order = max(0, bisect.bisect_right(word_word[pre_index], (rnd,)) - 1)
                next_index = word_word[pre_index][next_index_order][1]
                sentence.append(next_index)

            # 前向きに文章を伸ばす
            reversed_sentence = [word_index_hash[center]]
            while reversed_sentence[-1] != word_index_hash[BOL]:
                pre_index = reversed_sentence[-1]
                rnd = random.random()
                next_index_order = max(0, bisect.bisect_right(reversed_word_word[pre_index], (rnd,)) - 1)
                next_index = reversed_word_word[pre_index][next_index_order][1]
                reversed_sentence.append(next_index)

            print(''.join([index_word_hash[wi] for wi in reversed(reversed_sentence)][1:]) + ''.join([index_word_hash[wi] for wi in sentence[1:-1]]))

if __name__ == '__main__':
    main()

## preprocess.py
import json
import os

import janome.tokenizer

USER_ID = 1  # 分析したいアカウントのID


def main():
    tokenizer = janome.tokenizer.Tokenizer(wakati=True)

    fout = open('tweets.txt', 'w', encoding='utf-8')
    tweet_files = os.listdir('tweets/')
    for tweet_file in tweet_files:
        fin = open('tweets/' + tweet_file, 'r')
        print(tweet_file)
        tweets = json.load(fin)
        for tweet in tweets:
            if tweet['user']['id'] != USER_ID or 'retweeted_status' in tweet or 'in_reply_to_screen_name' in tweet or '@' in tweet['text']:
                continue
            fout.write(' '.join(tokenizer.tokenize(tweet['text'])) + '\n')
        fin.close()


if __name__ == '__main__':
    main()
	import collections
	import random
	import bisect

	BOL = '\\^'
	EOL = '\\$'

	def main():
	word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
	reversed_word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
	word_cnt = collections.defaultdict(int)
	word_index_hash = {BOL: 0, EOL: 1}
	index_word_hash = {0: BOL, 1: EOL}
	uniq_word_size = 2

	fin = open('tweets.txt', 'r', encoding='utf-8')
	for line in fin:
	words = [BOL] + line.split() + [EOL]

	# 各単語に一意なIDを付番する
	for word in words:
	if word not in word_index_hash:
	word_index_hash[word] = uniq_word_size
	index_word_hash[uniq_word_size] = word
	uniq_word_size += 1

	length = len(words)
	for i in range(length - 1):
	wi1 = word_index_hash[words[i]]
	wi2 = word_index_hash[words[i + 1]]
	word_word_cnt[wi1][wi2] += 1
	reversed_word_word_cnt[wi2][wi1] += 1
	word_cnt[wi1] += 1
	fin.close()

	# 各単語から各単語に遷移する割合を計算
	for w1 in word_cnt.keys():
	for w2 in word_word_cnt[w1].keys():
	word_word_cnt[w1][w2] /= word_cnt[w1]
	for w1 in word_cnt.keys():
	for w2 in reversed_word_word_cnt[w1].keys():
	reversed_word_word_cnt[w1][w2] /= word_cnt[w1]

	word_word = [[] for i in range(uniq_word_size)]
	reversed_word_word = [[] for i in range(uniq_word_size)]

	# 各単語を[0, 1)にマッピングする
	for wi1 in range(uniq_word_size):
	begin = 0
	for wi2 in word_word_cnt[wi1]:
	word_word[wi1].append((begin, wi2))
	begin += word_word_cnt[wi1][wi2]
	begin = 0
	for wi2 in reversed_word_word_cnt[wi1]:
	reversed_word_word[wi1].append((begin, wi2))
	begin += reversed_word_word_cnt[wi1][wi2]


	while True:
	print('> ', end='')
	center = input()
	if center not in word_index_hash:
	print('……')
	continue

	for i in range(10):

	# 後ろ向きに文章を伸ばす
	sentence = [word_index_hash[center]]
	while sentence[-1] != word_index_hash[EOL]:
	pre_index = sentence[-1]
	rnd = random.random()
	next_index_order = max(0, bisect.bisect_right(word_word[pre_index], (rnd,)) - 1)
	next_index = word_word[pre_index][next_index_order][1]
	sentence.append(next_index)

	# 前向きに文章を伸ばす
	reversed_sentence = [word_index_hash[center]]
	while reversed_sentence[-1] != word_index_hash[BOL]:
	pre_index = reversed_sentence[-1]
	rnd = random.random()
	next_index_order = max(0, bisect.bisect_right(reversed_word_word[pre_index], (rnd,)) - 1)
	next_index = reversed_word_word[pre_index][next_index_order][1]
	reversed_sentence.append(next_index)

	print(''.join([index_word_hash[wi] for wi in reversed(reversed_sentence)][1:]) + ''.join([index_word_hash[wi] for wi in sentence[1:-1]]))

	if __name__ == '__main__':
	main()
	import json
	import os

	import janome.tokenizer

	USER_ID = 1 # 分析したいアカウントのID


	def main():
	tokenizer = janome.tokenizer.Tokenizer(wakati=True)

	fout = open('tweets.txt', 'w', encoding='utf-8')
	tweet_files = os.listdir('tweets/')
	for tweet_file in tweet_files:
	fin = open('tweets/' + tweet_file, 'r')
	print(tweet_file)
	tweets = json.load(fin)
	for tweet in tweets:
	if tweet['user']['id'] != USER_ID or 'retweeted_status' in tweet or 'in_reply_to_screen_name' in tweet or '@' in tweet['text']:
	continue
	fout.write(' '.join(tokenizer.tokenize(tweet['text'])) + '\n')
	fin.close()


	if __name__ == '__main__':
	main()