Created
September 9, 2017 05:34
-
-
Save matsu7874/a1aa390cb2345a956dbc016d8439d3b5 to your computer and use it in GitHub Desktop.
Project KNJ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import random | |
import bisect | |
BOL = '\\^' | |
EOL = '\\$' | |
def main(): | |
word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int)) | |
reversed_word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int)) | |
word_cnt = collections.defaultdict(int) | |
word_index_hash = {BOL: 0, EOL: 1} | |
index_word_hash = {0: BOL, 1: EOL} | |
uniq_word_size = 2 | |
fin = open('tweets.txt', 'r', encoding='utf-8') | |
for line in fin: | |
words = [BOL] + line.split() + [EOL] | |
# 各単語に一意なIDを付番する | |
for word in words: | |
if word not in word_index_hash: | |
word_index_hash[word] = uniq_word_size | |
index_word_hash[uniq_word_size] = word | |
uniq_word_size += 1 | |
length = len(words) | |
for i in range(length - 1): | |
wi1 = word_index_hash[words[i]] | |
wi2 = word_index_hash[words[i + 1]] | |
word_word_cnt[wi1][wi2] += 1 | |
reversed_word_word_cnt[wi2][wi1] += 1 | |
word_cnt[wi1] += 1 | |
fin.close() | |
# 各単語から各単語に遷移する割合を計算 | |
for w1 in word_cnt.keys(): | |
for w2 in word_word_cnt[w1].keys(): | |
word_word_cnt[w1][w2] /= word_cnt[w1] | |
for w1 in word_cnt.keys(): | |
for w2 in reversed_word_word_cnt[w1].keys(): | |
reversed_word_word_cnt[w1][w2] /= word_cnt[w1] | |
word_word = [[] for i in range(uniq_word_size)] | |
reversed_word_word = [[] for i in range(uniq_word_size)] | |
# 各単語を[0, 1)にマッピングする | |
for wi1 in range(uniq_word_size): | |
begin = 0 | |
for wi2 in word_word_cnt[wi1]: | |
word_word[wi1].append((begin, wi2)) | |
begin += word_word_cnt[wi1][wi2] | |
begin = 0 | |
for wi2 in reversed_word_word_cnt[wi1]: | |
reversed_word_word[wi1].append((begin, wi2)) | |
begin += reversed_word_word_cnt[wi1][wi2] | |
while True: | |
print('> ', end='') | |
center = input() | |
if center not in word_index_hash: | |
print('……') | |
continue | |
for i in range(10): | |
# 後ろ向きに文章を伸ばす | |
sentence = [word_index_hash[center]] | |
while sentence[-1] != word_index_hash[EOL]: | |
pre_index = sentence[-1] | |
rnd = random.random() | |
next_index_order = max(0, bisect.bisect_right(word_word[pre_index], (rnd,)) - 1) | |
next_index = word_word[pre_index][next_index_order][1] | |
sentence.append(next_index) | |
# 前向きに文章を伸ばす | |
reversed_sentence = [word_index_hash[center]] | |
while reversed_sentence[-1] != word_index_hash[BOL]: | |
pre_index = reversed_sentence[-1] | |
rnd = random.random() | |
next_index_order = max(0, bisect.bisect_right(reversed_word_word[pre_index], (rnd,)) - 1) | |
next_index = reversed_word_word[pre_index][next_index_order][1] | |
reversed_sentence.append(next_index) | |
print(''.join([index_word_hash[wi] for wi in reversed(reversed_sentence)][1:]) + ''.join([index_word_hash[wi] for wi in sentence[1:-1]])) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import janome.tokenizer | |
USER_ID = 1 # 分析したいアカウントのID | |
def main(): | |
tokenizer = janome.tokenizer.Tokenizer(wakati=True) | |
fout = open('tweets.txt', 'w', encoding='utf-8') | |
tweet_files = os.listdir('tweets/') | |
for tweet_file in tweet_files: | |
fin = open('tweets/' + tweet_file, 'r') | |
print(tweet_file) | |
tweets = json.load(fin) | |
for tweet in tweets: | |
if tweet['user']['id'] != USER_ID or 'retweeted_status' in tweet or 'in_reply_to_screen_name' in tweet or '@' in tweet['text']: | |
continue | |
fout.write(' '.join(tokenizer.tokenize(tweet['text'])) + '\n') | |
fin.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment