Skip to content

Instantly share code, notes, and snippets.

@matsu7874
Created September 9, 2017 05:34
Show Gist options
  • Save matsu7874/a1aa390cb2345a956dbc016d8439d3b5 to your computer and use it in GitHub Desktop.
Save matsu7874/a1aa390cb2345a956dbc016d8439d3b5 to your computer and use it in GitHub Desktop.
Project KNJ
import collections
import random
import bisect
BOL = '\\^'
EOL = '\\$'
def main():
word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
reversed_word_word_cnt = collections.defaultdict(lambda: collections.defaultdict(int))
word_cnt = collections.defaultdict(int)
word_index_hash = {BOL: 0, EOL: 1}
index_word_hash = {0: BOL, 1: EOL}
uniq_word_size = 2
fin = open('tweets.txt', 'r', encoding='utf-8')
for line in fin:
words = [BOL] + line.split() + [EOL]
# 各単語に一意なIDを付番する
for word in words:
if word not in word_index_hash:
word_index_hash[word] = uniq_word_size
index_word_hash[uniq_word_size] = word
uniq_word_size += 1
length = len(words)
for i in range(length - 1):
wi1 = word_index_hash[words[i]]
wi2 = word_index_hash[words[i + 1]]
word_word_cnt[wi1][wi2] += 1
reversed_word_word_cnt[wi2][wi1] += 1
word_cnt[wi1] += 1
fin.close()
# 各単語から各単語に遷移する割合を計算
for w1 in word_cnt.keys():
for w2 in word_word_cnt[w1].keys():
word_word_cnt[w1][w2] /= word_cnt[w1]
for w1 in word_cnt.keys():
for w2 in reversed_word_word_cnt[w1].keys():
reversed_word_word_cnt[w1][w2] /= word_cnt[w1]
word_word = [[] for i in range(uniq_word_size)]
reversed_word_word = [[] for i in range(uniq_word_size)]
# 各単語を[0, 1)にマッピングする
for wi1 in range(uniq_word_size):
begin = 0
for wi2 in word_word_cnt[wi1]:
word_word[wi1].append((begin, wi2))
begin += word_word_cnt[wi1][wi2]
begin = 0
for wi2 in reversed_word_word_cnt[wi1]:
reversed_word_word[wi1].append((begin, wi2))
begin += reversed_word_word_cnt[wi1][wi2]
while True:
print('> ', end='')
center = input()
if center not in word_index_hash:
print('……')
continue
for i in range(10):
# 後ろ向きに文章を伸ばす
sentence = [word_index_hash[center]]
while sentence[-1] != word_index_hash[EOL]:
pre_index = sentence[-1]
rnd = random.random()
next_index_order = max(0, bisect.bisect_right(word_word[pre_index], (rnd,)) - 1)
next_index = word_word[pre_index][next_index_order][1]
sentence.append(next_index)
# 前向きに文章を伸ばす
reversed_sentence = [word_index_hash[center]]
while reversed_sentence[-1] != word_index_hash[BOL]:
pre_index = reversed_sentence[-1]
rnd = random.random()
next_index_order = max(0, bisect.bisect_right(reversed_word_word[pre_index], (rnd,)) - 1)
next_index = reversed_word_word[pre_index][next_index_order][1]
reversed_sentence.append(next_index)
print(''.join([index_word_hash[wi] for wi in reversed(reversed_sentence)][1:]) + ''.join([index_word_hash[wi] for wi in sentence[1:-1]]))
if __name__ == '__main__':
main()
import json
import os
import janome.tokenizer
USER_ID = 1 # 分析したいアカウントのID
def main():
tokenizer = janome.tokenizer.Tokenizer(wakati=True)
fout = open('tweets.txt', 'w', encoding='utf-8')
tweet_files = os.listdir('tweets/')
for tweet_file in tweet_files:
fin = open('tweets/' + tweet_file, 'r')
print(tweet_file)
tweets = json.load(fin)
for tweet in tweets:
if tweet['user']['id'] != USER_ID or 'retweeted_status' in tweet or 'in_reply_to_screen_name' in tweet or '@' in tweet['text']:
continue
fout.write(' '.join(tokenizer.tokenize(tweet['text'])) + '\n')
fin.close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment