Skip to content

Instantly share code, notes, and snippets.

@NetBUG
Created May 28, 2017 19:21
Show Gist options
  • Save NetBUG/fd211347d178d84b8ed6d8ce15645222 to your computer and use it in GitHub Desktop.
Save NetBUG/fd211347d178d84b8ed6d8ce15645222 to your computer and use it in GitHub Desktop.
Pipeline for word analysis in Twi
import numpy as np
# @title
dict_file = 'words.txt'
users_file = 'users.txt'
users = [u for l in open(users_file).readlines()]
words = [u for l in open(words_file).readlines()]
def count_words_general(users, words):
wu = np.zeros((len(users), len(words))) # Matrix all word counts by users
wu_self = np.zeros((len(users), len(words))) # Matrix words by users for tweets which are NOT a response to anyone
# Boilerplate for iterating over all files
for t in tweets:
if not t.user_id in users:
continue
uid = users.index(t.user_id)
for word in t.text.split(' '):
# Incrementing word-user matrix
wu[uid][word_id]
if t.in_reply_to != None:
# Adding to output file to scan again for the thread and build count
return wu, wu_self
def get_conversations(userlist, out_folder = './threads'):
# calling get_oleg.py with restriction on user_ids when outputting threads?
# Saving threads/USER_ID.txt in JSON format, a record for conversation
pass
def count_conversation_word_mention(userid, wordlist):
pass
if __name__ == '__main__':
wu, wu_self = count_words_general(users, words) # строим словарь частотности маркерных слов для всех пользователей
get_conversations(users) # вынимаем все переписки указанных пользователей
# Дальше либо считаем все упоминания слова в переписках и вычитаем из общего количества, либо берём wu_self (упоминания слова вне переписок) и делим на упоминание в переписках, как хочешь
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment