Skip to content

Instantly share code, notes, and snippets.

@nully0x
Last active July 28, 2022 13:58
Show Gist options
  • Save nully0x/9000a7b9175a47b7913a071b09ba76aa to your computer and use it in GitHub Desktop.
Save nully0x/9000a7b9175a47b7913a071b09ba76aa to your computer and use it in GitHub Desktop.
Making of tweet classifier helper functions
#I had just fetch a lot of tweets regarding a subject matter and after cleaning it left over fourteen thousand unique tweets. If you ask me that is quite huge but not too hug I guess.
#These clean extract is to be subject to sentiment analysis to get more information and know to what intent they convey there views.
#Here is the problem
#A sentence/message(tweet) could have a percieved meaning it conveys but choice of words, emojis and emoticons could be make it more elaborate.
#So the task before subjecting to NLP model prepared were:
#Task 1
#Step 1: Identify the Emotion words in the tweets and place them in a new column..
#Step 2: Identify Twitter Smileys in the tweets and place them in a new column.
#Step3: Identify Twitter Emoticons in the tweets and place them in a new column.
#Task 2
#Step 1: Replace the Emotion words with their emotion categories in a new column.
#Step 2: Replace the Twitter smileys with their emotion categories in a new column.
#Step 3: Replace the Twitter Emoticons with their emotion categories in a new column.
#This feels too much right?
#First I set out to get a list of emotion words and there emotion categories(there is a platform for it including some native languages)
#Second I set out to get smileys and Emotion categories
#Lastly, I set out again to get Emoticons and its Emotiona categories.
#The next task is to cross reference them with each tweets, extract and replace respectively in another colume in the csv.
#Ok the normal thing I would do is to read over over fourteen thousand tweet, memorize the emotions, smileys and emotion with its #categories respectively
#right? right?
#The would be hell and I wouldn't do that even if science reports my memory could do it, I dont have the time. I look at some options like Labalebox, it's not optimal for what I want to achieve.
#Resort to coding the process in python. So mkdir and put in the extracted tweets file with files of smileys, emoticons and Emotion words #I have gotten ealier.
from os import replace
from nltk.corpus.reader import wordnet
from numpy.lib.function_base import append, extract
import pandas as pd
import numpy as np
#convert xlxs to csv
# df = pd.read_excel('./files/Xenophobia.xlsx')
# df.to_csv('./files/xenophobia.csv')
# Read the tweets and place them in a dataframe
data_processed = pd.read_csv('./files/xenophobia.csv')
emotion_words = pd.read_csv('./files/emotion_words.csv')
emoji_category = pd.read_csv('files/emoji_category.csv')
emoticon_category = pd.read_csv('files/emoticon_category.csv')
# for each tweet I have to detect the emotion word first by refencing from the emotion word file
def get_emotion_words(tweet):
words = []
for word in tweet.split():
if word in emotional_words['Emotion_Word'][0:-1].values:
words.append(word)
return words
#then iterate over the tweets and extrating the emotion words
def get_emotion_words_list(tweets):
words_list = []
for tweet in tweets['Tweet']:
words = get_emotion_words(tweet)
words_list.append(words)
# append word_list to new column in dataframe
tweets['Emotion_Words'] = words_list
# tweets.to_csv('./files/data_processed.csv')
return words_list
get_emotion_words_list(data_processed)
#check for emotion word and replace with emotion category
def replace_emotion_words(tweet):
words = []
for word in tweet.split():
if word in emotional_words['Emotion_Word'][0:-1].values:
# words.append(word)
# print(word)
word = emotional_words[emotional_words['Emotion_Word'] == word]['Emotion_Category'].values[0]
words.append(word)
# print(words)
tweet = tweet.replace(word, word)
return words
#lastly iterate over the emotional word and replace with emotion category
def replace_emotion_words_list(data_with_emotion):
words_list = []
for tweet in data_with_emotion['Tweet']:
words = replace_emotion_words(tweet)
words_list.append(words)
data_with_emotion['Emotional_Category'] = words_list
# data_with_emotion.to_csv('./files/data_processed.csv')
return(words_list)
replace_emotion_words_list(data_processed)
#next is for each tweet have to detect the emojis used
def get_emoji(tweet):
emojis = []
for emoji in tweet.split():
if emoji in emoji_category['Emoji'].values:
emojis.append(emoji)
print(emojis)
return emojis
#next is to iterate over the tweets and get the emoji
def get_emoji_list(emojis):
emoji_list = []
for emoji in data_processed['Tweet']:
emojis = get_emoji(emoji)
emoji_list.append(emojis)
# append word_list to new column in dataframe
data_processed['Emoji'] = emoji_list
# data_processed.to_csv('./files/data_processed.csv')
return emoji_list
get_emoji_list(data_processed)
# check for emoji and replace with emoji category
def replace_emoji(tweet):
emojis = []
for emoji in tweet.split():
if emoji in emoji_category['Emoji'][0:-1].values:
# words.append(word)
# print(word)
emoji = emoji_category[emoji_category['Emoji'] == emoji]['Category'].values[0]
emojis.append(emoji)
# print(words)
tweet = tweet.replace(emoji, emoji)
return emojis
#lastly is to iterate over the emojis and replace with emoji category
def replace_emoji_list(data_processed):
emoji_list = []
for emoji in data_processed['Tweet']:
emojis = replace_emoji(emoji)
emoji_list.append(emojis)
data_processed['Emoji_Category'] = emoji_list
# data_processed.to_csv('./files/data_processed.csv')
print(emoji_list)
return(emoji_list)
replace_emoji_list(data_processed)
#here for each tweet detect the emoticon
def get_emoticon(tweet):
emoticons = []
for emoticon in tweet.split():
if emoticon in emoticon_category['Emoticon'][0:-1].values:
emoticons.append(emoticon)
print(emoticons)
return emoticons
#agian iterate over the tweets and get the emoticons
def get_emoji_list(emoticon):
emoticon_list = []
for emoticon in data_processed['Tweet']:
emoticon = get_emoticon(emoticon)
emoticon_list.append(emoticon)
# append word_list to new column in dataframe
data_processed['Emoticons'] = emoticon_list
# data_processed.to_csv('./files/data_processed.csv')
return emoticon_list
get_emoji_list(data_processed)
# check for emotion word and replace with emotion category
def replace_emoticon(tweet):
emoticons = []
for emoticon in tweet.split():
if emoticon in emoticon_category['Emoticon'][0:-1].values:
# words.append(word)
# print(word)
emoticon = emoticon_category[emoticon_category['Emoticon'] == emoticon]['Emoticon_Category'].values[0]
emoticons.append(emoticon)
# print(words)
tweet = tweet.replace(emoticon, emoticon)
return emoticons
# iterate over the emotional word and replace with emotion category
def replace_emoticon_list(data_processed):
emoticon_list = []
for emoticon in data_processed['Tweet']:
emoticon = replace_emoticon(emoticon)
emoticon_list.append(emoticon)
data_processed['Emoticon_Category'] = emoticon_list
data_processed.to_csv('./files/xenophobic_data_processed.csv')
print(emoticon_list)
return(emoticon_list)
replace_emoticon_list(data_processed)
# lastly I I have to keep the extracts in order they apppear in each tweets.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment