Last active
July 28, 2022 13:58
-
-
Save nully0x/9000a7b9175a47b7913a071b09ba76aa to your computer and use it in GitHub Desktop.
Making of tweet classifier helper functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#I had just fetch a lot of tweets regarding a subject matter and after cleaning it left over fourteen thousand unique tweets. If you ask me that is quite huge but not too hug I guess. | |
#These clean extract is to be subject to sentiment analysis to get more information and know to what intent they convey there views. | |
#Here is the problem | |
#A sentence/message(tweet) could have a percieved meaning it conveys but choice of words, emojis and emoticons could be make it more elaborate. | |
#So the task before subjecting to NLP model prepared were: | |
#Task 1 | |
#Step 1: Identify the Emotion words in the tweets and place them in a new column.. | |
#Step 2: Identify Twitter Smileys in the tweets and place them in a new column. | |
#Step3: Identify Twitter Emoticons in the tweets and place them in a new column. | |
#Task 2 | |
#Step 1: Replace the Emotion words with their emotion categories in a new column. | |
#Step 2: Replace the Twitter smileys with their emotion categories in a new column. | |
#Step 3: Replace the Twitter Emoticons with their emotion categories in a new column. | |
#This feels too much right? | |
#First I set out to get a list of emotion words and there emotion categories(there is a platform for it including some native languages) | |
#Second I set out to get smileys and Emotion categories | |
#Lastly, I set out again to get Emoticons and its Emotiona categories. | |
#The next task is to cross reference them with each tweets, extract and replace respectively in another colume in the csv. | |
#Ok the normal thing I would do is to read over over fourteen thousand tweet, memorize the emotions, smileys and emotion with its #categories respectively | |
#right? right? | |
#The would be hell and I wouldn't do that even if science reports my memory could do it, I dont have the time. I look at some options like Labalebox, it's not optimal for what I want to achieve. | |
#Resort to coding the process in python. So mkdir and put in the extracted tweets file with files of smileys, emoticons and Emotion words #I have gotten ealier. | |
from os import replace | |
from nltk.corpus.reader import wordnet | |
from numpy.lib.function_base import append, extract | |
import pandas as pd | |
import numpy as np | |
#convert xlxs to csv | |
# df = pd.read_excel('./files/Xenophobia.xlsx') | |
# df.to_csv('./files/xenophobia.csv') | |
# Read the tweets and place them in a dataframe | |
data_processed = pd.read_csv('./files/xenophobia.csv') | |
emotion_words = pd.read_csv('./files/emotion_words.csv') | |
emoji_category = pd.read_csv('files/emoji_category.csv') | |
emoticon_category = pd.read_csv('files/emoticon_category.csv') | |
# for each tweet I have to detect the emotion word first by refencing from the emotion word file | |
def get_emotion_words(tweet): | |
words = [] | |
for word in tweet.split(): | |
if word in emotional_words['Emotion_Word'][0:-1].values: | |
words.append(word) | |
return words | |
#then iterate over the tweets and extrating the emotion words | |
def get_emotion_words_list(tweets): | |
words_list = [] | |
for tweet in tweets['Tweet']: | |
words = get_emotion_words(tweet) | |
words_list.append(words) | |
# append word_list to new column in dataframe | |
tweets['Emotion_Words'] = words_list | |
# tweets.to_csv('./files/data_processed.csv') | |
return words_list | |
get_emotion_words_list(data_processed) | |
#check for emotion word and replace with emotion category | |
def replace_emotion_words(tweet): | |
words = [] | |
for word in tweet.split(): | |
if word in emotional_words['Emotion_Word'][0:-1].values: | |
# words.append(word) | |
# print(word) | |
word = emotional_words[emotional_words['Emotion_Word'] == word]['Emotion_Category'].values[0] | |
words.append(word) | |
# print(words) | |
tweet = tweet.replace(word, word) | |
return words | |
#lastly iterate over the emotional word and replace with emotion category | |
def replace_emotion_words_list(data_with_emotion): | |
words_list = [] | |
for tweet in data_with_emotion['Tweet']: | |
words = replace_emotion_words(tweet) | |
words_list.append(words) | |
data_with_emotion['Emotional_Category'] = words_list | |
# data_with_emotion.to_csv('./files/data_processed.csv') | |
return(words_list) | |
replace_emotion_words_list(data_processed) | |
#next is for each tweet have to detect the emojis used | |
def get_emoji(tweet): | |
emojis = [] | |
for emoji in tweet.split(): | |
if emoji in emoji_category['Emoji'].values: | |
emojis.append(emoji) | |
print(emojis) | |
return emojis | |
#next is to iterate over the tweets and get the emoji | |
def get_emoji_list(emojis): | |
emoji_list = [] | |
for emoji in data_processed['Tweet']: | |
emojis = get_emoji(emoji) | |
emoji_list.append(emojis) | |
# append word_list to new column in dataframe | |
data_processed['Emoji'] = emoji_list | |
# data_processed.to_csv('./files/data_processed.csv') | |
return emoji_list | |
get_emoji_list(data_processed) | |
# check for emoji and replace with emoji category | |
def replace_emoji(tweet): | |
emojis = [] | |
for emoji in tweet.split(): | |
if emoji in emoji_category['Emoji'][0:-1].values: | |
# words.append(word) | |
# print(word) | |
emoji = emoji_category[emoji_category['Emoji'] == emoji]['Category'].values[0] | |
emojis.append(emoji) | |
# print(words) | |
tweet = tweet.replace(emoji, emoji) | |
return emojis | |
#lastly is to iterate over the emojis and replace with emoji category | |
def replace_emoji_list(data_processed): | |
emoji_list = [] | |
for emoji in data_processed['Tweet']: | |
emojis = replace_emoji(emoji) | |
emoji_list.append(emojis) | |
data_processed['Emoji_Category'] = emoji_list | |
# data_processed.to_csv('./files/data_processed.csv') | |
print(emoji_list) | |
return(emoji_list) | |
replace_emoji_list(data_processed) | |
#here for each tweet detect the emoticon | |
def get_emoticon(tweet): | |
emoticons = [] | |
for emoticon in tweet.split(): | |
if emoticon in emoticon_category['Emoticon'][0:-1].values: | |
emoticons.append(emoticon) | |
print(emoticons) | |
return emoticons | |
#agian iterate over the tweets and get the emoticons | |
def get_emoji_list(emoticon): | |
emoticon_list = [] | |
for emoticon in data_processed['Tweet']: | |
emoticon = get_emoticon(emoticon) | |
emoticon_list.append(emoticon) | |
# append word_list to new column in dataframe | |
data_processed['Emoticons'] = emoticon_list | |
# data_processed.to_csv('./files/data_processed.csv') | |
return emoticon_list | |
get_emoji_list(data_processed) | |
# check for emotion word and replace with emotion category | |
def replace_emoticon(tweet): | |
emoticons = [] | |
for emoticon in tweet.split(): | |
if emoticon in emoticon_category['Emoticon'][0:-1].values: | |
# words.append(word) | |
# print(word) | |
emoticon = emoticon_category[emoticon_category['Emoticon'] == emoticon]['Emoticon_Category'].values[0] | |
emoticons.append(emoticon) | |
# print(words) | |
tweet = tweet.replace(emoticon, emoticon) | |
return emoticons | |
# iterate over the emotional word and replace with emotion category | |
def replace_emoticon_list(data_processed): | |
emoticon_list = [] | |
for emoticon in data_processed['Tweet']: | |
emoticon = replace_emoticon(emoticon) | |
emoticon_list.append(emoticon) | |
data_processed['Emoticon_Category'] = emoticon_list | |
data_processed.to_csv('./files/xenophobic_data_processed.csv') | |
print(emoticon_list) | |
return(emoticon_list) | |
replace_emoticon_list(data_processed) | |
# lastly I I have to keep the extracts in order they apppear in each tweets. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment