nully0x/tweet-classifier.py

## tweet-classifier.py
#I had just fetch a lot of tweets regarding a subject matter and after cleaning it left over fourteen thousand unique tweets. If you ask me that is quite huge but not too hug I guess.
#These clean extract is to be subject to sentiment analysis to get more information and know to what intent they convey there views.

#Here is the problem
#A sentence/message(tweet) could have a percieved meaning it conveys but choice of words, emojis and emoticons could be make it more elaborate.

#So the task before subjecting to NLP model prepared were:

#Task 1
#Step 1: Identify the Emotion words in the tweets and place them in a new column..
#Step 2:  Identify Twitter Smileys in the tweets and place them in a new column.
#Step3: Identify Twitter Emoticons in the tweets and place them in a new column.

#Task 2
#Step 1: Replace the Emotion words with their emotion categories in a new column.
#Step 2: Replace the Twitter smileys with their emotion categories in a new column.
#Step 3: Replace the Twitter Emoticons with their emotion categories in a new column.

#This feels too much right?

#First I set out to get a list of emotion words and there emotion categories(there is a platform for it including some native languages)
#Second I set out to get smileys and Emotion categories
#Lastly, I set out again to get Emoticons and its Emotiona categories.

#The next task is to cross reference them with each tweets, extract and replace respectively in another colume in the csv.
#Ok the normal thing I would do is to read over over fourteen thousand tweet, memorize the emotions, smileys and emotion with its #categories respectively
#right? right?

#The would be hell and I wouldn't do that even if science reports my memory could do it, I dont have the time. I look at some options like Labalebox, it's not optimal for what I want to achieve.

#Resort to coding the process in python. So mkdir and put in the extracted tweets file with files of smileys, emoticons and Emotion words #I have gotten ealier.

from os import replace
from nltk.corpus.reader import wordnet
from numpy.lib.function_base import append, extract
import pandas as pd
import numpy as np


#convert xlxs to csv
# df = pd.read_excel('./files/Xenophobia.xlsx')
# df.to_csv('./files/xenophobia.csv')

# Read the tweets and place them in a dataframe
data_processed = pd.read_csv('./files/xenophobia.csv')
emotion_words = pd.read_csv('./files/emotion_words.csv')
emoji_category = pd.read_csv('files/emoji_category.csv')
emoticon_category = pd.read_csv('files/emoticon_category.csv')


# for each tweet I have to detect the emotion word first by refencing from the emotion word file
def get_emotion_words(tweet):
    words = []
    for word in tweet.split():
        if word in emotional_words['Emotion_Word'][0:-1].values:
            words.append(word)
    return words

#then iterate over the tweets and extrating the emotion words
def get_emotion_words_list(tweets):
    words_list = []
    for tweet in tweets['Tweet']:
        words = get_emotion_words(tweet)
        words_list.append(words)
# append word_list to new column in dataframe
    tweets['Emotion_Words'] = words_list
    # tweets.to_csv('./files/data_processed.csv')
    return words_list
get_emotion_words_list(data_processed)

#check for emotion word and replace with emotion category
def replace_emotion_words(tweet):
    words = []
    for word in tweet.split():
        if word in emotional_words['Emotion_Word'][0:-1].values:
            # words.append(word)
            # print(word)
            word = emotional_words[emotional_words['Emotion_Word'] == word]['Emotion_Category'].values[0]
            words.append(word)
            # print(words)
            tweet = tweet.replace(word, word)
    return words

#lastly iterate over the emotional word and replace with emotion category
def replace_emotion_words_list(data_with_emotion):
    words_list = []
    for tweet in data_with_emotion['Tweet']:
        words = replace_emotion_words(tweet)
        words_list.append(words)
    data_with_emotion['Emotional_Category'] = words_list
    # data_with_emotion.to_csv('./files/data_processed.csv')
    return(words_list)
replace_emotion_words_list(data_processed)

#next is for each tweet have to detect the emojis used
def get_emoji(tweet):
    emojis = []
    for emoji in tweet.split():
        if emoji in emoji_category['Emoji'].values:
            emojis.append(emoji)
    print(emojis)
    return emojis

#next is to iterate over the tweets and get the emoji
def get_emoji_list(emojis):
    emoji_list = []
    for emoji in data_processed['Tweet']:
        emojis = get_emoji(emoji)
        emoji_list.append(emojis)
# append word_list to new column in dataframe
    data_processed['Emoji'] = emoji_list
    # data_processed.to_csv('./files/data_processed.csv')
    return emoji_list
get_emoji_list(data_processed)

# check for emoji and replace with emoji category
def replace_emoji(tweet):
    emojis = []
    for emoji in tweet.split():
        if emoji in emoji_category['Emoji'][0:-1].values:
            # words.append(word)
            # print(word)
            emoji = emoji_category[emoji_category['Emoji'] == emoji]['Category'].values[0]
            emojis.append(emoji)
            # print(words)
            tweet = tweet.replace(emoji, emoji)
    return emojis

#lastly is to iterate over the emojis and replace with emoji category
def replace_emoji_list(data_processed):
    emoji_list = []
    for emoji in data_processed['Tweet']:
        emojis = replace_emoji(emoji)
        emoji_list.append(emojis)
    data_processed['Emoji_Category'] = emoji_list
    # data_processed.to_csv('./files/data_processed.csv')
    print(emoji_list)
    return(emoji_list)
replace_emoji_list(data_processed)

#here for each tweet detect the emoticon
def get_emoticon(tweet):
    emoticons = []
    for emoticon in tweet.split():
        if emoticon in emoticon_category['Emoticon'][0:-1].values:
            emoticons.append(emoticon)
    print(emoticons)
    return emoticons

 #agian iterate over the tweets and get the emoticons
def get_emoji_list(emoticon):
    emoticon_list = []
    for emoticon in data_processed['Tweet']:
        emoticon = get_emoticon(emoticon)
        emoticon_list.append(emoticon)
# append word_list to new column in dataframe
    data_processed['Emoticons'] = emoticon_list
    # data_processed.to_csv('./files/data_processed.csv')
    return emoticon_list
get_emoji_list(data_processed)

# check for emotion word and replace with emotion category
def replace_emoticon(tweet):
    emoticons = []
    for emoticon in tweet.split():
        if emoticon in emoticon_category['Emoticon'][0:-1].values:
            # words.append(word)
            # print(word)
            emoticon = emoticon_category[emoticon_category['Emoticon'] == emoticon]['Emoticon_Category'].values[0]
            emoticons.append(emoticon)
            # print(words)
            tweet = tweet.replace(emoticon, emoticon)
    return emoticons

# iterate over the emotional word and replace with emotion category
def replace_emoticon_list(data_processed):
    emoticon_list = []
    for emoticon in data_processed['Tweet']:
        emoticon = replace_emoticon(emoticon)
        emoticon_list.append(emoticon)
    data_processed['Emoticon_Category'] = emoticon_list
    data_processed.to_csv('./files/xenophobic_data_processed.csv')
    print(emoticon_list)
    return(emoticon_list)
replace_emoticon_list(data_processed)

# lastly I I have to keep the extracts in order they apppear in each tweets.
	#I had just fetch a lot of tweets regarding a subject matter and after cleaning it left over fourteen thousand unique tweets. If you ask me that is quite huge but not too hug I guess.
	#These clean extract is to be subject to sentiment analysis to get more information and know to what intent they convey there views.

	#Here is the problem
	#A sentence/message(tweet) could have a percieved meaning it conveys but choice of words, emojis and emoticons could be make it more elaborate.

	#So the task before subjecting to NLP model prepared were:

	#Task 1
	#Step 1: Identify the Emotion words in the tweets and place them in a new column..
	#Step 2: Identify Twitter Smileys in the tweets and place them in a new column.
	#Step3: Identify Twitter Emoticons in the tweets and place them in a new column.

	#Task 2
	#Step 1: Replace the Emotion words with their emotion categories in a new column.
	#Step 2: Replace the Twitter smileys with their emotion categories in a new column.
	#Step 3: Replace the Twitter Emoticons with their emotion categories in a new column.

	#This feels too much right?

	#First I set out to get a list of emotion words and there emotion categories(there is a platform for it including some native languages)
	#Second I set out to get smileys and Emotion categories
	#Lastly, I set out again to get Emoticons and its Emotiona categories.

	#The next task is to cross reference them with each tweets, extract and replace respectively in another colume in the csv.
	#Ok the normal thing I would do is to read over over fourteen thousand tweet, memorize the emotions, smileys and emotion with its #categories respectively
	#right? right?

	#The would be hell and I wouldn't do that even if science reports my memory could do it, I dont have the time. I look at some options like Labalebox, it's not optimal for what I want to achieve.

	#Resort to coding the process in python. So mkdir and put in the extracted tweets file with files of smileys, emoticons and Emotion words #I have gotten ealier.

	from os import replace
	from nltk.corpus.reader import wordnet
	from numpy.lib.function_base import append, extract
	import pandas as pd
	import numpy as np


	#convert xlxs to csv
	# df = pd.read_excel('./files/Xenophobia.xlsx')
	# df.to_csv('./files/xenophobia.csv')

	# Read the tweets and place them in a dataframe
	data_processed = pd.read_csv('./files/xenophobia.csv')
	emotion_words = pd.read_csv('./files/emotion_words.csv')
	emoji_category = pd.read_csv('files/emoji_category.csv')
	emoticon_category = pd.read_csv('files/emoticon_category.csv')


	# for each tweet I have to detect the emotion word first by refencing from the emotion word file
	def get_emotion_words(tweet):
	words = []
	for word in tweet.split():
	if word in emotional_words['Emotion_Word'][0:-1].values:
	words.append(word)
	return words

	#then iterate over the tweets and extrating the emotion words
	def get_emotion_words_list(tweets):
	words_list = []
	for tweet in tweets['Tweet']:
	words = get_emotion_words(tweet)
	words_list.append(words)
	# append word_list to new column in dataframe
	tweets['Emotion_Words'] = words_list
	# tweets.to_csv('./files/data_processed.csv')
	return words_list
	get_emotion_words_list(data_processed)

	#check for emotion word and replace with emotion category
	def replace_emotion_words(tweet):
	words = []
	for word in tweet.split():
	if word in emotional_words['Emotion_Word'][0:-1].values:
	# words.append(word)
	# print(word)
	word = emotional_words[emotional_words['Emotion_Word'] == word]['Emotion_Category'].values[0]
	words.append(word)
	# print(words)
	tweet = tweet.replace(word, word)
	return words

	#lastly iterate over the emotional word and replace with emotion category
	def replace_emotion_words_list(data_with_emotion):
	words_list = []
	for tweet in data_with_emotion['Tweet']:
	words = replace_emotion_words(tweet)
	words_list.append(words)
	data_with_emotion['Emotional_Category'] = words_list
	# data_with_emotion.to_csv('./files/data_processed.csv')
	return(words_list)
	replace_emotion_words_list(data_processed)

	#next is for each tweet have to detect the emojis used
	def get_emoji(tweet):
	emojis = []
	for emoji in tweet.split():
	if emoji in emoji_category['Emoji'].values:
	emojis.append(emoji)
	print(emojis)
	return emojis

	#next is to iterate over the tweets and get the emoji
	def get_emoji_list(emojis):
	emoji_list = []
	for emoji in data_processed['Tweet']:
	emojis = get_emoji(emoji)
	emoji_list.append(emojis)
	# append word_list to new column in dataframe
	data_processed['Emoji'] = emoji_list
	# data_processed.to_csv('./files/data_processed.csv')
	return emoji_list
	get_emoji_list(data_processed)

	# check for emoji and replace with emoji category
	def replace_emoji(tweet):
	emojis = []
	for emoji in tweet.split():
	if emoji in emoji_category['Emoji'][0:-1].values:
	# words.append(word)
	# print(word)
	emoji = emoji_category[emoji_category['Emoji'] == emoji]['Category'].values[0]
	emojis.append(emoji)
	# print(words)
	tweet = tweet.replace(emoji, emoji)
	return emojis

	#lastly is to iterate over the emojis and replace with emoji category
	def replace_emoji_list(data_processed):
	emoji_list = []
	for emoji in data_processed['Tweet']:
	emojis = replace_emoji(emoji)
	emoji_list.append(emojis)
	data_processed['Emoji_Category'] = emoji_list
	# data_processed.to_csv('./files/data_processed.csv')
	print(emoji_list)
	return(emoji_list)
	replace_emoji_list(data_processed)

	#here for each tweet detect the emoticon
	def get_emoticon(tweet):
	emoticons = []
	for emoticon in tweet.split():
	if emoticon in emoticon_category['Emoticon'][0:-1].values:
	emoticons.append(emoticon)
	print(emoticons)
	return emoticons

	#agian iterate over the tweets and get the emoticons
	def get_emoji_list(emoticon):
	emoticon_list = []
	for emoticon in data_processed['Tweet']:
	emoticon = get_emoticon(emoticon)
	emoticon_list.append(emoticon)
	# append word_list to new column in dataframe
	data_processed['Emoticons'] = emoticon_list
	# data_processed.to_csv('./files/data_processed.csv')
	return emoticon_list
	get_emoji_list(data_processed)

	# check for emotion word and replace with emotion category
	def replace_emoticon(tweet):
	emoticons = []
	for emoticon in tweet.split():
	if emoticon in emoticon_category['Emoticon'][0:-1].values:
	# words.append(word)
	# print(word)
	emoticon = emoticon_category[emoticon_category['Emoticon'] == emoticon]['Emoticon_Category'].values[0]
	emoticons.append(emoticon)
	# print(words)
	tweet = tweet.replace(emoticon, emoticon)
	return emoticons

	# iterate over the emotional word and replace with emotion category
	def replace_emoticon_list(data_processed):
	emoticon_list = []
	for emoticon in data_processed['Tweet']:
	emoticon = replace_emoticon(emoticon)
	emoticon_list.append(emoticon)
	data_processed['Emoticon_Category'] = emoticon_list
	data_processed.to_csv('./files/xenophobic_data_processed.csv')
	print(emoticon_list)
	return(emoticon_list)
	replace_emoticon_list(data_processed)

	# lastly I I have to keep the extracts in order they apppear in each tweets.