Skip to content

Instantly share code, notes, and snippets.

@Vini2
Last active March 11, 2020 07:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Vini2/d9846f1e69f78a919042a3aa8784bd04 to your computer and use it in GitHub Desktop.
Save Vini2/d9846f1e69f78a919042a3aa8784bd04 to your computer and use it in GitHub Desktop.
Multinomial NB Classification of Tweets
# NLP imports
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# pandas
import pandas as pd
# python imports
import re
import json
from collections import Counter
from matplotlib import pyplot as plt
# seaborn import
import seaborn as sns
# preprocessor data sources and instances
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
tf = TfidfVectorizer()
Y = []
sentences = []
sentences_processed = []
sentences_processed_tw = []
# Reading data and creating sentences from data source
with open("./data/processed_stars_all") as data:
all_data = data.read().strip().split("\n")
for n, line in enumerate(all_data):
line_data = line.split(" ")[0:-1]
sentence = ""
for words_count in line_data:
words = words_count.split(":")[0].split("_")
count = int(words_count.split(":")[1])
words = list(filter("<num>".__ne__, words))
words = list(map(lambda x: re.sub(r'\`|\'|,|\.|\"', '', x), words))
if len(words) == 0:
continue
words = words * count
sentence += " ".join(words) + " "
sentences.append(sentence)
Y.append(float(line.split(" ")[-1].split(":")[-1]))
# Reading twitter data
with open("data.json") as twd:
data = twd.read()
json = json.loads(data)
for tw in json:
text = tw['full_text']
# remove mentions, hashtags and punctuatoins
text = re.sub(r'RT|@[a-zA-Z0-9_]+|:|#[a-zA-Z0-9_]+|[0-9]+|,|\'|\"|\`|…', '', text)
text = re.sub(r' ', ' ', text.strip())
sentences_processed_tw.append(text)
# At this point we have sentences
for sentence in sentences:
# Obtain work tokens
tokenized_words = word_tokenize(sentence)
# Remove stop words
filtered_words = list(filter(lambda x: x not in stop_words, tokenized_words))
# Stemming words (lematization is better but requires POS tagging which)
stemmed_words = list(map(ps.stem, filtered_words))
sentences_processed.append(" ".join(stemmed_words))
# Create a dataframe for downstream analysis
data = pd.DataFrame(data={'sentence': sentences_processed, 'sentiment': Y})
# Obtain TF-IDF transformation
text_tf = tf.fit_transform(data['sentence'])
# Split data into Test-Train sets
X_train, X_test, y_train, y_test = train_test_split(
text_tf, data['sentiment'], test_size=0.001, random_state=1)
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
print("NB Model Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Analysing the twitter data")
# Transforming to TF-IDF of our trained corpus
text_tf_tw = tf.transform(sentences_processed_tw)
predicted_tw = clf.predict(text_tf_tw)
print(Counter(predicted_tw))
sns.distplot(predicted_tw, kde=False, norm_hist=False)
plt.savefig("hist.png", dpi=300)
plt.figure()
sns.distplot(predicted_tw)
plt.savefig("hist_kde.png", dpi=300)
plt.show()
@karikalan001
Copy link

Hi mam
i cant understand this ( with open("./data/processed_stars_all") as data: )

please tell me what have you done with the (http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_stars.tar.gz) dataset

That was the nice article(A simple sentiment analysis of 2019 novel coronavirus outbreak using Twitter)
mam and it is very usefull for me

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment