Skip to content

Instantly share code, notes, and snippets.

@aipi
Created October 27, 2019 17:06
Show Gist options
  • Save aipi/de03e97e03710f43c660f7e08a5e4de6 to your computer and use it in GitHub Desktop.
Save aipi/de03e97e03710f43c660f7e08a5e4de6 to your computer and use it in GitHub Desktop.
import csv
import random
class Dataset:
def remove_stop_words(self, text):
return text.replace(',', '').replace('.', '')
def get_portuguese_text(self, row, dataset):
portuguese_text = self.remove_stop_words(row[2].lower()).split()
for word in portuguese_text:
dataset.append((word, row[-1]))
return dataset
def execute(self):
dataset = []
with open('imdb-reviews-pt-br.csv', newline='') as csvfile:
file = csv.reader(
csvfile,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL
)
test = []
x = 0
for row in file:
test.append(row)
random.shuffle(test)
for row in test[:100]:
self.get_portuguese_text(row, dataset)
dataset.remove(dataset[0])
return dataset
class NaiveBaivys:
def __init__(self, dataset: list, text: str):
self.dataset = dataset
self.text = text.split()
def calculate_total_word_in_sentiment(self, word, sentiment: str):
total_word_count_in_text = 0
for data in self.dataset:
if data[0] == word and data[1] == sentiment:
total_word_count_in_text += 1
return total_word_count_in_text
def calculate_total_terms_in_sentiment(self, sentiment: str):
return len([d for d in self.dataset if d[1] == sentiment])
def calculate_total_terms_in_text(self):
terms = []
for data in self.dataset:
if data[0] not in terms:
terms.append(data[0])
return len(terms)
def laplace_smoothing(self, word: str, sentiment: str):
total_word_in_sentiment = self.calculate_total_word_in_sentiment(
word, sentiment
)
total_terms_in_sentiment = self.calculate_total_terms_in_sentiment(
sentiment
)
total_terms_in_text = self.calculate_total_terms_in_text()
return (
(total_word_in_sentiment + 1) /
(total_terms_in_sentiment + total_terms_in_text)
)
def sentiment_probability(self, sentiment: str):
total = 0
for data in self.dataset:
if sentiment == data[1]:
total += 1
return total / len(self.dataset)
def execute(self):
sentiments = [['neg', 1], ['pos', 1]]
for sentiment in sentiments:
for word in self.text:
sentiment[1] *= self.laplace_smoothing(
word=word,
sentiment=sentiment[0]
)
sentiment[1] *= self.sentiment_probability(sentiment[0])
return 'neg' if sentiments[0][1] > sentiments[1][1] else 'pos'
if __name__ == '__main__':
dataset = Dataset().execute()
naive_baivys_sport = NaiveBaivys(
dataset=dataset,
text='',
)
print(naive_baivys_sport.execute())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment