aipi/naive-baivys.py

## naive-baivys.py
import csv
import random


class Dataset:
    def remove_stop_words(self, text):
        return text.replace(',', '').replace('.', '')

    def get_portuguese_text(self, row, dataset):
        portuguese_text = self.remove_stop_words(row[2].lower()).split()
        for word in portuguese_text:
            dataset.append((word, row[-1]))
        return dataset

    def execute(self):
        dataset = []
        with open('imdb-reviews-pt-br.csv', newline='') as csvfile:
            file = csv.reader(
                csvfile,
                delimiter=',',
                quotechar='"',
                quoting=csv.QUOTE_MINIMAL
            )

            test = []
            x = 0
            for row in file:
                test.append(row)

            random.shuffle(test)
            for row in test[:100]:
                self.get_portuguese_text(row, dataset)

        dataset.remove(dataset[0])
        return dataset


class NaiveBaivys:
    def __init__(self, dataset: list, text: str):
        self.dataset = dataset
        self.text = text.split()

    def calculate_total_word_in_sentiment(self, word, sentiment: str):
        total_word_count_in_text = 0
        for data in self.dataset:
            if data[0] == word and data[1] == sentiment:
                total_word_count_in_text += 1
        return total_word_count_in_text

    def calculate_total_terms_in_sentiment(self, sentiment: str):
        return len([d for d in self.dataset if d[1] == sentiment])

    def calculate_total_terms_in_text(self):
        terms = []
        for data in self.dataset:
            if data[0] not in terms:
                terms.append(data[0])
        return len(terms)

    def laplace_smoothing(self, word: str, sentiment: str):
        total_word_in_sentiment = self.calculate_total_word_in_sentiment(
            word, sentiment
        )
        total_terms_in_sentiment = self.calculate_total_terms_in_sentiment(
            sentiment
        )
        total_terms_in_text = self.calculate_total_terms_in_text()
        return (
            (total_word_in_sentiment + 1) /
            (total_terms_in_sentiment + total_terms_in_text)
        )

    def sentiment_probability(self, sentiment: str):
        total = 0
        for data in self.dataset:
            if sentiment == data[1]:
                total += 1
        return total / len(self.dataset)

    def execute(self):
        sentiments = [['neg', 1], ['pos', 1]]
        for sentiment in sentiments:
            for word in self.text:
                sentiment[1] *= self.laplace_smoothing(
                    word=word,
                    sentiment=sentiment[0]
                )
            sentiment[1] *= self.sentiment_probability(sentiment[0])

        return 'neg' if sentiments[0][1] > sentiments[1][1] else 'pos'


if __name__ == '__main__':
    dataset = Dataset().execute()
    naive_baivys_sport = NaiveBaivys(
        dataset=dataset,
        text='',
    )
    print(naive_baivys_sport.execute())
	import csv
	import random


	class Dataset:
	def remove_stop_words(self, text):
	return text.replace(',', '').replace('.', '')

	def get_portuguese_text(self, row, dataset):
	portuguese_text = self.remove_stop_words(row[2].lower()).split()
	for word in portuguese_text:
	dataset.append((word, row[-1]))
	return dataset

	def execute(self):
	dataset = []
	with open('imdb-reviews-pt-br.csv', newline='') as csvfile:
	file = csv.reader(
	csvfile,
	delimiter=',',
	quotechar='"',
	quoting=csv.QUOTE_MINIMAL
	)

	test = []
	x = 0
	for row in file:
	test.append(row)

	random.shuffle(test)
	for row in test[:100]:
	self.get_portuguese_text(row, dataset)

	dataset.remove(dataset[0])
	return dataset


	class NaiveBaivys:
	def __init__(self, dataset: list, text: str):
	self.dataset = dataset
	self.text = text.split()

	def calculate_total_word_in_sentiment(self, word, sentiment: str):
	total_word_count_in_text = 0
	for data in self.dataset:
	if data[0] == word and data[1] == sentiment:
	total_word_count_in_text += 1
	return total_word_count_in_text

	def calculate_total_terms_in_sentiment(self, sentiment: str):
	return len([d for d in self.dataset if d[1] == sentiment])

	def calculate_total_terms_in_text(self):
	terms = []
	for data in self.dataset:
	if data[0] not in terms:
	terms.append(data[0])
	return len(terms)

	def laplace_smoothing(self, word: str, sentiment: str):
	total_word_in_sentiment = self.calculate_total_word_in_sentiment(
	word, sentiment
	)
	total_terms_in_sentiment = self.calculate_total_terms_in_sentiment(
	sentiment
	)
	total_terms_in_text = self.calculate_total_terms_in_text()
	return (
	(total_word_in_sentiment + 1) /
	(total_terms_in_sentiment + total_terms_in_text)
	)

	def sentiment_probability(self, sentiment: str):
	total = 0
	for data in self.dataset:
	if sentiment == data[1]:
	total += 1
	return total / len(self.dataset)

	def execute(self):
	sentiments = [['neg', 1], ['pos', 1]]
	for sentiment in sentiments:
	for word in self.text:
	sentiment[1] *= self.laplace_smoothing(
	word=word,
	sentiment=sentiment[0]
	)
	sentiment[1] *= self.sentiment_probability(sentiment[0])

	return 'neg' if sentiments[0][1] > sentiments[1][1] else 'pos'


	if __name__ == '__main__':
	dataset = Dataset().execute()
	naive_baivys_sport = NaiveBaivys(
	dataset=dataset,
	text='',
	)
	print(naive_baivys_sport.execute())