MCodrescu/sentiment_analysis.R

## sentiment_analysis.R
# Download Data Zip
download.file(
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    destfile = "data.tar.gz"
)

# Unzip
untar("data.tar.gz")

# Get Files in Directories
pos_train_files <- list.files("aclimdb/train/pos", full.names = TRUE)
neg_train_files <- list.files("aclimdb/train/neg", full.names = TRUE)
pos_test_files <- list.files("aclimdb/test/pos", full.names = TRUE)
neg_test_files <- list.files("aclimdb/test/neg", full.names = TRUE)


# Convert a Directory of Files to a List
create_dataset_from_files <- function(files, label){
    lapply(files, function(x){

        list(
            review = tolower(
               gsub(
                   "[[:punct:]]",
                   " ",
                   readLines(x, warn = FALSE)
                )
            ),
            label = label
        )

    })
}


# Create Train and Test Data
train_data <- append(
    create_dataset_from_files(pos_train_files, label = 1),
    create_dataset_from_files(neg_train_files, label = 0)
)

test_data <- append(
    create_dataset_from_files(pos_test_files, label = 1),
    create_dataset_from_files(neg_test_files, label = 0)
)

# Get Unique Grams
all_reviews <- sapply(
    train_data,
    function(x) x$review
)
unique_grams <- unique(
    unlist(
        strsplit(all_reviews, " ")
    )
)

# Function to Featurize a Review
featurize <- function(review){
    review |>
        strsplit(" ") |>
        unlist() |>
        match(unique_grams) |>
        na.omit() |>
        unique()
}

# A couple of tests
sample(unique_grams, 10)
c("good", "bad", "ugly") %in% unique_grams
featurize("the plot grows thin soon and you find yourself praying for a quick resolution")

# Featurize the Training Data
train_data_features <- lapply(
    train_data,
    function(x) featurize(x$review)
)

# Featurize the Test Data
test_data_features <- lapply(
    test_data,
    function(x) featurize(x$review)
)

# Perceptron Function
perceptron <- function(weights, idx){
    ifelse(sum(weights[idx]) > 0, 1, 0)
}

# A few tests
perceptron(c(0.5, -0.5, -0.5), idx = c(1))
perceptron(c(0.5, -0.5, -0.5), idx = c(2, 3))

# Set Initial Values
weights <- numeric(length(unique_grams))
lr <- 0.01
n <- length(train_data_features)
n_epochs <- seq_len(100)

# Iterate Over Training Data
for (epoch in n_epochs){

    # Shuffle
    set.seed(100)
    rows_shuffled <- sample(seq_len(n), n)

    # Log Accuracy
    epoch_accuracy <- numeric(n)

    for (i in rows_shuffled){

        # Predict with Perceptron
        label <- train_data[[i]]$label
        idx <- train_data_features[[i]]
        output <- perceptron(weights, idx)

        # Update Weights
        if (output != label){
            weights[idx] <- weights[idx] + lr * (label - output)
        }

        # Log Accuracy
        epoch_accuracy[i] <- label == output
    }

    print(mean(epoch_accuracy))

}

# Make Predictions
predictions <- sapply(test_data_features, function(x) perceptron(weights, x))

# Get True Labels
test_labels <- sapply(test_data, function(x) x$label)

# Calculate Accuracy
print(mean(predictions == test_labels))

# Test
test1 <- "I recommend you never go see this movie it was awful"
test2 <- "All around very good! I was thrilled."

idx1 <- featurize(test1)
idx2 <- featurize(test2)

perceptron(weights, idx1)
perceptron(weights, idx2)
	# Download Data Zip
	download.file(
	url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
	destfile = "data.tar.gz"
	)

	# Unzip
	untar("data.tar.gz")

	# Get Files in Directories
	pos_train_files <- list.files("aclimdb/train/pos", full.names = TRUE)
	neg_train_files <- list.files("aclimdb/train/neg", full.names = TRUE)
	pos_test_files <- list.files("aclimdb/test/pos", full.names = TRUE)
	neg_test_files <- list.files("aclimdb/test/neg", full.names = TRUE)


	# Convert a Directory of Files to a List
	create_dataset_from_files <- function(files, label){
	lapply(files, function(x){

	list(
	review = tolower(
	gsub(
	"[[:punct:]]",
	" ",
	readLines(x, warn = FALSE)
	)
	),
	label = label
	)

	})
	}


	# Create Train and Test Data
	train_data <- append(
	create_dataset_from_files(pos_train_files, label = 1),
	create_dataset_from_files(neg_train_files, label = 0)
	)

	test_data <- append(
	create_dataset_from_files(pos_test_files, label = 1),
	create_dataset_from_files(neg_test_files, label = 0)
	)

	# Get Unique Grams
	all_reviews <- sapply(
	train_data,
	function(x) x$review
	)
	unique_grams <- unique(
	unlist(
	strsplit(all_reviews, " ")
	)
	)

	# Function to Featurize a Review
	featurize <- function(review){
	review \|>
	strsplit(" ") \|>
	unlist() \|>
	match(unique_grams) \|>
	na.omit() \|>
	unique()
	}

	# A couple of tests
	sample(unique_grams, 10)
	c("good", "bad", "ugly") %in% unique_grams
	featurize("the plot grows thin soon and you find yourself praying for a quick resolution")

	# Featurize the Training Data
	train_data_features <- lapply(
	train_data,
	function(x) featurize(x$review)
	)

	# Featurize the Test Data
	test_data_features <- lapply(
	test_data,
	function(x) featurize(x$review)
	)

	# Perceptron Function
	perceptron <- function(weights, idx){
	ifelse(sum(weights[idx]) > 0, 1, 0)
	}

	# A few tests
	perceptron(c(0.5, -0.5, -0.5), idx = c(1))
	perceptron(c(0.5, -0.5, -0.5), idx = c(2, 3))

	# Set Initial Values
	weights <- numeric(length(unique_grams))
	lr <- 0.01
	n <- length(train_data_features)
	n_epochs <- seq_len(100)

	# Iterate Over Training Data
	for (epoch in n_epochs){

	# Shuffle
	set.seed(100)
	rows_shuffled <- sample(seq_len(n), n)

	# Log Accuracy
	epoch_accuracy <- numeric(n)

	for (i in rows_shuffled){

	# Predict with Perceptron
	label <- train_data[[i]]$label
	idx <- train_data_features[[i]]
	output <- perceptron(weights, idx)

	# Update Weights
	if (output != label){
	weights[idx] <- weights[idx] + lr * (label - output)
	}

	# Log Accuracy
	epoch_accuracy[i] <- label == output
	}

	print(mean(epoch_accuracy))

	}

	# Make Predictions
	predictions <- sapply(test_data_features, function(x) perceptron(weights, x))

	# Get True Labels
	test_labels <- sapply(test_data, function(x) x$label)

	# Calculate Accuracy
	print(mean(predictions == test_labels))

	# Test
	test1 <- "I recommend you never go see this movie it was awful"
	test2 <- "All around very good! I was thrilled."

	idx1 <- featurize(test1)
	idx2 <- featurize(test2)

	perceptron(weights, idx1)
	perceptron(weights, idx2)