Created
September 8, 2023 18:59
-
-
Save MCodrescu/7a62f3bd4687400900f81f2face4a95c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download Data Zip | |
download.file( | |
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", | |
destfile = "data.tar.gz" | |
) | |
# Unzip | |
untar("data.tar.gz") | |
# Get Files in Directories | |
pos_train_files <- list.files("aclimdb/train/pos", full.names = TRUE) | |
neg_train_files <- list.files("aclimdb/train/neg", full.names = TRUE) | |
pos_test_files <- list.files("aclimdb/test/pos", full.names = TRUE) | |
neg_test_files <- list.files("aclimdb/test/neg", full.names = TRUE) | |
# Convert a Directory of Files to a List | |
create_dataset_from_files <- function(files, label){ | |
lapply(files, function(x){ | |
list( | |
review = tolower( | |
gsub( | |
"[[:punct:]]", | |
" ", | |
readLines(x, warn = FALSE) | |
) | |
), | |
label = label | |
) | |
}) | |
} | |
# Create Train and Test Data | |
train_data <- append( | |
create_dataset_from_files(pos_train_files, label = 1), | |
create_dataset_from_files(neg_train_files, label = 0) | |
) | |
test_data <- append( | |
create_dataset_from_files(pos_test_files, label = 1), | |
create_dataset_from_files(neg_test_files, label = 0) | |
) | |
# Get Unique Grams | |
all_reviews <- sapply( | |
train_data, | |
function(x) x$review | |
) | |
unique_grams <- unique( | |
unlist( | |
strsplit(all_reviews, " ") | |
) | |
) | |
# Function to Featurize a Review | |
featurize <- function(review){ | |
review |> | |
strsplit(" ") |> | |
unlist() |> | |
match(unique_grams) |> | |
na.omit() |> | |
unique() | |
} | |
# A couple of tests | |
sample(unique_grams, 10) | |
c("good", "bad", "ugly") %in% unique_grams | |
featurize("the plot grows thin soon and you find yourself praying for a quick resolution") | |
# Featurize the Training Data | |
train_data_features <- lapply( | |
train_data, | |
function(x) featurize(x$review) | |
) | |
# Featurize the Test Data | |
test_data_features <- lapply( | |
test_data, | |
function(x) featurize(x$review) | |
) | |
# Perceptron Function | |
perceptron <- function(weights, idx){ | |
ifelse(sum(weights[idx]) > 0, 1, 0) | |
} | |
# A few tests | |
perceptron(c(0.5, -0.5, -0.5), idx = c(1)) | |
perceptron(c(0.5, -0.5, -0.5), idx = c(2, 3)) | |
# Set Initial Values | |
weights <- numeric(length(unique_grams)) | |
lr <- 0.01 | |
n <- length(train_data_features) | |
n_epochs <- seq_len(100) | |
# Iterate Over Training Data | |
for (epoch in n_epochs){ | |
# Shuffle | |
set.seed(100) | |
rows_shuffled <- sample(seq_len(n), n) | |
# Log Accuracy | |
epoch_accuracy <- numeric(n) | |
for (i in rows_shuffled){ | |
# Predict with Perceptron | |
label <- train_data[[i]]$label | |
idx <- train_data_features[[i]] | |
output <- perceptron(weights, idx) | |
# Update Weights | |
if (output != label){ | |
weights[idx] <- weights[idx] + lr * (label - output) | |
} | |
# Log Accuracy | |
epoch_accuracy[i] <- label == output | |
} | |
print(mean(epoch_accuracy)) | |
} | |
# Make Predictions | |
predictions <- sapply(test_data_features, function(x) perceptron(weights, x)) | |
# Get True Labels | |
test_labels <- sapply(test_data, function(x) x$label) | |
# Calculate Accuracy | |
print(mean(predictions == test_labels)) | |
# Test | |
test1 <- "I recommend you never go see this movie it was awful" | |
test2 <- "All around very good! I was thrilled." | |
idx1 <- featurize(test1) | |
idx2 <- featurize(test2) | |
perceptron(weights, idx1) | |
perceptron(weights, idx2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment