Skip to content

Instantly share code, notes, and snippets.

@MCodrescu
Created September 8, 2023 18:59
Show Gist options
  • Save MCodrescu/7a62f3bd4687400900f81f2face4a95c to your computer and use it in GitHub Desktop.
Save MCodrescu/7a62f3bd4687400900f81f2face4a95c to your computer and use it in GitHub Desktop.
# Download Data Zip
download.file(
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
destfile = "data.tar.gz"
)
# Unzip
untar("data.tar.gz")
# Get Files in Directories
pos_train_files <- list.files("aclimdb/train/pos", full.names = TRUE)
neg_train_files <- list.files("aclimdb/train/neg", full.names = TRUE)
pos_test_files <- list.files("aclimdb/test/pos", full.names = TRUE)
neg_test_files <- list.files("aclimdb/test/neg", full.names = TRUE)
# Convert a Directory of Files to a List
create_dataset_from_files <- function(files, label){
lapply(files, function(x){
list(
review = tolower(
gsub(
"[[:punct:]]",
" ",
readLines(x, warn = FALSE)
)
),
label = label
)
})
}
# Create Train and Test Data
train_data <- append(
create_dataset_from_files(pos_train_files, label = 1),
create_dataset_from_files(neg_train_files, label = 0)
)
test_data <- append(
create_dataset_from_files(pos_test_files, label = 1),
create_dataset_from_files(neg_test_files, label = 0)
)
# Get Unique Grams
all_reviews <- sapply(
train_data,
function(x) x$review
)
unique_grams <- unique(
unlist(
strsplit(all_reviews, " ")
)
)
# Function to Featurize a Review
featurize <- function(review){
review |>
strsplit(" ") |>
unlist() |>
match(unique_grams) |>
na.omit() |>
unique()
}
# A couple of tests
sample(unique_grams, 10)
c("good", "bad", "ugly") %in% unique_grams
featurize("the plot grows thin soon and you find yourself praying for a quick resolution")
# Featurize the Training Data
train_data_features <- lapply(
train_data,
function(x) featurize(x$review)
)
# Featurize the Test Data
test_data_features <- lapply(
test_data,
function(x) featurize(x$review)
)
# Perceptron Function
perceptron <- function(weights, idx){
ifelse(sum(weights[idx]) > 0, 1, 0)
}
# A few tests
perceptron(c(0.5, -0.5, -0.5), idx = c(1))
perceptron(c(0.5, -0.5, -0.5), idx = c(2, 3))
# Set Initial Values
weights <- numeric(length(unique_grams))
lr <- 0.01
n <- length(train_data_features)
n_epochs <- seq_len(100)
# Iterate Over Training Data
for (epoch in n_epochs){
# Shuffle
set.seed(100)
rows_shuffled <- sample(seq_len(n), n)
# Log Accuracy
epoch_accuracy <- numeric(n)
for (i in rows_shuffled){
# Predict with Perceptron
label <- train_data[[i]]$label
idx <- train_data_features[[i]]
output <- perceptron(weights, idx)
# Update Weights
if (output != label){
weights[idx] <- weights[idx] + lr * (label - output)
}
# Log Accuracy
epoch_accuracy[i] <- label == output
}
print(mean(epoch_accuracy))
}
# Make Predictions
predictions <- sapply(test_data_features, function(x) perceptron(weights, x))
# Get True Labels
test_labels <- sapply(test_data, function(x) x$label)
# Calculate Accuracy
print(mean(predictions == test_labels))
# Test
test1 <- "I recommend you never go see this movie it was awful"
test2 <- "All around very good! I was thrilled."
idx1 <- featurize(test1)
idx2 <- featurize(test2)
perceptron(weights, idx1)
perceptron(weights, idx2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment