Skip to content

Instantly share code, notes, and snippets.

@Studentenfutter
Forked from PolMine/sentiws.R
Last active March 5, 2022 13:59
Show Gist options
  • Save Studentenfutter/b03a5f03bcee00f721287f627eb95c2c to your computer and use it in GitHub Desktop.
Save Studentenfutter/b03a5f03bcee00f721287f627eb95c2c to your computer and use it in GitHub Desktop.
Import SentiWS dictionary for sentiment analysis into R as data.table
# The get_sentiws function will download the zip-file with the SentiWS dictionary,
# unzip it and return a data.table.
library(data.table)
get_sentiws <- function(){
sentiws_tmp_dir <- file.path(tempdir(), "sentiws")
if (!file.exists(sentiws_tmp_dir)) dir.create(sentiws_tmp_dir)
sentiws_zipfile <- file.path(sentiws_tmp_dir, "SentiWS_v2.0c.zip")
sentiws_url <- "http://pcai056.informatik.uni-leipzig.de/downloads/etc/SentiWS/SentiWS_v2.0.zip"
download.file(url = sentiws_url, destfile = sentiws_zipfile)
unzip(zipfile = sentiws_zipfile, exdir = sentiws_tmp_dir)
.unfold <- function(.SD){
pos <- gsub("^([A-Z]+)\\s+.*$", "\\1", .SD[["data"]][1])
weight <- as.numeric(gsub("^[A-Z]+\\s+(-?\\d\\.\\d+).*$", "\\1", .SD[["data"]][1]))
words <- gsub("^[A-Z]+\\s+-?\\d\\.\\d+\\s*(.*?)\\s*$", "\\1", .SD[["data"]][1])
words <- if (!grepl("^\\s*$", words)) strsplit(x = words, split = ",")[[1]] else NULL
list(
word = c(.SD[["word"]][1], words),
base = c(TRUE, rep(FALSE, times = length(words))),
lemma = .SD[["word"]][1],
pos = pos,
weight = weight
)
}
dts <- lapply(
c(positive = "SentiWS_v2.0_Positive.txt", negative = "SentiWS_v2.0_Negative.txt"),
function(filename){
dt <- fread(file.path(sentiws_tmp_dir, filename), sep = "|", encoding="UTF-8")
colnames(dt) <- c("word", "data")
dt[, "id" := 1L:nrow(dt)]
dt[, .unfold(.SD), by = c("id")]
}
)
rbindlist(dts)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment