Skip to content

Instantly share code, notes, and snippets.

@PolMine
Last active May 30, 2022 12:37
Show Gist options
  • Save PolMine/70eeb095328070c18bd00ee087272adf to your computer and use it in GitHub Desktop.
Save PolMine/70eeb095328070c18bd00ee087272adf to your computer and use it in GitHub Desktop.
Import SentiWS dictionary for sentiment analysis into R as data.table
# The get_sentiws function will download the zip-file with the SentiWS dictionary,
# unzip it and return a data.table.
library(data.table)
get_sentiws <- function(){
sentiws_tmp_dir <- file.path(tempdir(), "sentiws")
if (!file.exists(sentiws_tmp_dir)) dir.create(sentiws_tmp_dir)
sentiws_zipfile <- file.path(sentiws_tmp_dir, "SentiWS_v1.8c.zip")
sentiws_url <- "http://pcai056.informatik.uni-leipzig.de/downloads/etc/SentiWS/SentiWS_v1.8c.zip"
download.file(url = sentiws_url, destfile = sentiws_zipfile)
unzip(zipfile = sentiws_zipfile, exdir = sentiws_tmp_dir)
.unfold <- function(.SD){
pos <- gsub("^([A-Z]+)\\s+.*$", "\\1", .SD[["data"]][1])
weight <- as.numeric(gsub("^[A-Z]+\\s+(-?\\d\\.\\d+).*$", "\\1", .SD[["data"]][1]))
words <- gsub("^[A-Z]+\\s+-?\\d\\.\\d+\\s*(.*?)\\s*$", "\\1", .SD[["data"]][1])
words <- if (!grepl("^\\s*$", words)) strsplit(x = words, split = ",")[[1]] else NULL
list(
word = c(.SD[["word"]][1], words),
base = c(TRUE, rep(FALSE, times = length(words))),
lemma = .SD[["word"]][1],
pos = pos,
weight = weight
)
}
dts <- lapply(
c(positive = "SentiWS_v1.8c_Positive.txt", negative = "SentiWS_v1.8c_Negative.txt"),
function(filename){
dt <- fread(file.path(sentiws_tmp_dir, filename))
colnames(dt) <- c("word", "data")
dt[, "id" := 1L:nrow(dt)]
dt[, .unfold(.SD), by = c("id")]
}
)
rbindlist(dts)
}
@Studentenfutter
Copy link

I adjusted the code for SentiWS 2.0 - maybe you want to update your gist.

library(data.table)
get_sentiws <- function(){
  
  sentiws_tmp_dir <- file.path(tempdir(), "sentiws")
  if (!file.exists(sentiws_tmp_dir)) dir.create(sentiws_tmp_dir)
  sentiws_zipfile <- file.path(sentiws_tmp_dir, "SentiWS_v2.0c.zip")
  sentiws_url <- "http://pcai056.informatik.uni-leipzig.de/downloads/etc/SentiWS/SentiWS_v2.0.zip"
  download.file(url = sentiws_url, destfile = sentiws_zipfile)
  unzip(zipfile = sentiws_zipfile, exdir = sentiws_tmp_dir)
  
  .unfold <- function(.SD){
    pos <- gsub("^([A-Z]+)\\s+.*$", "\\1", .SD[["data"]][1])
    weight <- as.numeric(gsub("^[A-Z]+\\s+(-?\\d\\.\\d+).*$", "\\1", .SD[["data"]][1]))
    words <- gsub("^[A-Z]+\\s+-?\\d\\.\\d+\\s*(.*?)\\s*$", "\\1", .SD[["data"]][1])
    words <- if (!grepl("^\\s*$", words)) strsplit(x = words, split = ",")[[1]] else NULL
    list(
      word = c(.SD[["word"]][1], words),
      base = c(TRUE, rep(FALSE, times = length(words))),
      lemma = .SD[["word"]][1],
      pos = pos,
      weight = weight
    )
  }
  
  
dts <- lapply(
    c(positive = "SentiWS_v2.0_Positive.txt", negative = "SentiWS_v2.0_Negative.txt"),
    function(filename){
      dt <- fread(file.path(sentiws_tmp_dir, filename), sep = "|")
      colnames(dt) <- c("word", "data")
      dt[, "id" :=  1L:nrow(dt)]
      dt[, .unfold(.SD), by = c("id")]
    }
  )
  rbindlist(dts)
}

all_sentiments <- get_sentiws()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment