Skip to content

Instantly share code, notes, and snippets.

@syu-id
Last active December 5, 2015 06:26
Show Gist options
  • Save syu-id/bd644f656adbc5c6747c to your computer and use it in GitHub Desktop.
Save syu-id/bd644f656adbc5c6747c to your computer and use it in GitHub Desktop.
library(tidyr)
library(dplyr)
library(stringr)
# preparations ----
dirs <- list.files('NICE/NICE_3.0.1b', full = TRUE)
files <- dirs %>% list.files(full = TRUE)
writers <- dirs %>% list.files() %>% str_replace('\\.txt$', '')
data_raw <- lapply(files, function(f) {
f %>%
# read in the files
scan(what = 'char', sep = '\n', fileEnc = 'cp932') %>%
# extract the essay sentences
str_extract('(?<=^\\*(JPN|NS)?\\d{3}:\\t).*$') %>%
na.omit()
})
# the times that each writer id shoud repeat in the data.frame below
writer_times <- sapply(data_raw, length)
# essays ----
essays <- data_raw %>%
unlist() %>%
data.frame(text = .) %>%
# the "id" vector: each of the writers x writer_times
mutate(
id = rep(writers, writer_times)
) %>%
# separate into native languge and id
separate(id,
into = c('native', 'id'),
sep = -4
) %>%
mutate(
native = plyr::revalue(native, c(JPN = 'ja', NS = 'en'))
) %>%
# number each sentence
group_by(native, id) %>%
mutate(
sen_no = 1:n()
) %>%
ungroup() %>%
# reorder columns
select(native, id, sen_no, text)
# raw tokens ----
tokens_raw <- essays %>%
# one token one row
rowwise() %>%
do(data.frame(
native = .$native,
id = .$id,
sen_no = .$sen_no,
token = .$text %>% str_split(' +') %>% unlist(),
stringsAsFactors = FALSE
)) %>%
ungroup() %>%
# number each token in each sentence
group_by(native, id, sen_no) %>%
mutate(
token_no = 1:n()
) %>%
ungroup() %>%
# reorder columns
select(native, id, sen_no, token_no, token)
# cleaned tokens ----
tokens <- tokens_raw %>%
mutate(
token = token %>%
str_replace_all('\\W', ' ') %>%
str_replace_all(' +', ' ') %>%
str_trim() %>%
tolower()
) %>%
# filter out empty tokens
filter(
str_length(token) > 0
)
# save to csv ----
write.csv(essays, file = 'data/essays.csv', row.names = FALSE)
write.csv(tokens_raw, file = 'data/tokens_raw.csv', row.names = FALSE)
write.csv(tokens, file = 'data/tokens.csv', row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment