Skip to content

Instantly share code, notes, and snippets.

@syu-id
Last active November 27, 2015 02:02
Show Gist options
  • Save syu-id/71cede379f43b401396f to your computer and use it in GitHub Desktop.
Save syu-id/71cede379f43b401396f to your computer and use it in GitHub Desktop.
学習者コーパス論宿題 2015.11.20
library(tidyr)
library(dplyr)
library(stringr)
dirs <- c(
'NICE3.0/NICE-NNS',
'NICE3.0/NICE-NS'
)
data_raw <- dirs %>%
# vector of file paths
lapply(list.files, full = TRUE) %>%
unlist() %>%
# vector of files contents
lapply(scan, what = 'char', sep = '\n', fileEnc = 'cp932') %>%
unlist() %>%
# subset the contents of essays
str_subset('^\\*.+:\\t')
# table of essays
# one line per row
essays <- data_raw %>%
data.frame(text = .) %>%
# separate into native, id, text
separate(text,
into = c('id', 'text'),
sep = '\\t'
) %>%
mutate(
id = str_replace_all(id, '[*:]', '')
) %>%
separate(id,
into = c('native', 'id'),
sep = -4
) %>%
mutate(
native = plyr::revalue(native, c(JPN = 'ja', NS = 'en'))
) %>%
# number each line
group_by(native, id) %>%
mutate(
line = 1:n()
) %>%
ungroup() %>%
# reorder columns
select(native, id, line, text)
# table of raw tokens
# one token per row
tokens_raw <- essays %>%
rowwise() %>%
do(data.frame(
native = .$native,
id = .$id,
token = str_split(.$text, ' ') %>% unlist(),
stringsAsFactors = FALSE
)) %>%
ungroup()
# table of cleaned tokens
tokens <- tokens_raw %>%
mutate(
token = token %>%
str_trim() %>%
str_replace_all('^[[:punct:]]+|[[:punct:]]+$', '') %>%
tolower()
) %>%
filter(
str_length(token) > 0
)
# result
result <- tokens %>%
group_by(native, id) %>%
summarise(
n_token = length(token),
n_type = length(unique(token)),
type_per_token = n_type / n_token
) %>%
ungroup()
result %>%
mutate(type_per_token = round(type_per_token, 3)) %>%
as.data.frame()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment