Skip to content

Instantly share code, notes, and snippets.

@syu-id
Last active November 26, 2015 13:54
Show Gist options
  • Save syu-id/394335fbef8845c4345c to your computer and use it in GitHub Desktop.
Save syu-id/394335fbef8845c4345c to your computer and use it in GitHub Desktop.
学習者コーパス論 2015.11.13 宿題
library(magrittr)
library(stringr)
# read in all the files
files_nns <- list.files('NICE3.0/NICE-NNS', full = TRUE)
files_ns <- list.files('NICE3.0/NICE-NS', full = TRUE)
nns <- lapply(files_nns, scan, what = 'char', sep = '\n', fileEnc = 'cp932')
ns <- lapply(files_ns, scan, what = 'char', sep = '\n', fileEnc = 'cp932')
# extract the data
nns_essay <- do.call(c, nns) %>%
str_extract('(?<=^\\*JPN\\d{3}:\\t).*') %>%
na.omit()
ns_essay <- do.call(c, ns) %>%
str_extract('(?<=^\\*NS\\d{3}:\\t).*') %>%
na.omit()
# combine the two data
essays <- list(NNS = nns_essay, NS = ns_essay)
# count of words
count_words <- sapply(essays, function(x) {
x %>%
str_count('\\S+') %>%
sum()
})
# count of genitives
count_of <- sapply(essays, function(x) {
x %>%
str_count('\\b[Oo]f\\b') %>%
sum()
})
count_s <- sapply(essays, function(x) {
x %>%
str_count("('s|s')\\b") %>%
sum()
})
# result
result <-
data.frame(
words = count_words,
of = count_of,
s = count_s) %>%
transform(
gen_per_word = (of + s) / words,
s_per_gen = s / (of + s)
)
round(result, 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment