Last active
November 26, 2015 13:54
-
-
Save syu-id/394335fbef8845c4345c to your computer and use it in GitHub Desktop.
学習者コーパス論 2015.11.13 宿題
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(magrittr) | |
library(stringr) | |
# read in all the files | |
files_nns <- list.files('NICE3.0/NICE-NNS', full = TRUE) | |
files_ns <- list.files('NICE3.0/NICE-NS', full = TRUE) | |
nns <- lapply(files_nns, scan, what = 'char', sep = '\n', fileEnc = 'cp932') | |
ns <- lapply(files_ns, scan, what = 'char', sep = '\n', fileEnc = 'cp932') | |
# extract the data | |
nns_essay <- do.call(c, nns) %>% | |
str_extract('(?<=^\\*JPN\\d{3}:\\t).*') %>% | |
na.omit() | |
ns_essay <- do.call(c, ns) %>% | |
str_extract('(?<=^\\*NS\\d{3}:\\t).*') %>% | |
na.omit() | |
# combine the two data | |
essays <- list(NNS = nns_essay, NS = ns_essay) | |
# count of words | |
count_words <- sapply(essays, function(x) { | |
x %>% | |
str_count('\\S+') %>% | |
sum() | |
}) | |
# count of genitives | |
count_of <- sapply(essays, function(x) { | |
x %>% | |
str_count('\\b[Oo]f\\b') %>% | |
sum() | |
}) | |
count_s <- sapply(essays, function(x) { | |
x %>% | |
str_count("('s|s')\\b") %>% | |
sum() | |
}) | |
# result | |
result <- | |
data.frame( | |
words = count_words, | |
of = count_of, | |
s = count_s) %>% | |
transform( | |
gen_per_word = (of + s) / words, | |
s_per_gen = s / (of + s) | |
) | |
round(result, 3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment