syu-id/LC-2015.11.20.R

## LC-2015.11.20.R
library(tidyr)
library(dplyr)
library(stringr)

dirs <- c(
    'NICE3.0/NICE-NNS',
    'NICE3.0/NICE-NS'
    )

data_raw <- dirs %>%
    # vector of file paths
    lapply(list.files, full = TRUE) %>%
    unlist() %>%
    # vector of files contents
    lapply(scan, what = 'char', sep = '\n', fileEnc = 'cp932') %>%
    unlist() %>%
    # subset the contents of essays
    str_subset('^\\*.+:\\t')


# table of essays
# one line per row
essays <- data_raw %>%
    data.frame(text = .) %>%
    # separate into native, id, text
    separate(text,
        into = c('id', 'text'),
        sep  = '\\t'
        ) %>%
    mutate(
        id = str_replace_all(id, '[*:]', '')
        ) %>%
    separate(id,
        into = c('native', 'id'),
        sep  = -4
        ) %>%
    mutate(
        native = plyr::revalue(native, c(JPN = 'ja', NS = 'en'))
    ) %>%
    # number each line
    group_by(native, id) %>%
    mutate(
        line = 1:n()
    ) %>%
    ungroup() %>%
    # reorder columns
    select(native, id, line, text)


# table of raw tokens
# one token per row
tokens_raw <- essays %>%
    rowwise() %>%
    do(data.frame(
        native = .$native,
        id     = .$id,
        token  = str_split(.$text, ' ') %>% unlist(),
        stringsAsFactors = FALSE
    )) %>%
    ungroup()


# table of cleaned tokens
tokens <- tokens_raw %>%
    mutate(
        token = token %>%
                str_trim() %>%
                str_replace_all('^[[:punct:]]+|[[:punct:]]+$', '') %>%
                tolower()
    ) %>%
    filter(
        str_length(token) > 0
    )


# result
result <- tokens %>%
    group_by(native, id) %>%
    summarise(
        n_token        = length(token),
        n_type         = length(unique(token)),
        type_per_token = n_type / n_token
    ) %>%
    ungroup()

result %>%
    mutate(type_per_token = round(type_per_token, 3)) %>%
    as.data.frame()
	library(tidyr)
	library(dplyr)
	library(stringr)

	dirs <- c(
	'NICE3.0/NICE-NNS',
	'NICE3.0/NICE-NS'
	)

	data_raw <- dirs %>%
	# vector of file paths
	lapply(list.files, full = TRUE) %>%
	unlist() %>%
	# vector of files contents
	lapply(scan, what = 'char', sep = '\n', fileEnc = 'cp932') %>%
	unlist() %>%
	# subset the contents of essays
	str_subset('^\\*.+:\\t')


	# table of essays
	# one line per row
	essays <- data_raw %>%
	data.frame(text = .) %>%
	# separate into native, id, text
	separate(text,
	into = c('id', 'text'),
	sep = '\\t'
	) %>%
	mutate(
	id = str_replace_all(id, '[*:]', '')
	) %>%
	separate(id,
	into = c('native', 'id'),
	sep = -4
	) %>%
	mutate(
	native = plyr::revalue(native, c(JPN = 'ja', NS = 'en'))
	) %>%
	# number each line
	group_by(native, id) %>%
	mutate(
	line = 1:n()
	) %>%
	ungroup() %>%
	# reorder columns
	select(native, id, line, text)


	# table of raw tokens
	# one token per row
	tokens_raw <- essays %>%
	rowwise() %>%
	do(data.frame(
	native = .$native,
	id = .$id,
	token = str_split(.$text, ' ') %>% unlist(),
	stringsAsFactors = FALSE
	)) %>%
	ungroup()


	# table of cleaned tokens
	tokens <- tokens_raw %>%
	mutate(
	token = token %>%
	str_trim() %>%
	str_replace_all('^[[:punct:]]+\|[[:punct:]]+$', '') %>%
	tolower()
	) %>%
	filter(
	str_length(token) > 0
	)


	# result
	result <- tokens %>%
	group_by(native, id) %>%
	summarise(
	n_token = length(token),
	n_type = length(unique(token)),
	type_per_token = n_type / n_token
	) %>%
	ungroup()

	result %>%
	mutate(type_per_token = round(type_per_token, 3)) %>%
	as.data.frame()