Shaoyun Yu syu-id

## mattr.R
# An alternative implementation of the MATTR algorithm in R
# author: Shaoyun YU <eric.rongmu@gmail.com>
# ref: Covington & Mcfall (2010) Cutting the Gordian Knot: The Moving-Average Type-Token Ratio
# usage: mattr(vector_of_tokens, window_size)

window_types <- function(i_start, win_size, data) {
  i_end <- i_start + win_size - 1

  win <- data[i_start:i_end]

## LC-2015.12.11-graph.R
library(dplyr)
library(ggplot2)
library(scales)

df <- read.csv('out/2015.12.04-awl_asl.csv')


df_plot <- df %>%
    mutate(
        native = plyr::revalue(native, c(en = 'L1 English ', ja='L1 Japanese'))

## LC-2015.12.04-graph.R
library(dplyr)
library(ggplot2)

df <- read.csv('out/2015.12.04-awl_asl.csv')


df_plot <- df %>%
    mutate(
        native = plyr::revalue(native, c(en = 'L1 English ', ja='L1 Japanese'))
    )

## 2015.12.04-learner_corpus-2
トークン数と TTR について少し数学的に考えてみると、プロットの結果が当たり前のことです。

トークン数とタイプ数は線形的な関係にあると仮定します。

$$
type =  \beta_0 + \beta_1 \cdot token
$$

$token = 1$ の場合、$type=1$ になるので、$\beta_0$ と $\beta_1$ の和が $1$ に決まっています。

## LC-2015.12.04-awl_asl.R
library(dplyr)

tokens <- read.csv('data/tokens.csv', as.is = TRUE)


result <- tokens %>%
    group_by(native, id) %>%
    summarise(
        n_token = n(),
        n_type  = n_distinct(token),

## LC-2015.11.27-ttr_gi.csv

          
            native
            id
            n_token
            n_type
            ttr
            gi

            
              591
              365
              144
              0.394520547945205
              7.53730448529908

            
              en
              501
              738
              361
              0.489159891598916
              13.2885936376732

            
              en
              502
              636
              340
              0.534591194968553
              13.4818769572084

            
              en
              503
              834
              353
              0.42326139088729
              12.2233903227874

            
              en
              504
              824
              336
              0.407766990291262
              11.7051146400992

            
              en
              505
              898
              393
              0.437639198218263
              13.1145798598269

            
              en
              506
              834
              341
              0.408872901678657
              11.8078643061488

            
              en
              507
              600
              264
              0.44
              10.777754868246

            
              en
              508
              848
              332
              0.391509433962264
              11.4009268077412

## LC-2015.11.27-ttr_gi.R
library(dplyr)

tokens <- read.csv('data/tokens.csv')

result <- tokens %>%
    group_by(native, id) %>%
    summarise(
        n_token = length(token),
        n_type  = length(unique(token)),
        ttr     = n_type / n_token,

## LC-2015-prepare_data.R
library(tidyr)
library(dplyr)
library(stringr)

# preparations ----
dirs    <- list.files('NICE/NICE_3.0.1b', full = TRUE)

files   <- dirs %>% list.files(full = TRUE)
writers <- dirs %>% list.files() %>% str_replace('\\.txt$', '')

## 2015.11.30-nishizaka
## 効果量について

### 分散分析の効果量

被験者内計画の分散分析の効果量である「一般化オメガ二乗（$\omega_G^2$）」と「一般化イータ二乗（$\eta_G^2$）」の解釈は、水本篤・竹内理 (2008)「研究論文における効果量の報告のために―基本的概念と注意点」『英語教育研究』31, 57–66. (http://www.mizumot.com/files/EffectSize_KELES31.pdf ) の表1（p. 62）に従いました。後の説明の部分では、$\omega_G^2$ を使いました。

サイズ | $\eta_G^2$ | $\omega_G^2$
------|------------|-------------
小    | $.01$ | $.01$
中    | $.06$ | $.09$

## LC-2015.11.20.R
library(tidyr)
library(dplyr)
library(stringr)

dirs <- c(
    'NICE3.0/NICE-NNS',
    'NICE3.0/NICE-NS'
    )

data_raw <- dirs %>%
	# An alternative implementation of the MATTR algorithm in R
	# author: Shaoyun YU <eric.rongmu@gmail.com>
	# ref: Covington & Mcfall (2010) Cutting the Gordian Knot: The Moving-Average Type-Token Ratio
	# usage: mattr(vector_of_tokens, window_size)

	window_types <- function(i_start, win_size, data) {
	i_end <- i_start + win_size - 1

	win <- data[i_start:i_end]
	library(dplyr)
	library(ggplot2)
	library(scales)

	df <- read.csv('out/2015.12.04-awl_asl.csv')


	df_plot <- df %>%
	mutate(
	native = plyr::revalue(native, c(en = 'L1 English ', ja='L1 Japanese'))
	トークン数と TTR について少し数学的に考えてみると、プロットの結果が当たり前のことです。

	トークン数とタイプ数は線形的な関係にあると仮定します。

	$$
	type = \beta_0 + \beta_1 \cdot token
	$$

	$token = 1$ の場合、$type=1$ になるので、$\beta_0$ と $\beta_1$ の和が $1$ に決まっています。
	library(dplyr)

	tokens <- read.csv('data/tokens.csv', as.is = TRUE)


	result <- tokens %>%
	group_by(native, id) %>%
	summarise(
	n_token = n(),
	n_type = n_distinct(token),
native	id	n_token	n_type	ttr	gi
	591	365	144	0.394520547945205	7.53730448529908
en	501	738	361	0.489159891598916	13.2885936376732
en	502	636	340	0.534591194968553	13.4818769572084
en	503	834	353	0.42326139088729	12.2233903227874
en	504	824	336	0.407766990291262	11.7051146400992
en	505	898	393	0.437639198218263	13.1145798598269
en	506	834	341	0.408872901678657	11.8078643061488
en	507	600	264	0.44	10.777754868246
en	508	848	332	0.391509433962264	11.4009268077412
	library(dplyr)

	tokens <- read.csv('data/tokens.csv')

	result <- tokens %>%
	group_by(native, id) %>%
	summarise(
	n_token = length(token),
	n_type = length(unique(token)),
	ttr = n_type / n_token,
	library(tidyr)
	library(dplyr)
	library(stringr)

	# preparations ----
	dirs <- list.files('NICE/NICE_3.0.1b', full = TRUE)

	files <- dirs %>% list.files(full = TRUE)
	writers <- dirs %>% list.files() %>% str_replace('\\.txt$', '')
	## 効果量について

	### 分散分析の効果量

	被験者内計画の分散分析の効果量である「一般化オメガ二乗（$\omega_G^2$）」と「一般化イータ二乗（$\eta_G^2$）」の解釈は、水本篤・竹内理 (2008)「研究論文における効果量の報告のために―基本的概念と注意点」『英語教育研究』31, 57–66. (http://www.mizumot.com/files/EffectSize_KELES31.pdf ) の表1（p. 62）に従いました。後の説明の部分では、$\omega_G^2$ を使いました。

	サイズ \| $\eta_G^2$ \| $\omega_G^2$
	------\|------------\|-------------
	小 \| $.01$ \| $.01$
	中 \| $.06$ \| $.09$