Skip to content

Instantly share code, notes, and snippets.

@hotoku
Created October 14, 2014 12:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hotoku/74d31ee036d075362392 to your computer and use it in GitHub Desktop.
Save hotoku/74d31ee036d075362392 to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
## 4.4節:母集団分散の区間推定のシミュレーション
library(MASS) # truehist
## 真の平均・分散
mu <- 10
sigma <- 3
## n個の正規乱数を発生させる実験をN回繰り返す。
## 結果をn行N列の行列Xに格納。
## Xの1列が、1回の実験
n <- 10
N <- 100000
X <- matrix(rnorm(n * N, m=mu, s=sigma), nrow=n)
## 各実験について、平均値を計算
## 具体的には、各列の平均値を計算
## 後の計算の為に、その結果をn段積み重ねておく
heikin <- matrix(apply(X, 2, mean), nrow=n, ncol=N, byrow=T)
## 各実験について、s^2の値を計算
## s2の分布が、自由度n-1のカイ2乗分布になる
s2 <- apply((X-heikin)^2, 2, sum)/(sigma^2)
## s2のヒストグラムとカイ2乗分布の密度関数を比較
truehist(s2)
index <- seq(min(s2), max(s2), len=100)
lines(index, dchisq(index, df=n-1))
#!/usr/bin/env Rscript
## 4.5節:母集団比率の区間推定のシミュレーション
library(MASS) ## truehist
## 当たり確率pのくじをn回引く実験をN回繰り返す
n <- 1200
N <- 10000
p <- 0.054
dat <- rbinom(N, n, p)
## pの推定量phatを計算する
phat <- dat/n
## phatを正規化してzを作る。zの分布が、大体、標準正規分布
z <- (phat - p)/sqrt(p*(1-p)/n)
## zのヒストグラムとN(0,1)の密度関数を描画
truehist(z)
index <- seq(min(z), max(z), len=100)
lines(index, dnorm(index))
## zが離散的な値で、ヒストグラムのbinの切り方が上手くないので、
## 累積分布関数で比較
plot(ecdf(z))
lines(index, pnorm(index))
#!/usr/bin/env Rscript
## 4.6節:回帰係数の区間推定のシミュレーション
library(MASS)
## 真の回帰係数と残差分散
alpha <- 2
beta <- 1.5
sigma <- 0.7
## nサンプルの実験をN回シミュレーション
# n <- 50
n <- 5
N <- 1000
## xは毎回固定にする
x <- matrix(1:n, nrow=n, ncol=N)
## yを発生させて、結果を行列に保存
y <- matrix(alpha + beta * x + rnorm(n*N, sd=sigma),
nrow=n, ncol=N)
## 回帰係数betaの推定値を計算
Txx <- apply(x, 2, function(a)sum((a-mean(a))^2))
## applyは1つの行列にしか使えないので、
## x,yを縦に積んで、関数の中で上半分と下半分に分ける
Txy <- apply(rbind(x,y), 2,
function(a){
x <- a[1:n]
y <- a[(n+1):(2*n)]
sum((x-mean(x)) * (y-mean(y)))
})
beta.hat <- Txy/Txx
## alphaの推定値も計算
y.bar <- apply(y, 2, mean)
x.bar <- apply(x, 2, mean)
alpha.hat <- y.bar - beta.hat * x.bar
## yの推定値を計算
y.hat <- matrix(alpha.hat, nrow=n, ncol=N, byrow=T) +
matrix(beta.hat, nrow=n, ncol=N, byrow=T) * x
## 真のsigmaで調整すると、正規分布
z <- (beta.hat-beta)*sqrt(Txx)/sigma
truehist(z)
curve(dnorm, from=-3, to=3, add=T)
## sigmaの推定値sで調整すると、自由度n-2のt分布
curve.dt <- function(f, t, df, ...){
## curve関数で上手くdtの曲線が引けないので、自前定義
x <- seq(f, t, len=100)
y <- dt(x, df=df)
lines(x,y,...)
}
s2 <- apply((y-y.hat)^2, 2, sum)/(n-2)
s <- sqrt(s2)
student.t <- (beta.hat-beta)*sqrt(Txx)/s
truehist(student.t)
curve.dt(min(student.t), max(student.t), n-2)
curve(dnorm, min(student.t), max(student.t), col=2, add=T)
## 黒線が自由度n-2のt分布
## 赤線が標準正規分布
## 赤線がべったり0に張り付いている部分にも、student.tの値が
## 発生している事が分かる
## nの値が小さい時ほど、違いが顕著で、
## nの値を大きくする(50とか)と、殆ど違いが無くなる
\documentclass[25pt, a4paper, landscape]{foils}
\usepackage[usenames]{color}
\usepackage{layout}
\usepackage[dvipdfmx]{graphicx}
\title{{\small 統計勉強会} \\
統計学基礎 $\S 4.4$ -- $\S 4.7$}
\author{\includegraphics[width=3cm]{twitter-face-2.png} \\
@hotoku}
\date{2014--10--14}
\newcommand{\myhead}[2][0in]{\foilhead[#1]{\textcolor{BurntOrange}{#2}}}
\renewcommand{\labelitemi}{{\color{RoyalPurple}\textbullet}}
\renewcommand{\labelitemii}{{\color{RoyalPurple}\normalfont\bfseries \textendash}}
\newcommand{\myd}{\mathrm{d}}
\newcommand{\mye}{\mathrm{e}}
\newcommand{\myparagraph}[1]{\noindent{\color{OliveGreen}\underline{#1\hspace{2cm}}}}
\newcommand{\ampeq}[1][=]{&#1&}
\newcommand{\myemph}[1]{{\color{Red}\bf #1}}
\newcommand{\myvec}[1]{\mbox{\boldmath$#1$}}
\definecolor{MyGreen}{rgb}{.3, .5, .4}
\newcommand{\normal}{\mathrm{N}}
\newcommand{\prob}{\mathrm{Pr}}
\newcommand{\iid}{\sim\mathrm{i.i.d.}}
\newcommand{\E}{\mathrm{E}}
\newcommand{\V}{\mathrm{V}}
\renewcommand{\labelenumi}{(\arabic{enumi})}
\setlength{\voffset}{-0.7in}
\setlength{\textheight}{470pt}
\LogoOff
\begin{document}
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{agenda}
\begin{itemize}
\item $\S 4.4$ 母集団分散の区間推定
\item $\S 4.5$ 母集団比率の推定
\item $\S 4.6$ 回帰係数の区間推定
\item $\S 4.7$ 母集団相関係数の区間推定
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{$\S 4.4$ 母集団分散の区間推定}
\begin{itemize}
\item $X_{i}\ \iid\ \normal (\mu, \sigma^{2}),\ i=1, \ldots, n$
\item $\displaystyle\Rightarrow \chi^{2} := \frac{\sum (X_{i} - \bar{X})^{2}}{\sigma^{2}} = \frac{(n-1)s^2}{\sigma^{2}}\sim \chi^{2}_{n-1} $
\item i.e. $\displaystyle \prob\left\{X^{2}_{L} \leq \frac{(n-1)s^2}{\sigma^{2}} \leq X^{2}_{U}\right\} = 1-\alpha$
\item $\Leftrightarrow \prob\left\{\frac{(n-1)s^2}{X^{2}_{U}} \leq \sigma^{2} \leq \frac{(n-1)s^2}{X^{2}_{L}}\right\} = 1-\alpha$
\end{itemize}
但し、$X_{L}, X_{U}$は、自由度$n-1$の$\chi^{2}$分布の下・上側$100 \alpha / 2$\% 点.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{自由度?なにそれ?おいしいの?}
\begin{itemize}
\item ピタゴラスの定理
\item 斜め上から見る
\item 線形代数大事
\item ホワイトボードでやります
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{平均の性質}
\begin{itemize}
\item $\displaystyle\min_{a} \sum(x_{i}-a)^{2}$
\item $\displaystyle\frac{\myd}{\myd a}\sum(x_{i}-a)^{2}=\frac{\myd}{\myd a}\sum (x_{i}^{2} - 2ax_{i} + a^{2})=0$
\item $\Rightarrow\displaystyle a=\bar{x}$
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{$\S$ 4.5母集団比率の推定}
\myparagraph{適用場面(5)}
\begin{itemize}
\item 大都市の全世帯から、1,200世帯を無作為抽出
\item そのうち、$5.4\% = 65/1200$に要介護者がいた
\item この都市全体で、$5.4\%$に要介護者が居ると考えて良いか?
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead[-1cm]{復元抽出と非復元抽出}
\begin{itemize}
\item $R$個の赤球と$B$個の黒球が入った箱。
\begin{itemize}
\item $N:=R+B$
\item $p:=R/N$
\end{itemize}
\item 箱から球を取り出す事を繰り返す
\item 箱から球を取り出して…
\begin{itemize}
\item 元に戻す→毎回、赤の出る確率は$p$
\item 元に戻さない→赤の出る確率は、変動する
\item 1回目が…
\begin{itemize}
\item 赤→$\frac{R-1}{N-1}$
\item 黒→$\frac{R}{N-1}$
\end{itemize}
\end{itemize}
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{こまけぇこたぁいいんだよ}
\begin{itemize}
\item 大都市→人口数十万人(1,200世帯の数千倍の世帯)
\item 一世帯を抽出した後に、戻しても戻さなくても大して変わらない
\item $p:=\mbox{その都市の要介護者世帯の割合}$
\item 1,200世帯の抽出全体を通じて、要介護者世帯に当たる
確率=$p$としてしまいましょう
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{母集団比率の区間推定}
\begin{itemize}
\item 1,200回試行して、65回が当たりだった。
\item 一定の確率$p$のくじを$n$回引いて$x$回当たりが出る→2項分布
\item $\E[x]=np, \V[x]=np(1-p)$
\item 2項分布は左右対称なので、割りと小さい$n$でも中心極限定理で充分な近似になる
\item $\displaystyle z:=\frac{x/n-p}{\sqrt{p(1-p)/n}}$が、大体$\normal(0,1)$
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{有限修正}
\begin{itemize}
\item 母集団の数が小さい or くじを引きまくる場合
\item 超幾何分布
\item $\displaystyle\E[x]=np, \V[x]=\frac{N-n}{N-1}np(1-p)$
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{連続修正}
\begin{itemize}
\item 0.5引け
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{回帰係数の推定}
\begin{itemize}
\item $y_{i} = \alpha + \beta x_{i} + \epsilon_{i}, i=1, \ldots, n, \epsilon_{i}\sim \normal(0,\sigma^{2})$
\item $T_{xy}:=\sum(x_{i}-\bar{x})(y_{i}-\bar{y}), T_{xx}:=\sum(x_{i}-\bar{x})^{2}$
\item $\displaystyle\hat{\beta}=\frac{T_{xy}}{T_{xx}}$
\item $T_{xy}=\sum(x_{i}-\bar{x})y_{i} - \bar{y}\sum(x_{i}-\bar{x}) = \sum(x_{i}-\bar{x})y_{i}$
\item $\sum(x_{i}-\bar{x})=0$だからね
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{回帰係数の推定量の分布}
\begin{itemize}
\item $\displaystyle\hat{\beta}=\frac{T_{xy}}{T_{xx}}$ (再掲)
\item $T_{xx}$は所詮、定数。$T_{xy}$は$y_{i}$の線形結合$\Rightarrow\epsilon_{i}\sim\normal(0,\sigma^{2})$の線形結合。
\item よって、$\hat{\beta}\sim\normal$
\item 計算すると、
\begin{itemize}
\item $\E[\hat{\beta}]=\beta$
\item $\V[\hat{\beta}]=\frac{\sigma^{2}}{T_{xx}}$
\end{itemize}
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{残差分散の推定}
\begin{itemize}
\item $\displaystyle\hat{\sigma^{2}}=s^{2}:=\frac{1}{n-2}\sum(x_{i}-\bar{x})^{2}$
\item $\alpha, \beta$の2つを推定しているので、$n-2$で割れば、不偏推定量。という事だと思います(要出典)。
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{回帰係数の区間推定}
\begin{itemize}
\item $\displaystyle\hat{\beta}\sim\normal\left(\beta, \frac{\sigma^{2}}{T_{xx}}\right)\Rightarrow\frac{\hat{\beta}-\beta}{\sqrt{T_{xx}}/\sigma}\sim\normal(0,1)$
\item i.e. $\displaystyle\prob\left\{q_{L}\leq\frac{\hat{\beta}-\beta}{\sqrt{T_{xx}}/\sigma}\leq q_{U}\right\}=1-\alpha$
\item これを$\beta$について整理すれば、信頼区間を作れそうだけど、$\sigma$が未知の値なので、代わりに$\sigma$の推定量$s$を入れる。
\item Studentの$t$-分布。統計学とGuinessビールの関係。
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{$\S 4.7$ 母集団相関係数の区間推定}
\begin{itemize}
\item この章の内容はクソ難しいので、黙って結果を暗記しましょう。
\item 標本相関係数を$r$として、$r$の分布は非対称で扱いづらいけど、Fisherの$z$-変換をしたら、対称な分布になるよ。
\item $\displaystyle z:=\frac{1}{2}\log \frac{1+r}{1-r}$
\item 竹内啓, {\bfseries 数理統計学}, 東洋経済新報社, 1963.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{練習問題 4.1}
\begin{enumerate}
\item 体重は、右に裾の長い分布なので(たまに体重が100kg超える人とか。凄い人だと、300kgとか)。こういう時は、Gamma分布とかが使われます。
身長は、正規分布でも良いと言われます(体重と違って、身長が3m超える人とかは居ない)。他には、年収とかも、やはり右に裾の長い分布なので、
同様のケアが必要(年収100億とかね)。
\item $n=10$個の和を取ると、分布の歪みは気にしなくて良くなるので、正規分布で近似すれば良い。
\item 上と同じ
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{練習問題 4.2}
\begin{enumerate}
\item この場合は、平均の推定値は正規分布になる。
\item この場合は、$t-$分布を使う。
\end{enumerate}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{練習問題 4.3}
\begin{enumerate}
\item $n=2500$なら正規近似で良い
\item $n=25$だと、正規近似じゃ駄目。
\end{enumerate}
以下略。すみません。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^
\myhead{参考文献}
\begin{itemize}
\item 竹内啓, {\bfseries 数理統計学}, 東洋経済新報社, 1963.
\item 稲垣宣夫, {\bfseries 数理統計学 改訂版}, 裳華房, 2003.
\item 竹村彰通, {\bfseries 現代数理統計学}, 創文社, 1991.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$
\end{document}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment