Created
October 14, 2014 12:25
-
-
Save hotoku/74d31ee036d075362392 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
## 4.4節:母集団分散の区間推定のシミュレーション | |
library(MASS) # truehist | |
## 真の平均・分散 | |
mu <- 10 | |
sigma <- 3 | |
## n個の正規乱数を発生させる実験をN回繰り返す。 | |
## 結果をn行N列の行列Xに格納。 | |
## Xの1列が、1回の実験 | |
n <- 10 | |
N <- 100000 | |
X <- matrix(rnorm(n * N, m=mu, s=sigma), nrow=n) | |
## 各実験について、平均値を計算 | |
## 具体的には、各列の平均値を計算 | |
## 後の計算の為に、その結果をn段積み重ねておく | |
heikin <- matrix(apply(X, 2, mean), nrow=n, ncol=N, byrow=T) | |
## 各実験について、s^2の値を計算 | |
## s2の分布が、自由度n-1のカイ2乗分布になる | |
s2 <- apply((X-heikin)^2, 2, sum)/(sigma^2) | |
## s2のヒストグラムとカイ2乗分布の密度関数を比較 | |
truehist(s2) | |
index <- seq(min(s2), max(s2), len=100) | |
lines(index, dchisq(index, df=n-1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
## 4.5節:母集団比率の区間推定のシミュレーション | |
library(MASS) ## truehist | |
## 当たり確率pのくじをn回引く実験をN回繰り返す | |
n <- 1200 | |
N <- 10000 | |
p <- 0.054 | |
dat <- rbinom(N, n, p) | |
## pの推定量phatを計算する | |
phat <- dat/n | |
## phatを正規化してzを作る。zの分布が、大体、標準正規分布 | |
z <- (phat - p)/sqrt(p*(1-p)/n) | |
## zのヒストグラムとN(0,1)の密度関数を描画 | |
truehist(z) | |
index <- seq(min(z), max(z), len=100) | |
lines(index, dnorm(index)) | |
## zが離散的な値で、ヒストグラムのbinの切り方が上手くないので、 | |
## 累積分布関数で比較 | |
plot(ecdf(z)) | |
lines(index, pnorm(index)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
## 4.6節:回帰係数の区間推定のシミュレーション | |
library(MASS) | |
## 真の回帰係数と残差分散 | |
alpha <- 2 | |
beta <- 1.5 | |
sigma <- 0.7 | |
## nサンプルの実験をN回シミュレーション | |
# n <- 50 | |
n <- 5 | |
N <- 1000 | |
## xは毎回固定にする | |
x <- matrix(1:n, nrow=n, ncol=N) | |
## yを発生させて、結果を行列に保存 | |
y <- matrix(alpha + beta * x + rnorm(n*N, sd=sigma), | |
nrow=n, ncol=N) | |
## 回帰係数betaの推定値を計算 | |
Txx <- apply(x, 2, function(a)sum((a-mean(a))^2)) | |
## applyは1つの行列にしか使えないので、 | |
## x,yを縦に積んで、関数の中で上半分と下半分に分ける | |
Txy <- apply(rbind(x,y), 2, | |
function(a){ | |
x <- a[1:n] | |
y <- a[(n+1):(2*n)] | |
sum((x-mean(x)) * (y-mean(y))) | |
}) | |
beta.hat <- Txy/Txx | |
## alphaの推定値も計算 | |
y.bar <- apply(y, 2, mean) | |
x.bar <- apply(x, 2, mean) | |
alpha.hat <- y.bar - beta.hat * x.bar | |
## yの推定値を計算 | |
y.hat <- matrix(alpha.hat, nrow=n, ncol=N, byrow=T) + | |
matrix(beta.hat, nrow=n, ncol=N, byrow=T) * x | |
## 真のsigmaで調整すると、正規分布 | |
z <- (beta.hat-beta)*sqrt(Txx)/sigma | |
truehist(z) | |
curve(dnorm, from=-3, to=3, add=T) | |
## sigmaの推定値sで調整すると、自由度n-2のt分布 | |
curve.dt <- function(f, t, df, ...){ | |
## curve関数で上手くdtの曲線が引けないので、自前定義 | |
x <- seq(f, t, len=100) | |
y <- dt(x, df=df) | |
lines(x,y,...) | |
} | |
s2 <- apply((y-y.hat)^2, 2, sum)/(n-2) | |
s <- sqrt(s2) | |
student.t <- (beta.hat-beta)*sqrt(Txx)/s | |
truehist(student.t) | |
curve.dt(min(student.t), max(student.t), n-2) | |
curve(dnorm, min(student.t), max(student.t), col=2, add=T) | |
## 黒線が自由度n-2のt分布 | |
## 赤線が標準正規分布 | |
## 赤線がべったり0に張り付いている部分にも、student.tの値が | |
## 発生している事が分かる | |
## nの値が小さい時ほど、違いが顕著で、 | |
## nの値を大きくする(50とか)と、殆ど違いが無くなる |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\documentclass[25pt, a4paper, landscape]{foils} | |
\usepackage[usenames]{color} | |
\usepackage{layout} | |
\usepackage[dvipdfmx]{graphicx} | |
\title{{\small 統計勉強会} \\ | |
統計学基礎 $\S 4.4$ -- $\S 4.7$} | |
\author{\includegraphics[width=3cm]{twitter-face-2.png} \\ | |
@hotoku} | |
\date{2014--10--14} | |
\newcommand{\myhead}[2][0in]{\foilhead[#1]{\textcolor{BurntOrange}{#2}}} | |
\renewcommand{\labelitemi}{{\color{RoyalPurple}\textbullet}} | |
\renewcommand{\labelitemii}{{\color{RoyalPurple}\normalfont\bfseries \textendash}} | |
\newcommand{\myd}{\mathrm{d}} | |
\newcommand{\mye}{\mathrm{e}} | |
\newcommand{\myparagraph}[1]{\noindent{\color{OliveGreen}\underline{#1\hspace{2cm}}}} | |
\newcommand{\ampeq}[1][=]{&} | |
\newcommand{\myemph}[1]{{\color{Red}\bf #1}} | |
\newcommand{\myvec}[1]{\mbox{\boldmath$#1$}} | |
\definecolor{MyGreen}{rgb}{.3, .5, .4} | |
\newcommand{\normal}{\mathrm{N}} | |
\newcommand{\prob}{\mathrm{Pr}} | |
\newcommand{\iid}{\sim\mathrm{i.i.d.}} | |
\newcommand{\E}{\mathrm{E}} | |
\newcommand{\V}{\mathrm{V}} | |
\renewcommand{\labelenumi}{(\arabic{enumi})} | |
\setlength{\voffset}{-0.7in} | |
\setlength{\textheight}{470pt} | |
\LogoOff | |
\begin{document} | |
\maketitle | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{agenda} | |
\begin{itemize} | |
\item $\S 4.4$ 母集団分散の区間推定 | |
\item $\S 4.5$ 母集団比率の推定 | |
\item $\S 4.6$ 回帰係数の区間推定 | |
\item $\S 4.7$ 母集団相関係数の区間推定 | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{$\S 4.4$ 母集団分散の区間推定} | |
\begin{itemize} | |
\item $X_{i}\ \iid\ \normal (\mu, \sigma^{2}),\ i=1, \ldots, n$ | |
\item $\displaystyle\Rightarrow \chi^{2} := \frac{\sum (X_{i} - \bar{X})^{2}}{\sigma^{2}} = \frac{(n-1)s^2}{\sigma^{2}}\sim \chi^{2}_{n-1} $ | |
\item i.e. $\displaystyle \prob\left\{X^{2}_{L} \leq \frac{(n-1)s^2}{\sigma^{2}} \leq X^{2}_{U}\right\} = 1-\alpha$ | |
\item $\Leftrightarrow \prob\left\{\frac{(n-1)s^2}{X^{2}_{U}} \leq \sigma^{2} \leq \frac{(n-1)s^2}{X^{2}_{L}}\right\} = 1-\alpha$ | |
\end{itemize} | |
但し、$X_{L}, X_{U}$は、自由度$n-1$の$\chi^{2}$分布の下・上側$100 \alpha / 2$\% 点. | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{自由度?なにそれ?おいしいの?} | |
\begin{itemize} | |
\item ピタゴラスの定理 | |
\item 斜め上から見る | |
\item 線形代数大事 | |
\item ホワイトボードでやります | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{平均の性質} | |
\begin{itemize} | |
\item $\displaystyle\min_{a} \sum(x_{i}-a)^{2}$ | |
\item $\displaystyle\frac{\myd}{\myd a}\sum(x_{i}-a)^{2}=\frac{\myd}{\myd a}\sum (x_{i}^{2} - 2ax_{i} + a^{2})=0$ | |
\item $\Rightarrow\displaystyle a=\bar{x}$ | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{$\S$ 4.5母集団比率の推定} | |
\myparagraph{適用場面(5)} | |
\begin{itemize} | |
\item 大都市の全世帯から、1,200世帯を無作為抽出 | |
\item そのうち、$5.4\% = 65/1200$に要介護者がいた | |
\item この都市全体で、$5.4\%$に要介護者が居ると考えて良いか? | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead[-1cm]{復元抽出と非復元抽出} | |
\begin{itemize} | |
\item $R$個の赤球と$B$個の黒球が入った箱。 | |
\begin{itemize} | |
\item $N:=R+B$ | |
\item $p:=R/N$ | |
\end{itemize} | |
\item 箱から球を取り出す事を繰り返す | |
\item 箱から球を取り出して… | |
\begin{itemize} | |
\item 元に戻す→毎回、赤の出る確率は$p$ | |
\item 元に戻さない→赤の出る確率は、変動する | |
\item 1回目が… | |
\begin{itemize} | |
\item 赤→$\frac{R-1}{N-1}$ | |
\item 黒→$\frac{R}{N-1}$ | |
\end{itemize} | |
\end{itemize} | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{こまけぇこたぁいいんだよ} | |
\begin{itemize} | |
\item 大都市→人口数十万人(1,200世帯の数千倍の世帯) | |
\item 一世帯を抽出した後に、戻しても戻さなくても大して変わらない | |
\item $p:=\mbox{その都市の要介護者世帯の割合}$ | |
\item 1,200世帯の抽出全体を通じて、要介護者世帯に当たる | |
確率=$p$としてしまいましょう | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{母集団比率の区間推定} | |
\begin{itemize} | |
\item 1,200回試行して、65回が当たりだった。 | |
\item 一定の確率$p$のくじを$n$回引いて$x$回当たりが出る→2項分布 | |
\item $\E[x]=np, \V[x]=np(1-p)$ | |
\item 2項分布は左右対称なので、割りと小さい$n$でも中心極限定理で充分な近似になる | |
\item $\displaystyle z:=\frac{x/n-p}{\sqrt{p(1-p)/n}}$が、大体$\normal(0,1)$ | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{有限修正} | |
\begin{itemize} | |
\item 母集団の数が小さい or くじを引きまくる場合 | |
\item 超幾何分布 | |
\item $\displaystyle\E[x]=np, \V[x]=\frac{N-n}{N-1}np(1-p)$ | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{連続修正} | |
\begin{itemize} | |
\item 0.5引け | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{回帰係数の推定} | |
\begin{itemize} | |
\item $y_{i} = \alpha + \beta x_{i} + \epsilon_{i}, i=1, \ldots, n, \epsilon_{i}\sim \normal(0,\sigma^{2})$ | |
\item $T_{xy}:=\sum(x_{i}-\bar{x})(y_{i}-\bar{y}), T_{xx}:=\sum(x_{i}-\bar{x})^{2}$ | |
\item $\displaystyle\hat{\beta}=\frac{T_{xy}}{T_{xx}}$ | |
\item $T_{xy}=\sum(x_{i}-\bar{x})y_{i} - \bar{y}\sum(x_{i}-\bar{x}) = \sum(x_{i}-\bar{x})y_{i}$ | |
\item $\sum(x_{i}-\bar{x})=0$だからね | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{回帰係数の推定量の分布} | |
\begin{itemize} | |
\item $\displaystyle\hat{\beta}=\frac{T_{xy}}{T_{xx}}$ (再掲) | |
\item $T_{xx}$は所詮、定数。$T_{xy}$は$y_{i}$の線形結合$\Rightarrow\epsilon_{i}\sim\normal(0,\sigma^{2})$の線形結合。 | |
\item よって、$\hat{\beta}\sim\normal$ | |
\item 計算すると、 | |
\begin{itemize} | |
\item $\E[\hat{\beta}]=\beta$ | |
\item $\V[\hat{\beta}]=\frac{\sigma^{2}}{T_{xx}}$ | |
\end{itemize} | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{残差分散の推定} | |
\begin{itemize} | |
\item $\displaystyle\hat{\sigma^{2}}=s^{2}:=\frac{1}{n-2}\sum(x_{i}-\bar{x})^{2}$ | |
\item $\alpha, \beta$の2つを推定しているので、$n-2$で割れば、不偏推定量。という事だと思います(要出典)。 | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{回帰係数の区間推定} | |
\begin{itemize} | |
\item $\displaystyle\hat{\beta}\sim\normal\left(\beta, \frac{\sigma^{2}}{T_{xx}}\right)\Rightarrow\frac{\hat{\beta}-\beta}{\sqrt{T_{xx}}/\sigma}\sim\normal(0,1)$ | |
\item i.e. $\displaystyle\prob\left\{q_{L}\leq\frac{\hat{\beta}-\beta}{\sqrt{T_{xx}}/\sigma}\leq q_{U}\right\}=1-\alpha$ | |
\item これを$\beta$について整理すれば、信頼区間を作れそうだけど、$\sigma$が未知の値なので、代わりに$\sigma$の推定量$s$を入れる。 | |
\item Studentの$t$-分布。統計学とGuinessビールの関係。 | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{$\S 4.7$ 母集団相関係数の区間推定} | |
\begin{itemize} | |
\item この章の内容はクソ難しいので、黙って結果を暗記しましょう。 | |
\item 標本相関係数を$r$として、$r$の分布は非対称で扱いづらいけど、Fisherの$z$-変換をしたら、対称な分布になるよ。 | |
\item $\displaystyle z:=\frac{1}{2}\log \frac{1+r}{1-r}$ | |
\item 竹内啓, {\bfseries 数理統計学}, 東洋経済新報社, 1963. | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{練習問題 4.1} | |
\begin{enumerate} | |
\item 体重は、右に裾の長い分布なので(たまに体重が100kg超える人とか。凄い人だと、300kgとか)。こういう時は、Gamma分布とかが使われます。 | |
身長は、正規分布でも良いと言われます(体重と違って、身長が3m超える人とかは居ない)。他には、年収とかも、やはり右に裾の長い分布なので、 | |
同様のケアが必要(年収100億とかね)。 | |
\item $n=10$個の和を取ると、分布の歪みは気にしなくて良くなるので、正規分布で近似すれば良い。 | |
\item 上と同じ | |
\end{enumerate} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{練習問題 4.2} | |
\begin{enumerate} | |
\item この場合は、平均の推定値は正規分布になる。 | |
\item この場合は、$t-$分布を使う。 | |
\end{enumerate} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{練習問題 4.3} | |
\begin{enumerate} | |
\item $n=2500$なら正規近似で良い | |
\item $n=25$だと、正規近似じゃ駄目。 | |
\end{enumerate} | |
以下略。すみません。 | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%^ | |
\myhead{参考文献} | |
\begin{itemize} | |
\item 竹内啓, {\bfseries 数理統計学}, 東洋経済新報社, 1963. | |
\item 稲垣宣夫, {\bfseries 数理統計学 改訂版}, 裳華房, 2003. | |
\item 竹村彰通, {\bfseries 現代数理統計学}, 創文社, 1991. | |
\end{itemize} | |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%$ | |
\end{document} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment