whilo/gist:ed2db43d8bc0aa7849bb

## gistfile1.r

click1 <- function(){
    x <- runif(1);y <- runif(1)
    plot(x = x, y = y, xlim = c(0, 1), ylim = c(0, 1),
         main = "Bitte auf den Punkt klicken",
         xlab = '', ylab = '',
         axes = FALSE, frame.plot = TRUE)
    clicktime <- system.time(xyclick <- locator(1))
    c(timestamp = Sys.time(),
      x = x, y = y,
      xclick = xyclick$x, yclick = xyclick$y,
      tclick = clicktime[3])
}

# 1 click wegwerfen, 4-5 einfuehrungsclicks
# lm modell, laufende nummer des clicks muss aufgenommen werden

# besserer vergleich als qqplot ist linearisieren
# vergleich gerade
click <- function(count) {
    # 5 einfuehrungsklicks
    initClicks <- Map(function(i) {c(index=i, click1())}, 0:5)

    #clicks <- Map(function(i) { c(i, click1())}, 1:count)
    clicks <- list()
    lastClick <- tail(initClicks,1)[[1]] # truely elegant ...
    prevX <- lastClick["x"]
    prevY <- lastClick["y"]
    for (i in 1:count) {
        click <- c(index=i, click1())
        xclick <- click["xclick"]
        yclick <- click["yclick"]
        click <- c(click, dist=(click["x"] - xclick)**2 + (click["y"] - yclick)**2)
        click <- c(click, way=(xclick-prevX)**2 + (yclick - prevY)**2)
        clicks <- append(clicks, list(click))

        prevX <<- xclick
        prevY <<- yclick
    }
    clicks <- do.call(rbind.data.frame, clicks)
    colnames(clicks) <- c("index", "timestamp", "x", "y", "xclick", "yclick", "tclick", "dist", "way")
    clicks
}

# wie viele clicks sinnvoll fuer lm? ermuedung, test auf signifikanz?
# rerun experiment
write.table(click(20), "click20r")
write.table(click(20), "click20l")

write.table(click(20), "click20r2")
write.table(click(20), "click20l2")

# read stored data
right <- rbind(read.table("click20r"), read.table("click20r2"))
left <- rbind(read.table("click20l"), read.table("click20l2"))

# not what is difference between left and right?

# 0-hypothesis: are identical distributed

# alternative
# shift? in linear model
# x_i = mu + e_i  i=1,...,n1=n_x
# y_i = mu + a + e_i i=1,.., n_2=n_y
# t-test detects shift alternative

# alternative
# different distribution function
# KS test between ecdfs, continuous cdfs necessary (?)


# appending of lists and dataframes
stack(list(left=left$tclick, right=right$tclick))

# idea, compare time / way, should be normal distributed? t-test for significance

?sample
sample(1:12) # permutation

wilcox.test( right$tclick, left$tclick ) # stetigkeitskorrektur

require(coin)

wilcox_test(right$tclick, left$tclick )


hist(right$tclick / right$way)

# empirical cdf, shift between left and right clearly visible
plot(ecdf(left$tclick), col="green")
lines(ecdf(right$tclick), col="red")
legend(x=1,y=1,legend=c("left", "right"), fill=c("green", "red"))

ks.test(left$tclick, right$tclick)
# p-value: 0.001 (that dists are the same)


# t-test:
# empirical estimator for mean of two groups
# simple case, both same variance, estimate variance
t.test(left$tclick, right$tclick)

t.test(left$tclick, right$tclick, var.equal=TRUE)
# more degrees of freedom

# linear model: (only if normal distributed)


# qqplot comparison of time to click
qqplot(right$tclick, left$tclick)

ppplot(right$tclick, distf=rnorm)


rlm <- lm(formula= right$tclick ~ right$way + right$index + right$dist)
llm <- lm(formula= left$tclick ~ left$way + left$index + left$dist)

require(graphics)

par(mfrow = c(2,2))
plot(rlm)

par(mfrow = c(2,2))
plot(llm)

# monte carlo confidence bounds for theoretical distribution


# qqplot 300 beobachtungen

# genauigkeit, laenge des weges, anzahl der clicks


unif50 <- runif(50)
unif100 <- runif(100)
norm50 <- rnorm(50)
norm100 <- rnorm(100)
lognorm50 <- exp(rnorm(50))
lognorm100 <- exp( rnorm(100))

oldpar <- par(mfrow = c(2, 3))
plot(ecdf(unif50), pch = "[")
plot(ecdf(norm50), pch = "[")
plot(ecdf(lognorm50), pch = "[")
plot(ecdf(unif100), pch = "[")
plot(ecdf(norm100), pch = "[")
plot(ecdf(lognorm100), pch = "[")
par(oldpar)

oldpar <- par(mfrow = c(2, 3))
qqnorm(unif50, main = "Normal Q-Q unif50")
qqnorm(norm50, main = "Normal Q-Q norm50")
qqnorm(lognorm50, main = "Normal Q-Q lognorm50")
qqnorm(unif100, main = "Normal Q-Q unif100")
qqnorm(norm100, main = "Normal Q-Q norm100")
qqnorm(lognorm100, main = "Normal Q-Q lognorm100")
par(oldpar)


tq99 <- function(n, df=1) {
    t100 <- rt(n, df)
    q <- quantile(t100, probs = seq(0.01, 0.99, 0.98))
    st100 <- sort(t100)
    ust100 <- st100[q["1%"]<st100]
    cst100 <- ust100[q["99%"]>ust100]
    cst100
}


# TODO why 19? Aufgabe 1.10: abstract P(T(X0) > T(Xi))
ppplot <- function(x, y=NULL, samps = 19, distf=rnorm) {
    if(is.null(y)) {
        y <- sort(distf(length(x))) #(1:length(x))/length(x)
    } else {
        y <- sort(y)
    }
    plot(sort(x), y, xlab = substitute(x), ylab = expression(F[n]),
         main = "Verteilungsfunktion mit Monte-Carlo-Band",
         type = "s")
    mtext(paste(samps, "Monte-Carlo-Stichproben"), side = 3)
    samples <- matrix(distf(length(x)* samps), nrow = length(x), ncol = samps)
    samples <- apply(samples, 2, sort)
    envelope <- t(apply(samples, 1, range))
    lines(envelope[, 1], y, type = "s", col = "red");
    lines(envelope[, 2], y, type = "s", col = "red")
}


ppplot(norm100, tq99(100))

ppplot(rnorm(300), rnorm(300))

ks.test(rnorm(300), runif(300))


# chisq vs. ks
# https://stackoverflow.com/questions/21655653/r-chi-square-goodness-of-fit-for-random-numbers-generated

set.seed(1)
df <- data.frame(y = rexp(1000),
                 z = rcauchy(1000, 100, 100))

ks.test(df$y,"pexp")
# One-sample Kolmogorov-Smirnov test
#
# data:  df$y
# D = 0.0387, p-value = 0.1001
# alternative hypothesis: two-sided

ks.test(df$z,"pcauchy",100,100)
#  One-sample Kolmogorov-Smirnov test
#
# data:  df$z
# D = 0.0296, p-value = 0.3455
# alternative hypothesis: two-sided

breaks <- c(seq(0,10,by=1))
O <- table(cut(df$y,breaks=breaks))
p <- diff(pexp(breaks))
chisq.test(O,p=p, rescale.p=T)

#   Chi-squared test for given probabilities
#
# data:  O
# X-squared = 7.9911, df = 9, p-value = 0.535


par(mfrow=c(1,2))
plot(qexp(seq(0,1,0.01)),quantile(df$y,seq(0,1,0.01)),
     main="Q-Q Plot",ylab="df$Y", xlab="Exponential",
     xlim=c(0,5),ylim=c(0,5))
plot(qcauchy(seq(0,.99,0.01),100,100),quantile(df$z,seq(0,.99,0.01)),
     main="Q-Q Plot",ylab="df$Z",xlab="Cauchy",
     xlim=c(-1000,1000),ylim=c(-1000,1000))


# lm

x <- 1:100
err <- rnorm(100, mean = 0, sd = 10)
y <- 2.5*x + err

lm(y ~ x)

lmres <- lm(y ~ x)
plot(x, y)
abline(lmres)


attach(hills)
scotlm<-lm(time ~ dist + climb)
attach(scotlm)


oldpar <- par(mfrow=c(1,2))
plot(fitted.values, residuals,xlim=c(0,200), main="Scottish Hill Races", pch=19, cex.lab = 1.5)
text.id(fitted.values, residuals, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
sres<-stdres(scotlm)
plot(fitted.values,sres,xlim=c(0,200), main="Scottish Hill Races",
ylab="MASS::stdres", pch=19, cex.lab = 1.5)
text.id(fitted.values,sres, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
par(oldpar)


oldpar <- par(mfrow=c(1,2))

scotdffits <-dffits(scotlm)

plot(fitted.values, scotdffits,ylab="DFFITS",xlim=c(0,200),
main="Scottish Hill Races", pch=19, cex.lab = 1.5)
text.id(fitted.values, scotdffits, c(7,11, 18,31,33,35), row.names(hills), cex.id=1.2)
scotcooks <-cooks.distance(scotlm)
plot(fitted.values, scotcooks,ylab="Cooks",xlim=c(0,200),
main="Scottish Hill Races", pch=19, cex.lab = 1.5)
text.id(fitted.values, scotcooks, c(7,11, 18), row.names(hills), cex.id=1.2)
par(oldpar)


library(forward)
fwdhills <- fwdlm(time ~dist + climb, data=hills)


summary(hills[,1:3])


# Guete Simulation p.126
# task view
foo <- Map(function(e) { t.test(exp(rnorm(1000)), exp(rnorm(1000)))$p.value }, 1:1000)


length(foo[foo<0.05])

	click1 <- function(){
	x <- runif(1);y <- runif(1)
	plot(x = x, y = y, xlim = c(0, 1), ylim = c(0, 1),
	main = "Bitte auf den Punkt klicken",
	xlab = '', ylab = '',
	axes = FALSE, frame.plot = TRUE)
	clicktime <- system.time(xyclick <- locator(1))
	c(timestamp = Sys.time(),
	x = x, y = y,
	xclick = xyclick$x, yclick = xyclick$y,
	tclick = clicktime[3])
	}

	# 1 click wegwerfen, 4-5 einfuehrungsclicks
	# lm modell, laufende nummer des clicks muss aufgenommen werden

	# besserer vergleich als qqplot ist linearisieren
	# vergleich gerade
	click <- function(count) {
	# 5 einfuehrungsklicks
	initClicks <- Map(function(i) {c(index=i, click1())}, 0:5)

	#clicks <- Map(function(i) { c(i, click1())}, 1:count)
	clicks <- list()
	lastClick <- tail(initClicks,1)[[1]] # truely elegant ...
	prevX <- lastClick["x"]
	prevY <- lastClick["y"]
	for (i in 1:count) {
	click <- c(index=i, click1())
	xclick <- click["xclick"]
	yclick <- click["yclick"]
	click <- c(click, dist=(click["x"] - xclick)2 + (click["y"] - yclick)2)
	click <- c(click, way=(xclick-prevX)2 + (yclick - prevY)2)
	clicks <- append(clicks, list(click))

	prevX <<- xclick
	prevY <<- yclick
	}
	clicks <- do.call(rbind.data.frame, clicks)
	colnames(clicks) <- c("index", "timestamp", "x", "y", "xclick", "yclick", "tclick", "dist", "way")
	clicks
	}

	# wie viele clicks sinnvoll fuer lm? ermuedung, test auf signifikanz?
	# rerun experiment
	write.table(click(20), "click20r")
	write.table(click(20), "click20l")

	write.table(click(20), "click20r2")
	write.table(click(20), "click20l2")

	# read stored data
	right <- rbind(read.table("click20r"), read.table("click20r2"))
	left <- rbind(read.table("click20l"), read.table("click20l2"))

	# not what is difference between left and right?

	# 0-hypothesis: are identical distributed

	# alternative
	# shift? in linear model
	# x_i = mu + e_i i=1,...,n1=n_x
	# y_i = mu + a + e_i i=1,.., n_2=n_y
	# t-test detects shift alternative

	# alternative
	# different distribution function
	# KS test between ecdfs, continuous cdfs necessary (?)


	# appending of lists and dataframes
	stack(list(left=left$tclick, right=right$tclick))

	# idea, compare time / way, should be normal distributed? t-test for significance

	?sample
	sample(1:12) # permutation

	wilcox.test( right$tclick, left$tclick ) # stetigkeitskorrektur

	require(coin)

	wilcox_test(right$tclick, left$tclick )



	hist(right$tclick / right$way)

	# empirical cdf, shift between left and right clearly visible
	plot(ecdf(left$tclick), col="green")
	lines(ecdf(right$tclick), col="red")
	legend(x=1,y=1,legend=c("left", "right"), fill=c("green", "red"))

	ks.test(left$tclick, right$tclick)
	# p-value: 0.001 (that dists are the same)


	# t-test:
	# empirical estimator for mean of two groups
	# simple case, both same variance, estimate variance
	t.test(left$tclick, right$tclick)

	t.test(left$tclick, right$tclick, var.equal=TRUE)
	# more degrees of freedom

	# linear model: (only if normal distributed)


	# qqplot comparison of time to click
	qqplot(right$tclick, left$tclick)

	ppplot(right$tclick, distf=rnorm)



	rlm <- lm(formula= right$tclick ~ right$way + right$index + right$dist)
	llm <- lm(formula= left$tclick ~ left$way + left$index + left$dist)

	require(graphics)

	par(mfrow = c(2,2))
	plot(rlm)

	par(mfrow = c(2,2))
	plot(llm)

	# monte carlo confidence bounds for theoretical distribution


	# qqplot 300 beobachtungen

	# genauigkeit, laenge des weges, anzahl der clicks


	unif50 <- runif(50)
	unif100 <- runif(100)
	norm50 <- rnorm(50)
	norm100 <- rnorm(100)
	lognorm50 <- exp(rnorm(50))
	lognorm100 <- exp( rnorm(100))

	oldpar <- par(mfrow = c(2, 3))
	plot(ecdf(unif50), pch = "[")
	plot(ecdf(norm50), pch = "[")
	plot(ecdf(lognorm50), pch = "[")
	plot(ecdf(unif100), pch = "[")
	plot(ecdf(norm100), pch = "[")
	plot(ecdf(lognorm100), pch = "[")
	par(oldpar)

	oldpar <- par(mfrow = c(2, 3))
	qqnorm(unif50, main = "Normal Q-Q unif50")
	qqnorm(norm50, main = "Normal Q-Q norm50")
	qqnorm(lognorm50, main = "Normal Q-Q lognorm50")
	qqnorm(unif100, main = "Normal Q-Q unif100")
	qqnorm(norm100, main = "Normal Q-Q norm100")
	qqnorm(lognorm100, main = "Normal Q-Q lognorm100")
	par(oldpar)




	tq99 <- function(n, df=1) {
	t100 <- rt(n, df)
	q <- quantile(t100, probs = seq(0.01, 0.99, 0.98))
	st100 <- sort(t100)
	ust100 <- st100[q["1%"]<st100]
	cst100 <- ust100[q["99%"]>ust100]
	cst100
	}




	# TODO why 19? Aufgabe 1.10: abstract P(T(X0) > T(Xi))
	ppplot <- function(x, y=NULL, samps = 19, distf=rnorm) {
	if(is.null(y)) {
	y <- sort(distf(length(x))) #(1:length(x))/length(x)
	} else {
	y <- sort(y)
	}
	plot(sort(x), y, xlab = substitute(x), ylab = expression(F[n]),
	main = "Verteilungsfunktion mit Monte-Carlo-Band",
	type = "s")
	mtext(paste(samps, "Monte-Carlo-Stichproben"), side = 3)
	samples <- matrix(distf(length(x)* samps), nrow = length(x), ncol = samps)
	samples <- apply(samples, 2, sort)
	envelope <- t(apply(samples, 1, range))
	lines(envelope[, 1], y, type = "s", col = "red");
	lines(envelope[, 2], y, type = "s", col = "red")
	}



	ppplot(norm100, tq99(100))

	ppplot(rnorm(300), rnorm(300))

	ks.test(rnorm(300), runif(300))














	# chisq vs. ks
	# https://stackoverflow.com/questions/21655653/r-chi-square-goodness-of-fit-for-random-numbers-generated

	set.seed(1)
	df <- data.frame(y = rexp(1000),
	z = rcauchy(1000, 100, 100))

	ks.test(df$y,"pexp")
	# One-sample Kolmogorov-Smirnov test
	#
	# data: df$y
	# D = 0.0387, p-value = 0.1001
	# alternative hypothesis: two-sided

	ks.test(df$z,"pcauchy",100,100)
	# One-sample Kolmogorov-Smirnov test
	#
	# data: df$z
	# D = 0.0296, p-value = 0.3455
	# alternative hypothesis: two-sided

	breaks <- c(seq(0,10,by=1))
	O <- table(cut(df$y,breaks=breaks))
	p <- diff(pexp(breaks))
	chisq.test(O,p=p, rescale.p=T)

	# Chi-squared test for given probabilities
	#
	# data: O
	# X-squared = 7.9911, df = 9, p-value = 0.535


	par(mfrow=c(1,2))
	plot(qexp(seq(0,1,0.01)),quantile(df$y,seq(0,1,0.01)),
	main="Q-Q Plot",ylab="df$Y", xlab="Exponential",
	xlim=c(0,5),ylim=c(0,5))
	plot(qcauchy(seq(0,.99,0.01),100,100),quantile(df$z,seq(0,.99,0.01)),
	main="Q-Q Plot",ylab="df$Z",xlab="Cauchy",
	xlim=c(-1000,1000),ylim=c(-1000,1000))


	# lm

	x <- 1:100
	err <- rnorm(100, mean = 0, sd = 10)
	y <- 2.5*x + err

	lm(y ~ x)

	lmres <- lm(y ~ x)
	plot(x, y)
	abline(lmres)



	attach(hills)
	scotlm<-lm(time ~ dist + climb)
	attach(scotlm)


	oldpar <- par(mfrow=c(1,2))
	plot(fitted.values, residuals,xlim=c(0,200), main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, residuals, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
	sres<-stdres(scotlm)
	plot(fitted.values,sres,xlim=c(0,200), main="Scottish Hill Races",
	ylab="MASS::stdres", pch=19, cex.lab = 1.5)
	text.id(fitted.values,sres, c(7,11,18,31,33,35, 16,14), row.names(hills), cex.id=1.0)
	par(oldpar)


	oldpar <- par(mfrow=c(1,2))

	scotdffits <-dffits(scotlm)

	plot(fitted.values, scotdffits,ylab="DFFITS",xlim=c(0,200),
	main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, scotdffits, c(7,11, 18,31,33,35), row.names(hills), cex.id=1.2)
	scotcooks <-cooks.distance(scotlm)
	plot(fitted.values, scotcooks,ylab="Cooks",xlim=c(0,200),
	main="Scottish Hill Races", pch=19, cex.lab = 1.5)
	text.id(fitted.values, scotcooks, c(7,11, 18), row.names(hills), cex.id=1.2)
	par(oldpar)


	library(forward)
	fwdhills <- fwdlm(time ~dist + climb, data=hills)


	summary(hills[,1:3])


	# Guete Simulation p.126
	# task view
	foo <- Map(function(e) { t.test(exp(rnorm(1000)), exp(rnorm(1000)))$p.value }, 1:1000)


	length(foo[foo<0.05])