Created
July 27, 2013 03:48
-
-
Save kzfm/6093635 to your computer and use it in GitHub Desktop.
今日のコード
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(gridExtra) | |
### | |
set.seed(1) | |
x <-seq(-10, 10, by=0.01) | |
y <- 1 - x ^ 2 + rnorm(length(x), 0, 5) | |
g1 <- ggplot(data.frame(X=x,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(se = FALSE) | |
g2 <- ggplot(data.frame(X=x,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(method='lm', se=FALSE) | |
x.squared <- x ^ 2 | |
g3 <- ggplot(data.frame(X=x.squared,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(method='lm', se = FALSE) | |
summary(lm(y ~ x))$r.squared | |
summary(lm(y ~ x.squared))$r.squared | |
grid.arrange(g1, g2, g3, ncol=2, nrow=2) | |
### | |
set.seed(1) | |
x <- seq(0, 1, by=0.01) | |
y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1) | |
df <- data.frame(X=x, Y=y) | |
ggplot(df, aes(x=X, y=Y)) + geom_point() | |
ggplot(df, aes(x=X, y=Y)) + geom_point() + geom_smooth(method='lm', se=FALSE) | |
ggplot(df, aes(x=X, y=Y)) + geom_point() + geom_smooth(se=FALSE) | |
poly.fit1 <- lm(Y ~ poly(X, degree = 1), data=df) | |
df1 <- transform(df, PredictedY = predict(poly.fit1)) | |
gg1 <- ggplot(df1, aes(x=X, y=PredictedY)) + geom_point() + geom_line() | |
poly.fit3 <- lm(Y ~ poly(X, degree = 3), data=df) | |
df3 <- transform(df, PredictedY = predict(poly.fit3)) | |
gg3 <- ggplot(df3, aes(x=X, y=PredictedY)) + geom_point() + geom_line() | |
poly.fit5 <- lm(Y ~ poly(X, degree = 5), data=df) | |
df5 <- transform(df, PredictedY = predict(poly.fit5)) | |
gg5 <- ggplot(df5, aes(x=X, y=PredictedY)) + geom_point() + geom_line() | |
poly.fit25 <- lm(Y ~ poly(X, degree = 25), data=df) | |
df25 <- transform(df, PredictedY = predict(poly.fit25)) | |
gg25 <- ggplot(df25, aes(x=X, y=PredictedY)) + geom_point() + geom_line() | |
grid.arrange(gg1, gg3, gg5, gg25, ncol=2, nrow=2) | |
### | |
set.seed(1) | |
x <- seq(0, 1, by=0.01) | |
y <- sin(2 * pi * x) + rnorm(length(x), 0 , 0.1) | |
n <- length(x) | |
indices <- sort(sample(1:n, round(0.5 * n))) | |
training.x <- x[indices] | |
training.y <- y[indices] | |
test.x <- x[-indices] | |
test.y <- y[-indices] | |
training.df <- data.frame(X=training.x, Y=training.y) | |
test.df <- data.frame(X=test.x, Y=test.y) | |
rmse <- function(y, h) { return(sqrt(mean((y-h)^2)))} | |
performance <- data.frame() | |
for (d in 1:12) { | |
poly.fit <- lm(Y ~ poly(X, degree=d), data=training.df) | |
performance <- rbind(performance, | |
data.frame(Degree=d, | |
Data='Training', | |
RMSE=rmse(training.y, predict(poly.fit)))) | |
performance <- rbind(performance, | |
data.frame(Degree=d, | |
Data='Test', | |
RMSE=rmse(test.y, predict(poly.fit, newdata=test.df)))) | |
} | |
ggplot(performance, aes(x=Degree, y=RMSE, linetype=Data)) + geom_point() + geom_line() | |
### 6.2.1 | |
set.seed(1) | |
x <- seq(0, 1, by=0.01) | |
y <- sin(2 * pi * x) + rnorm(length(x), 0 , 0.1) | |
x <- matrix(x) | |
library(glmnet) | |
glmnet(x, y) | |
n <- length(x) | |
indices <- sort(sample(1:n, round(0.5 * n))) | |
training.x <- x[indices] | |
training.y <- y[indices] | |
test.x <- x[-indices] | |
test.y <- y[-indices] | |
training.df <- data.frame(X=training.x, Y=training.y) | |
test.df <- data.frame(X=test.x, Y=test.y) | |
glmnet.fit <- with(training.df, glmnet(poly(X, degree=10), Y)) | |
lambdas <- glmnet.fit$lambda | |
performance <- data.frame() | |
for (lambda in lambdas) { | |
performance <- rbind(performance, | |
data.frame(Lambda=lambda, | |
RMSE=rmse(test.y, with(test.df, predict(glmnet.fit, | |
poly(X, | |
degree=10),s=lambda))))) | |
} | |
ggplot(performance, aes(x=Lambda, y=RMSE)) + geom_point() + geom_line() | |
best.lambda <- with(performance, Lambda[which(RMSE == min(RMSE))]) | |
glmnet.fit <- with(df, glmnet(poly(X, degree=10), Y)) | |
coef(glmnet.fit, s=best.lambda) | |
### | |
# setwd("/Users//kzfm/lang/rcode/ML_for_Hackers//06-Regularization") | |
ranks <- read.csv("data/oreilly.csv", stringsAsFactors = FALSE) | |
library(tm) | |
documents <- data.frame(Text = ranks$Long.Desc.) | |
row.names(documents) <- 1:nrow(documents) | |
corpus <- Corpus(DataframeSource(documents)) | |
corpus <- tm_map(corpus, tolower) | |
corpus <- tm_map(corpus, stripWhitespace) | |
corpus <- tm_map(corpus, removeWords, stopwords('english')) | |
dtm <- DocumentTermMatrix(corpus) | |
x <- as.matrix(dtm) | |
y <- rev(1:100) | |
set.seed(1) | |
library(glmnet) | |
performance <- data.frame() | |
for (lambda in c(0.1, 0.25, 0.5, 1, 2, 5)) | |
{ | |
for (i in 1:50) | |
{ | |
indices <- sample(1:100, 80) | |
training.x <- x[indices, ] | |
training.y <- y[indices] | |
test.x <- x[-indices, ] | |
test.y <- y[-indices] | |
glm.fit <- glmnet(training.x, training.y) | |
predicted.y <- predict(glm.fit, test.x, s = lambda) | |
rmse <- sqrt(mean((predicted.y - test.y) ^ 2)) | |
performance <- rbind(performance, | |
data.frame(Lambda = lambda, | |
Iteration = i, | |
RMSE = rmse)) | |
} | |
} | |
ggplot(performance, aes(x = Lambda, y = RMSE)) + | |
stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') + | |
stat_summary(fun.data = 'mean_cl_boot', geom = 'point') | |
### | |
y <- rep(c(1, 0), each = 50) | |
regularized.fit <- glmnet(x, y, family = 'binomial') | |
set.seed(1) | |
performance <- data.frame() | |
for (i in 1:250) | |
{ | |
indices <- sample(1:100, 80) | |
training.x <- x[indices, ] | |
training.y <- y[indices] | |
test.x <- x[-indices, ] | |
test.y <- y[-indices] | |
for (lambda in c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.5, 0.1)) | |
{ | |
glm.fit <- glmnet(training.x, training.y, family = 'binomial') | |
predicted.y <- ifelse(predict(glm.fit, test.x, s = lambda) > 0, 1, 0) | |
error.rate <- mean(predicted.y != test.y) | |
performance <- rbind(performance, | |
data.frame(Lambda = lambda, | |
Iteration = i, | |
ErrorRate = error.rate)) | |
} | |
} | |
ggplot(performance, aes(x = Lambda, y = ErrorRate)) + | |
stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') + | |
stat_summary(fun.data = 'mean_cl_boot', geom = 'point') + | |
scale_x_log10() | |
# chapter 7 | |
heights.weights <- read.csv("data/01_heights_weights_genders.csv") | |
height.to.weight <- function(height, a, b){ return(a + b * height)} | |
ridge.error <- function(heights.weights, a, b, lambda){ | |
predictions <- with(heights.weights, height.to.weight(Height, a, b)) | |
errors <- with(heights.weights, Weight - predictions) | |
return(sum(errors ^ 2) + lambda * (a^2+b^2)) | |
} | |
lambda <- 1 | |
optim(c(0,0), | |
function (x) {ridge.error(heights.weights, x[1], x[2], lambda)} | |
) | |
#$par | |
#[1] -340.434108 7.562524 | |
lm(Weight ~ Height, data=heights.weights) | |
#Call: | |
# lm(formula = Weight ~ Height, data = heights.weights) | |
# | |
#Coefficients: | |
# (Intercept) Height | |
#-350.737 7.717 | |
a.ridge.error <- function(a, lambda) {return(ridge.error(heights.weights, a, 0, lambda))} | |
curve(sapply(x, function (a) {a.ridge.error(a, lambda)}), from = -1000, to = 1000) | |
b.ridge.error <- function(b, lambda) {return(ridge.error(heights.weights, 0, b, lambda))} | |
curve(sapply(x, function (b) {b.ridge.error(b, lambda)}), from = -1000, to = 1000) | |
### copy & paste :-) | |
english.letters <- c('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', | |
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', | |
'w', 'x', 'y', 'z') | |
caesar.cipher <- list() | |
inverse.caesar.cipher <- list() | |
for (index in 1:length(english.letters)) | |
{ | |
caesar.cipher[[english.letters[index]]] <- english.letters[index %% 26 + 1] | |
inverse.caesar.cipher[[english.letters[index %% 26 + 1]]] <- english.letters[index] | |
} | |
print(caesar.cipher) | |
# Fifteenth code snippet | |
apply.cipher.to.string <- function(string, cipher) | |
{ | |
output <- '' | |
for (i in 1:nchar(string)) | |
{ | |
output <- paste(output, cipher[[substr(string, i, i)]], sep = '') | |
} | |
return(output) | |
} | |
apply.cipher.to.text <- function(text, cipher) | |
{ | |
output <- c() | |
for (string in text) | |
{ | |
output <- c(output, apply.cipher.to.string(string, cipher)) | |
} | |
return(output) | |
} | |
apply.cipher.to.text(c('sample', 'text'), caesar.cipher) | |
# Sixteenth code snippet | |
generate.random.cipher <- function() | |
{ | |
cipher <- list() | |
inputs <- english.letters | |
outputs <- english.letters[sample(1:length(english.letters), length(english.letters))] | |
for (index in 1:length(english.letters)) | |
{ | |
cipher[[inputs[index]]] <- outputs[index] | |
} | |
return(cipher) | |
} | |
modify.cipher <- function(cipher, input, output) | |
{ | |
new.cipher <- cipher | |
new.cipher[[input]] <- output | |
old.output <- cipher[[input]] | |
collateral.input <- names(which(sapply(names(cipher), | |
function (key) {cipher[[key]]}) == output)) | |
new.cipher[[collateral.input]] <- old.output | |
return(new.cipher) | |
} | |
propose.modified.cipher <- function(cipher) | |
{ | |
input <- sample(names(cipher), 1) | |
output <- sample(english.letters, 1) | |
return(modify.cipher(cipher, input, output)) | |
} | |
# Seventeenth code snippet | |
load(file.path('data', 'lexical_database.Rdata')) | |
# Eighteength code snippet | |
lexical.database[['a']] | |
lexical.database[['the']] | |
lexical.database[['he']] | |
lexical.database[['she']] | |
lexical.database[['data']] | |
# Nineteenth code snippet | |
one.gram.probability <- function(one.gram, lexical.database = list()) | |
{ | |
lexical.probability <- lexical.database[[one.gram]] | |
if (is.null(lexical.probability) || is.na(lexical.probability)) | |
{ | |
return(.Machine$double.eps) | |
} | |
else | |
{ | |
return(lexical.probability) | |
} | |
} | |
# Twentieth code snippet | |
log.probability.of.text <- function(text, cipher, lexical.database = list()) | |
{ | |
log.probability <- 0.0 | |
for (string in text) | |
{ | |
decrypted.string <- apply.cipher.to.string(string, cipher) | |
log.probability <- log.probability + | |
log(one.gram.probability(decrypted.string, lexical.database)) | |
} | |
return(log.probability) | |
} | |
# Twenty-first code snippet | |
metropolis.step <- function(text, cipher, lexical.database = list()) | |
{ | |
proposed.cipher <- propose.modified.cipher(cipher) | |
lp1 <- log.probability.of.text(text, cipher, lexical.database) | |
lp2 <- log.probability.of.text(text, proposed.cipher, lexical.database) | |
if (lp2 > lp1) | |
{ | |
return(proposed.cipher) | |
} | |
else | |
{ | |
a <- exp(lp2 - lp1) | |
x <- runif(1) | |
if (x < a) | |
{ | |
return(proposed.cipher) | |
} | |
else | |
{ | |
return(cipher) | |
} | |
} | |
} | |
# Twenty-second code snippet | |
decrypted.text <- c('here', 'is', 'some', 'sample', 'text') | |
# Twenty-third code snippet | |
encrypted.text <- apply.cipher.to.text(decrypted.text, caesar.cipher) | |
# Twenty-fourth code snippet | |
set.seed(1) | |
cipher <- generate.random.cipher() | |
results <- data.frame() | |
number.of.iterations <- 50000 | |
for (iteration in 1:number.of.iterations) | |
{ | |
log.probability <- log.probability.of.text(encrypted.text, | |
cipher, | |
lexical.database) | |
current.decrypted.text <- paste(apply.cipher.to.text(encrypted.text, | |
cipher), | |
collapse = ' ') | |
correct.text <- as.numeric(current.decrypted.text == paste(decrypted.text, | |
collapse = ' ')) | |
results <- rbind(results, | |
data.frame(Iteration = iteration, | |
LogProbability = log.probability, | |
CurrentDecryptedText = current.decrypted.text, | |
CorrectText = correct.text)) | |
cipher <- metropolis.step(encrypted.text, cipher, lexical.database) | |
} | |
write.table(results, | |
file = file.path('data/results.tsv'), | |
row.names = FALSE, | |
sep = '\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment