Instantly share code, notes, and snippets.

@kzfm /ML130727
Created Jul 27, 2013

Embed
What would you like to do?
今日のコード
library(ggplot2)
library(gridExtra)
###
set.seed(1)
x <-seq(-10, 10, by=0.01)
y <- 1 - x ^ 2 + rnorm(length(x), 0, 5)
g1 <- ggplot(data.frame(X=x,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(se = FALSE)
g2 <- ggplot(data.frame(X=x,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(method='lm', se=FALSE)
x.squared <- x ^ 2
g3 <- ggplot(data.frame(X=x.squared,Y=y), aes(x=X, y=Y))+ geom_point() + geom_smooth(method='lm', se = FALSE)
summary(lm(y ~ x))$r.squared
summary(lm(y ~ x.squared))$r.squared
grid.arrange(g1, g2, g3, ncol=2, nrow=2)
###
set.seed(1)
x <- seq(0, 1, by=0.01)
y <- sin(2 * pi * x) + rnorm(length(x), 0, 0.1)
df <- data.frame(X=x, Y=y)
ggplot(df, aes(x=X, y=Y)) + geom_point()
ggplot(df, aes(x=X, y=Y)) + geom_point() + geom_smooth(method='lm', se=FALSE)
ggplot(df, aes(x=X, y=Y)) + geom_point() + geom_smooth(se=FALSE)
poly.fit1 <- lm(Y ~ poly(X, degree = 1), data=df)
df1 <- transform(df, PredictedY = predict(poly.fit1))
gg1 <- ggplot(df1, aes(x=X, y=PredictedY)) + geom_point() + geom_line()
poly.fit3 <- lm(Y ~ poly(X, degree = 3), data=df)
df3 <- transform(df, PredictedY = predict(poly.fit3))
gg3 <- ggplot(df3, aes(x=X, y=PredictedY)) + geom_point() + geom_line()
poly.fit5 <- lm(Y ~ poly(X, degree = 5), data=df)
df5 <- transform(df, PredictedY = predict(poly.fit5))
gg5 <- ggplot(df5, aes(x=X, y=PredictedY)) + geom_point() + geom_line()
poly.fit25 <- lm(Y ~ poly(X, degree = 25), data=df)
df25 <- transform(df, PredictedY = predict(poly.fit25))
gg25 <- ggplot(df25, aes(x=X, y=PredictedY)) + geom_point() + geom_line()
grid.arrange(gg1, gg3, gg5, gg25, ncol=2, nrow=2)
###
set.seed(1)
x <- seq(0, 1, by=0.01)
y <- sin(2 * pi * x) + rnorm(length(x), 0 , 0.1)
n <- length(x)
indices <- sort(sample(1:n, round(0.5 * n)))
training.x <- x[indices]
training.y <- y[indices]
test.x <- x[-indices]
test.y <- y[-indices]
training.df <- data.frame(X=training.x, Y=training.y)
test.df <- data.frame(X=test.x, Y=test.y)
rmse <- function(y, h) { return(sqrt(mean((y-h)^2)))}
performance <- data.frame()
for (d in 1:12) {
poly.fit <- lm(Y ~ poly(X, degree=d), data=training.df)
performance <- rbind(performance,
data.frame(Degree=d,
Data='Training',
RMSE=rmse(training.y, predict(poly.fit))))
performance <- rbind(performance,
data.frame(Degree=d,
Data='Test',
RMSE=rmse(test.y, predict(poly.fit, newdata=test.df))))
}
ggplot(performance, aes(x=Degree, y=RMSE, linetype=Data)) + geom_point() + geom_line()
### 6.2.1
set.seed(1)
x <- seq(0, 1, by=0.01)
y <- sin(2 * pi * x) + rnorm(length(x), 0 , 0.1)
x <- matrix(x)
library(glmnet)
glmnet(x, y)
n <- length(x)
indices <- sort(sample(1:n, round(0.5 * n)))
training.x <- x[indices]
training.y <- y[indices]
test.x <- x[-indices]
test.y <- y[-indices]
training.df <- data.frame(X=training.x, Y=training.y)
test.df <- data.frame(X=test.x, Y=test.y)
glmnet.fit <- with(training.df, glmnet(poly(X, degree=10), Y))
lambdas <- glmnet.fit$lambda
performance <- data.frame()
for (lambda in lambdas) {
performance <- rbind(performance,
data.frame(Lambda=lambda,
RMSE=rmse(test.y, with(test.df, predict(glmnet.fit,
poly(X,
degree=10),s=lambda)))))
}
ggplot(performance, aes(x=Lambda, y=RMSE)) + geom_point() + geom_line()
best.lambda <- with(performance, Lambda[which(RMSE == min(RMSE))])
glmnet.fit <- with(df, glmnet(poly(X, degree=10), Y))
coef(glmnet.fit, s=best.lambda)
###
# setwd("/Users//kzfm/lang/rcode/ML_for_Hackers//06-Regularization")
ranks <- read.csv("data/oreilly.csv", stringsAsFactors = FALSE)
library(tm)
documents <- data.frame(Text = ranks$Long.Desc.)
row.names(documents) <- 1:nrow(documents)
corpus <- Corpus(DataframeSource(documents))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
dtm <- DocumentTermMatrix(corpus)
x <- as.matrix(dtm)
y <- rev(1:100)
set.seed(1)
library(glmnet)
performance <- data.frame()
for (lambda in c(0.1, 0.25, 0.5, 1, 2, 5))
{
for (i in 1:50)
{
indices <- sample(1:100, 80)
training.x <- x[indices, ]
training.y <- y[indices]
test.x <- x[-indices, ]
test.y <- y[-indices]
glm.fit <- glmnet(training.x, training.y)
predicted.y <- predict(glm.fit, test.x, s = lambda)
rmse <- sqrt(mean((predicted.y - test.y) ^ 2))
performance <- rbind(performance,
data.frame(Lambda = lambda,
Iteration = i,
RMSE = rmse))
}
}
ggplot(performance, aes(x = Lambda, y = RMSE)) +
stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') +
stat_summary(fun.data = 'mean_cl_boot', geom = 'point')
###
y <- rep(c(1, 0), each = 50)
regularized.fit <- glmnet(x, y, family = 'binomial')
set.seed(1)
performance <- data.frame()
for (i in 1:250)
{
indices <- sample(1:100, 80)
training.x <- x[indices, ]
training.y <- y[indices]
test.x <- x[-indices, ]
test.y <- y[-indices]
for (lambda in c(0.0001, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.5, 0.1))
{
glm.fit <- glmnet(training.x, training.y, family = 'binomial')
predicted.y <- ifelse(predict(glm.fit, test.x, s = lambda) > 0, 1, 0)
error.rate <- mean(predicted.y != test.y)
performance <- rbind(performance,
data.frame(Lambda = lambda,
Iteration = i,
ErrorRate = error.rate))
}
}
ggplot(performance, aes(x = Lambda, y = ErrorRate)) +
stat_summary(fun.data = 'mean_cl_boot', geom = 'errorbar') +
stat_summary(fun.data = 'mean_cl_boot', geom = 'point') +
scale_x_log10()
# chapter 7
heights.weights <- read.csv("data/01_heights_weights_genders.csv")
height.to.weight <- function(height, a, b){ return(a + b * height)}
ridge.error <- function(heights.weights, a, b, lambda){
predictions <- with(heights.weights, height.to.weight(Height, a, b))
errors <- with(heights.weights, Weight - predictions)
return(sum(errors ^ 2) + lambda * (a^2+b^2))
}
lambda <- 1
optim(c(0,0),
function (x) {ridge.error(heights.weights, x[1], x[2], lambda)}
)
#$par
#[1] -340.434108 7.562524
lm(Weight ~ Height, data=heights.weights)
#Call:
# lm(formula = Weight ~ Height, data = heights.weights)
#
#Coefficients:
# (Intercept) Height
#-350.737 7.717
a.ridge.error <- function(a, lambda) {return(ridge.error(heights.weights, a, 0, lambda))}
curve(sapply(x, function (a) {a.ridge.error(a, lambda)}), from = -1000, to = 1000)
b.ridge.error <- function(b, lambda) {return(ridge.error(heights.weights, 0, b, lambda))}
curve(sapply(x, function (b) {b.ridge.error(b, lambda)}), from = -1000, to = 1000)
### copy & paste :-)
english.letters <- c('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
'w', 'x', 'y', 'z')
caesar.cipher <- list()
inverse.caesar.cipher <- list()
for (index in 1:length(english.letters))
{
caesar.cipher[[english.letters[index]]] <- english.letters[index %% 26 + 1]
inverse.caesar.cipher[[english.letters[index %% 26 + 1]]] <- english.letters[index]
}
print(caesar.cipher)
# Fifteenth code snippet
apply.cipher.to.string <- function(string, cipher)
{
output <- ''
for (i in 1:nchar(string))
{
output <- paste(output, cipher[[substr(string, i, i)]], sep = '')
}
return(output)
}
apply.cipher.to.text <- function(text, cipher)
{
output <- c()
for (string in text)
{
output <- c(output, apply.cipher.to.string(string, cipher))
}
return(output)
}
apply.cipher.to.text(c('sample', 'text'), caesar.cipher)
# Sixteenth code snippet
generate.random.cipher <- function()
{
cipher <- list()
inputs <- english.letters
outputs <- english.letters[sample(1:length(english.letters), length(english.letters))]
for (index in 1:length(english.letters))
{
cipher[[inputs[index]]] <- outputs[index]
}
return(cipher)
}
modify.cipher <- function(cipher, input, output)
{
new.cipher <- cipher
new.cipher[[input]] <- output
old.output <- cipher[[input]]
collateral.input <- names(which(sapply(names(cipher),
function (key) {cipher[[key]]}) == output))
new.cipher[[collateral.input]] <- old.output
return(new.cipher)
}
propose.modified.cipher <- function(cipher)
{
input <- sample(names(cipher), 1)
output <- sample(english.letters, 1)
return(modify.cipher(cipher, input, output))
}
# Seventeenth code snippet
load(file.path('data', 'lexical_database.Rdata'))
# Eighteength code snippet
lexical.database[['a']]
lexical.database[['the']]
lexical.database[['he']]
lexical.database[['she']]
lexical.database[['data']]
# Nineteenth code snippet
one.gram.probability <- function(one.gram, lexical.database = list())
{
lexical.probability <- lexical.database[[one.gram]]
if (is.null(lexical.probability) || is.na(lexical.probability))
{
return(.Machine$double.eps)
}
else
{
return(lexical.probability)
}
}
# Twentieth code snippet
log.probability.of.text <- function(text, cipher, lexical.database = list())
{
log.probability <- 0.0
for (string in text)
{
decrypted.string <- apply.cipher.to.string(string, cipher)
log.probability <- log.probability +
log(one.gram.probability(decrypted.string, lexical.database))
}
return(log.probability)
}
# Twenty-first code snippet
metropolis.step <- function(text, cipher, lexical.database = list())
{
proposed.cipher <- propose.modified.cipher(cipher)
lp1 <- log.probability.of.text(text, cipher, lexical.database)
lp2 <- log.probability.of.text(text, proposed.cipher, lexical.database)
if (lp2 > lp1)
{
return(proposed.cipher)
}
else
{
a <- exp(lp2 - lp1)
x <- runif(1)
if (x < a)
{
return(proposed.cipher)
}
else
{
return(cipher)
}
}
}
# Twenty-second code snippet
decrypted.text <- c('here', 'is', 'some', 'sample', 'text')
# Twenty-third code snippet
encrypted.text <- apply.cipher.to.text(decrypted.text, caesar.cipher)
# Twenty-fourth code snippet
set.seed(1)
cipher <- generate.random.cipher()
results <- data.frame()
number.of.iterations <- 50000
for (iteration in 1:number.of.iterations)
{
log.probability <- log.probability.of.text(encrypted.text,
cipher,
lexical.database)
current.decrypted.text <- paste(apply.cipher.to.text(encrypted.text,
cipher),
collapse = ' ')
correct.text <- as.numeric(current.decrypted.text == paste(decrypted.text,
collapse = ' '))
results <- rbind(results,
data.frame(Iteration = iteration,
LogProbability = log.probability,
CurrentDecryptedText = current.decrypted.text,
CorrectText = correct.text))
cipher <- metropolis.step(encrypted.text, cipher, lexical.database)
}
write.table(results,
file = file.path('data/results.tsv'),
row.names = FALSE,
sep = '\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment