oscardelama/rgb-noise: selecting few samples.r

## rgb-noise: selecting few samples.r
# 1) Read and digest the channels with valid values
vvm.all <- vvm$new(has.RGGB.pattern = TRUE)
vvm.all$digest(
    file.name.from = '_ODL0387s4',
    file.name.to   = '_ODL1671s6',
    file.name.ext  = '.pgm',
    file.path      = 'ISO100/crops',
    min.raw        = 4,
    max.raw        = c(16379, 15774, 15774 ,16379))

# 2) Fit a robust, weighted, quadratic model
vvm.all$fit.model(model.name = 'weighted', model.family = 'lmrob', weights=1/mean^2)

# 3) Get the var and mean values for the average green channel
avg.green.means <- subset(vvm.all$var.df, channel == 'Green Avg')
avg.green.means <- avg.green.means[with(avg.green.means, order(mean)), ]

# 4) Get the predictions of var for the average green mean values in the data
avg.green.var.preds <- vvm.all$get.model.predictions(model.name = 'weighted',
                                                     select = 'Green Avg',
                                                     x = avg.green.means$mean)
# 5) Compute the relative error between predicted and real average green var
avg.green.means$avg.green.rel.err <- (avg.green.means$var - avg.green.var.preds$var)/
                                     (avg.green.var.preds$upl - avg.green.var.preds$var)

# Take a look to the histogram of relative errors
hist(avg.green.means$avg.green.rel.err)

# 6) Keep in the best.samples data frame samples with absolute relative error below 0.4
best.samples <- subset(avg.green.means, abs(avg.green.rel.err) < 0.4)

# 7) Get the wide var data frame with all the picture channels per row
all.samples <- vvm.all$wide.var.df
# 8) Keep only the pictures with data not NA for all the channels
complete.samples <- all.samples[complete.cases(all.samples), ]

# 9) Keep in best.samples only pictures with data for all the channels
best.samples <- subset(best.samples, pict %in% complete.samples$pict)
# 10) Find in best.samples 154 clusters of similar mean values
chunks <- kmeans(best.samples$mean, 154)
# 11) Add to each row en best.samples a variable indicating to which cluster it belongs
best.samples$cluster <- chunks$cluster
# 12) Split the best.samples rows in a list of clusters with the rows of them
pict.clusters <- split(as.character(best.samples$pict), best.samples$cluster)
# Function selecting the "median" row
my.median <- function(v) {
  if (length(v) == 1)
    v[1]
  else if (length(v) %% 2 == 0)
    v[length(v) %/% 2]
  else
    v[length(v) %/% 2 + 1]
}
# 13) Select a picture from each cluster
sel.pics <- sapply(pict.clusters, my.median)
# 14) Save the selected picture sample names
write.csv(data.frame('pict'=sel.pics), 'sel-pict.csv', row.names = FALSE)
	# 1) Read and digest the channels with valid values
	vvm.all <- vvm$new(has.RGGB.pattern = TRUE)
	vvm.all$digest(
	file.name.from = '_ODL0387s4',
	file.name.to = '_ODL1671s6',
	file.name.ext = '.pgm',
	file.path = 'ISO100/crops',
	min.raw = 4,
	max.raw = c(16379, 15774, 15774 ,16379))

	# 2) Fit a robust, weighted, quadratic model
	vvm.all$fit.model(model.name = 'weighted', model.family = 'lmrob', weights=1/mean^2)

	# 3) Get the var and mean values for the average green channel
	avg.green.means <- subset(vvm.all$var.df, channel == 'Green Avg')
	avg.green.means <- avg.green.means[with(avg.green.means, order(mean)), ]

	# 4) Get the predictions of var for the average green mean values in the data
	avg.green.var.preds <- vvm.all$get.model.predictions(model.name = 'weighted',
	select = 'Green Avg',
	x = avg.green.means$mean)
	# 5) Compute the relative error between predicted and real average green var
	avg.green.means$avg.green.rel.err <- (avg.green.means$var - avg.green.var.preds$var)/
	(avg.green.var.preds$upl - avg.green.var.preds$var)

	# Take a look to the histogram of relative errors
	hist(avg.green.means$avg.green.rel.err)

	# 6) Keep in the best.samples data frame samples with absolute relative error below 0.4
	best.samples <- subset(avg.green.means, abs(avg.green.rel.err) < 0.4)

	# 7) Get the wide var data frame with all the picture channels per row
	all.samples <- vvm.all$wide.var.df
	# 8) Keep only the pictures with data not NA for all the channels
	complete.samples <- all.samples[complete.cases(all.samples), ]

	# 9) Keep in best.samples only pictures with data for all the channels
	best.samples <- subset(best.samples, pict %in% complete.samples$pict)
	# 10) Find in best.samples 154 clusters of similar mean values
	chunks <- kmeans(best.samples$mean, 154)
	# 11) Add to each row en best.samples a variable indicating to which cluster it belongs
	best.samples$cluster <- chunks$cluster
	# 12) Split the best.samples rows in a list of clusters with the rows of them
	pict.clusters <- split(as.character(best.samples$pict), best.samples$cluster)
	# Function selecting the "median" row
	my.median <- function(v) {
	if (length(v) == 1)
	v[1]
	else if (length(v) %% 2 == 0)
	v[length(v) %/% 2]
	else
	v[length(v) %/% 2 + 1]
	}
	# 13) Select a picture from each cluster
	sel.pics <- sapply(pict.clusters, my.median)
	# 14) Save the selected picture sample names
	write.csv(data.frame('pict'=sel.pics), 'sel-pict.csv', row.names = FALSE)