raymondben/beetle_colour.R Secret

## beetle_colour.R
## Retrieve and extract colour palettes from beetle specimen images
## Ben Raymond
## Last-Modified: <2015-02-25 17:40:45>

## change these paths to suit your needs
setwd("~/your/path/")
cache_directory="~/your/data/cache/path/"

## ------------------------------------------------
## stage 1: search for names and their images, and download and cache images locally

## required packages
library(rgbif)
library(ALA4R)
library(plyr)
library(R.cache)
library(jpeg)

## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of various functions:
setCacheRootPath(cache_directory)
cached_name_lookup=addMemoization(name_lookup)
cached_occ_search=addMemoization(occ_search)
cached_name_backbone=addMemoization(name_backbone)
## also configure caching for the ALA4R library, which we use for convenience
ala_config(cache_directory=cache_directory)

## specify the group of taxa we are interested in: Carabidae
parent_key=cached_name_backbone(name="Carabidae")$usageKey

## number of records total
nrecords=cached_name_lookup(rank="SPECIES",limit=0,higherTaxonKey=parent_key,status="ACCEPTED")$meta$count
## fetch all records in chunks
chunk_size=1000
all_species=data.frame()
for (k in seq(0,nrecords,by=chunk_size)) {
    all_species=rbind.fill(all_species,cached_name_lookup(rank="SPECIES",start=k,limit=chunk_size,higherTaxonKey=parent_key,status="ACCEPTED")$data)
}
## could also search for subspecies here as well

## iterate through species and find images, max. 10 per species
all_species$image_urls=vector("list",nrow(all_species))
cat(sprintf("Fetching image URLs:"))
for (k in 1:nrow(all_species)) {
    if ((k %% 500)==1) cat(sprintf(" %d",k))
    this=cached_occ_search(taxonKey=all_species$key[k],mediatype = 'StillImage',basisOfRecord="PRESERVED_SPECIMEN",return = "media",limit=10)
    if (length(this)==1 && grepl("no data found",this)) {
        ## no results, so just store an empty data.frame
        all_species$image_urls[[k]]=data.frame()
    } else {
        all_species$image_urls[[k]]=ldply(this,function(z)as.data.frame(z,stringsAsFactors=FALSE))
    }
}
cat(sprintf("done.\n"))

## now we'll retrieve all those images and cache them locally for processing
## first define a helper function to handle the downloading process
## we use the ALA4R:::cached_get function just because it is an easy way of handling the downloading and caching
get_stuff=function(thisim) {
    ## return data frame of URL, locally-cached filename, file type
    ## can have multiple columns named "identifer" "identifier.1" etc
    exts=gsub("identifier(.*)","\\1",colnames(thisim)[grepl("identifier",names(thisim))])
    out=data.frame()
    if (nrow(thisim)>0) {
        for (e in 1:length(exts)) {
            temp=thisim[,c(".id",paste0(c("identifier","format"),exts[[e]]))]
            names(temp)=c(".id","identifier","format")
            out=rbind(out,temp)
        }
        out=out[!is.na(out$identifier),]
    }
    if (nrow(out)>0) {
        ## seem to have a bunch of images that live behind 303 redirects and which aren't being downloaded properly
        out$identifier=gsub(".*target=(http.*)","\\1",out$identifier)
        out$cached_file_name=as.character(sapply(out$identifier,function(z) ALA4R:::cached_get(z,type="binary_filename")))
    }
    out
}

## now go through and retrieve images
lcfiles=list()
for (k in 1:nrow(all_species)) {
    lcfiles[[k]]=get_stuff(all_species$image_urls[[k]])
}

## add taxon key identifier and row number (row number in the all_species data.frame) to lcfiles for convenience
for (k in 1:nrow(all_species)) {
    if (nrow(lcfiles[[k]])>0) {
        lcfiles[[k]]$taxon_key=all_species$key[k]
        lcfiles[[k]]$row_number=k
    }
}

## collapse this list into a data.frame
image_list=ldply(lcfiles)


## ------------------------------------------------
## stage 2: extract colour palettes from images

cmaps=list() ## keep all colour maps in a list
all_colours=matrix(NA,nrow=0,ncol=5) ## matrix of colours with pixel counts and image numbers

## read all images and build global color map
## this takes a while!
cat(sprintf('Reading images and collating colours ... '))
for (di in 1:nrow(image_list)) {
  if ((di %% 100)==1) cat(sprintf("%d ",di))
  im=readJPEG(image_list$cached_file_name[[di]])

  ## quantize the colours in the image
  im=round(im*20)/20
  ## flatten into nx3 matrix
  imf=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3])) ## columns R G B

  this_count=arrange(count(imf),desc(freq)) ## gives a matrix of colours and pixel count for each
  this_count$im_number=di
  all_colours=rbind(all_colours,this_count) ## add to global colour/count matrix
  ## would probably be better to only add new colours here, rather than growing the all_colours matrix so large inside the loop
  ##  and then reducing it to unique colours at the end

  cmaps[[di]]=this_count
}
cat(sprintf(' done.\n'))
## reduce this down to just unique colours
all_colours=unique(all_colours[,1:3])
all_colours$index=1:nrow(all_colours) ## add "index" (colour number)
## the first 3 columns in all_colours are red, green, blue but are named "x.1" "x.2" "x.3". We'll leave them as is for now, because it makes the join below easier

## now convert each image to counts against the all_colours colourmap
## this is also slow
im_palette=matrix(NA,nrow=length(cmaps),ncol=nrow(all_colours)) ## i.e. an n_images by n_colours matrix
colnames(im_palette)=paste("cl",1:nrow(all_colours),sep="")
for (k in 1:length(cmaps)) {
  if ((k %% 100)==1) cat(sprintf("%d ",k))
  im=readJPEG(image_list$cached_file_name[[k]])
  im=round(im*20)/20

  ## flatten
  im=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3]))
  ## unique colours (rows) and their counts from count(im)
  ## keep only those that appear in global colour map
  blah=na.omit(join(count(im),all_colours,by=c("x.1","x.2","x.3")))
  im_palette[k,]=0
  im_palette[k,blah$index]=blah$freq
}
rownames(all_colours)=paste("cl",1:nrow(all_colours),sep="")
colnames(all_colours)=c("red","green","blue","index")

## save
save(file="colour_data_carabidae.RData",list=c("im_palette","all_colours","all_species","image_list"))
	## Retrieve and extract colour palettes from beetle specimen images
	## Ben Raymond
	## Last-Modified: <2015-02-25 17:40:45>

	## change these paths to suit your needs
	setwd("~/your/path/")
	cache_directory="~/your/data/cache/path/"

	## ------------------------------------------------
	## stage 1: search for names and their images, and download and cache images locally

	## required packages
	library(rgbif)
	library(ALA4R)
	library(plyr)
	library(R.cache)
	library(jpeg)

	## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of various functions:
	setCacheRootPath(cache_directory)
	cached_name_lookup=addMemoization(name_lookup)
	cached_occ_search=addMemoization(occ_search)
	cached_name_backbone=addMemoization(name_backbone)
	## also configure caching for the ALA4R library, which we use for convenience
	ala_config(cache_directory=cache_directory)

	## specify the group of taxa we are interested in: Carabidae
	parent_key=cached_name_backbone(name="Carabidae")$usageKey

	## number of records total
	nrecords=cached_name_lookup(rank="SPECIES",limit=0,higherTaxonKey=parent_key,status="ACCEPTED")$meta$count
	## fetch all records in chunks
	chunk_size=1000
	all_species=data.frame()
	for (k in seq(0,nrecords,by=chunk_size)) {
	all_species=rbind.fill(all_species,cached_name_lookup(rank="SPECIES",start=k,limit=chunk_size,higherTaxonKey=parent_key,status="ACCEPTED")$data)
	}
	## could also search for subspecies here as well

	## iterate through species and find images, max. 10 per species
	all_species$image_urls=vector("list",nrow(all_species))
	cat(sprintf("Fetching image URLs:"))
	for (k in 1:nrow(all_species)) {
	if ((k %% 500)==1) cat(sprintf(" %d",k))
	this=cached_occ_search(taxonKey=all_species$key[k],mediatype = 'StillImage',basisOfRecord="PRESERVED_SPECIMEN",return = "media",limit=10)
	if (length(this)==1 && grepl("no data found",this)) {
	## no results, so just store an empty data.frame
	all_species$image_urls[[k]]=data.frame()
	} else {
	all_species$image_urls[[k]]=ldply(this,function(z)as.data.frame(z,stringsAsFactors=FALSE))
	}
	}
	cat(sprintf("done.\n"))

	## now we'll retrieve all those images and cache them locally for processing
	## first define a helper function to handle the downloading process
	## we use the ALA4R:::cached_get function just because it is an easy way of handling the downloading and caching
	get_stuff=function(thisim) {
	## return data frame of URL, locally-cached filename, file type
	## can have multiple columns named "identifer" "identifier.1" etc
	exts=gsub("identifier(.*)","\\1",colnames(thisim)[grepl("identifier",names(thisim))])
	out=data.frame()
	if (nrow(thisim)>0) {
	for (e in 1:length(exts)) {
	temp=thisim[,c(".id",paste0(c("identifier","format"),exts[[e]]))]
	names(temp)=c(".id","identifier","format")
	out=rbind(out,temp)
	}
	out=out[!is.na(out$identifier),]
	}
	if (nrow(out)>0) {
	## seem to have a bunch of images that live behind 303 redirects and which aren't being downloaded properly
	out$identifier=gsub(".target=(http.)","\\1",out$identifier)
	out$cached_file_name=as.character(sapply(out$identifier,function(z) ALA4R:::cached_get(z,type="binary_filename")))
	}
	out
	}

	## now go through and retrieve images
	lcfiles=list()
	for (k in 1:nrow(all_species)) {
	lcfiles[[k]]=get_stuff(all_species$image_urls[[k]])
	}

	## add taxon key identifier and row number (row number in the all_species data.frame) to lcfiles for convenience
	for (k in 1:nrow(all_species)) {
	if (nrow(lcfiles[[k]])>0) {
	lcfiles[[k]]$taxon_key=all_species$key[k]
	lcfiles[[k]]$row_number=k
	}
	}

	## collapse this list into a data.frame
	image_list=ldply(lcfiles)


	## ------------------------------------------------
	## stage 2: extract colour palettes from images

	cmaps=list() ## keep all colour maps in a list
	all_colours=matrix(NA,nrow=0,ncol=5) ## matrix of colours with pixel counts and image numbers

	## read all images and build global color map
	## this takes a while!
	cat(sprintf('Reading images and collating colours ... '))
	for (di in 1:nrow(image_list)) {
	if ((di %% 100)==1) cat(sprintf("%d ",di))
	im=readJPEG(image_list$cached_file_name[[di]])

	## quantize the colours in the image
	im=round(im*20)/20
	## flatten into nx3 matrix
	imf=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3])) ## columns R G B

	this_count=arrange(count(imf),desc(freq)) ## gives a matrix of colours and pixel count for each
	this_count$im_number=di
	all_colours=rbind(all_colours,this_count) ## add to global colour/count matrix
	## would probably be better to only add new colours here, rather than growing the all_colours matrix so large inside the loop
	## and then reducing it to unique colours at the end

	cmaps[[di]]=this_count
	}
	cat(sprintf(' done.\n'))
	## reduce this down to just unique colours
	all_colours=unique(all_colours[,1:3])
	all_colours$index=1:nrow(all_colours) ## add "index" (colour number)
	## the first 3 columns in all_colours are red, green, blue but are named "x.1" "x.2" "x.3". We'll leave them as is for now, because it makes the join below easier

	## now convert each image to counts against the all_colours colourmap
	## this is also slow
	im_palette=matrix(NA,nrow=length(cmaps),ncol=nrow(all_colours)) ## i.e. an n_images by n_colours matrix
	colnames(im_palette)=paste("cl",1:nrow(all_colours),sep="")
	for (k in 1:length(cmaps)) {
	if ((k %% 100)==1) cat(sprintf("%d ",k))
	im=readJPEG(image_list$cached_file_name[[k]])
	im=round(im*20)/20

	## flatten
	im=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3]))
	## unique colours (rows) and their counts from count(im)
	## keep only those that appear in global colour map
	blah=na.omit(join(count(im),all_colours,by=c("x.1","x.2","x.3")))
	im_palette[k,]=0
	im_palette[k,blah$index]=blah$freq
	}
	rownames(all_colours)=paste("cl",1:nrow(all_colours),sep="")
	colnames(all_colours)=c("red","green","blue","index")

	## save
	save(file="colour_data_carabidae.RData",list=c("im_palette","all_colours","all_species","image_list"))