Skip to content

Instantly share code, notes, and snippets.

@raymondben
Created February 27, 2015 03:08
Show Gist options
  • Save raymondben/22ffeec3f5835a575b89 to your computer and use it in GitHub Desktop.
Save raymondben/22ffeec3f5835a575b89 to your computer and use it in GitHub Desktop.
Retrieve and extract colour palettes from beetle specimen images
## Retrieve and extract colour palettes from beetle specimen images
## Ben Raymond
## Last-Modified: <2015-02-25 17:40:45>
## change these paths to suit your needs
setwd("~/your/path/")
cache_directory="~/your/data/cache/path/"
## ------------------------------------------------
## stage 1: search for names and their images, and download and cache images locally
## required packages
library(rgbif)
library(ALA4R)
library(plyr)
library(R.cache)
library(jpeg)
## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of various functions:
setCacheRootPath(cache_directory)
cached_name_lookup=addMemoization(name_lookup)
cached_occ_search=addMemoization(occ_search)
cached_name_backbone=addMemoization(name_backbone)
## also configure caching for the ALA4R library, which we use for convenience
ala_config(cache_directory=cache_directory)
## specify the group of taxa we are interested in: Carabidae
parent_key=cached_name_backbone(name="Carabidae")$usageKey
## number of records total
nrecords=cached_name_lookup(rank="SPECIES",limit=0,higherTaxonKey=parent_key,status="ACCEPTED")$meta$count
## fetch all records in chunks
chunk_size=1000
all_species=data.frame()
for (k in seq(0,nrecords,by=chunk_size)) {
all_species=rbind.fill(all_species,cached_name_lookup(rank="SPECIES",start=k,limit=chunk_size,higherTaxonKey=parent_key,status="ACCEPTED")$data)
}
## could also search for subspecies here as well
## iterate through species and find images, max. 10 per species
all_species$image_urls=vector("list",nrow(all_species))
cat(sprintf("Fetching image URLs:"))
for (k in 1:nrow(all_species)) {
if ((k %% 500)==1) cat(sprintf(" %d",k))
this=cached_occ_search(taxonKey=all_species$key[k],mediatype = 'StillImage',basisOfRecord="PRESERVED_SPECIMEN",return = "media",limit=10)
if (length(this)==1 && grepl("no data found",this)) {
## no results, so just store an empty data.frame
all_species$image_urls[[k]]=data.frame()
} else {
all_species$image_urls[[k]]=ldply(this,function(z)as.data.frame(z,stringsAsFactors=FALSE))
}
}
cat(sprintf("done.\n"))
## now we'll retrieve all those images and cache them locally for processing
## first define a helper function to handle the downloading process
## we use the ALA4R:::cached_get function just because it is an easy way of handling the downloading and caching
get_stuff=function(thisim) {
## return data frame of URL, locally-cached filename, file type
## can have multiple columns named "identifer" "identifier.1" etc
exts=gsub("identifier(.*)","\\1",colnames(thisim)[grepl("identifier",names(thisim))])
out=data.frame()
if (nrow(thisim)>0) {
for (e in 1:length(exts)) {
temp=thisim[,c(".id",paste0(c("identifier","format"),exts[[e]]))]
names(temp)=c(".id","identifier","format")
out=rbind(out,temp)
}
out=out[!is.na(out$identifier),]
}
if (nrow(out)>0) {
## seem to have a bunch of images that live behind 303 redirects and which aren't being downloaded properly
out$identifier=gsub(".*target=(http.*)","\\1",out$identifier)
out$cached_file_name=as.character(sapply(out$identifier,function(z) ALA4R:::cached_get(z,type="binary_filename")))
}
out
}
## now go through and retrieve images
lcfiles=list()
for (k in 1:nrow(all_species)) {
lcfiles[[k]]=get_stuff(all_species$image_urls[[k]])
}
## add taxon key identifier and row number (row number in the all_species data.frame) to lcfiles for convenience
for (k in 1:nrow(all_species)) {
if (nrow(lcfiles[[k]])>0) {
lcfiles[[k]]$taxon_key=all_species$key[k]
lcfiles[[k]]$row_number=k
}
}
## collapse this list into a data.frame
image_list=ldply(lcfiles)
## ------------------------------------------------
## stage 2: extract colour palettes from images
cmaps=list() ## keep all colour maps in a list
all_colours=matrix(NA,nrow=0,ncol=5) ## matrix of colours with pixel counts and image numbers
## read all images and build global color map
## this takes a while!
cat(sprintf('Reading images and collating colours ... '))
for (di in 1:nrow(image_list)) {
if ((di %% 100)==1) cat(sprintf("%d ",di))
im=readJPEG(image_list$cached_file_name[[di]])
## quantize the colours in the image
im=round(im*20)/20
## flatten into nx3 matrix
imf=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3])) ## columns R G B
this_count=arrange(count(imf),desc(freq)) ## gives a matrix of colours and pixel count for each
this_count$im_number=di
all_colours=rbind(all_colours,this_count) ## add to global colour/count matrix
## would probably be better to only add new colours here, rather than growing the all_colours matrix so large inside the loop
## and then reducing it to unique colours at the end
cmaps[[di]]=this_count
}
cat(sprintf(' done.\n'))
## reduce this down to just unique colours
all_colours=unique(all_colours[,1:3])
all_colours$index=1:nrow(all_colours) ## add "index" (colour number)
## the first 3 columns in all_colours are red, green, blue but are named "x.1" "x.2" "x.3". We'll leave them as is for now, because it makes the join below easier
## now convert each image to counts against the all_colours colourmap
## this is also slow
im_palette=matrix(NA,nrow=length(cmaps),ncol=nrow(all_colours)) ## i.e. an n_images by n_colours matrix
colnames(im_palette)=paste("cl",1:nrow(all_colours),sep="")
for (k in 1:length(cmaps)) {
if ((k %% 100)==1) cat(sprintf("%d ",k))
im=readJPEG(image_list$cached_file_name[[k]])
im=round(im*20)/20
## flatten
im=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3]))
## unique colours (rows) and their counts from count(im)
## keep only those that appear in global colour map
blah=na.omit(join(count(im),all_colours,by=c("x.1","x.2","x.3")))
im_palette[k,]=0
im_palette[k,blah$index]=blah$freq
}
rownames(all_colours)=paste("cl",1:nrow(all_colours),sep="")
colnames(all_colours)=c("red","green","blue","index")
## save
save(file="colour_data_carabidae.RData",list=c("im_palette","all_colours","all_species","image_list"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment