-
-
Save raymondben/22ffeec3f5835a575b89 to your computer and use it in GitHub Desktop.
Retrieve and extract colour palettes from beetle specimen images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Retrieve and extract colour palettes from beetle specimen images | |
## Ben Raymond | |
## Last-Modified: <2015-02-25 17:40:45> | |
## change these paths to suit your needs | |
setwd("~/your/path/") | |
cache_directory="~/your/data/cache/path/" | |
## ------------------------------------------------ | |
## stage 1: search for names and their images, and download and cache images locally | |
## required packages | |
library(rgbif) | |
library(ALA4R) | |
library(plyr) | |
library(R.cache) | |
library(jpeg) | |
## We use the `R.cache` package to cache our web calls to the GBIF API. This just means that, having run a certain query once, we can re-run it without needing an internet connection. Specify the cache directory and create a cached version of various functions: | |
setCacheRootPath(cache_directory) | |
cached_name_lookup=addMemoization(name_lookup) | |
cached_occ_search=addMemoization(occ_search) | |
cached_name_backbone=addMemoization(name_backbone) | |
## also configure caching for the ALA4R library, which we use for convenience | |
ala_config(cache_directory=cache_directory) | |
## specify the group of taxa we are interested in: Carabidae | |
parent_key=cached_name_backbone(name="Carabidae")$usageKey | |
## number of records total | |
nrecords=cached_name_lookup(rank="SPECIES",limit=0,higherTaxonKey=parent_key,status="ACCEPTED")$meta$count | |
## fetch all records in chunks | |
chunk_size=1000 | |
all_species=data.frame() | |
for (k in seq(0,nrecords,by=chunk_size)) { | |
all_species=rbind.fill(all_species,cached_name_lookup(rank="SPECIES",start=k,limit=chunk_size,higherTaxonKey=parent_key,status="ACCEPTED")$data) | |
} | |
## could also search for subspecies here as well | |
## iterate through species and find images, max. 10 per species | |
all_species$image_urls=vector("list",nrow(all_species)) | |
cat(sprintf("Fetching image URLs:")) | |
for (k in 1:nrow(all_species)) { | |
if ((k %% 500)==1) cat(sprintf(" %d",k)) | |
this=cached_occ_search(taxonKey=all_species$key[k],mediatype = 'StillImage',basisOfRecord="PRESERVED_SPECIMEN",return = "media",limit=10) | |
if (length(this)==1 && grepl("no data found",this)) { | |
## no results, so just store an empty data.frame | |
all_species$image_urls[[k]]=data.frame() | |
} else { | |
all_species$image_urls[[k]]=ldply(this,function(z)as.data.frame(z,stringsAsFactors=FALSE)) | |
} | |
} | |
cat(sprintf("done.\n")) | |
## now we'll retrieve all those images and cache them locally for processing | |
## first define a helper function to handle the downloading process | |
## we use the ALA4R:::cached_get function just because it is an easy way of handling the downloading and caching | |
get_stuff=function(thisim) { | |
## return data frame of URL, locally-cached filename, file type | |
## can have multiple columns named "identifer" "identifier.1" etc | |
exts=gsub("identifier(.*)","\\1",colnames(thisim)[grepl("identifier",names(thisim))]) | |
out=data.frame() | |
if (nrow(thisim)>0) { | |
for (e in 1:length(exts)) { | |
temp=thisim[,c(".id",paste0(c("identifier","format"),exts[[e]]))] | |
names(temp)=c(".id","identifier","format") | |
out=rbind(out,temp) | |
} | |
out=out[!is.na(out$identifier),] | |
} | |
if (nrow(out)>0) { | |
## seem to have a bunch of images that live behind 303 redirects and which aren't being downloaded properly | |
out$identifier=gsub(".*target=(http.*)","\\1",out$identifier) | |
out$cached_file_name=as.character(sapply(out$identifier,function(z) ALA4R:::cached_get(z,type="binary_filename"))) | |
} | |
out | |
} | |
## now go through and retrieve images | |
lcfiles=list() | |
for (k in 1:nrow(all_species)) { | |
lcfiles[[k]]=get_stuff(all_species$image_urls[[k]]) | |
} | |
## add taxon key identifier and row number (row number in the all_species data.frame) to lcfiles for convenience | |
for (k in 1:nrow(all_species)) { | |
if (nrow(lcfiles[[k]])>0) { | |
lcfiles[[k]]$taxon_key=all_species$key[k] | |
lcfiles[[k]]$row_number=k | |
} | |
} | |
## collapse this list into a data.frame | |
image_list=ldply(lcfiles) | |
## ------------------------------------------------ | |
## stage 2: extract colour palettes from images | |
cmaps=list() ## keep all colour maps in a list | |
all_colours=matrix(NA,nrow=0,ncol=5) ## matrix of colours with pixel counts and image numbers | |
## read all images and build global color map | |
## this takes a while! | |
cat(sprintf('Reading images and collating colours ... ')) | |
for (di in 1:nrow(image_list)) { | |
if ((di %% 100)==1) cat(sprintf("%d ",di)) | |
im=readJPEG(image_list$cached_file_name[[di]]) | |
## quantize the colours in the image | |
im=round(im*20)/20 | |
## flatten into nx3 matrix | |
imf=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3])) ## columns R G B | |
this_count=arrange(count(imf),desc(freq)) ## gives a matrix of colours and pixel count for each | |
this_count$im_number=di | |
all_colours=rbind(all_colours,this_count) ## add to global colour/count matrix | |
## would probably be better to only add new colours here, rather than growing the all_colours matrix so large inside the loop | |
## and then reducing it to unique colours at the end | |
cmaps[[di]]=this_count | |
} | |
cat(sprintf(' done.\n')) | |
## reduce this down to just unique colours | |
all_colours=unique(all_colours[,1:3]) | |
all_colours$index=1:nrow(all_colours) ## add "index" (colour number) | |
## the first 3 columns in all_colours are red, green, blue but are named "x.1" "x.2" "x.3". We'll leave them as is for now, because it makes the join below easier | |
## now convert each image to counts against the all_colours colourmap | |
## this is also slow | |
im_palette=matrix(NA,nrow=length(cmaps),ncol=nrow(all_colours)) ## i.e. an n_images by n_colours matrix | |
colnames(im_palette)=paste("cl",1:nrow(all_colours),sep="") | |
for (k in 1:length(cmaps)) { | |
if ((k %% 100)==1) cat(sprintf("%d ",k)) | |
im=readJPEG(image_list$cached_file_name[[k]]) | |
im=round(im*20)/20 | |
## flatten | |
im=cbind(as.vector(im[,,1]),as.vector(im[,,2]),as.vector(im[,,3])) | |
## unique colours (rows) and their counts from count(im) | |
## keep only those that appear in global colour map | |
blah=na.omit(join(count(im),all_colours,by=c("x.1","x.2","x.3"))) | |
im_palette[k,]=0 | |
im_palette[k,blah$index]=blah$freq | |
} | |
rownames(all_colours)=paste("cl",1:nrow(all_colours),sep="") | |
colnames(all_colours)=c("red","green","blue","index") | |
## save | |
save(file="colour_data_carabidae.RData",list=c("im_palette","all_colours","all_species","image_list")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment