Skip to content

Instantly share code, notes, and snippets.

@gedankenstuecke
Last active May 25, 2017 07:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gedankenstuecke/f5214a066f9cc890b95a518977822070 to your computer and use it in GitHub Desktop.
Save gedankenstuecke/f5214a066f9cc890b95a518977822070 to your computer and use it in GitHub Desktop.
class2tree workaround
library(taxize)
library(reshape2)
### stuff starts here
#### remove the rank filter from the clustering method
class2tree_helper_no_filter <- function(x){
#x <- x[!x$rank == "no rank", ]
df <- x[-nrow(x), 'id']
names(df) <- x[-nrow(x), 'rank']
df <- data.frame(t(data.frame(df)), stringsAsFactors = FALSE)
data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE)
}
#### get classification & convert into dataframe
spnames <- c('Homo_sapiens',
'Pan_troglodytes',
'Macaca_mulatta',
'Mus_musculus',
'Rattus_norvegicus',
'Bos_taurus',
'Canis_lupus',
'Ornithorhynchus_anatinus',
'Xenopus_tropicalis',
'Takifugu_rubripes',
'Gallus_gallus',
'Ciona_intestinalis',
'Branchiostoma_floridae',
'Schistosoma_mansoni',
'Caenorhabditis_elegans',
'Anopheles_gambiae',
'Drosophila_melanogaster',
'Ixodes_scapularis',
'Ustilago_maydis',
'Neurospora_crassa',
'Monodelphis_domestica',
'Danio_rerio',
'Nematostella_vectensis',
'Cryptococcus_neoformans')
out <- classification(spnames, db='ncbi')
df <- rbind.fill(lapply(out, class2tree_helper_no_filter))
# this is what class2tree produces on its own
tr <- class2tree(out)
plot(tr)
# this is the alternative that tries to take all levels, also unranked, into account
# initialize empty df for calculated distances
df_dist <- data.frame(x = character(nrow(df)*nrow(df)), y = character(nrow(df)*nrow(df)), dist = numeric(nrow(df)*nrow(df)), stringsAsFactors = FALSE)
# iterate over each row of classification DF & fill distance
# this is probably the most un-R thing ever but my tired brain didn't find any better solution
c = 1
for (r1 in 1:nrow(df)) {
n1 = df[r1,1]
x = as.vector(t(df[r1,-1]))
for (r2 in 1:nrow(df)) {
n2 = df[r2,1]
y = as.vector(t(df[r2,-1]))
distance = length(union(x,y))-length(intersect(x,y))
df_dist$x[c] = n1
df_dist$y[c] = n2
df_dist$dist[c] = distance
c = c +1
}
}
# convert long dataframe to distance object & plot hclust result
dist_matrix <- as.dist(acast(df_dist, x ~ y, value.var='dist', fun.aggregate = sum, margins=FALSE))
plot(hclust(dist_matrix))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment