Last active
May 25, 2017 07:44
-
-
Save gedankenstuecke/f5214a066f9cc890b95a518977822070 to your computer and use it in GitHub Desktop.
class2tree workaround
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(taxize) | |
library(reshape2) | |
### stuff starts here | |
#### remove the rank filter from the clustering method | |
class2tree_helper_no_filter <- function(x){ | |
#x <- x[!x$rank == "no rank", ] | |
df <- x[-nrow(x), 'id'] | |
names(df) <- x[-nrow(x), 'rank'] | |
df <- data.frame(t(data.frame(df)), stringsAsFactors = FALSE) | |
data.frame(tip = x[nrow(x), "name"], df, stringsAsFactors = FALSE) | |
} | |
#### get classification & convert into dataframe | |
spnames <- c('Homo_sapiens', | |
'Pan_troglodytes', | |
'Macaca_mulatta', | |
'Mus_musculus', | |
'Rattus_norvegicus', | |
'Bos_taurus', | |
'Canis_lupus', | |
'Ornithorhynchus_anatinus', | |
'Xenopus_tropicalis', | |
'Takifugu_rubripes', | |
'Gallus_gallus', | |
'Ciona_intestinalis', | |
'Branchiostoma_floridae', | |
'Schistosoma_mansoni', | |
'Caenorhabditis_elegans', | |
'Anopheles_gambiae', | |
'Drosophila_melanogaster', | |
'Ixodes_scapularis', | |
'Ustilago_maydis', | |
'Neurospora_crassa', | |
'Monodelphis_domestica', | |
'Danio_rerio', | |
'Nematostella_vectensis', | |
'Cryptococcus_neoformans') | |
out <- classification(spnames, db='ncbi') | |
df <- rbind.fill(lapply(out, class2tree_helper_no_filter)) | |
# this is what class2tree produces on its own | |
tr <- class2tree(out) | |
plot(tr) | |
# this is the alternative that tries to take all levels, also unranked, into account | |
# initialize empty df for calculated distances | |
df_dist <- data.frame(x = character(nrow(df)*nrow(df)), y = character(nrow(df)*nrow(df)), dist = numeric(nrow(df)*nrow(df)), stringsAsFactors = FALSE) | |
# iterate over each row of classification DF & fill distance | |
# this is probably the most un-R thing ever but my tired brain didn't find any better solution | |
c = 1 | |
for (r1 in 1:nrow(df)) { | |
n1 = df[r1,1] | |
x = as.vector(t(df[r1,-1])) | |
for (r2 in 1:nrow(df)) { | |
n2 = df[r2,1] | |
y = as.vector(t(df[r2,-1])) | |
distance = length(union(x,y))-length(intersect(x,y)) | |
df_dist$x[c] = n1 | |
df_dist$y[c] = n2 | |
df_dist$dist[c] = distance | |
c = c +1 | |
} | |
} | |
# convert long dataframe to distance object & plot hclust result | |
dist_matrix <- as.dist(acast(df_dist, x ~ y, value.var='dist', fun.aggregate = sum, margins=FALSE)) | |
plot(hclust(dist_matrix)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment