Last active
March 15, 2017 19:35
-
-
Save taylorreiter/4661ef8acb5adf986cbb426f3e26e439 to your computer and use it in GitHub Desktop.
Very slop code from sourmash RNA-seq testing via the Schurch 2016 yeast dataset.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code to visualize numpy output from sourmash compare | |
# NB output was saved to csv file from python using the command: | |
# sourmash compare -o yeast_syrah_L2 ./*sig # completed in terminal | |
# import numpy | |
# from sourmash_lib import fig | |
# import pylab | |
# D, labels = fig.load_matrix_and_labels('yeast_syrah_L2') | |
# D = (D + D.T) / 2.0 | |
# numpy.savetxt("L2_numpy.csv", D, delimiter=",") | |
# Orrrrrr | |
# for i in glob.glob('compared*'): | |
# if i.endswith("labels.txt") or i.endswith(".csv"): | |
# continue | |
# print i | |
# D, labels = fig.load_matrix_and_labels(i) | |
# D = (D + D.T) / 2.0 | |
# numpy.savetxt("{}.csv".format(i), D, delimiter=",") | |
# label.txt files were saved as output from sourmash compute | |
------------------------------------------------------------- | |
# Functions for importing and formating data | |
# Function to import numpy txt file | |
numpy_in<-function(row) { | |
csv_filename = row[1] | |
labels_filename = row[2] | |
# read in numpy matrix | |
numpy_mat<-read.csv(csv_filename, header= F) | |
# import labels | |
numpy_labs<-read.delim(labels_filename, header = F) | |
# add labels to matrix | |
numpy_obj<-cbind(numpy_labs, numpy_mat) | |
numpy_obj <- data.frame(numpy_obj[,-1], row.names=numpy_obj[,1]) | |
numpy_labs<-as.vector(numpy_labs[,1]) | |
colnames(numpy_obj) <- numpy_labs | |
return(numpy_obj) | |
} | |
# Process files for downstream application | |
# determine k sizes used for sourmash compute and compare (regex that acts on file name) | |
determine_k_size<-function(input_csv_file){ | |
library(stringr) | |
# regex for k size: | |
k_size_regex<-"\\.[0-9]{1,2}\\." | |
k_extraction_one<-str_extract(input_csv_file, k_size_regex) | |
k_size_regex2<-"[0-9]{1,2}" | |
k_size<-str_extract(k_extraction_one, k_size_regex2) | |
} | |
# determine scale sizes used for sourmash compute (regex that acts on file name) | |
determine_scale_size<-function(input_csv_file){ | |
library(stringr) | |
# regex for --scale parameter: | |
scale_size_regex<-"\\.[0-9]{3,6}\\." | |
scale_extraction_one<-str_extract(input_csv_file, scale_size_regex) | |
scale_size_regex2<-"[0-9]{3,6}" | |
scale_size_regex2<-str_extract(scale_extraction_one, scale_size_regex2) | |
} | |
# Functions to enable standard curve plots | |
similarity<-function(sourmash_compare_matrix){ | |
sum(sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag=T)])/ | |
(length(sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag=T)])) | |
} | |
standard_error<-function(sourmash_compare_matrix) { | |
scm = sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag = TRUE)] | |
sd(scm)/sqrt(length(scm)) | |
} | |
# Apply to snf2 br13 data | |
# Read in files from SNF2 BR 13 full run | |
setwd("/Users/taylorreiter/Desktop/DIB/yeast_original_fastq/SNF2_BR13/sourmash_compare_output/") | |
# Read in csv files of comparisons | |
# Read csv file names | |
csv_files<-list.files(".","csv$") | |
length(csv_files) | |
# Read label file names (txt) | |
csv_labels<-list.files(".", "txt$") | |
length(csv_labels) | |
# bind vectors for input into numpy_in function | |
snf2_br13_numpy_in<-cbind(csv_files, csv_labels) | |
#Import files | |
all_snf2_br13<-apply(snf2_br13_numpy_in, 1, numpy_in) | |
similarity_snf2_br13<-sapply(all_snf2_br13, similarity) | |
standard_err_snf2_br13<-sapply(all_snf2_br13, standard_error) | |
k_size_snf2_br13<-sapply(csv_files, determine_k_size) | |
scale_size_snf2_br13<-sapply(csv_files, determine_scale_size) | |
# Bind numeric columns together | |
snf2_br13_sim_stderr<-cbind(similarity_snf2_br13, standard_err_snf2_br13, k_size_snf2_br13, scale_size_snf2_br13) | |
# convert from factor/character to numeric | |
snf2_br13_sim_stderr <- data.frame(apply(snf2_br13_sim_stderr, 2, function(x) as.numeric(as.character(x)))) | |
# bind csv_file_name (character) column | |
snf2_br13_sim_stderr <-cbind(csv_files, snf2_br13_sim_stderr) | |
------------------------------------------------- | |
# apply to all | |
# Read in files from SNF2 BR 13 full run | |
setwd("/Users/taylorreiter/Desktop/DIB/yeast_original_fastq/all_compute_compare/") | |
# Read in csv files of comparisons | |
# Read csv file names | |
csv_files<-list.files(".","csv$") | |
length(csv_files) | |
# Read label file names (txt) | |
csv_labels<-list.files(".", "txt$") | |
length(csv_labels) | |
csv_labels<-csv_labels[1:12] | |
# bind vectors for input into numpy_in function | |
all_numpy_in<-cbind(csv_files, csv_labels) | |
#Import files | |
all_in<-apply(all_numpy_in, 1, numpy_in) | |
similarity_all_in<-sapply(all_in, similarity) | |
standard_err_all_in<-sapply(all_in, standard_error) | |
k_size_all_in<-sapply(csv_files, determine_k_size) | |
scale_size_all_in<-sapply(csv_files, determine_scale_size) | |
# Bind numeric columns together | |
all_in_sim_stderr<-cbind(similarity_all_in, standard_err_all_in, k_size_all_in, scale_size_all_in) | |
# convert from factor/character to numeric | |
all_in_sim_stderr <- data.frame(apply(all_in_sim_stderr, 2, function(x) as.numeric(as.character(x)))) | |
# bind csv_file_name (character) column | |
all_in_sim_stderr <-cbind(csv_files, all_in_sim_stderr) | |
head(all_in_sim_stderr) | |
# Plot all data, faceted on k size | |
library(ggplot2) | |
snf2_br13_sim_stderr_plot<-ggplot(snf2_br13_sim_stderr, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) + geom_point() + ggtitle("All") + ylim(0,1) + theme(axis.text.x = element_text(angle=45)) + ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + scale_x_log10(breaks=c(500, 5000, 20000, 40000, 80000, 160000, 240000, 480000), labels = comma) + geom_errorbar(width=0.2) + facet_wrap(~ k_values) | |
snf2_br13_sim_stderr_plot<-ggplot(snf2_br13_sim_stderr, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) + geom_point() + ggtitle("All") + ylim(0,1) + theme(axis.text.x = element_text(angle=45)) + ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + scale_x_log10(breaks=c(500, 5000, 20000, 40000, 80000, 160000, 240000, 480000), labels = comma) + geom_errorbar(width=0.2) + facet_wrap(~ k_values) | |
# # plot all data, faceted on k size | |
# plot_all<-function(plot_data){ | |
# library(ggplot2) | |
# environment = environment() | |
# plot<-ggplot(plot_data, aes_string(x = colnames(plot_data)[5], y = colnames(plot_data)[2], ymin=(plot_data)[2]-(plot_data)[3], (plot_data)[2]+(plot_data)[3])) + | |
# geom_point() + ggtitle("All") + ylim(0,1) + | |
# theme(axis.text.x = element_text(angle=45)) + | |
# ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + | |
# scale_x_log10(breaks=colnames(plot_data)[5], labels = comma) + | |
# geom_errorbar(width=0.2) | |
# | |
# #plot + facet_wrap(~ names(plot_data)[4]) | |
# } | |
# plot<-plot_all(snf2_br13_sim_stderr) | |
# function for individual plots | |
individual_plots<-function(data_to_be_plotted, title) { | |
library(ggplot2) | |
require(scales) | |
ggplot(data_to_be_plotted, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) + | |
# Add points to the graph, and title the graph | |
geom_point() + ggtitle(sprintf("%s", title)) + | |
# Set the y axis. | |
# If all similarities are above .6, start at .6. If not, start at 0 | |
ylim((min(ifelse(data_to_be_plotted[,2]>=.6, .6, 0))), 1) + | |
# rotate x axis numbers 45 degrees | |
theme(axis.text.x = element_text(angle=45)) + | |
# label y and x axes | |
ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + | |
# use a log scale, with breaks at scaled values | |
scale_x_log10(breaks=c(scaled_values), labels = comma) + | |
# Add error bars to graph | |
geom_errorbar(width=0.2) | |
} | |
# From matrix, select only rows that correspond to k = 12 | |
k12<-snf2_br13_sim_stderr[1:8,] | |
# make sure/convert factors into numbers in dataframe | |
k12[,2]<-as.numeric(as.character(k12[,2])) | |
k12[,3]<-as.numeric(as.character(k12[,3])) | |
k12[,4]<-as.numeric(as.character(k12[,4])) | |
# use function "individual_plots" to make standard curves | |
library(ggplot2) | |
k12_plot<-individual_plots(k12, "k12") | |
k15<-snf2_br13_sim_stderr[9:16,] | |
k15[,2]<-as.numeric(as.character(k15[,2])) | |
k15[,3]<-as.numeric(as.character(k15[,3])) | |
k15[,4]<-as.numeric(as.character(k15[,4])) | |
library(ggplot2) | |
k15_plot<-individual_plots(k15, "k15") | |
k18<-snf2_br13_sim_stderr[17:24,] | |
k18[,2]<-as.numeric(as.character(k18[,2])) | |
k18[,3]<-as.numeric(as.character(k18[,3])) | |
k18[,4]<-as.numeric(as.character(k18[,4])) | |
library(ggplot2) | |
k18_plot<-individual_plots(k18, "k18") | |
k21<-snf2_br13_sim_stderr[25:32,] | |
k21[,2]<-as.numeric(as.character(k21[,2])) | |
k21[,3]<-as.numeric(as.character(k21[,3])) | |
k21[,4]<-as.numeric(as.character(k21[,4])) | |
library(ggplot2) | |
k21_plot<-individual_plots(k21, "k21") | |
k24<-snf2_br13_sim_stderr[33:40,] | |
k24[,2]<-as.numeric(as.character(k24[,2])) | |
k24[,3]<-as.numeric(as.character(k24[,3])) | |
k24[,4]<-as.numeric(as.character(k24[,4])) | |
library(ggplot2) | |
k24_plot<-individual_plots(k24, "k24") | |
k27<-snf2_br13_sim_stderr[41:48,] | |
k27[,2]<-as.numeric(as.character(k27[,2])) | |
k27[,3]<-as.numeric(as.character(k27[,3])) | |
k27[,4]<-as.numeric(as.character(k27[,4])) | |
library(ggplot2) | |
k27_plot<-individual_plots(k27, "k27") | |
k30<-snf2_br13_sim_stderr[49:56,] | |
k30[,2]<-as.numeric(as.character(k30[,2])) | |
k30[,3]<-as.numeric(as.character(k30[,3])) | |
k30[,4]<-as.numeric(as.character(k30[,4])) | |
library(ggplot2) | |
k30_plot<-individual_plots(k30, "k30") | |
k6<-snf2_br13_sim_stderr[57:64,] | |
k6[,2]<-as.numeric(as.character(k6[,2])) | |
k6[,3]<-as.numeric(as.character(k6[,3])) | |
k6[,4]<-as.numeric(as.character(k6[,4])) | |
library(ggplot2) | |
k6_plot<-individual_plots(k6, "k6") | |
k9<-snf2_br13_sim_stderr[65:72,] | |
k9[,2]<-as.numeric(as.character(k9[,2])) | |
k9[,3]<-as.numeric(as.character(k9[,3])) | |
k9[,4]<-as.numeric(as.character(k9[,4])) | |
library(ggplot2) | |
k9_plot<-individual_plots(k9, "k9") | |
pdf(file="trial_export_plots.pdf") | |
k6_plot; k9_plot; k12_plot; k15_plot; k18_plot; k21_plot; k24_plot; k27_plot; k30_plot | |
dev.off() | |
------------------------------------------------------------------ | |
# Plot | |
# Heirarchical clustering analysis | |
# calculate correlation | |
cor_all<-lapply(all_in, function(x) {cor(x, method="spearman")}) | |
# cor_all_spearman <- cor(all_in[[1]], method="spearman") | |
# Method options: pearson, kendall, spearman. Default pearson. | |
# calculate hierarchical cluster | |
norm_cor_all<-lapply(cor_all, function(x) {(1-x)}) | |
dist_cor_all<-lapply(norm_cor_all, function(x) {dist(x)}) | |
hc_all<-lapply(dist_cor_all, function(x) {hclust(x, method="average")}) | |
# hc_all<-hclust(dist(1-cor_all_spearman), method="average") | |
# Method: complete, average, single, mcquitty, median, centroid, ward.D, ward.D2 | |
# change labels to only ERR number | |
for(i in 1:length(hc_all)) { | |
truncate_label_regex<-"(ERR[0-9]{4,8})" | |
library(stringr) | |
labels_hc<-str_extract(hc_all[[i]]$labels, truncate_label_regex) | |
print(labels_hc) | |
hc_all[[i]]$labels<-labels_hc} | |
# Plot! | |
# pick a pretty color for SNF2 and for WT | |
color_by_type<-function(names) { | |
library(stringr) | |
# detect if WT is present in the name | |
magenta <-"(ERR458[0-9]{1,6})" | |
is_MUT<-str_detect(names, magenta) | |
# detect if SNF2 is present in the name | |
cyan <-"(ERR459[0-9]{1,6})" | |
is_WT<-str_detect(names, cyan) | |
# if WT is detected, black, else blue | |
ifelse(is_WT==F, "magenta", "cyan") | |
} | |
library(ape) | |
par(mfrow=c(1,4)) | |
# Make the plot! | |
plots_all<-sapply(hc_all, function(x) {plot.phylo(as.phylo(x), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(x$labels))}) | |
# function for colors of tips (works best of Syrah) | |
color_by_type<-function(names) { | |
library(stringr) | |
# detect if WT is present in the name | |
black <-"(ERR[0-9]{4,8})(\\.)(WT)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})" | |
is_WT<-str_detect(names, black) | |
# detect if SNF2 is present in the name | |
blue <-"(ERR[0-9]{4,8})(\\.)(SNF2)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})" | |
is_MUT<-str_detect(names, blue) | |
# if WT is detected, black, else blue | |
ifelse(is_WT==T, "black", "blue") | |
} | |
# plot | |
library(ape) #package for phylo.plot | |
plot.phylo(as.phylo(hc_L2), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(hc_L2$labels)) | |
plots<-plot.phylo(as.phylo(hc_all), type="fan", no.margin=TRUE, font = 1, cex = 1) | |
# Old trials before code was cleaned | |
# # import lane 2 signature (test the function) | |
# L2_sourmash_comp<-numpy_in("/Users/taylorreiter/Desktop/Titus_Brown/yeast_syrah/L2/L2_numpy.csv", "/Users/taylorreiter/Desktop/Titus_Brown/yeast_syrah/L2/yeast_syrah_L2.labels.txt") | |
# # check file integrity | |
# head(L2_sourmash_comp) | |
# # subset to remove the file names that are not comparison files | |
# csv_files<-csv_files[1:90] | |
# # Remove k = 1000--unidentified error, being ignored right now | |
# library(stringr) | |
# regex_eliminate_k1000<-"^((?!.1000.).)*$" # mimicing 'does not contain' in regex | |
# csv_files_no_k1000<-str_extract(csv_files, regex_eliminate_k1000) | |
# csv_files_no_k1000<-csv_files_no_k1000[!is.na(csv_files_no_k1000)] | |
# csv_labels_no_k1000<-str_extract(csv_labels, regex_eliminate_k1000) | |
# csv_labels_no_k1000<-csv_labels_no_k1000[!is.na(csv_labels_no_k1000)] | |
--------------------------------------------------------- | |
# Plot | |
# Heirarchical clustering analysis | |
# calculate correlation | |
# cor_L2 <- cor(L2_sourmash_comp, method="spearman") | |
# # calculate hierarchical cluster | |
# hc_L2<-hclust(dist(1-cor_L2), method="average") | |
# | |
# # function for colors of tips | |
# color_by_type<-function(names) { | |
# library(stringr) | |
# # detect if WT is present in the name | |
# black <-"(ERR[0-9]{4,8})(\\.)(WT)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})" | |
# is_WT<-str_detect(names, black) | |
# # detect if SNF2 is present in the name | |
# blue <-"(ERR[0-9]{4,8})(\\.)(SNF2)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})" | |
# is_MUT<-str_detect(names, blue) | |
# # if WT is detected, black, else blue | |
# ifelse(is_WT==T, "black", "blue") | |
# } | |
# | |
# # plot | |
# library(ape) #package for phylo.plot | |
# plot.phylo(as.phylo(hc_L2), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(hc_L2$labels)) | |
# Plot | |
# Heirarchical clustering analysis | |
# calculate correlation | |
cor_all <- cor(all_in[[1]], method="spearman") | |
# # calculate hierarchical cluster | |
hc_all<-hclust(dist(1-cor_all), method="average") | |
# plot | |
library(ape) #package for phylo.plot | |
plot.phylo(as.phylo(hc_all), type="fan", no.margin=TRUE, font = 1, cex = 1) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sourmash pipeline | |
# obtain files | |
fastq-dump -A ERR458584 --gzip | |
# khmer | |
trim-low-abund.py -k 21 -C 2 --variable-coverage --gzip /Users/taylorreiter/Desktop/Titus_Brown/yeast_raw_fastq/SNF2_BR13/*.fastq.gz | |
# sourmash compute (pre-loop days) | |
# k: 18, 27, 36 | |
# scaled: 20000, 40000, 60000, 80000 | |
# Examples: | |
sourmash compute -k 30 -o SNF2_BR13_k30_480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 27 -o SNF2_BR13_k27_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 24 -o SNF2_BR13_k24_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 21 -o SNF2_BR13_k21_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 18 -o SNF2_BR13_k18_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 15 -o SNF2_BR13_k15_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 12 -o SNF2_BR13_k12_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 9 -o SNF2_BR13_k9_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
sourmash compute -k 6 -o SNF2_BR13_k6_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim | |
# Loop: | |
for k in 18 27 36 | |
do | |
for scale in 20000 40000 60000 80000 | |
do | |
sourmash compute -k $k -o WT_BR34_k${k}_s${scale}* --track-abundance --scaled $scale ./khmer_out/*abundtrim | |
done | |
done | |
# Run split.py to reformat json files (Luiz Irber) | |
# sourmash compare | |
for k in 6 9 12 15 18 21 24 27 30 | |
do | |
for scale in 500 1000 5000 10000 20000 40000 80000 160000 240000 480000 | |
do | |
sourmash compare -k $k SNF2_BR13_k${k}_s${scale}* -o compared.${k}.${scale} | |
done | |
done | |
# sourmash compare | |
for k in 18 27 37 | |
do | |
for scale in 20000 40000 60000 80000 | |
do | |
sourmash compare -k $k SNF2_BR48_k${k}_s${scale}* -o compared.${k}.${scale} | |
done | |
done | |
# R plotting |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment