Skip to content

Instantly share code, notes, and snippets.

@taylorreiter
Last active March 15, 2017 19:35
Show Gist options
  • Save taylorreiter/4661ef8acb5adf986cbb426f3e26e439 to your computer and use it in GitHub Desktop.
Save taylorreiter/4661ef8acb5adf986cbb426f3e26e439 to your computer and use it in GitHub Desktop.
Very slop code from sourmash RNA-seq testing via the Schurch 2016 yeast dataset.
# Code to visualize numpy output from sourmash compare
# NB output was saved to csv file from python using the command:
# sourmash compare -o yeast_syrah_L2 ./*sig # completed in terminal
# import numpy
# from sourmash_lib import fig
# import pylab
# D, labels = fig.load_matrix_and_labels('yeast_syrah_L2')
# D = (D + D.T) / 2.0
# numpy.savetxt("L2_numpy.csv", D, delimiter=",")
# Orrrrrr
# for i in glob.glob('compared*'):
# if i.endswith("labels.txt") or i.endswith(".csv"):
# continue
# print i
# D, labels = fig.load_matrix_and_labels(i)
# D = (D + D.T) / 2.0
# numpy.savetxt("{}.csv".format(i), D, delimiter=",")
# label.txt files were saved as output from sourmash compute
-------------------------------------------------------------
# Functions for importing and formating data
# Function to import numpy txt file
numpy_in<-function(row) {
csv_filename = row[1]
labels_filename = row[2]
# read in numpy matrix
numpy_mat<-read.csv(csv_filename, header= F)
# import labels
numpy_labs<-read.delim(labels_filename, header = F)
# add labels to matrix
numpy_obj<-cbind(numpy_labs, numpy_mat)
numpy_obj <- data.frame(numpy_obj[,-1], row.names=numpy_obj[,1])
numpy_labs<-as.vector(numpy_labs[,1])
colnames(numpy_obj) <- numpy_labs
return(numpy_obj)
}
# Process files for downstream application
# determine k sizes used for sourmash compute and compare (regex that acts on file name)
determine_k_size<-function(input_csv_file){
library(stringr)
# regex for k size:
k_size_regex<-"\\.[0-9]{1,2}\\."
k_extraction_one<-str_extract(input_csv_file, k_size_regex)
k_size_regex2<-"[0-9]{1,2}"
k_size<-str_extract(k_extraction_one, k_size_regex2)
}
# determine scale sizes used for sourmash compute (regex that acts on file name)
determine_scale_size<-function(input_csv_file){
library(stringr)
# regex for --scale parameter:
scale_size_regex<-"\\.[0-9]{3,6}\\."
scale_extraction_one<-str_extract(input_csv_file, scale_size_regex)
scale_size_regex2<-"[0-9]{3,6}"
scale_size_regex2<-str_extract(scale_extraction_one, scale_size_regex2)
}
# Functions to enable standard curve plots
similarity<-function(sourmash_compare_matrix){
sum(sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag=T)])/
(length(sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag=T)]))
}
standard_error<-function(sourmash_compare_matrix) {
scm = sourmash_compare_matrix[upper.tri(sourmash_compare_matrix, diag = TRUE)]
sd(scm)/sqrt(length(scm))
}
# Apply to snf2 br13 data
# Read in files from SNF2 BR 13 full run
setwd("/Users/taylorreiter/Desktop/DIB/yeast_original_fastq/SNF2_BR13/sourmash_compare_output/")
# Read in csv files of comparisons
# Read csv file names
csv_files<-list.files(".","csv$")
length(csv_files)
# Read label file names (txt)
csv_labels<-list.files(".", "txt$")
length(csv_labels)
# bind vectors for input into numpy_in function
snf2_br13_numpy_in<-cbind(csv_files, csv_labels)
#Import files
all_snf2_br13<-apply(snf2_br13_numpy_in, 1, numpy_in)
similarity_snf2_br13<-sapply(all_snf2_br13, similarity)
standard_err_snf2_br13<-sapply(all_snf2_br13, standard_error)
k_size_snf2_br13<-sapply(csv_files, determine_k_size)
scale_size_snf2_br13<-sapply(csv_files, determine_scale_size)
# Bind numeric columns together
snf2_br13_sim_stderr<-cbind(similarity_snf2_br13, standard_err_snf2_br13, k_size_snf2_br13, scale_size_snf2_br13)
# convert from factor/character to numeric
snf2_br13_sim_stderr <- data.frame(apply(snf2_br13_sim_stderr, 2, function(x) as.numeric(as.character(x))))
# bind csv_file_name (character) column
snf2_br13_sim_stderr <-cbind(csv_files, snf2_br13_sim_stderr)
-------------------------------------------------
# apply to all
# Read in files from SNF2 BR 13 full run
setwd("/Users/taylorreiter/Desktop/DIB/yeast_original_fastq/all_compute_compare/")
# Read in csv files of comparisons
# Read csv file names
csv_files<-list.files(".","csv$")
length(csv_files)
# Read label file names (txt)
csv_labels<-list.files(".", "txt$")
length(csv_labels)
csv_labels<-csv_labels[1:12]
# bind vectors for input into numpy_in function
all_numpy_in<-cbind(csv_files, csv_labels)
#Import files
all_in<-apply(all_numpy_in, 1, numpy_in)
similarity_all_in<-sapply(all_in, similarity)
standard_err_all_in<-sapply(all_in, standard_error)
k_size_all_in<-sapply(csv_files, determine_k_size)
scale_size_all_in<-sapply(csv_files, determine_scale_size)
# Bind numeric columns together
all_in_sim_stderr<-cbind(similarity_all_in, standard_err_all_in, k_size_all_in, scale_size_all_in)
# convert from factor/character to numeric
all_in_sim_stderr <- data.frame(apply(all_in_sim_stderr, 2, function(x) as.numeric(as.character(x))))
# bind csv_file_name (character) column
all_in_sim_stderr <-cbind(csv_files, all_in_sim_stderr)
head(all_in_sim_stderr)
# Plot all data, faceted on k size
library(ggplot2)
snf2_br13_sim_stderr_plot<-ggplot(snf2_br13_sim_stderr, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) + geom_point() + ggtitle("All") + ylim(0,1) + theme(axis.text.x = element_text(angle=45)) + ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + scale_x_log10(breaks=c(500, 5000, 20000, 40000, 80000, 160000, 240000, 480000), labels = comma) + geom_errorbar(width=0.2) + facet_wrap(~ k_values)
snf2_br13_sim_stderr_plot<-ggplot(snf2_br13_sim_stderr, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) + geom_point() + ggtitle("All") + ylim(0,1) + theme(axis.text.x = element_text(angle=45)) + ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") + scale_x_log10(breaks=c(500, 5000, 20000, 40000, 80000, 160000, 240000, 480000), labels = comma) + geom_errorbar(width=0.2) + facet_wrap(~ k_values)
# # plot all data, faceted on k size
# plot_all<-function(plot_data){
# library(ggplot2)
# environment = environment()
# plot<-ggplot(plot_data, aes_string(x = colnames(plot_data)[5], y = colnames(plot_data)[2], ymin=(plot_data)[2]-(plot_data)[3], (plot_data)[2]+(plot_data)[3])) +
# geom_point() + ggtitle("All") + ylim(0,1) +
# theme(axis.text.x = element_text(angle=45)) +
# ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") +
# scale_x_log10(breaks=colnames(plot_data)[5], labels = comma) +
# geom_errorbar(width=0.2)
#
# #plot + facet_wrap(~ names(plot_data)[4])
# }
# plot<-plot_all(snf2_br13_sim_stderr)
# function for individual plots
individual_plots<-function(data_to_be_plotted, title) {
library(ggplot2)
require(scales)
ggplot(data_to_be_plotted, aes(x = scaled_values, y = similarity_snf2_br13, ymin=similarity_snf2_br13-standard_err_snf2_br13, ymax=similarity_snf2_br13+standard_err_snf2_br13)) +
# Add points to the graph, and title the graph
geom_point() + ggtitle(sprintf("%s", title)) +
# Set the y axis.
# If all similarities are above .6, start at .6. If not, start at 0
ylim((min(ifelse(data_to_be_plotted[,2]>=.6, .6, 0))), 1) +
# rotate x axis numbers 45 degrees
theme(axis.text.x = element_text(angle=45)) +
# label y and x axes
ylab("Average Percent Similarity") + xlab("Sourmash Scaled Parameter") +
# use a log scale, with breaks at scaled values
scale_x_log10(breaks=c(scaled_values), labels = comma) +
# Add error bars to graph
geom_errorbar(width=0.2)
}
# From matrix, select only rows that correspond to k = 12
k12<-snf2_br13_sim_stderr[1:8,]
# make sure/convert factors into numbers in dataframe
k12[,2]<-as.numeric(as.character(k12[,2]))
k12[,3]<-as.numeric(as.character(k12[,3]))
k12[,4]<-as.numeric(as.character(k12[,4]))
# use function "individual_plots" to make standard curves
library(ggplot2)
k12_plot<-individual_plots(k12, "k12")
k15<-snf2_br13_sim_stderr[9:16,]
k15[,2]<-as.numeric(as.character(k15[,2]))
k15[,3]<-as.numeric(as.character(k15[,3]))
k15[,4]<-as.numeric(as.character(k15[,4]))
library(ggplot2)
k15_plot<-individual_plots(k15, "k15")
k18<-snf2_br13_sim_stderr[17:24,]
k18[,2]<-as.numeric(as.character(k18[,2]))
k18[,3]<-as.numeric(as.character(k18[,3]))
k18[,4]<-as.numeric(as.character(k18[,4]))
library(ggplot2)
k18_plot<-individual_plots(k18, "k18")
k21<-snf2_br13_sim_stderr[25:32,]
k21[,2]<-as.numeric(as.character(k21[,2]))
k21[,3]<-as.numeric(as.character(k21[,3]))
k21[,4]<-as.numeric(as.character(k21[,4]))
library(ggplot2)
k21_plot<-individual_plots(k21, "k21")
k24<-snf2_br13_sim_stderr[33:40,]
k24[,2]<-as.numeric(as.character(k24[,2]))
k24[,3]<-as.numeric(as.character(k24[,3]))
k24[,4]<-as.numeric(as.character(k24[,4]))
library(ggplot2)
k24_plot<-individual_plots(k24, "k24")
k27<-snf2_br13_sim_stderr[41:48,]
k27[,2]<-as.numeric(as.character(k27[,2]))
k27[,3]<-as.numeric(as.character(k27[,3]))
k27[,4]<-as.numeric(as.character(k27[,4]))
library(ggplot2)
k27_plot<-individual_plots(k27, "k27")
k30<-snf2_br13_sim_stderr[49:56,]
k30[,2]<-as.numeric(as.character(k30[,2]))
k30[,3]<-as.numeric(as.character(k30[,3]))
k30[,4]<-as.numeric(as.character(k30[,4]))
library(ggplot2)
k30_plot<-individual_plots(k30, "k30")
k6<-snf2_br13_sim_stderr[57:64,]
k6[,2]<-as.numeric(as.character(k6[,2]))
k6[,3]<-as.numeric(as.character(k6[,3]))
k6[,4]<-as.numeric(as.character(k6[,4]))
library(ggplot2)
k6_plot<-individual_plots(k6, "k6")
k9<-snf2_br13_sim_stderr[65:72,]
k9[,2]<-as.numeric(as.character(k9[,2]))
k9[,3]<-as.numeric(as.character(k9[,3]))
k9[,4]<-as.numeric(as.character(k9[,4]))
library(ggplot2)
k9_plot<-individual_plots(k9, "k9")
pdf(file="trial_export_plots.pdf")
k6_plot; k9_plot; k12_plot; k15_plot; k18_plot; k21_plot; k24_plot; k27_plot; k30_plot
dev.off()
------------------------------------------------------------------
# Plot
# Heirarchical clustering analysis
# calculate correlation
cor_all<-lapply(all_in, function(x) {cor(x, method="spearman")})
# cor_all_spearman <- cor(all_in[[1]], method="spearman")
# Method options: pearson, kendall, spearman. Default pearson.
# calculate hierarchical cluster
norm_cor_all<-lapply(cor_all, function(x) {(1-x)})
dist_cor_all<-lapply(norm_cor_all, function(x) {dist(x)})
hc_all<-lapply(dist_cor_all, function(x) {hclust(x, method="average")})
# hc_all<-hclust(dist(1-cor_all_spearman), method="average")
# Method: complete, average, single, mcquitty, median, centroid, ward.D, ward.D2
# change labels to only ERR number
for(i in 1:length(hc_all)) {
truncate_label_regex<-"(ERR[0-9]{4,8})"
library(stringr)
labels_hc<-str_extract(hc_all[[i]]$labels, truncate_label_regex)
print(labels_hc)
hc_all[[i]]$labels<-labels_hc}
# Plot!
# pick a pretty color for SNF2 and for WT
color_by_type<-function(names) {
library(stringr)
# detect if WT is present in the name
magenta <-"(ERR458[0-9]{1,6})"
is_MUT<-str_detect(names, magenta)
# detect if SNF2 is present in the name
cyan <-"(ERR459[0-9]{1,6})"
is_WT<-str_detect(names, cyan)
# if WT is detected, black, else blue
ifelse(is_WT==F, "magenta", "cyan")
}
library(ape)
par(mfrow=c(1,4))
# Make the plot!
plots_all<-sapply(hc_all, function(x) {plot.phylo(as.phylo(x), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(x$labels))})
# function for colors of tips (works best of Syrah)
color_by_type<-function(names) {
library(stringr)
# detect if WT is present in the name
black <-"(ERR[0-9]{4,8})(\\.)(WT)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})"
is_WT<-str_detect(names, black)
# detect if SNF2 is present in the name
blue <-"(ERR[0-9]{4,8})(\\.)(SNF2)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})"
is_MUT<-str_detect(names, blue)
# if WT is detected, black, else blue
ifelse(is_WT==T, "black", "blue")
}
# plot
library(ape) #package for phylo.plot
plot.phylo(as.phylo(hc_L2), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(hc_L2$labels))
plots<-plot.phylo(as.phylo(hc_all), type="fan", no.margin=TRUE, font = 1, cex = 1)
# Old trials before code was cleaned
# # import lane 2 signature (test the function)
# L2_sourmash_comp<-numpy_in("/Users/taylorreiter/Desktop/Titus_Brown/yeast_syrah/L2/L2_numpy.csv", "/Users/taylorreiter/Desktop/Titus_Brown/yeast_syrah/L2/yeast_syrah_L2.labels.txt")
# # check file integrity
# head(L2_sourmash_comp)
# # subset to remove the file names that are not comparison files
# csv_files<-csv_files[1:90]
# # Remove k = 1000--unidentified error, being ignored right now
# library(stringr)
# regex_eliminate_k1000<-"^((?!.1000.).)*$" # mimicing 'does not contain' in regex
# csv_files_no_k1000<-str_extract(csv_files, regex_eliminate_k1000)
# csv_files_no_k1000<-csv_files_no_k1000[!is.na(csv_files_no_k1000)]
# csv_labels_no_k1000<-str_extract(csv_labels, regex_eliminate_k1000)
# csv_labels_no_k1000<-csv_labels_no_k1000[!is.na(csv_labels_no_k1000)]
---------------------------------------------------------
# Plot
# Heirarchical clustering analysis
# calculate correlation
# cor_L2 <- cor(L2_sourmash_comp, method="spearman")
# # calculate hierarchical cluster
# hc_L2<-hclust(dist(1-cor_L2), method="average")
#
# # function for colors of tips
# color_by_type<-function(names) {
# library(stringr)
# # detect if WT is present in the name
# black <-"(ERR[0-9]{4,8})(\\.)(WT)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})"
# is_WT<-str_detect(names, black)
# # detect if SNF2 is present in the name
# blue <-"(ERR[0-9]{4,8})(\\.)(SNF2)(\\.)(BR[0-9]{1,2})(\\.)(L[0-9]{1})"
# is_MUT<-str_detect(names, blue)
# # if WT is detected, black, else blue
# ifelse(is_WT==T, "black", "blue")
# }
#
# # plot
# library(ape) #package for phylo.plot
# plot.phylo(as.phylo(hc_L2), type="fan", no.margin=TRUE, font = 1, cex = 1, tip.color = color_by_type(hc_L2$labels))
# Plot
# Heirarchical clustering analysis
# calculate correlation
cor_all <- cor(all_in[[1]], method="spearman")
# # calculate hierarchical cluster
hc_all<-hclust(dist(1-cor_all), method="average")
# plot
library(ape) #package for phylo.plot
plot.phylo(as.phylo(hc_all), type="fan", no.margin=TRUE, font = 1, cex = 1)
# Sourmash pipeline
# obtain files
fastq-dump -A ERR458584 --gzip
# khmer
trim-low-abund.py -k 21 -C 2 --variable-coverage --gzip /Users/taylorreiter/Desktop/Titus_Brown/yeast_raw_fastq/SNF2_BR13/*.fastq.gz
# sourmash compute (pre-loop days)
# k: 18, 27, 36
# scaled: 20000, 40000, 60000, 80000
# Examples:
sourmash compute -k 30 -o SNF2_BR13_k30_480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 27 -o SNF2_BR13_k27_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 24 -o SNF2_BR13_k24_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 21 -o SNF2_BR13_k21_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 18 -o SNF2_BR13_k18_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 15 -o SNF2_BR13_k15_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 12 -o SNF2_BR13_k12_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 9 -o SNF2_BR13_k9_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
sourmash compute -k 6 -o SNF2_BR13_k6_s480000 --track-abundance --scaled 480000 ./khmer_out/*abundtrim
# Loop:
for k in 18 27 36
do
for scale in 20000 40000 60000 80000
do
sourmash compute -k $k -o WT_BR34_k${k}_s${scale}* --track-abundance --scaled $scale ./khmer_out/*abundtrim
done
done
# Run split.py to reformat json files (Luiz Irber)
# sourmash compare
for k in 6 9 12 15 18 21 24 27 30
do
for scale in 500 1000 5000 10000 20000 40000 80000 160000 240000 480000
do
sourmash compare -k $k SNF2_BR13_k${k}_s${scale}* -o compared.${k}.${scale}
done
done
# sourmash compare
for k in 18 27 37
do
for scale in 20000 40000 60000 80000
do
sourmash compare -k $k SNF2_BR48_k${k}_s${scale}* -o compared.${k}.${scale}
done
done
# R plotting
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment