Created
October 12, 2017 10:44
-
-
Save leosanbu/52e147ec52d4d86ee9e9204308354f90 to your computer and use it in GitHub Desktop.
Useful R functions for bioinformatics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## R functions ## | |
# Get GFF file # | |
get_GFF <- function(x){ | |
gff.tmp <- system(paste("less", x, "| grep \\\t"), intern=TRUE) | |
gff.tmp <- strsplit(gff.tmp, "\t") | |
gff.tmp <- do.call(rbind, gff.tmp) | |
} | |
# Get subalignments with extractalign (EMBOSS) # | |
get_subaln_emboss <- function(aln, ini, end){ | |
aln.tmp <- aln | |
range <- paste0(ini, "-", end) | |
outfile <- gsub("\\..*", "", aln.tmp) | |
outfile <- paste0("subaln_", range, ".fas") | |
system(paste("extractalign -sequence", aln.tmp, "-regions", range, "-outseq", outfile), wait = T) | |
outfile | |
} | |
# Get subalignments from metadata field # | |
# data is a vector with names == rownames(aln) # | |
get_subaln_metadata <- function(aln, data){ | |
un <- unique(as.character(data)) | |
data <- data[rownames(aln)] | |
for (i in 1:length(un)){ | |
tmp <- un[i] | |
subdata <- names(data)[which(data==tmp)] | |
subaln <- aln[subdata,] | |
write.dna(subaln, paste0(tmp, ".fas"), "fasta") | |
} | |
} | |
# Get number of SNPs by gene # | |
get_nb_SNPs <- function(gff, snps){ | |
vec <- c() | |
for (x in 1:nrow(gff)){ | |
ini <- gff[x,4] | |
end <- gff[x,5] | |
wh <- which(snps %in% ini:end) | |
vec <- c(vec, length(wh)) | |
} | |
vec | |
} | |
# Get stats on number of SNPs per gene and strain # | |
# Filename is embl SNP file # | |
get_stats_SNPs <- function(filename, gff){ | |
snp_sites <- system(paste("less", filename, "| grep \"^FT SNP\""), intern=T) | |
snp_sites <- gsub("^FT SNP ", "", snp_sites) | |
taxa <- system(paste("less", filename, "| grep taxa"), intern=T) | |
taxa <- gsub("^FT /taxa=", "", taxa) | |
taxa <- gsub("\"", "", taxa) | |
taxa.len <- strsplit(taxa, "\\, ") | |
taxa.names <- unique(unlist(taxa.len)) | |
m <- matrix(0, nrow(gff), length(taxa.names)) | |
rownames(m) <- paste0("gene", 1:nrow(gff)) | |
colnames(m) <- taxa.names | |
for (i in 1:length(taxa.names)){ | |
tmp.name <- taxa.names[i] | |
index <- c() | |
for (x in 1:length(taxa.len)){ | |
g <- grep(tmp.name, taxa.len[[x]]) | |
if (length(g)>0){ | |
index <- c(index, x) | |
} | |
} | |
# Calculate nb of SNPs per gene # | |
cur.snps <- as.numeric(snp_sites[index]) | |
cur.nb.snps <- get_nb_SNPs(gff, cur.snps) | |
# Add to matrix # | |
m[,tmp.name] <- cur.nb.snps | |
} | |
# Calculate density # | |
gene.len <- as.numeric(gff[,5])-as.numeric(gff[,4])+1 | |
m.dens <- m/gene.len | |
m.dens | |
} | |
# Get reverse complementary from a sequence # | |
rev_comp <- function(seq){ | |
rc <- tolower(seq) | |
rc <- chartr("atcg", "tagc", rc) | |
rev(rc) | |
} | |
# Get dna sequences from GFF with genome # | |
gff2genes <- function(gff){ | |
library(ape) | |
file <- get_GFF(gff) | |
dna <- read.dna(gff, "fasta") | |
outname <- gsub(".gff", "", gff) | |
for (i in 1:nrow(file)){ | |
contig <- file[i,1] | |
ini <- as.numeric(file[i,4]) | |
end <- as.numeric(file[i,5]) | |
strand <- file[i,7] | |
tag <- gsub(".*;locus_tag=", "", file[i,9]) | |
tag <- gsub(";.*", "", tag) | |
tag <- gsub("\"", "", tag) | |
# Select contig # | |
if (!is.list(dna)){ sel.contig <- dna }else{ | |
sel.contig <- dna[[which(names(dna)==contig)]] | |
class(sel.contig) <- "DNAbin" | |
} | |
if (strand=="+"){ | |
gene <- sel.contig[ini:end] | |
gene <- t(as.matrix(as.character(gene))) | |
rownames(gene) <- tag | |
}else{ | |
gene <- rev_comp(sel.contig[ini:end]) | |
gene <- t(as.matrix(gene)) | |
rownames(gene) <- tag | |
} | |
write.dna(gene, paste0(outname, "_genes.fas"), "fasta", append = T) | |
} | |
} | |
# Get best hit from a blast tabular output file # | |
get_bestHit <- function(tab){ | |
best.list <- list() | |
evs <- unique(tab[,1]) | |
for (i in 1:length(evs)){ | |
ev <- as.vector(evs[i]) | |
bl <- tab[which(tab[,1]==ev),] | |
best <- bl[1,] | |
best.list[[i]] <- best | |
} | |
best.tab <- do.call(rbind, best.list) | |
best.tab | |
} | |
# Get a melted table of >1 list for geom_boxplot # | |
get_melted_lists <- function(listOfvectors){ | |
n <- names(listOfvectors) | |
if (is.null(n)){ n <- paste0("list", 1:length(listOfvectors)) } | |
first <- cbind(rep(n[1], length(listOfvectors[[1]])), listOfvectors[[1]]) | |
for (i in 2:length(listOfvectors)){ | |
tmp <- cbind(rep(n[i], length(listOfvectors[[i]])), listOfvectors[[i]]) | |
first <- rbind(first, tmp) | |
} | |
first <- as.data.frame(first) | |
colnames(first) <- c("name", "variable") | |
first$variable <- as.numeric(as.vector(first$variable)) | |
first | |
} | |
# Concatenate contigs into a pseudogenome # | |
concat_seqs <- function(contigs){ | |
library(ape) | |
fas <- read.dna(contigs, "fasta") | |
concat <- unlist(as.character(fas)) | |
concat <- t(as.matrix(as.vector(concat))) | |
rownames(concat) <- contigs | |
class(concat)<-"DNAbin" | |
write.dna(concat, paste0(contigs, ".concat.fas"), "fasta") | |
} | |
# Make coords table # | |
coords_table <- function(window, step, max, ini=1){ | |
coords1 <- seq(ini, max, by=step) | |
coords2 <- seq(ini+(window-1), max, by=step) | |
coords1[1] <- ini | |
if (length(coords1)>length(coords2)){ dif <- length(coords1)-length(coords2) } | |
for (i in 1:dif){ coords1 <- coords1[-length(coords1)] } | |
coords <- cbind(coords1, coords2) | |
colnames(coords) <- c("start", "end") | |
coords | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment