Skip to content

Instantly share code, notes, and snippets.

@leosanbu
Created October 12, 2017 10:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leosanbu/52e147ec52d4d86ee9e9204308354f90 to your computer and use it in GitHub Desktop.
Save leosanbu/52e147ec52d4d86ee9e9204308354f90 to your computer and use it in GitHub Desktop.
Useful R functions for bioinformatics
## R functions ##
# Get GFF file #
get_GFF <- function(x){
gff.tmp <- system(paste("less", x, "| grep \\\t"), intern=TRUE)
gff.tmp <- strsplit(gff.tmp, "\t")
gff.tmp <- do.call(rbind, gff.tmp)
}
# Get subalignments with extractalign (EMBOSS) #
get_subaln_emboss <- function(aln, ini, end){
aln.tmp <- aln
range <- paste0(ini, "-", end)
outfile <- gsub("\\..*", "", aln.tmp)
outfile <- paste0("subaln_", range, ".fas")
system(paste("extractalign -sequence", aln.tmp, "-regions", range, "-outseq", outfile), wait = T)
outfile
}
# Get subalignments from metadata field #
# data is a vector with names == rownames(aln) #
get_subaln_metadata <- function(aln, data){
un <- unique(as.character(data))
data <- data[rownames(aln)]
for (i in 1:length(un)){
tmp <- un[i]
subdata <- names(data)[which(data==tmp)]
subaln <- aln[subdata,]
write.dna(subaln, paste0(tmp, ".fas"), "fasta")
}
}
# Get number of SNPs by gene #
get_nb_SNPs <- function(gff, snps){
vec <- c()
for (x in 1:nrow(gff)){
ini <- gff[x,4]
end <- gff[x,5]
wh <- which(snps %in% ini:end)
vec <- c(vec, length(wh))
}
vec
}
# Get stats on number of SNPs per gene and strain #
# Filename is embl SNP file #
get_stats_SNPs <- function(filename, gff){
snp_sites <- system(paste("less", filename, "| grep \"^FT SNP\""), intern=T)
snp_sites <- gsub("^FT SNP ", "", snp_sites)
taxa <- system(paste("less", filename, "| grep taxa"), intern=T)
taxa <- gsub("^FT /taxa=", "", taxa)
taxa <- gsub("\"", "", taxa)
taxa.len <- strsplit(taxa, "\\, ")
taxa.names <- unique(unlist(taxa.len))
m <- matrix(0, nrow(gff), length(taxa.names))
rownames(m) <- paste0("gene", 1:nrow(gff))
colnames(m) <- taxa.names
for (i in 1:length(taxa.names)){
tmp.name <- taxa.names[i]
index <- c()
for (x in 1:length(taxa.len)){
g <- grep(tmp.name, taxa.len[[x]])
if (length(g)>0){
index <- c(index, x)
}
}
# Calculate nb of SNPs per gene #
cur.snps <- as.numeric(snp_sites[index])
cur.nb.snps <- get_nb_SNPs(gff, cur.snps)
# Add to matrix #
m[,tmp.name] <- cur.nb.snps
}
# Calculate density #
gene.len <- as.numeric(gff[,5])-as.numeric(gff[,4])+1
m.dens <- m/gene.len
m.dens
}
# Get reverse complementary from a sequence #
rev_comp <- function(seq){
rc <- tolower(seq)
rc <- chartr("atcg", "tagc", rc)
rev(rc)
}
# Get dna sequences from GFF with genome #
gff2genes <- function(gff){
library(ape)
file <- get_GFF(gff)
dna <- read.dna(gff, "fasta")
outname <- gsub(".gff", "", gff)
for (i in 1:nrow(file)){
contig <- file[i,1]
ini <- as.numeric(file[i,4])
end <- as.numeric(file[i,5])
strand <- file[i,7]
tag <- gsub(".*;locus_tag=", "", file[i,9])
tag <- gsub(";.*", "", tag)
tag <- gsub("\"", "", tag)
# Select contig #
if (!is.list(dna)){ sel.contig <- dna }else{
sel.contig <- dna[[which(names(dna)==contig)]]
class(sel.contig) <- "DNAbin"
}
if (strand=="+"){
gene <- sel.contig[ini:end]
gene <- t(as.matrix(as.character(gene)))
rownames(gene) <- tag
}else{
gene <- rev_comp(sel.contig[ini:end])
gene <- t(as.matrix(gene))
rownames(gene) <- tag
}
write.dna(gene, paste0(outname, "_genes.fas"), "fasta", append = T)
}
}
# Get best hit from a blast tabular output file #
get_bestHit <- function(tab){
best.list <- list()
evs <- unique(tab[,1])
for (i in 1:length(evs)){
ev <- as.vector(evs[i])
bl <- tab[which(tab[,1]==ev),]
best <- bl[1,]
best.list[[i]] <- best
}
best.tab <- do.call(rbind, best.list)
best.tab
}
# Get a melted table of >1 list for geom_boxplot #
get_melted_lists <- function(listOfvectors){
n <- names(listOfvectors)
if (is.null(n)){ n <- paste0("list", 1:length(listOfvectors)) }
first <- cbind(rep(n[1], length(listOfvectors[[1]])), listOfvectors[[1]])
for (i in 2:length(listOfvectors)){
tmp <- cbind(rep(n[i], length(listOfvectors[[i]])), listOfvectors[[i]])
first <- rbind(first, tmp)
}
first <- as.data.frame(first)
colnames(first) <- c("name", "variable")
first$variable <- as.numeric(as.vector(first$variable))
first
}
# Concatenate contigs into a pseudogenome #
concat_seqs <- function(contigs){
library(ape)
fas <- read.dna(contigs, "fasta")
concat <- unlist(as.character(fas))
concat <- t(as.matrix(as.vector(concat)))
rownames(concat) <- contigs
class(concat)<-"DNAbin"
write.dna(concat, paste0(contigs, ".concat.fas"), "fasta")
}
# Make coords table #
coords_table <- function(window, step, max, ini=1){
coords1 <- seq(ini, max, by=step)
coords2 <- seq(ini+(window-1), max, by=step)
coords1[1] <- ini
if (length(coords1)>length(coords2)){ dif <- length(coords1)-length(coords2) }
for (i in 1:dif){ coords1 <- coords1[-length(coords1)] }
coords <- cbind(coords1, coords2)
colnames(coords) <- c("start", "end")
coords
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment