leosanbu/lsb_functions.R

## lsb_functions.R

## R functions ##

# Get GFF file #
get_GFF <- function(x){
  gff.tmp <- system(paste("less", x, "| grep \\\t"), intern=TRUE)
  gff.tmp <- strsplit(gff.tmp, "\t")
  gff.tmp <- do.call(rbind, gff.tmp)
}

# Get subalignments with extractalign (EMBOSS) #
get_subaln_emboss <- function(aln, ini, end){
  aln.tmp <- aln
  range <- paste0(ini, "-", end)
  outfile <- gsub("\\..*", "", aln.tmp)
  outfile <- paste0("subaln_", range, ".fas")
  system(paste("extractalign -sequence", aln.tmp, "-regions", range, "-outseq", outfile), wait = T)
  outfile
}

# Get subalignments from metadata field #
# data is a vector with names == rownames(aln) #
get_subaln_metadata <- function(aln, data){
  un <- unique(as.character(data))
  data <- data[rownames(aln)]
  for (i in 1:length(un)){
    tmp <- un[i]
    subdata <- names(data)[which(data==tmp)]
    subaln <- aln[subdata,]
    write.dna(subaln, paste0(tmp, ".fas"), "fasta")
  }
}

# Get number of SNPs by gene #
get_nb_SNPs <- function(gff, snps){
  vec <- c()
  for (x in 1:nrow(gff)){
    ini <- gff[x,4]
    end <- gff[x,5]
    wh <- which(snps %in% ini:end)
    vec <- c(vec, length(wh))
  }
  vec
}

# Get stats on number of SNPs per gene and strain #
# Filename is embl SNP file #
get_stats_SNPs <- function(filename, gff){
  snp_sites <- system(paste("less", filename, "| grep \"^FT   SNP\""), intern=T)
  snp_sites <- gsub("^FT   SNP             ", "", snp_sites)
  taxa <- system(paste("less", filename, "| grep taxa"), intern=T)
  taxa <- gsub("^FT                   /taxa=", "", taxa)
  taxa <- gsub("\"", "", taxa)

  taxa.len <- strsplit(taxa, "\\, ")
  taxa.names <- unique(unlist(taxa.len))

  m <- matrix(0, nrow(gff), length(taxa.names))
  rownames(m) <- paste0("gene", 1:nrow(gff))
  colnames(m) <- taxa.names

  for (i in 1:length(taxa.names)){
    tmp.name <- taxa.names[i]
    index <- c()
    for (x in 1:length(taxa.len)){
      g <- grep(tmp.name, taxa.len[[x]])
      if (length(g)>0){
        index <- c(index, x)
      }
    }

    # Calculate nb of SNPs per gene #
    cur.snps <- as.numeric(snp_sites[index])
    cur.nb.snps <- get_nb_SNPs(gff, cur.snps)

    # Add to matrix #
    m[,tmp.name] <- cur.nb.snps
  }

  # Calculate density #
  gene.len <- as.numeric(gff[,5])-as.numeric(gff[,4])+1

  m.dens <- m/gene.len
  m.dens
}

# Get reverse complementary from a sequence #
rev_comp <- function(seq){
  rc <- tolower(seq)
  rc <- chartr("atcg", "tagc", rc)
  rev(rc)
}

# Get dna sequences from GFF with genome #
gff2genes <- function(gff){
  library(ape)

  file <- get_GFF(gff)
  dna <- read.dna(gff, "fasta")
  outname <- gsub(".gff", "", gff)

  for (i in 1:nrow(file)){
    contig <- file[i,1]
    ini <- as.numeric(file[i,4])
    end <- as.numeric(file[i,5])
    strand <- file[i,7]
    tag <- gsub(".*;locus_tag=", "", file[i,9])
    tag <- gsub(";.*", "", tag)
    tag <- gsub("\"", "", tag)

    # Select contig #
    if (!is.list(dna)){ sel.contig <- dna }else{
      sel.contig <- dna[[which(names(dna)==contig)]]
      class(sel.contig) <- "DNAbin"
    }

    if (strand=="+"){
      gene <- sel.contig[ini:end]
      gene <- t(as.matrix(as.character(gene)))
      rownames(gene) <- tag
    }else{
      gene <- rev_comp(sel.contig[ini:end])
      gene <- t(as.matrix(gene))
      rownames(gene) <- tag
    }

    write.dna(gene, paste0(outname, "_genes.fas"), "fasta", append = T)
  }
}

# Get best hit from a blast tabular output file #
get_bestHit <- function(tab){
  best.list <- list()
  evs <- unique(tab[,1])
  for (i in 1:length(evs)){
    ev <- as.vector(evs[i])
    bl <- tab[which(tab[,1]==ev),]
    best <- bl[1,]
    best.list[[i]] <- best
  }
  best.tab <- do.call(rbind, best.list)
  best.tab
}

# Get a melted table of >1 list for geom_boxplot #
get_melted_lists <- function(listOfvectors){
  n <- names(listOfvectors)
  if (is.null(n)){ n <- paste0("list", 1:length(listOfvectors)) }
  first <- cbind(rep(n[1], length(listOfvectors[[1]])), listOfvectors[[1]])
  for (i in 2:length(listOfvectors)){
    tmp <- cbind(rep(n[i], length(listOfvectors[[i]])), listOfvectors[[i]])
    first <- rbind(first, tmp)
  }
  first <- as.data.frame(first)
  colnames(first) <- c("name", "variable")
  first$variable <- as.numeric(as.vector(first$variable))
  first
}

# Concatenate contigs into a pseudogenome #
concat_seqs <- function(contigs){
  library(ape)
  fas <- read.dna(contigs, "fasta")
  concat <- unlist(as.character(fas))
  concat <- t(as.matrix(as.vector(concat)))
  rownames(concat) <- contigs
  class(concat)<-"DNAbin"
  write.dna(concat, paste0(contigs, ".concat.fas"), "fasta")
}

# Make coords table #
coords_table <- function(window, step, max, ini=1){
  coords1 <- seq(ini, max, by=step)
  coords2 <- seq(ini+(window-1), max, by=step)
  coords1[1] <- ini
  if (length(coords1)>length(coords2)){ dif <- length(coords1)-length(coords2) }
  for (i in 1:dif){ coords1 <- coords1[-length(coords1)] }
  coords <- cbind(coords1, coords2)
  colnames(coords) <- c("start", "end")
  coords
}

	## R functions ##

	# Get GFF file #
	get_GFF <- function(x){
	gff.tmp <- system(paste("less", x, "\| grep \\\t"), intern=TRUE)
	gff.tmp <- strsplit(gff.tmp, "\t")
	gff.tmp <- do.call(rbind, gff.tmp)
	}

	# Get subalignments with extractalign (EMBOSS) #
	get_subaln_emboss <- function(aln, ini, end){
	aln.tmp <- aln
	range <- paste0(ini, "-", end)
	outfile <- gsub("\\..*", "", aln.tmp)
	outfile <- paste0("subaln_", range, ".fas")
	system(paste("extractalign -sequence", aln.tmp, "-regions", range, "-outseq", outfile), wait = T)
	outfile
	}

	# Get subalignments from metadata field #
	# data is a vector with names == rownames(aln) #
	get_subaln_metadata <- function(aln, data){
	un <- unique(as.character(data))
	data <- data[rownames(aln)]
	for (i in 1:length(un)){
	tmp <- un[i]
	subdata <- names(data)[which(data==tmp)]
	subaln <- aln[subdata,]
	write.dna(subaln, paste0(tmp, ".fas"), "fasta")
	}
	}

	# Get number of SNPs by gene #
	get_nb_SNPs <- function(gff, snps){
	vec <- c()
	for (x in 1:nrow(gff)){
	ini <- gff[x,4]
	end <- gff[x,5]
	wh <- which(snps %in% ini:end)
	vec <- c(vec, length(wh))
	}
	vec
	}

	# Get stats on number of SNPs per gene and strain #
	# Filename is embl SNP file #
	get_stats_SNPs <- function(filename, gff){
	snp_sites <- system(paste("less", filename, "\| grep \"^FT SNP\""), intern=T)
	snp_sites <- gsub("^FT SNP ", "", snp_sites)
	taxa <- system(paste("less", filename, "\| grep taxa"), intern=T)
	taxa <- gsub("^FT /taxa=", "", taxa)
	taxa <- gsub("\"", "", taxa)

	taxa.len <- strsplit(taxa, "\\, ")
	taxa.names <- unique(unlist(taxa.len))

	m <- matrix(0, nrow(gff), length(taxa.names))
	rownames(m) <- paste0("gene", 1:nrow(gff))
	colnames(m) <- taxa.names

	for (i in 1:length(taxa.names)){
	tmp.name <- taxa.names[i]
	index <- c()
	for (x in 1:length(taxa.len)){
	g <- grep(tmp.name, taxa.len[[x]])
	if (length(g)>0){
	index <- c(index, x)
	}
	}

	# Calculate nb of SNPs per gene #
	cur.snps <- as.numeric(snp_sites[index])
	cur.nb.snps <- get_nb_SNPs(gff, cur.snps)

	# Add to matrix #
	m[,tmp.name] <- cur.nb.snps
	}

	# Calculate density #
	gene.len <- as.numeric(gff[,5])-as.numeric(gff[,4])+1

	m.dens <- m/gene.len
	m.dens
	}

	# Get reverse complementary from a sequence #
	rev_comp <- function(seq){
	rc <- tolower(seq)
	rc <- chartr("atcg", "tagc", rc)
	rev(rc)
	}

	# Get dna sequences from GFF with genome #
	gff2genes <- function(gff){
	library(ape)

	file <- get_GFF(gff)
	dna <- read.dna(gff, "fasta")
	outname <- gsub(".gff", "", gff)

	for (i in 1:nrow(file)){
	contig <- file[i,1]
	ini <- as.numeric(file[i,4])
	end <- as.numeric(file[i,5])
	strand <- file[i,7]
	tag <- gsub(".*;locus_tag=", "", file[i,9])
	tag <- gsub(";.*", "", tag)
	tag <- gsub("\"", "", tag)

	# Select contig #
	if (!is.list(dna)){ sel.contig <- dna }else{
	sel.contig <- dna[[which(names(dna)==contig)]]
	class(sel.contig) <- "DNAbin"
	}

	if (strand=="+"){
	gene <- sel.contig[ini:end]
	gene <- t(as.matrix(as.character(gene)))
	rownames(gene) <- tag
	}else{
	gene <- rev_comp(sel.contig[ini:end])
	gene <- t(as.matrix(gene))
	rownames(gene) <- tag
	}

	write.dna(gene, paste0(outname, "_genes.fas"), "fasta", append = T)
	}
	}

	# Get best hit from a blast tabular output file #
	get_bestHit <- function(tab){
	best.list <- list()
	evs <- unique(tab[,1])
	for (i in 1:length(evs)){
	ev <- as.vector(evs[i])
	bl <- tab[which(tab[,1]==ev),]
	best <- bl[1,]
	best.list[[i]] <- best
	}
	best.tab <- do.call(rbind, best.list)
	best.tab
	}

	# Get a melted table of >1 list for geom_boxplot #
	get_melted_lists <- function(listOfvectors){
	n <- names(listOfvectors)
	if (is.null(n)){ n <- paste0("list", 1:length(listOfvectors)) }
	first <- cbind(rep(n[1], length(listOfvectors[[1]])), listOfvectors[[1]])
	for (i in 2:length(listOfvectors)){
	tmp <- cbind(rep(n[i], length(listOfvectors[[i]])), listOfvectors[[i]])
	first <- rbind(first, tmp)
	}
	first <- as.data.frame(first)
	colnames(first) <- c("name", "variable")
	first$variable <- as.numeric(as.vector(first$variable))
	first
	}

	# Concatenate contigs into a pseudogenome #
	concat_seqs <- function(contigs){
	library(ape)
	fas <- read.dna(contigs, "fasta")
	concat <- unlist(as.character(fas))
	concat <- t(as.matrix(as.vector(concat)))
	rownames(concat) <- contigs
	class(concat)<-"DNAbin"
	write.dna(concat, paste0(contigs, ".concat.fas"), "fasta")
	}

	# Make coords table #
	coords_table <- function(window, step, max, ini=1){
	coords1 <- seq(ini, max, by=step)
	coords2 <- seq(ini+(window-1), max, by=step)
	coords1[1] <- ini
	if (length(coords1)>length(coords2)){ dif <- length(coords1)-length(coords2) }
	for (i in 1:dif){ coords1 <- coords1[-length(coords1)] }
	coords <- cbind(coords1, coords2)
	colnames(coords) <- c("start", "end")
	coords
	}