Heath Blackmon coleoguy

## rosalind 2
foo <- gsub("T", "U", as.character(read.table("rosalind_rna.txt")[[1]]))
paste(foo)

## rosalind 3
foo <- unlist(strsplit(as.character(read.table("rosalind_revc.txt")[[1]]),""))
DNA <- c("A", "C", "G", "T")
rcDNA <- c("T", "G", "C", "A")
foo2 <- vector(length = length(foo))
for(i in 1:4){
  foo2[grep(DNA[i], unlist(foo))] <- rcDNA[i]
}
paste(foo2[length(foo2):1], collapse = "", sep = "")

## gist:6335059
data <- as.numeric(read.table("rosalind_iprb.txt"))
k<-data[1]
m<-data[2]
n<-data[3]
t<-sum(data)
# Sire is AA Dam is anything
t.prob <- k/(t)
# Sire is Aa Dam is AA, Aa, aa
t.prob <- t.prob + (((m/(t))  * (k/(t-1))) +     #Aa x AA
((m/(t))  * ((m-1)/(t-1)) * .75) +               #Aa x Aa

## gist:6335185
dna <- read.table("rosalind_hamm.txt")
dna1 <- unlist(strsplit(as.character(dna[1,1]),""))
dna2 <- unlist(strsplit(as.character(dna[2,1]),""))
ham.dist <- 0
for(i in 1:length(dna1)){
  if(dna1[i] != dna2[i]){
    ham.dist <- ham.dist +1
  }
}
cat(ham.dist)

## mortalrabbits
lifespan <- 18
months <- 85
pop <- vector(mode="numeric", length=months)  # this will hold the population each month as we build it up
pop[1:2] <- 1                                 # rabbits take a month to mature so we go ahead and fill in month 1 and 2
for(i in 3:months){                           # this loop will just build our population month by month
  if(i-(lifespan+1) < 0){                     # if we are early on none have died yet this if statement takes care of that
    pop[i] <- pop[i-1]+pop[i-2]-0
  }
  if(i-(lifespan+1) == 0){                    # this one takes care of the first rabbit dying
    pop[i] <- pop[i-1]+pop[i-2]-1

## dstat.R
##  ABBA BABA tests
##  Heath Blackmon
##  coleoguy@gmail.com
##  28 July 2013
##  This file has algorithms found in:

##  Durand, Eric Y., et al. "Testing for ancient admixture between
##  closely related populations." Molecular biology and evolution
##  28.8 (2011): 2239-2252.

## retro-gff3parser.py
# the gff3 file that was used as input for this script was downloaded from:
# ftp://ftp.ensemblgenomes.org/pub/metazoa/release-21/gff3/tribolium_castaneum
# on January 10, 2014

import re
## This script is just to parse out the part of the gff file that we
## are interested in, (i.e. lines describing exons or protein coding genes


## Here we specify the name of the gff3

## retro-EEfinder.py
from Bio import SeqIO  # to deal with the fast files

# lets pull in our exon table
datafile = open('exons.csv', 'r')
data = []
for row in datafile:
    data.append(row.strip().split(','))

# now lets start the process of creating all of our exon-exon sequences
# by first creating a list that has the name we want to assign and the LG and

## retro-lgexonFinder.R
setwd("~/Desktop/retrogenes/data/")
gff <- read.csv("exons.csv", header = F, as.is=T) #This is a parsed GFF3 file that contains the lines for exons

# lets get big ones >300 bp

gff <- gff[gff[, 3] - gff[, 2] > 300, ]
setwd("~/Desktop/retrogenes/data/chromosomes") #This is the directory that contains the chromosomes of the genome

library(ape)
seq <- name <- list()

## retro-blast.commands
# to construct database from an assembly in the file
# conf.scaf.fa run this in the terminal
# it will create a blast database call confscaf

makeblastdb -in conf.scaf.fa -dbtype nucl -out confscaf

# next we want to blast our large exons against this db
blastn -query=/outputs/300+bpExons.fa -db=/blastDB/confscaf -outfmt='6 qseqid qstart sseqid sstart qlen length pident' -max_target_seqs=2 -out=info

# this produces the lookup table that we can then use to analyze the degree of synteny conservation
	foo <- gsub("T", "U", as.character(read.table("rosalind_rna.txt")[[1]]))
	paste(foo)
	foo <- unlist(strsplit(as.character(read.table("rosalind_revc.txt")[[1]]),""))
	DNA <- c("A", "C", "G", "T")
	rcDNA <- c("T", "G", "C", "A")
	foo2 <- vector(length = length(foo))
	for(i in 1:4){
	foo2[grep(DNA[i], unlist(foo))] <- rcDNA[i]
	}
	paste(foo2[length(foo2):1], collapse = "", sep = "")
	data <- as.numeric(read.table("rosalind_iprb.txt"))
	k<-data[1]
	m<-data[2]
	n<-data[3]
	t<-sum(data)
	# Sire is AA Dam is anything
	t.prob <- k/(t)
	# Sire is Aa Dam is AA, Aa, aa
	t.prob <- t.prob + (((m/(t)) * (k/(t-1))) + #Aa x AA
	((m/(t)) * ((m-1)/(t-1)) * .75) + #Aa x Aa
	dna <- read.table("rosalind_hamm.txt")
	dna1 <- unlist(strsplit(as.character(dna[1,1]),""))
	dna2 <- unlist(strsplit(as.character(dna[2,1]),""))
	ham.dist <- 0
	for(i in 1:length(dna1)){
	if(dna1[i] != dna2[i]){
	ham.dist <- ham.dist +1
	}
	}
	cat(ham.dist)
	lifespan <- 18
	months <- 85
	pop <- vector(mode="numeric", length=months) # this will hold the population each month as we build it up
	pop[1:2] <- 1 # rabbits take a month to mature so we go ahead and fill in month 1 and 2
	for(i in 3:months){ # this loop will just build our population month by month
	if(i-(lifespan+1) < 0){ # if we are early on none have died yet this if statement takes care of that
	pop[i] <- pop[i-1]+pop[i-2]-0
	}
	if(i-(lifespan+1) == 0){ # this one takes care of the first rabbit dying
	pop[i] <- pop[i-1]+pop[i-2]-1
	## ABBA BABA tests
	## Heath Blackmon
	## coleoguy@gmail.com
	## 28 July 2013
	## This file has algorithms found in:

	## Durand, Eric Y., et al. "Testing for ancient admixture between
	## closely related populations." Molecular biology and evolution
	## 28.8 (2011): 2239-2252.
	# the gff3 file that was used as input for this script was downloaded from:
	# ftp://ftp.ensemblgenomes.org/pub/metazoa/release-21/gff3/tribolium_castaneum
	# on January 10, 2014

	import re
	## This script is just to parse out the part of the gff file that we
	## are interested in, (i.e. lines describing exons or protein coding genes


	## Here we specify the name of the gff3
	from Bio import SeqIO # to deal with the fast files

	# lets pull in our exon table
	datafile = open('exons.csv', 'r')
	data = []
	for row in datafile:
	data.append(row.strip().split(','))

	# now lets start the process of creating all of our exon-exon sequences
	# by first creating a list that has the name we want to assign and the LG and
	setwd("~/Desktop/retrogenes/data/")
	gff <- read.csv("exons.csv", header = F, as.is=T) #This is a parsed GFF3 file that contains the lines for exons

	# lets get big ones >300 bp

	gff <- gff[gff[, 3] - gff[, 2] > 300, ]
	setwd("~/Desktop/retrogenes/data/chromosomes") #This is the directory that contains the chromosomes of the genome

	library(ape)
	seq <- name <- list()
	# to construct database from an assembly in the file
	# conf.scaf.fa run this in the terminal
	# it will create a blast database call confscaf

	makeblastdb -in conf.scaf.fa -dbtype nucl -out confscaf

	# next we want to blast our large exons against this db
	blastn -query=/outputs/300+bpExons.fa -db=/blastDB/confscaf -outfmt='6 qseqid qstart sseqid sstart qlen length pident' -max_target_seqs=2 -out=info

	# this produces the lookup table that we can then use to analyze the degree of synteny conservation