Will Townes willtownes

## monotone_spline.R
library(mgcv)
#library(modules) #devtools::install_github(klmr/modules)
#mgcv<-import_package("mgcv")

mspline<-function(x,y,k=10,lower=NA,upper=NA){
  #fits a monotonic spline to data
  #small values of k= more smoothing (flatter curves)
  #large values of k= more flexible (wiggly curves)
  #k is related to effective degrees of freedom and number of knots
  #use unconstrained gam to get rough parameter estimates

## mp_benchmark.py
"""
Testing python multiprocessing speed. Based on
http://blogs.warwick.ac.uk/dwatkins/entry/benchmarking_parallel_python_1_2/
"""

import time
import math
import multiprocessing as mp

def isprime(n):

## texconverter.py
"""
Python Script to extract code blocks from tex files for CS281
input: a file in current working directory with suffix *.tex
output: the same file with suffix *.py, with everything except the python parts removed
"""
#import pdb
from sys import argv
import os

def convert(ifile,ofile):

## refgene2bed.py
"""
Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
All output files have the structure expected by bedtools, namely,
CHROM POSITION1 POSITION2 REFSEQ_ID
Possible output files include:
1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
2. proximal promoter (transcription start [-1KB,1KB])
3. gene body (anywhere between transcription start and transcription end)
4. transcript (anywhere in an exon)- outputs each exon as a separate line
5. first 1/3 transcript- outputs each exon as a separate line

## compbio_hw2_rnaseq_master.sh
#!/bin/bash

LOGFILE="master_job_ids.log"

echo "======Starting new analysis run========" >> $LOGFILE
names=( SRR1592185 SRR1592187 SRR1592189 SRR1592186 SRR1592188 SRR1592190 )
for ((i=0; i<${#names[@]} ; i++)) do
	echo "Running Batch Job for ${names[i]}. SLURM Job ID shown below:" >> $LOGFILE
	#make results directory
	mkdir -p ${names[i]}_results

## compbio_hw2_rnaseq_template.sh
#!/bin/bash
#SBATCH -n 1
#SBATCH -t 400
#SBATCH -p serial_requeue
#SBATCH --mem=35000
#SBATCH --mail-type=END,FAIL
#SBATCH --mail-user=my_username@mail.harvard.edu

#take filename from command line argument
FILENAME=$1

## enet_cp.r
#sigma2, df, and Cp may not be reliable in default elasticnet function enet
#provide replacements
#S2L is the estimate for sigma2 based on maximal model (full_ols_model from lm())
S2L<- sum(residuals(full_ols_model)^2)/full_ols_model$df.residual

Cp_enet<-function(enet_obj,y,X,S2L){
  #returns the Mallows Cp statistics for a given elastic net object
  betamat<-enet_obj$beta.pure
  #create a matrix where each column is prediction at one of the df levels
  yhats<-X%*%t(betamat) + mean(y)

## research_interests.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                willtownes
                / research_interests.md
            
            
              Last active
              August 29, 2015 14:06
            
              
                Research Interests
              
          
    My background

I am interested in biological problems that require both computational and statistical methods. I started out doing tropical ecology. Then I was a software tester for a while, where I learned programming. I took a bunch of statistics classes at night (two of my favorites were Bayesian Statistics and Stochastic Simulation). I am a fan of open source collaboration and plan on using software engineering techniques like automated testing, version control and continuous integration in my future research. Here are some things I want to learn more about:
Statistics


Bayesian hierarchical modeling and probabilistic graphical models
Bayesian nonparametrics, including all the stochastic processes theory beneath it (Dirichlet processes, etc)
Nonparametric and semiparametric statistics in general
Variational Inference- I find this an intriguing alternative to MCMC
Spatial Statistics (I love looking at maps)


How does the choice of projection affect things? Is there a way to do 3-D spatial


## paintball.r
# Simulation of a Paintball Game.
# The two players start off in random positions. They move one square each turn. Then they shoot at each other.
# Probability of who shoots first is 50/50.
# Probability of a hit is proportional to the distance between the players.
# Tracks history of last 3 moves.
# Prints out graphs to show the movement of the players and the shots.
# Game over when someone is hit.

shoot<-function(a.pos,b.pos){
  # returns TRUE if the shot was a hit, otherwise FALSE

## GtownMath611hw11prob1bBasicFilter.R
index2slag<-function(index){
  # Takes an index between 1 and 32 and converts it into a binary vector. This is used to iterate all of the possible values of (s_t,s_t1,s_t2,s_t3,s_t4)
  num<-paste(rev(intToBits(index-1)),"")
  return(as.logical(as.numeric(num[28:32])))
}

ref.s<-t(sapply(1:32,index2slag)) #table of all S_t1,... combinations
ref.s #structure of this reference table is crucial in step5!

slag2index<-function(slag){
	library(mgcv)
	#library(modules) #devtools::install_github(klmr/modules)
	#mgcv<-import_package("mgcv")

	mspline<-function(x,y,k=10,lower=NA,upper=NA){
	#fits a monotonic spline to data
	#small values of k= more smoothing (flatter curves)
	#large values of k= more flexible (wiggly curves)
	#k is related to effective degrees of freedom and number of knots
	#use unconstrained gam to get rough parameter estimates
	"""
	Testing python multiprocessing speed. Based on
	http://blogs.warwick.ac.uk/dwatkins/entry/benchmarking_parallel_python_1_2/
	"""

	import time
	import math
	import multiprocessing as mp

	def isprime(n):
	"""
	Python Script to extract code blocks from tex files for CS281
	input: a file in current working directory with suffix *.tex
	output: the same file with suffix *.py, with everything except the python parts removed
	"""
	#import pdb
	from sys import argv
	import os

	def convert(ifile,ofile):
	"""
	Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features.
	All output files have the structure expected by bedtools, namely,
	CHROM POSITION1 POSITION2 REFSEQ_ID
	Possible output files include:
	1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte
	2. proximal promoter (transcription start [-1KB,1KB])
	3. gene body (anywhere between transcription start and transcription end)
	4. transcript (anywhere in an exon)- outputs each exon as a separate line
	5. first 1/3 transcript- outputs each exon as a separate line
	#!/bin/bash

	LOGFILE="master_job_ids.log"

	echo "======Starting new analysis run========" >> $LOGFILE
	names=( SRR1592185 SRR1592187 SRR1592189 SRR1592186 SRR1592188 SRR1592190 )
	for ((i=0; i<${#names[@]} ; i++)) do
	echo "Running Batch Job for ${names[i]}. SLURM Job ID shown below:" >> $LOGFILE
	#make results directory
	mkdir -p ${names[i]}_results
	#!/bin/bash
	#SBATCH -n 1
	#SBATCH -t 400
	#SBATCH -p serial_requeue
	#SBATCH --mem=35000
	#SBATCH --mail-type=END,FAIL
	#SBATCH --mail-user=my_username@mail.harvard.edu

	#take filename from command line argument
	FILENAME=$1
	#sigma2, df, and Cp may not be reliable in default elasticnet function enet
	#provide replacements
	#S2L is the estimate for sigma2 based on maximal model (full_ols_model from lm())
	S2L<- sum(residuals(full_ols_model)^2)/full_ols_model$df.residual

	Cp_enet<-function(enet_obj,y,X,S2L){
	#returns the Mallows Cp statistics for a given elastic net object
	betamat<-enet_obj$beta.pure
	#create a matrix where each column is prediction at one of the df levels
	yhats<-X%*%t(betamat) + mean(y)
	# Simulation of a Paintball Game.
	# The two players start off in random positions. They move one square each turn. Then they shoot at each other.
	# Probability of who shoots first is 50/50.
	# Probability of a hit is proportional to the distance between the players.
	# Tracks history of last 3 moves.
	# Prints out graphs to show the movement of the players and the shots.
	# Game over when someone is hit.

	shoot<-function(a.pos,b.pos){
	# returns TRUE if the shot was a hit, otherwise FALSE
	index2slag<-function(index){
	# Takes an index between 1 and 32 and converts it into a binary vector. This is used to iterate all of the possible values of (s_t,s_t1,s_t2,s_t3,s_t4)
	num<-paste(rev(intToBits(index-1)),"")
	return(as.logical(as.numeric(num[28:32])))
	}

	ref.s<-t(sapply(1:32,index2slag)) #table of all S_t1,... combinations
	ref.s #structure of this reference table is crucial in step5!

	slag2index<-function(slag){