Ming Tang crazyhottommy

## batch_convert_faidx.sh
#! /usr/bin
# put the coordinates in a bed file

infile=$1
while read chr start end
do
samtools faidx ref.fasta $chr:$start-$end >> test.fa
done <$infile

## Pvalue_FDR_multiple_test.r
### This part is from the Edx online Harvard course
## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1

library(devtools)
install_github("genomicsclass/GSE5859Subset")

library(GSE5859Subset)
data(GSE5859Subset)
dim(geneExpression)

## benchmark_shuf.sh
# creat a test file
$time seq 1 10000000 > ten_million.txt
seq 1 10000000 > ten_million.txt  3.51s user 0.13s system 99% cpu 3.663 total

# it is a "big" file with size of 109M
$ls -lh ten_million.txt
-rw-r--r--  1 Tammy  staff   109M Mar 22 20:49 ten_million.txt

$man gshuf
# randomly select 1000 lines from it

## simulation.r
## Overview
# central limit theorem (CLT) states that, given certain conditions, the arithmetic mean of a sufficiently large number of iterates of independent random variables, each with a well-defined expected value and well-defined variance, will be approximately normally distributed, regardless of the underlying distribution.

# I am going to draw 40 numbers from exponential distribution for 1000 times, and calcuate the mean
# of each draw (we will have 1000 means), and through this simulation to test if the
# distribution of the means will be normal or not.

## start simulation
# number of simulation, sample size and lambda
nosim<- 1000

## breakdancer_filter.r
options(stringsAsFactors=F)
library(gdata)
library(parallel)
files = list.files(path='ctx/',pattern='*.bd$')
meta = read.csv("WGS.coverage.csv")

mclapply (files, function(f) {
	dat = read.delim(sprintf('ctx/%s', f),comment.char='#',header=F,as.is=T)[,-(12:14)]
	message(sprintf("File: %s, Dim: (%s)", f, paste(dim(dat), collapse=",")))


## IBash_Notebook_reproducible_research.json
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### I am going to demonstrate how to use ipython notebook bash_kernal to do reproducible research.\n",
    "I can do command line in the notebook and take notes along the way.\n",
    "Let's go to the directory first."
   ]

## heatmap_ChIP-seq_2.r
#The X axis is -3 kb to 3 kb around TSS.

#It turns out that the heatmap.2 function use default hclust ( Hierachical Clustering) to cluster the #matrix.
#alternatively, we can use K-means clustering to cluster the data and to see what's the pattern look like.

km<- kmeans(m1,2)   # determine how many cluster you want, I specify 2 here

m1.kmeans<- cbind(m1, km$cluster)   # combine the cluster with the matrix

dim(m1.kmeans)

## heatmap_ChIP-seq_3.r
m.row.sum<- cbind(m1, rowSums(m1))
o1<- rev(order(m.row.sum[,602]))

m.row.sum<- m.row.sum[o1,]

bk = unique(c(seq(-0.1,3, length=100),seq(3,10.35,length=100)))

hmcols<- colorRampPalette(c("white","red"))(length(bk)-1)

pheatmap( m.row.sum[,1:601], cluster_rows = F, cluster_cols = F, col= hmcols, breaks = bk, legend=FALSE, show_rownames=FALSE, show_colnames=FALSE)

## expression2gct.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                crazyhottommy
                / expression2gct.md
            
            
              Last active
              August 29, 2015 14:21
            
          
    compare:
cat raw_expression.txt | head
cat raw_expression.txt | awk -F"\t" '{if(NR==1) $1="NAME"FS$1}1' OFS="\t" | head 

$1 is the first field for each line.

$1="NAME" FS $1  assigns NAME to the first field and seperate by a field seprator (FS), which is a tab.

1 is always TRUE, so awk prints out the rest of the lines.
then, add a second column with a dummy "na":

  
## vcf2gemini_db.sh
#! /bin/bash

# this script is used to decompose vcf file and normalize the vcf file
# see here https://gemini.readthedocs.org/en/latest/index.html
# and load it to gemini database

# put the following three lines to every bash script to catch errors
set -e
set -u
set -o pipefail -o errexit -o nounset
	#! /usr/bin
	# put the coordinates in a bed file

	infile=$1
	while read chr start end
	do
	samtools faidx ref.fasta $chr:$start-$end >> test.fa
	done <$infile
	### This part is from the Edx online Harvard course
	## HarvardX: PH525.3x Advanced Statistics for the Life Sciences, week1

	library(devtools)
	install_github("genomicsclass/GSE5859Subset")

	library(GSE5859Subset)
	data(GSE5859Subset)
	dim(geneExpression)
	# creat a test file
	$time seq 1 10000000 > ten_million.txt
	seq 1 10000000 > ten_million.txt 3.51s user 0.13s system 99% cpu 3.663 total

	# it is a "big" file with size of 109M
	$ls -lh ten_million.txt
	-rw-r--r-- 1 Tammy staff 109M Mar 22 20:49 ten_million.txt

	$man gshuf
	# randomly select 1000 lines from it
	## Overview
	# central limit theorem (CLT) states that, given certain conditions, the arithmetic mean of a sufficiently large number of iterates of independent random variables, each with a well-defined expected value and well-defined variance, will be approximately normally distributed, regardless of the underlying distribution.

	# I am going to draw 40 numbers from exponential distribution for 1000 times, and calcuate the mean
	# of each draw (we will have 1000 means), and through this simulation to test if the
	# distribution of the means will be normal or not.

	## start simulation
	# number of simulation, sample size and lambda
	nosim<- 1000
	options(stringsAsFactors=F)
	library(gdata)
	library(parallel)
	files = list.files(path='ctx/',pattern='*.bd$')
	meta = read.csv("WGS.coverage.csv")

	mclapply (files, function(f) {
	dat = read.delim(sprintf('ctx/%s', f),comment.char='#',header=F,as.is=T)[,-(12:14)]
	message(sprintf("File: %s, Dim: (%s)", f, paste(dim(dat), collapse=",")))
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### I am going to demonstrate how to use ipython notebook bash_kernal to do reproducible research.\n",
	"I can do command line in the notebook and take notes along the way.\n",
	"Let's go to the directory first."
	]
	#The X axis is -3 kb to 3 kb around TSS.

	#It turns out that the heatmap.2 function use default hclust ( Hierachical Clustering) to cluster the #matrix.
	#alternatively, we can use K-means clustering to cluster the data and to see what's the pattern look like.

	km<- kmeans(m1,2) # determine how many cluster you want, I specify 2 here

	m1.kmeans<- cbind(m1, km$cluster) # combine the cluster with the matrix

	dim(m1.kmeans)
	m.row.sum<- cbind(m1, rowSums(m1))
	o1<- rev(order(m.row.sum[,602]))

	m.row.sum<- m.row.sum[o1,]

	bk = unique(c(seq(-0.1,3, length=100),seq(3,10.35,length=100)))

	hmcols<- colorRampPalette(c("white","red"))(length(bk)-1)

	pheatmap( m.row.sum[,1:601], cluster_rows = F, cluster_cols = F, col= hmcols, breaks = bk, legend=FALSE, show_rownames=FALSE, show_colnames=FALSE)
	#! /bin/bash

	# this script is used to decompose vcf file and normalize the vcf file
	# see here https://gemini.readthedocs.org/en/latest/index.html
	# and load it to gemini database

	# put the following three lines to every bash script to catch errors
	set -e
	set -u
	set -o pipefail -o errexit -o nounset