Ming Tang crazyhottommy

## groupby.py
# this script reformats the tab delimited file like:
#FBgn00001 GO:0016301 [Name:****(annotation)]
#FBgn00002 GO:0016301 [Name:****(annotation)]
#FBgn00003 GO:0016301 [Name:****(annotation)]
#FBgn00004 GO:0003700 [Name:****(annotation)]
#FBgn00004 GO:0009651 [Name:****(annotation)]
#FBgn00004 GO:0006355 [Name:****(annotation)]
#FBgn00005 GO:0009556 [Name:****(annotation)]
#FBgn00005 GO:0005515 [Name:****(annotation)]
#FBgn00005 GO:0080019 [Name:****(annotation)]

## TSS_profile.py
#! /usr/bin/env python


# group the genes according to expression level
# analyze RNAseq data by counting tags for each gene using HTSeq.scripts.count or use bedtools muticov
# it genrates a file (K562_htseq_count.out.clean) with two columns, column 1 are gene names, column 2 are
#counts that mapped to all the exons of the same gene.
# compare the counts from different methods! and visualize them in IGV browser.
# top 30% midum 30% and low 30% gene names were obtained by linux command line
# sort -k2 -nrs K562_htseq_count.out.clean | wc -l

## sam_to_bedgraph.py
import HTSeq

alignment_file = HTSeq.SAM_Reader("SRR817000.sam")
# HTSeq also has a BAM_Reader function to handle the bam file

# initialize a Genomic Array (a class defined in the HTSeq package to deal with NGS data,
# it allows transparent access of the data through the GenomicInterval object)
# more reading http://www-huber.embl.de/users/anders/HTSeq/doc/genomic.html#genomic

coverage = HTSeq.GenomicArray("auto", stranded = True, typecode = 'i')

## reverse_complement.py
# get the reverse-complement DNA sequence

def ReverseComplement1(seq):
    seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
    return "".join([seq_dict[base] for base in reversed(seq)])


# make it more robust, lower case DNA

## alpha_beta_DEG.r
  library(limma)
  library(edgeR)

  x<-read.delim('counts.csv',skip=0, sep="\t", check.names=FALSE)
  counts <- x[,c('a1','a2','a3','b1','b2','b3')]
  keep <- apply(counts, 1, max) >= 0
  x <- x[keep,]
  counts <- counts[keep,]
  design <- matrix(data=c(1,1,1,0,0,0,0,0,0,1,1,1), nrow=6, ncol=2, dimnames = list(c(), c('alpha','beta')))


## DESeq.r
setwd("/home/tommy/scripts")
library("DESeq")
countsTable<- read.delim("All_counts_nozero_1pseudocount_with_header.txt", header=TRUE)
rownames(countsTable)<- countsTable$Gene
countsTable<- countsTable[,-1]

head(countsTable)
conds<- factor(c("alpha","beta","alpha","beta","alpha","beta"))

cds<- newCountDataSet(countsTable, conds)

## shTet1.r
# read GEO data sets from NCBI by GEOquery

setwd("/home/tommy/Tet1")# set the working directory
library(Biobase)
library(GEOquery)

# only set the GSEMatrix to FALSE can it be parsed for later use of function like
# Meta(gse)
gse<- getGEO('GSE26830', GSEMatrix=FALSE,  destdir=".")

## Tet1_CpG_promoters.py
def TSS_Profile(ifile1,ifile2):
    '''read in three files, ifile1 is the sortedbamfile prepared by samtool
    ifile2 is the promoters (upstream 5kb of TSS) bed  file with five  columns: chr, start ,end, name and  strand'''

    import HTSeq
    import numpy
    import itertools

    sortedbamfile=HTSeq.BAM_Reader(ifile1)
    promoters = open(ifile2)

## co_up_down_genes_heatmap.r
library(gplots)
getwd()
setwd("/home/tommy/")
d<- read.table("co_up_or_down_uniq.txt", header=T)

# heatmap.2 works only with matrix, convert the dataframe to matrix
m<-as.matrix(d[,2:3])
rownames(m)<- d$genes # add the gene names as the row lable

png(filename = "co_regulated.png", width=400, height = 800) #save the heatmap to a png or a pdf by pdf(filename=...)

## Tet1_CpG.py
# This code was modified from the tss plot code. It can plot any other ChIP-seq signal
# at other genomic positions in addtion to tss. In this case, it is the HRE. HIF1a ChIP-seq data
# is available, peaks were called by MACS, generated a bed file. the middle point
# of each peak is used as the center of the plot (you can also use summit of the peak from the exel file
# generated from MACS. HREs at promoters are not included
# 04/10/13

def TSS_Profile(ifile1,ifile2):
    '''read in three files, ifile1 is the sortedbamfile prepared by samtool
    ifile2 is the promoters (upstream 5kb of TSS) bed  file with five  columns: chr, start ,end, name and  strand'''
	# this script reformats the tab delimited file like:
	#FBgn00001 GO:0016301 [Name:****(annotation)]
	#FBgn00002 GO:0016301 [Name:****(annotation)]
	#FBgn00003 GO:0016301 [Name:****(annotation)]
	#FBgn00004 GO:0003700 [Name:****(annotation)]
	#FBgn00004 GO:0009651 [Name:****(annotation)]
	#FBgn00004 GO:0006355 [Name:****(annotation)]
	#FBgn00005 GO:0009556 [Name:****(annotation)]
	#FBgn00005 GO:0005515 [Name:****(annotation)]
	#FBgn00005 GO:0080019 [Name:****(annotation)]
	#! /usr/bin/env python


	# group the genes according to expression level
	# analyze RNAseq data by counting tags for each gene using HTSeq.scripts.count or use bedtools muticov
	# it genrates a file (K562_htseq_count.out.clean) with two columns, column 1 are gene names, column 2 are
	#counts that mapped to all the exons of the same gene.
	# compare the counts from different methods! and visualize them in IGV browser.
	# top 30% midum 30% and low 30% gene names were obtained by linux command line
	# sort -k2 -nrs K562_htseq_count.out.clean \| wc -l
	import HTSeq

	alignment_file = HTSeq.SAM_Reader("SRR817000.sam")
	# HTSeq also has a BAM_Reader function to handle the bam file

	# initialize a Genomic Array (a class defined in the HTSeq package to deal with NGS data,
	# it allows transparent access of the data through the GenomicInterval object)
	# more reading http://www-huber.embl.de/users/anders/HTSeq/doc/genomic.html#genomic

	coverage = HTSeq.GenomicArray("auto", stranded = True, typecode = 'i')
	# get the reverse-complement DNA sequence

	def ReverseComplement1(seq):
	seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
	return "".join([seq_dict[base] for base in reversed(seq)])



	# make it more robust, lower case DNA
	library(limma)
	library(edgeR)

	x<-read.delim('counts.csv',skip=0, sep="\t", check.names=FALSE)
	counts <- x[,c('a1','a2','a3','b1','b2','b3')]
	keep <- apply(counts, 1, max) >= 0
	x <- x[keep,]
	counts <- counts[keep,]
	design <- matrix(data=c(1,1,1,0,0,0,0,0,0,1,1,1), nrow=6, ncol=2, dimnames = list(c(), c('alpha','beta')))
	setwd("/home/tommy/scripts")
	library("DESeq")
	countsTable<- read.delim("All_counts_nozero_1pseudocount_with_header.txt", header=TRUE)
	rownames(countsTable)<- countsTable$Gene
	countsTable<- countsTable[,-1]

	head(countsTable)
	conds<- factor(c("alpha","beta","alpha","beta","alpha","beta"))

	cds<- newCountDataSet(countsTable, conds)
	# read GEO data sets from NCBI by GEOquery

	setwd("/home/tommy/Tet1")# set the working directory
	library(Biobase)
	library(GEOquery)

	# only set the GSEMatrix to FALSE can it be parsed for later use of function like
	# Meta(gse)
	gse<- getGEO('GSE26830', GSEMatrix=FALSE, destdir=".")
	def TSS_Profile(ifile1,ifile2):
	'''read in three files, ifile1 is the sortedbamfile prepared by samtool
	ifile2 is the promoters (upstream 5kb of TSS) bed file with five columns: chr, start ,end, name and strand'''

	import HTSeq
	import numpy
	import itertools

	sortedbamfile=HTSeq.BAM_Reader(ifile1)
	promoters = open(ifile2)
	library(gplots)
	getwd()
	setwd("/home/tommy/")
	d<- read.table("co_up_or_down_uniq.txt", header=T)

	# heatmap.2 works only with matrix, convert the dataframe to matrix
	m<-as.matrix(d[,2:3])
	rownames(m)<- d$genes # add the gene names as the row lable

	png(filename = "co_regulated.png", width=400, height = 800) #save the heatmap to a png or a pdf by pdf(filename=...)
	# This code was modified from the tss plot code. It can plot any other ChIP-seq signal
	# at other genomic positions in addtion to tss. In this case, it is the HRE. HIF1a ChIP-seq data
	# is available, peaks were called by MACS, generated a bed file. the middle point
	# of each peak is used as the center of the plot (you can also use summit of the peak from the exel file
	# generated from MACS. HREs at promoters are not included
	# 04/10/13

	def TSS_Profile(ifile1,ifile2):
	'''read in three files, ifile1 is the sortedbamfile prepared by samtool
	ifile2 is the promoters (upstream 5kb of TSS) bed file with five columns: chr, start ,end, name and strand'''