Ming Tang crazyhottommy

## compare_mutect_results.sh
for i in {1..22} X Y
do
same=$(comm -12 <(cut -f1,2,4,5 keep_original_bam/TCGA-06-0125_$i.call.keep.txt | sort) <(cut -f1,2,4,5 keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt| sort) | wc -l | tr -d " ")
original=`wc -l keep_original_bam/TCGA-06-0125_$i.call.keep.txt | awk '{print $1}'`
realigned=`wc -l keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt |  awk '{print $1}'`
printf "chromosome $i, original_bam:$original, realigned_bam:$realigned, common:$same \n"
done

## maintain_header_sort.sh
## imagine we have a file with one line header, and we want to keep the header after sorting
## use subshells http://bash.cyberciti.biz/guide/What_is_a_Subshell%3F
(sed -n '1p' your_file; cat your_file | sed '1d' | sort) > sort_header.txt

## if you have two header lines and want to keep both of them:
(sed -n '1,2p' your_file; cat your_file | sed '1,2d' | sort) > sort_header.txt

## if you have many lines starting with "#" as header, like vcf files
(grep "^#" my_vcf; grep -v "^#" my_vcf | sort -k1,1V -k2,2n) > sorted.vcf

## GEOquery_gene_expression.r
library(Biobase)
library(GEOquery)


# load series and platform data from GEO

gset <- getGEO("GSE34412", GSEMatrix =TRUE)
gset<- gset[[1]]

# make proper column names to match toptable

## change_fasta_header.py
with open ("C:/Users/Tang Ming/Desktop/anotation.txt", "r") as annotation:
    anotation_dict = {}
    for line in annotation:
        line = line.split()
        if line: #test whether it is an empty line
            anotation_dict[line[0]]=line[1:]
        else:
            continue

# really should not parse the fasta file by myself. there are

## ngs2014_R_basics.r
#2014 MSU NGS R basics tutorial
#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md

#pick one language, and learn it well!

#pick up a dataset, play with it!

#object-oriented programming
#functional programming

## make_dummy_file.sh
!# /usr/bin/bash
while read name
do
    echo "Name read from file - $name"
    touch $name
done < $1

## rename.sh
for fspec1 in *.gz
do
#echo $fspec1
fspec2=$(echo ${fspec1} | sed "s/\([1-4]egg\)_r\([1-2]\)_0\([1-2]\)_sub.fastq.gz/\1_R\3_00\2.fastq.gz/")
echo $fspec2
mv ${fspec1} ${fspec2}
done

## convert_ids.r

##### use bioconductor annotation packages #######

source("http://Bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
biocLite(c("GenomicFeatures", "AnnotationDbi"))

library("org.Hs.eg.db")
library("AnnotationDbi")
library("GenomicFeatures")

## dplyr_gff.r
library(dplyr)
setwd("/home/tommy/annotations/human/ensemble/")

# set the colClasses for faster reading in the data
gtf_cols <- c(seqname="factor", source="factor", feature="factor",
              start="integer", end="integer", score="character",
              strand="factor", frame="factor", attribute="character")

hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE,
                     col.names=names(gtf_cols), comment.char="#")

## Entrez_Direct.sh
# search pubmed contains "glioblastoma enhancer"
$esearch -db pubmed -query "glioblastoma enhancer"
<ENTREZ_DIRECT>
  <Db>pubmed</Db>
  <WebEnv>NCID_1_539964707_130.14.18.34_9001_1422280320_2091337226_0MetA0_S_MegaStore_F_1</WebEnv>
  <QueryKey>1</QueryKey>
  <Count>97</Count>
  <Step>1</Step>
</ENTREZ_DIRECT>
	for i in {1..22} X Y
	do
	same=$(comm -12 <(cut -f1,2,4,5 keep_original_bam/TCGA-06-0125_$i.call.keep.txt \| sort) <(cut -f1,2,4,5 keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt\| sort) \| wc -l \| tr -d " ")
	original=`wc -l keep_original_bam/TCGA-06-0125_$i.call.keep.txt \| awk '{print $1}'`
	realigned=`wc -l keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt \| awk '{print $1}'`
	printf "chromosome $i, original_bam:$original, realigned_bam:$realigned, common:$same \n"
	done
	## imagine we have a file with one line header, and we want to keep the header after sorting
	## use subshells http://bash.cyberciti.biz/guide/What_is_a_Subshell%3F
	(sed -n '1p' your_file; cat your_file \| sed '1d' \| sort) > sort_header.txt

	## if you have two header lines and want to keep both of them:
	(sed -n '1,2p' your_file; cat your_file \| sed '1,2d' \| sort) > sort_header.txt

	## if you have many lines starting with "#" as header, like vcf files
	(grep "^#" my_vcf; grep -v "^#" my_vcf \| sort -k1,1V -k2,2n) > sorted.vcf
	library(Biobase)
	library(GEOquery)


	# load series and platform data from GEO

	gset <- getGEO("GSE34412", GSEMatrix =TRUE)
	gset<- gset[[1]]

	# make proper column names to match toptable
	with open ("C:/Users/Tang Ming/Desktop/anotation.txt", "r") as annotation:
	anotation_dict = {}
	for line in annotation:
	line = line.split()
	if line: #test whether it is an empty line
	anotation_dict[line[0]]=line[1:]
	else:
	continue

	# really should not parse the fasta file by myself. there are
	#2014 MSU NGS R basics tutorial
	#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
	#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md

	#pick one language, and learn it well!

	#pick up a dataset, play with it!

	#object-oriented programming
	#functional programming
	!# /usr/bin/bash
	while read name
	do
	echo "Name read from file - $name"
	touch $name
	done < $1
	for fspec1 in *.gz
	do
	#echo $fspec1
	fspec2=$(echo ${fspec1} \| sed "s/\([1-4]egg\)_r\([1-2]\)_0\([1-2]\)_sub.fastq.gz/\1_R\3_00\2.fastq.gz/")
	echo $fspec2
	mv ${fspec1} ${fspec2}
	done

	##### use bioconductor annotation packages #######

	source("http://Bioconductor.org/biocLite.R")
	biocLite("org.Hs.eg.db")
	biocLite(c("GenomicFeatures", "AnnotationDbi"))

	library("org.Hs.eg.db")
	library("AnnotationDbi")
	library("GenomicFeatures")
	library(dplyr)
	setwd("/home/tommy/annotations/human/ensemble/")

	# set the colClasses for faster reading in the data
	gtf_cols <- c(seqname="factor", source="factor", feature="factor",
	start="integer", end="integer", score="character",
	strand="factor", frame="factor", attribute="character")

	hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE,
	col.names=names(gtf_cols), comment.char="#")
	# search pubmed contains "glioblastoma enhancer"
	$esearch -db pubmed -query "glioblastoma enhancer"
	<ENTREZ_DIRECT>
	<Db>pubmed</Db>
	<WebEnv>NCID_1_539964707_130.14.18.34_9001_1422280320_2091337226_0MetA0_S_MegaStore_F_1</WebEnv>
	<QueryKey>1</QueryKey>
	<Count>97</Count>
	<Step>1</Step>
	</ENTREZ_DIRECT>