Skip to content

Instantly share code, notes, and snippets.

View crazyhottommy's full-sized avatar
🎯
Focusing

Ming Tang crazyhottommy

🎯
Focusing
View GitHub Profile
@crazyhottommy
crazyhottommy / compare_mutect_results.sh
Created August 25, 2015 19:33
compare mutect results overlapping
for i in {1..22} X Y
do
same=$(comm -12 <(cut -f1,2,4,5 keep_original_bam/TCGA-06-0125_$i.call.keep.txt | sort) <(cut -f1,2,4,5 keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt| sort) | wc -l | tr -d " ")
original=`wc -l keep_original_bam/TCGA-06-0125_$i.call.keep.txt | awk '{print $1}'`
realigned=`wc -l keep_realigned_bam/TCGA-06-0125_$i.call.keep.txt | awk '{print $1}'`
printf "chromosome $i, original_bam:$original, realigned_bam:$realigned, common:$same \n"
done
## imagine we have a file with one line header, and we want to keep the header after sorting
## use subshells http://bash.cyberciti.biz/guide/What_is_a_Subshell%3F
(sed -n '1p' your_file; cat your_file | sed '1d' | sort) > sort_header.txt
## if you have two header lines and want to keep both of them:
(sed -n '1,2p' your_file; cat your_file | sed '1,2d' | sort) > sort_header.txt
## if you have many lines starting with "#" as header, like vcf files
(grep "^#" my_vcf; grep -v "^#" my_vcf | sort -k1,1V -k2,2n) > sorted.vcf
library(Biobase)
library(GEOquery)
# load series and platform data from GEO
gset <- getGEO("GSE34412", GSEMatrix =TRUE)
gset<- gset[[1]]
# make proper column names to match toptable
with open ("C:/Users/Tang Ming/Desktop/anotation.txt", "r") as annotation:
anotation_dict = {}
for line in annotation:
line = line.split()
if line: #test whether it is an empty line
anotation_dict[line[0]]=line[1:]
else:
continue
# really should not parse the fasta file by myself. there are
#2014 MSU NGS R basics tutorial
#http://angus.readthedocs.org/en/2014/R_Introductory_tutorial_2014.html
#https://github.com/jrherr/quick_basic_R_tutorial/blob/master/R_tutorial.md
#pick one language, and learn it well!
#pick up a dataset, play with it!
#object-oriented programming
#functional programming
@crazyhottommy
crazyhottommy / make_dummy_file.sh
Created August 14, 2014 14:23
make_dummy_files
!# /usr/bin/bash
while read name
do
echo "Name read from file - $name"
touch $name
done < $1
@crazyhottommy
crazyhottommy / rename.sh
Created August 14, 2014 14:25
rename_files
for fspec1 in *.gz
do
#echo $fspec1
fspec2=$(echo ${fspec1} | sed "s/\([1-4]egg\)_r\([1-2]\)_0\([1-2]\)_sub.fastq.gz/\1_R\3_00\2.fastq.gz/")
echo $fspec2
mv ${fspec1} ${fspec2}
done
##### use bioconductor annotation packages #######
source("http://Bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
biocLite(c("GenomicFeatures", "AnnotationDbi"))
library("org.Hs.eg.db")
library("AnnotationDbi")
library("GenomicFeatures")
library(dplyr)
setwd("/home/tommy/annotations/human/ensemble/")
# set the colClasses for faster reading in the data
gtf_cols <- c(seqname="factor", source="factor", feature="factor",
start="integer", end="integer", score="character",
strand="factor", frame="factor", attribute="character")
hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE,
col.names=names(gtf_cols), comment.char="#")
# search pubmed contains "glioblastoma enhancer"
$esearch -db pubmed -query "glioblastoma enhancer"
<ENTREZ_DIRECT>
<Db>pubmed</Db>
<WebEnv>NCID_1_539964707_130.14.18.34_9001_1422280320_2091337226_0MetA0_S_MegaStore_F_1</WebEnv>
<QueryKey>1</QueryKey>
<Count>97</Count>
<Step>1</Step>
</ENTREZ_DIRECT>