Create a gist now

Instantly share code, notes, and snippets.

get the first intron and the first coding exon from UCSC database.
ORG=hg19
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
strand, exonStarts, exonEnds from refGene;" \
| awk 'BEGIN {FS=OFS="\t"}
(NR>1){
delete cstarts; delete cends;
split($9, cstarts, ",");
split($10, cends, ",");
strand=$8;
name=$6"::"$7
if(strand == "+"){
# 1 based indexing...
print $1,cends[1],cstarts[2],name,strand
}
else if(strand == "-"){
n = length(cends) - 1 # account for trailing coma
print $1,cends[n-1],cstarts[n],name,strand
}
}' > first.introns.bed
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
strand, exonStarts, exonEnds from refGene;" \
| awk 'BEGIN {FS=OFS="\t"}
(NR>1){
if($4==$5){ next; } # noncoding
delete cstarts; delete cends;
split($9, cstarts, ",");
split($10, cends, ",");
name=$6"::"$7
strand=$8;
if(strand == "+"){
for(i=1; i < length(cstarts); i++){
# if the start of the exon is >= the cdsStart...
if(cends[i] >= $4){
# account for UTR? this just prints entire exon...
# could use cdsStart instead of cstarts[i]
print $1,cstarts[i],cends[i],name,strand
break;
}
}
}
else if(strand == "-"){
for(i=length(cstarts) - 1; i > 0; i--){
if(cstarts[i] <= $5){
# could use cdsEnd instead of cends[i]
print $1,cstarts[i],cends[i],name,strand
break;
}
}
}
}' > first.coding.exon.bed
@Farhat
Farhat commented Aug 28, 2011

You can make it a tiny bit more efficient by using else instead of a second if in the first query processing.

@brentp
Owner
brentp commented Aug 29, 2011

@Farhat I changed it, but, yeah, I think it will be unnoticeable.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment