Created
August 26, 2011 14:59
-
-
Save brentp/1173596 to your computer and use it in GitHub Desktop.
get the first intron and the first coding exon from UCSC database.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ORG=hg19 | |
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \ | |
strand, exonStarts, exonEnds from refGene;" \ | |
| awk 'BEGIN {FS=OFS="\t"} | |
(NR>1){ | |
delete cstarts; delete cends; | |
split($9, cstarts, ","); | |
split($10, cends, ","); | |
strand=$8; | |
name=$6"::"$7 | |
if(strand == "+"){ | |
# 1 based indexing... | |
print $1,cends[1],cstarts[2],name,strand | |
} | |
else if(strand == "-"){ | |
n = length(cends) - 1 # account for trailing coma | |
print $1,cends[n-1],cstarts[n],name,strand | |
} | |
}' > first.introns.bed | |
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \ | |
strand, exonStarts, exonEnds from refGene;" \ | |
| awk 'BEGIN {FS=OFS="\t"} | |
(NR>1){ | |
if($4==$5){ next; } # noncoding | |
delete cstarts; delete cends; | |
split($9, cstarts, ","); | |
split($10, cends, ","); | |
name=$6"::"$7 | |
strand=$8; | |
if(strand == "+"){ | |
for(i=1; i < length(cstarts); i++){ | |
# if the start of the exon is >= the cdsStart... | |
if(cends[i] >= $4){ | |
# account for UTR? this just prints entire exon... | |
# could use cdsStart instead of cstarts[i] | |
print $1,cstarts[i],cends[i],name,strand | |
break; | |
} | |
} | |
} | |
else if(strand == "-"){ | |
for(i=length(cstarts) - 1; i > 0; i--){ | |
if(cstarts[i] <= $5){ | |
# could use cdsEnd instead of cends[i] | |
print $1,cstarts[i],cends[i],name,strand | |
break; | |
} | |
} | |
} | |
}' > first.coding.exon.bed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can make it a tiny bit more efficient by using else instead of a second if in the first query processing.