public
Created

get the first intron and the first coding exon from UCSC database.

  • Download Gist
first.ucsc.bed.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
ORG=hg19
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
strand, exonStarts, exonEnds from refGene;" \
| awk 'BEGIN {FS=OFS="\t"}
(NR>1){
delete cstarts; delete cends;
split($9, cstarts, ",");
split($10, cends, ",");
strand=$8;
name=$6"::"$7
if(strand == "+"){
# 1 based indexing...
print $1,cends[1],cstarts[2],name,strand
}
else if(strand == "-"){
n = length(cends) - 1 # account for trailing coma
print $1,cends[n-1],cstarts[n],name,strand
}
}' > first.introns.bed
 
 
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
strand, exonStarts, exonEnds from refGene;" \
| awk 'BEGIN {FS=OFS="\t"}
(NR>1){
if($4==$5){ next; } # noncoding
delete cstarts; delete cends;
split($9, cstarts, ",");
split($10, cends, ",");
name=$6"::"$7
strand=$8;
if(strand == "+"){
for(i=1; i < length(cstarts); i++){
# if the start of the exon is >= the cdsStart...
if(cends[i] >= $4){
# account for UTR? this just prints entire exon...
# could use cdsStart instead of cstarts[i]
print $1,cstarts[i],cends[i],name,strand
break;
}
}
}
else if(strand == "-"){
for(i=length(cstarts) - 1; i > 0; i--){
if(cstarts[i] <= $5){
# could use cdsEnd instead of cends[i]
print $1,cstarts[i],cends[i],name,strand
break;
}
}
}
 
}' > first.coding.exon.bed

You can make it a tiny bit more efficient by using else instead of a second if in the first query processing.

@Farhat I changed it, but, yeah, I think it will be unnoticeable.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.