get the first intron and the first coding exon from UCSC database.
ORG=hg19 | |
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \ | |
strand, exonStarts, exonEnds from refGene;" \ | |
| awk 'BEGIN {FS=OFS="\t"} | |
(NR>1){ | |
delete cstarts; delete cends; | |
split($9, cstarts, ","); | |
split($10, cends, ","); | |
strand=$8; | |
name=$6"::"$7 | |
if(strand == "+"){ | |
# 1 based indexing... | |
print $1,cends[1],cstarts[2],name,strand | |
} | |
else if(strand == "-"){ | |
n = length(cends) - 1 # account for trailing coma | |
print $1,cends[n-1],cstarts[n],name,strand | |
} | |
}' > first.introns.bed | |
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \ | |
strand, exonStarts, exonEnds from refGene;" \ | |
| awk 'BEGIN {FS=OFS="\t"} | |
(NR>1){ | |
if($4==$5){ next; } # noncoding | |
delete cstarts; delete cends; | |
split($9, cstarts, ","); | |
split($10, cends, ","); | |
name=$6"::"$7 | |
strand=$8; | |
if(strand == "+"){ | |
for(i=1; i < length(cstarts); i++){ | |
# if the start of the exon is >= the cdsStart... | |
if(cends[i] >= $4){ | |
# account for UTR? this just prints entire exon... | |
# could use cdsStart instead of cstarts[i] | |
print $1,cstarts[i],cends[i],name,strand | |
break; | |
} | |
} | |
} | |
else if(strand == "-"){ | |
for(i=length(cstarts) - 1; i > 0; i--){ | |
if(cstarts[i] <= $5){ | |
# could use cdsEnd instead of cends[i] | |
print $1,cstarts[i],cends[i],name,strand | |
break; | |
} | |
} | |
} | |
}' > first.coding.exon.bed |
This comment has been minimized.
This comment has been minimized.
@Farhat I changed it, but, yeah, I think it will be unnoticeable. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
You can make it a tiny bit more efficient by using else instead of a second if in the first query processing.