brentp/first.ucsc.bed.sh

## first.ucsc.bed.sh
ORG=hg19
mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
           strand, exonStarts, exonEnds from refGene;" \
           | awk 'BEGIN {FS=OFS="\t"}
            (NR>1){
                delete cstarts; delete cends;
                split($9, cstarts, ",");
                split($10, cends, ",");
                strand=$8;
                name=$6"::"$7
                if(strand == "+"){
                    # 1 based indexing...
                    print $1,cends[1],cstarts[2],name,strand
                }
                else if(strand == "-"){
                    n = length(cends) - 1 # account for trailing coma
                    print $1,cends[n-1],cstarts[n],name,strand
                }
            }' > first.introns.bed


mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
           strand, exonStarts, exonEnds from refGene;" \
           | awk 'BEGIN {FS=OFS="\t"}
            (NR>1){
                if($4==$5){ next; } # noncoding
                delete cstarts; delete cends;
                split($9, cstarts, ",");
                split($10, cends, ",");
                name=$6"::"$7
                strand=$8;
                if(strand == "+"){
                    for(i=1; i < length(cstarts); i++){
                        # if the start of the exon is >= the cdsStart...
                        if(cends[i] >= $4){
                            # account for UTR? this just prints entire exon...
                            # could use cdsStart instead of cstarts[i]
                            print $1,cstarts[i],cends[i],name,strand
                            break;
                        }
                    }
                }
                else if(strand == "-"){
                    for(i=length(cstarts) - 1; i > 0; i--){
                        if(cstarts[i] <= $5){
                            # could use cdsEnd instead of cends[i]
                            print $1,cstarts[i],cends[i],name,strand
                            break;
                        }
                    }
                }

            }' > first.coding.exon.bed
	ORG=hg19
	mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
	strand, exonStarts, exonEnds from refGene;" \
	\| awk 'BEGIN {FS=OFS="\t"}
	(NR>1){
	delete cstarts; delete cends;
	split($9, cstarts, ",");
	split($10, cends, ",");
	strand=$8;
	name=$6"::"$7
	if(strand == "+"){
	# 1 based indexing...
	print $1,cends[1],cstarts[2],name,strand
	}
	else if(strand == "-"){
	n = length(cends) - 1 # account for trailing coma
	print $1,cends[n-1],cstarts[n],name,strand
	}
	}' > first.introns.bed


	mysql -A -D $ORG -e "SELECT chrom, txStart, txEnd, cdsStart, cdsEnd, name2, name, \
	strand, exonStarts, exonEnds from refGene;" \
	\| awk 'BEGIN {FS=OFS="\t"}
	(NR>1){
	if($4==$5){ next; } # noncoding
	delete cstarts; delete cends;
	split($9, cstarts, ",");
	split($10, cends, ",");
	name=$6"::"$7
	strand=$8;
	if(strand == "+"){
	for(i=1; i < length(cstarts); i++){
	# if the start of the exon is >= the cdsStart...
	if(cends[i] >= $4){
	# account for UTR? this just prints entire exon...
	# could use cdsStart instead of cstarts[i]
	print $1,cstarts[i],cends[i],name,strand
	break;
	}
	}
	}
	else if(strand == "-"){
	for(i=length(cstarts) - 1; i > 0; i--){
	if(cstarts[i] <= $5){
	# could use cdsEnd instead of cends[i]
	print $1,cstarts[i],cends[i],name,strand
	break;
	}
	}
	}

	}' > first.coding.exon.bed