Skip to content

Instantly share code, notes, and snippets.

@dinovski
Created April 27, 2020 20:19
Show Gist options
  • Save dinovski/1a58805a55c5c51ce85b50b3de7f1be3 to your computer and use it in GitHub Desktop.
Save dinovski/1a58805a55c5c51ce85b50b3de7f1be3 to your computer and use it in GitHub Desktop.
Add padding up and/or downstream of TSS coordinates
#!/bin/bash
BEDTOOLS=/local/build/BEDTools_2.21.0/bin
FEATURECOUNTS=/local/build/subread/subread-1.5.1-Linux-x86_64/bin/featureCounts
R=/local/build/R/R-3.4.0/bin/R
IDIR=/in/path
DIR=/out/path
GTF=gencode.vM13.annotation.gtf
CHR_LEN=mm10.chrom.sizes
grep -v "chrUn" $CHR_LEN | grep -v "random" > ${DIR}/mm10.chrom.sizes
## extract TSS coordinates and convert to BED and SAF: GeneID,Chr,Start,End,Strand
## add slop downstream TSS: for transcripts <= slop; use len transcript
## NOTE: use transcript ID (some genes have alt transcripts where len is > & < slop
mkdir -p ${DIR}
LEN=3000
GENOME=mm10
###############
## TSS based ##
###############
## Extract coord for transcripts < $LEN kb; use actual coords
grep -w transcript ${GTF} | awk -v len=$LEN '($5-$4 <= len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.short
## Extract TSS coordinates for fwd strand genes > $LEN kb
grep -w transcript ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${ODIR}/tmp.tss.long+
## Extract TSS coordinates for reverse strand genes > kb
grep -w transcript ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${ODIR}/tmp.tss.long-
## Add padding to TSS for transcripts > $LEN
cat ${DIR}/tmp.tss.long+ ${DIR}/tmp.tss.long- | sort -k1,1 -k 2n,2 |\
${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} -s -l 0 -r $LEN > ${DIR}/tmp.tss.long
cat ${DIR}/tmp.tss.long ${DIR}/tmp.tss.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_tss_plus_${LEN}.bed
rm -rf ${DIR}/tmp.tss*
################
## gene-based ##
################
## keep gene_id and gene_name
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/gencode.vM13.gene.bed
GENE_BED=${DIR}/gencode.vM13.gene.bed
## gene bed > saf
awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' ${GENE_BED} > ${DIR}/gencode.vM13.gene.saf
## strand-specific gene start coords for plotting TSS
awk 'BEGIN{FS=OFS="\t"}($6=="+"){print $1,$2-1,$2,$4,$5,$6}' ${GENE_BED} > ${DIR}/tmp.bed
awk 'BEGIN{FS=OFS="\t"}($6=="-"){print $1,$3,$3+1,$4,$5,$6}' ${GENE_BED} >> ${DIR}/tmp.bed
sort -k1,1 -k2n,2 ${DIR}/tmp.bed > ${DIR}/mm10_gene_tss.bed
rm -rf ${DIR}/tmp.bed
## gene_id only
## TSS
cat ${DIR}/mm10_gene_tss.bed | tr '_' '\t' | cut -f 5 --complement > ${DIR}/mm10_gene_tss.gene_id.bed
##-------------
## gene body coordinates
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6,$5,$4}' > ${DIR}/gencode.vM13.gene_id.bed
##-------------
## TSS +/- kb
GENE_TSS_BED=${DIR}/mm10_gene_tss.bed
${BEDTOOLS}/slopBed -i ${GENE_TSS_BED} -g ${CHR_LEN} -s -l $LEN -r $LEN | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' > ${DIR}/mm10_genes_$(LEN).saf
##-------------
## TSS +kb
## Extract coord for transcripts < $LEN; use actual coords
awk '($3=="gene")' ${GTF} | awk '($5-$4 <= 3000)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.short
## Extract TSS coordinates for fwd strand genes > $LEN
awk '($3=="gene")' ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long+
## Extract TSS coordinates for reverse strand genes > 3kb
awk '($3=="gene")' ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long-
## add slop to TSS for genes > slop
SLOP_OPTS='-s -l 0 -r 3000'
cat ${DIR}/tmp.gene.long+ ${DIR}/tmp.gene.long- | sort -k1,1 -k 2n,2 | ${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} ${SLOP_OPTS} > ${DIR}/tmp.gene.long
cat ${DIR}/tmp.gene.long ${DIR}/tmp.gene.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_genes_plus_${LEN}.bed
rm -rf ${DIR}/tmp.gene*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment