Last active
June 29, 2018 12:18
-
-
Save decodebiology/367befcf5216be4b1fd9 to your computer and use it in GitHub Desktop.
Convert EnsEMBL GTF to Annotation table (Geneid, GeneSymbol, GeneWiseChrLocation, GeneClass, Strand)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo `date`; | |
dir=(`dirname $1`) | |
base=(`basename $1`) | |
echo "1/7 Preparing files"; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '!x[$6]++' | awk '{print $6"\t"$0;}' > $dir/enst_annotation.tmp; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '!x[$5]++' | awk '{print $5"\t"$0;}' > $dir/ensg_annotation.tmp; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '{print $6"\t"$2;}' | awk 'BEGIN{FS="\t"}{ if( !seen[$1]++ ) order[++oidx] = $1; stuff[$1] = stuff[$1] $2 "," } END { for( i = 1; i <= oidx; i++ ) print order[i]"\t"stuff[order[i]] }' > $dir/enst_start.tmp; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '{print $5"\t"$2;}' | awk 'BEGIN{FS="\t"}{ if( !seen[$1]++ ) order[++oidx] = $1; stuff[$1] = stuff[$1] $2 "," } END { for( i = 1; i <= oidx; i++ ) print order[i]"\t"stuff[order[i]] }' > $dir/ensg_start.tmp; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '{print $6"\t"$3;}' | awk 'BEGIN{FS="\t"}{ if( !seen[$1]++ ) order[++oidx] = $1; stuff[$1] = stuff[$1] $2 "," } END { for( i = 1; i <= oidx; i++ ) print order[i]"\t"stuff[order[i]] }' > $dir/enst_end.tmp; | |
cat $1 | sed 's/ "/\t/g' | sed 's/"; /\t/g' | cut -f1,4,5,7,10,12,16,18,22 | sed 's/";//' | awk '{print $5"\t"$3;}' | awk 'BEGIN{FS="\t"}{ if( !seen[$1]++ ) order[++oidx] = $1; stuff[$1] = stuff[$1] $2 "," } END { for( i = 1; i <= oidx; i++ ) print order[i]"\t"stuff[order[i]] }' > $dir/ensg_end.tmp; | |
echo "2/7 sorting and joining files - Transcripts"; | |
sort -k1,1 ${dir}/enst_start.tmp > ${dir}/enst_start_sort.tmp; | |
sort -k1,1 ${dir}/enst_end.tmp > ${dir}/enst_end_sort.tmp; | |
join -j1 -t $'\t' ${dir}/enst_start_sort.tmp ${dir}/enst_end_sort.tmp > ${dir}/enst_location.tmp | |
sort -k1,1 ${dir}/enst_annotation.tmp > ${dir}/enst_annotation_sort.tmp; | |
sort -k1,1 ${dir}/enst_location.tmp > ${dir}/enst_location_sort.tmp; | |
join -j1 -t $'\t' ${dir}/enst_annotation_sort.tmp ${dir}/enst_location_sort.tmp > ${dir}/enst_annotation.txt; | |
echo "3/7 sorting and joining files - Genes"; | |
### ENSG | |
sort -k1,1 ${dir}/ensg_start.tmp > ${dir}/ensg_start_sort.tmp; | |
sort -k1,1 ${dir}/ensg_end.tmp > ${dir}/ensg_end_sort.tmp; | |
join -j1 -t $'\t' ${dir}/ensg_start_sort.tmp ${dir}/ensg_end_sort.tmp > ${dir}/ensg_location.tmp | |
sort -k1,1 ${dir}/ensg_annotation.tmp > ${dir}/ensg_annotation_sort.tmp; | |
sort -k1,1 ${dir}/ensg_location.tmp > ${dir}/ensg_location_sort.tmp; | |
join -j1 -t $'\t' ${dir}/ensg_annotation_sort.tmp ${dir}/ensg_location_sort.tmp > ${dir}/ensg_annotation.txt; | |
echo "4/7 Distance measure - Transcripts"; | |
cat $dir/enst_annotation.txt | awk 'function max(x){i=0;for(val in x){if(i<=x[val]){i=x[val];}}return i;}function min(x){i=max(x);for(val in x){if(i>x[val] && x[val]>1){i=x[val];}}return i;}{split($11,a,",") ; split($12,b,","); print $0"\t"min(a)"\t"max(b);}' | awk 'BEGIN{FS="\t"}{print $1"\t"$2"\t"$13"\t"$14"\t"$5"\t"$6"\t"$8"\t"$9;}' > ${dir}/${base}_enst_annotation.txt; | |
echo "5/7 Distance measure - Genes"; | |
cat $dir/ensg_annotation.txt | awk 'function max(x){i=0;for(val in x){if(i<=x[val]){i=x[val];}}return i;}function min(x){i=max(x);for(val in x){if(i>x[val] && x[val]>1){i=x[val];}}return i;}{split($11,a,",") ; split($12,b,","); print $0"\t"min(a)"\t"max(b);}' | awk 'BEGIN{FS="\t"}{print $1"\t"$2"\t"$13"\t"$14"\t"$5"\t"$8"\t"$9;}' > ${dir}/${base}_ensg_annotation.txt; | |
echo "6/7 Cleaning temporary files"; | |
rm $dir/enst_annotation.tmp $dir/ensg_annotation.tmp $dir/enst_start.tmp $dir/ensg_start.tmp $dir/enst_end.tmp $dir/ensg_end.tmp $dir/enst_annotation.txt $dir/ensg_annotation.txt $dir/enst_location.tmp $dir/ensg_location.tmp ${dir}/enst_start_sort.tmp ${dir}/enst_end_sort.tmp ${dir}/ensg_start_sort.tmp ${dir}/ensg_end_sort.tmp ${dir}/enst_location_sort.tmp ${dir}/ensg_location_sort.tmp ${dir}/ensg_annotation_sort.tmp ${dir}/enst_annotation_sort.tmp; | |
echo "7/7 Done"; | |
echo `date`; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script converts EnsEMBL GTF (Ex: https://gist.githubusercontent.com/santhilalsubhash/1e7cca357e52a181dc25/raw/cfb803e07900a2baefbb6534f1299fd30cb57a29/sample.GTF) file to annotation table format. It generated two files
Note: You can download GTF files from http://www.ensembl.org/info/data/ftp/index.html