Last active
June 17, 2020 11:59
-
-
Save markziemann/7d9a33d64ca934ae060a176311240cf5 to your computer and use it in GitHub Desktop.
the goal of this script is to determine the fraction of genes in each biotype class that have annotated functions as determined by membership in either GO or REACTOME.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# the goal of this script is to determine the fraction of genes in each | |
# biotype class that have annotated functions as determined by membership | |
# in either GO or REACTOME. | |
wget ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz | |
zcat Homo_sapiens.GRCh38.100.gtf.gz \ | |
| grep -w gene \ | |
| cut -d '"' -f2,10 \ | |
| sed 's/"/\t/' \ | |
| sort -u > biotypes.tsv | |
# count numbers in each class | |
cut -f2 biotypes.tsv | sort | uniq -c | sort -k1nr > biotypes_quant.tsv | |
# obtained from ensembl biomart | |
# http://www.ensembl.org/biomart/martview | |
# select gene and GO accession | |
cut -f-2 mart_export.txt | sort -u > mart_export_fmt.txt | |
for BIOTYPE in $(cut -f2 biotypes.tsv | sort -u) ; do | |
NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \ | |
| cut -f1 \ | |
| sort -u \ | |
| wc -l) | |
NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\ | |
| cut -f1\ | |
| sort -u\ | |
| grep -wFf - mart_export.txt\ | |
| grep "GO:"\ | |
| cut -f1\ | |
| sort -u\ | |
| wc -l) | |
echo $BIOTYPE $NUMGENES $NUMGENES_WGO | |
done \ | |
| sort -k2gr | |
######################################################### | |
# now try reactome in mart_export2.txt from biomart | |
for BIOTYPE in $(cut -f2 biotypes.tsv | sort -u) ; do | |
NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \ | |
| cut -f1 \ | |
| sort -u \ | |
| wc -l) | |
NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\ | |
| cut -f1\ | |
| sort -u\ | |
| grep -wFf - mart_export2.txt\ | |
| grep "R-HSA-"\ | |
| cut -f1\ | |
| sort -u\ | |
| wc -l) | |
echo $BIOTYPE $NUMGENES $NUMGENES_WGO | |
done \ | |
| sort -k2gr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment