markziemann/genefunc.sh

## genefunc.sh
#!/bin/bash

# the goal of this script is to determine the fraction of genes in each
# biotype class that have annotated functions as determined by membership
# in either GO or REACTOME.

wget ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz

zcat Homo_sapiens.GRCh38.100.gtf.gz \
| grep -w gene \
| cut -d '"' -f2,10 \
| sed 's/"/\t/' \
| sort -u > biotypes.tsv

# count numbers in each class
cut -f2 biotypes.tsv | sort | uniq -c | sort -k1nr > biotypes_quant.tsv

# obtained from ensembl biomart
# http://www.ensembl.org/biomart/martview
# select gene and GO accession
cut -f-2 mart_export.txt | sort -u > mart_export_fmt.txt


for BIOTYPE in $(cut -f2 biotypes.tsv | sort -u) ; do
  NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \
    | cut -f1 \
    | sort -u \
    | wc -l)
  NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\
    | cut -f1\
    | sort -u\
    | grep -wFf - mart_export.txt\
    | grep "GO:"\
    | cut -f1\
    | sort -u\
    | wc -l)
  echo $BIOTYPE $NUMGENES $NUMGENES_WGO
done \
| sort -k2gr

#########################################################

# now try reactome in mart_export2.txt from biomart

for BIOTYPE in $(cut -f2 biotypes.tsv | sort -u) ; do
  NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \
    | cut -f1 \
    | sort -u \
    | wc -l)
  NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\
    | cut -f1\
    | sort -u\
    | grep -wFf - mart_export2.txt\
    | grep "R-HSA-"\
    | cut -f1\
    | sort -u\
    | wc -l)
  echo $BIOTYPE $NUMGENES $NUMGENES_WGO
done \
| sort -k2gr
	#!/bin/bash

	# the goal of this script is to determine the fraction of genes in each
	# biotype class that have annotated functions as determined by membership
	# in either GO or REACTOME.

	wget ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz

	zcat Homo_sapiens.GRCh38.100.gtf.gz \
	\| grep -w gene \
	\| cut -d '"' -f2,10 \
	\| sed 's/"/\t/' \
	\| sort -u > biotypes.tsv

	# count numbers in each class
	cut -f2 biotypes.tsv \| sort \| uniq -c \| sort -k1nr > biotypes_quant.tsv

	# obtained from ensembl biomart
	# http://www.ensembl.org/biomart/martview
	# select gene and GO accession
	cut -f-2 mart_export.txt \| sort -u > mart_export_fmt.txt


	for BIOTYPE in $(cut -f2 biotypes.tsv \| sort -u) ; do
	NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \
	\| cut -f1 \
	\| sort -u \
	\| wc -l)
	NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\
	\| cut -f1\
	\| sort -u\
	\| grep -wFf - mart_export.txt\
	\| grep "GO:"\
	\| cut -f1\
	\| sort -u\
	\| wc -l)
	echo $BIOTYPE $NUMGENES $NUMGENES_WGO
	done \
	\| sort -k2gr

	#########################################################

	# now try reactome in mart_export2.txt from biomart

	for BIOTYPE in $(cut -f2 biotypes.tsv \| sort -u) ; do
	NUMGENES=$(grep -w $BIOTYPE biotypes.tsv \
	\| cut -f1 \
	\| sort -u \
	\| wc -l)
	NUMGENES_WGO=$(grep -w $BIOTYPE biotypes.tsv\
	\| cut -f1\
	\| sort -u\
	\| grep -wFf - mart_export2.txt\
	\| grep "R-HSA-"\
	\| cut -f1\
	\| sort -u\
	\| wc -l)
	echo $BIOTYPE $NUMGENES $NUMGENES_WGO
	done \
	\| sort -k2gr