crazyhottommy/dplyr_gff.r

## dplyr_gff.r
library(dplyr)
setwd("/home/tommy/annotations/human/ensemble/")

# set the colClasses for faster reading in the data
gtf_cols <- c(seqname="factor", source="factor", feature="factor",
              start="integer", end="integer", score="character",
              strand="factor", frame="factor", attribute="character")

hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE,
                     col.names=names(gtf_cols), comment.char="#")

# load all the data into memory, this gz file is around 28Mb, if gunzip it will be around 900Mb
# very slow, it took me 2-3 mins to finish loading.....try data.table.  usually I work with gtf file
# on command line. of course, specialized tools such as python library HTSeq and gffutils, bioawk etc are
# also existed there for more complicated analysis.

head(hs_gtf)

dim(hs_gtf) # how many rows and columns  2 million rows

hs_df<- tbl_df(hs_gtf) #convert to a local dataframe, when you print, it will not
                       # print out all the rows. similar to head()

head(hs_df$attribute, n=3)

extractGeneID<- function(x) gsub("gene_id ([^;]+);.*", "\\1", x)

# on linux command (it is all about writing regular expression!)
# generally,awk does not support back reference http://ubuntuforums.org/showthread.php?t=2198616
# but gensub function supports in gawk (Gnu awk)
# ex. $ echo 'noon' | awk '{print gensub(/(.)(.)/,"\\2\\1","g")}'   output:onno

# zcat Homo_sapiens.GRCh37.74.gtf.gz| head |awk -F'\t'  '{ a=gensub(/gene_id "([^;]+)";.*/, "\\1", "g",$9); print $0"\t"a}'
# the gene_id has quotes

# use sed instead:
# zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed  's/gene_id "\([^;]\+\)";.*/\1/'
# append this column to the original file:
# paste <(zcat Homo_sapiens.GRCh37.74.gtf.gz) <(zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed  's/gene_id "\([^;]\+\)";.*/\1/')
# this is a long linux command, I constructed bit by bit, make sure every step works.


# use mutate from dplyr to add a column based on original column
# add a column with GeneID

hs_df<- mutate(hs_df, gene_id = extractGeneID(attribute)) # takes several seconds

# select only the columns we are interested
select(hs_df, seqname, feature, start, end, gene_id)
select(hs_df, seqname, source, feature, start, end, gene_id)

table(hs_df$source)  # on linux command zcat Homo_sapiens.GRCh37.74.gtf.gz | datamash -s -g 2 count 2

# use filer to keep the rows with source == "protein_coding"
pc_df<- filter(hs_df, source == "protein_coding") #fast
pc_df  # zcat Homo_sapiens.GRCh37.74.gtf.gz | awk '$2=="protein_coding"'| wc -l   returns 1667560

# multiple filter rules
filter(hs_df, feature == "exon", strand == "+")  #ensemble 74 is different from ensemble 75 in which has a feature "gene"
                                                 # and comment lines
# use %>% (pronounced as "then") to get the exon number of each gene
# SQL can do the same job, see my previous post here http://crazyhottommy.blogspot.com/2013/11/mysql-rocks.html


# extract exon number
extractExonNumber <- function(x) as.integer(gsub(".*exon_number (\\d+);.*", "\\1", x))

pce_df <- filter(hs_df, source == "protein_coding", feature == "exon") %>%
  mutate(exon_number=extractExonNumber(attribute))

pce_df %>% filter(feature=="exon") %>%
                     group_by(gene_id) %>%
                     summarise(num_exons = max(exon_number)) %>%
                     arrange(desc(num_exons))

# Source: local data frame [22,642 x 2]

# gene_id num_exons
# 1  ENSG00000155657       363
# 2  ENSG00000183091       182
# 3  ENSG00000131018       147
# 4  ENSG00000114270       118
# 5  ENSG00000054654       116
# 6  ENSG00000154358       116
# 7  ENSG00000203832       111
# 8  ENSG00000127481       107
# 9  ENSG00000143341       107
# 10 ENSG00000198626       107
	library(dplyr)
	setwd("/home/tommy/annotations/human/ensemble/")

	# set the colClasses for faster reading in the data
	gtf_cols <- c(seqname="factor", source="factor", feature="factor",
	start="integer", end="integer", score="character",
	strand="factor", frame="factor", attribute="character")

	hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE,
	col.names=names(gtf_cols), comment.char="#")

	# load all the data into memory, this gz file is around 28Mb, if gunzip it will be around 900Mb
	# very slow, it took me 2-3 mins to finish loading.....try data.table. usually I work with gtf file
	# on command line. of course, specialized tools such as python library HTSeq and gffutils, bioawk etc are
	# also existed there for more complicated analysis.

	head(hs_gtf)

	dim(hs_gtf) # how many rows and columns 2 million rows

	hs_df<- tbl_df(hs_gtf) #convert to a local dataframe, when you print, it will not
	# print out all the rows. similar to head()

	head(hs_df$attribute, n=3)

	extractGeneID<- function(x) gsub("gene_id ([^;]+);.*", "\\1", x)

	# on linux command (it is all about writing regular expression!)
	# generally,awk does not support back reference http://ubuntuforums.org/showthread.php?t=2198616
	# but gensub function supports in gawk (Gnu awk)
	# ex. $ echo 'noon' \| awk '{print gensub(/(.)(.)/,"\\2\\1","g")}' output:onno

	# zcat Homo_sapiens.GRCh37.74.gtf.gz\| head \|awk -F'\t' '{ a=gensub(/gene_id "([^;]+)";.*/, "\\1", "g",$9); print $0"\t"a}'
	# the gene_id has quotes

	# use sed instead:
	# zcat Homo_sapiens.GRCh37.74.gtf.gz \| awk -F '\t' '{print $9}' \| sed 's/gene_id "\([^;]\+\)";.*/\1/'
	# append this column to the original file:
	# paste <(zcat Homo_sapiens.GRCh37.74.gtf.gz) <(zcat Homo_sapiens.GRCh37.74.gtf.gz \| awk -F '\t' '{print $9}' \| sed 's/gene_id "\([^;]\+\)";.*/\1/')
	# this is a long linux command, I constructed bit by bit, make sure every step works.


	# use mutate from dplyr to add a column based on original column
	# add a column with GeneID

	hs_df<- mutate(hs_df, gene_id = extractGeneID(attribute)) # takes several seconds

	# select only the columns we are interested
	select(hs_df, seqname, feature, start, end, gene_id)
	select(hs_df, seqname, source, feature, start, end, gene_id)

	table(hs_df$source) # on linux command zcat Homo_sapiens.GRCh37.74.gtf.gz \| datamash -s -g 2 count 2

	# use filer to keep the rows with source == "protein_coding"
	pc_df<- filter(hs_df, source == "protein_coding") #fast
	pc_df # zcat Homo_sapiens.GRCh37.74.gtf.gz \| awk '$2=="protein_coding"'\| wc -l returns 1667560

	# multiple filter rules
	filter(hs_df, feature == "exon", strand == "+") #ensemble 74 is different from ensemble 75 in which has a feature "gene"
	# and comment lines
	# use %>% (pronounced as "then") to get the exon number of each gene
	# SQL can do the same job, see my previous post here http://crazyhottommy.blogspot.com/2013/11/mysql-rocks.html


	# extract exon number
	extractExonNumber <- function(x) as.integer(gsub(".exon_number (\\d+);.", "\\1", x))

	pce_df <- filter(hs_df, source == "protein_coding", feature == "exon") %>%
	mutate(exon_number=extractExonNumber(attribute))

	pce_df %>% filter(feature=="exon") %>%
	group_by(gene_id) %>%
	summarise(num_exons = max(exon_number)) %>%
	arrange(desc(num_exons))

	# Source: local data frame [22,642 x 2]

	# gene_id num_exons
	# 1 ENSG00000155657 363
	# 2 ENSG00000183091 182
	# 3 ENSG00000131018 147
	# 4 ENSG00000114270 118
	# 5 ENSG00000054654 116
	# 6 ENSG00000154358 116
	# 7 ENSG00000203832 111
	# 8 ENSG00000127481 107
	# 9 ENSG00000143341 107
	# 10 ENSG00000198626 107