Created
September 26, 2014 19:22
-
-
Save crazyhottommy/6b92d37f2ed2267cc75e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
setwd("/home/tommy/annotations/human/ensemble/") | |
# set the colClasses for faster reading in the data | |
gtf_cols <- c(seqname="factor", source="factor", feature="factor", | |
start="integer", end="integer", score="character", | |
strand="factor", frame="factor", attribute="character") | |
hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE, | |
col.names=names(gtf_cols), comment.char="#") | |
# load all the data into memory, this gz file is around 28Mb, if gunzip it will be around 900Mb | |
# very slow, it took me 2-3 mins to finish loading.....try data.table. usually I work with gtf file | |
# on command line. of course, specialized tools such as python library HTSeq and gffutils, bioawk etc are | |
# also existed there for more complicated analysis. | |
head(hs_gtf) | |
dim(hs_gtf) # how many rows and columns 2 million rows | |
hs_df<- tbl_df(hs_gtf) #convert to a local dataframe, when you print, it will not | |
# print out all the rows. similar to head() | |
head(hs_df$attribute, n=3) | |
extractGeneID<- function(x) gsub("gene_id ([^;]+);.*", "\\1", x) | |
# on linux command (it is all about writing regular expression!) | |
# generally,awk does not support back reference http://ubuntuforums.org/showthread.php?t=2198616 | |
# but gensub function supports in gawk (Gnu awk) | |
# ex. $ echo 'noon' | awk '{print gensub(/(.)(.)/,"\\2\\1","g")}' output:onno | |
# zcat Homo_sapiens.GRCh37.74.gtf.gz| head |awk -F'\t' '{ a=gensub(/gene_id "([^;]+)";.*/, "\\1", "g",$9); print $0"\t"a}' | |
# the gene_id has quotes | |
# use sed instead: | |
# zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed 's/gene_id "\([^;]\+\)";.*/\1/' | |
# append this column to the original file: | |
# paste <(zcat Homo_sapiens.GRCh37.74.gtf.gz) <(zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed 's/gene_id "\([^;]\+\)";.*/\1/') | |
# this is a long linux command, I constructed bit by bit, make sure every step works. | |
# use mutate from dplyr to add a column based on original column | |
# add a column with GeneID | |
hs_df<- mutate(hs_df, gene_id = extractGeneID(attribute)) # takes several seconds | |
# select only the columns we are interested | |
select(hs_df, seqname, feature, start, end, gene_id) | |
select(hs_df, seqname, source, feature, start, end, gene_id) | |
table(hs_df$source) # on linux command zcat Homo_sapiens.GRCh37.74.gtf.gz | datamash -s -g 2 count 2 | |
# use filer to keep the rows with source == "protein_coding" | |
pc_df<- filter(hs_df, source == "protein_coding") #fast | |
pc_df # zcat Homo_sapiens.GRCh37.74.gtf.gz | awk '$2=="protein_coding"'| wc -l returns 1667560 | |
# multiple filter rules | |
filter(hs_df, feature == "exon", strand == "+") #ensemble 74 is different from ensemble 75 in which has a feature "gene" | |
# and comment lines | |
# use %>% (pronounced as "then") to get the exon number of each gene | |
# SQL can do the same job, see my previous post here http://crazyhottommy.blogspot.com/2013/11/mysql-rocks.html | |
# extract exon number | |
extractExonNumber <- function(x) as.integer(gsub(".*exon_number (\\d+);.*", "\\1", x)) | |
pce_df <- filter(hs_df, source == "protein_coding", feature == "exon") %>% | |
mutate(exon_number=extractExonNumber(attribute)) | |
pce_df %>% filter(feature=="exon") %>% | |
group_by(gene_id) %>% | |
summarise(num_exons = max(exon_number)) %>% | |
arrange(desc(num_exons)) | |
# Source: local data frame [22,642 x 2] | |
# gene_id num_exons | |
# 1 ENSG00000155657 363 | |
# 2 ENSG00000183091 182 | |
# 3 ENSG00000131018 147 | |
# 4 ENSG00000114270 118 | |
# 5 ENSG00000054654 116 | |
# 6 ENSG00000154358 116 | |
# 7 ENSG00000203832 111 | |
# 8 ENSG00000127481 107 | |
# 9 ENSG00000143341 107 | |
# 10 ENSG00000198626 107 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment