Skip to content

Instantly share code, notes, and snippets.

@venkan
Created May 18, 2018 08:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save venkan/de8b3eb6443b0f839228b900d37cfb24 to your computer and use it in GitHub Desktop.
Save venkan/de8b3eb6443b0f839228b900d37cfb24 to your computer and use it in GitHub Desktop.
rm(list=ls())
library(GenomicFeatures)
library(rtracklayer)
extractGenesOfGTF <- function(inputFile, outputGTF_File, outputCsvFile ) {
# reading the file.
gtf <- import(inputFile)
# its dimension is 2617197 by 25
gtf <- as.data.frame(gtf)
# selecting only the genes. The dimension is 58288 25
gtf = gtf[gtf$type == "gene",]
# removing columns with no values. The dimension is 58288 13
gtf = gtf[, apply(gtf, 2, function(x) { all(is.na(x)) == FALSE })]
# Examples of gene ids are ENSG00000223972.5 and ENSG00000227232.5. We split by "." and
# save the numbers in gene_version
res = strsplit(gtf$gene_id, ".", fixed=TRUE)
gtf$gene_id = unlist(lapply(res, function(x) {x[1] }))
gtf$gene_version = unlist(lapply(res, function(x) {x[2] }))
# # Gene names
# res = strsplit(gtf$gene_name, ".", fixed=TRUE)
# gtf$gene_name2 = unlist(lapply(res, function(x) {x[1] }))
# gtf$gene_version2 = unlist(lapply(res, function(x) {x[2] }))
# there are several duplicates. We remove those genes with tag "PAR" (pseudoautosomal regions) to
# resolve the issue. The dimension is 58243
gtf = gtf[-which(gtf$tag == "PAR"), ]
rownames(gtf) <- gtf$gene_id
# save the results in the output file
write.table(gtf, outputCsvFile, sep="\t")
export(GRanges(gtf), outputGTF_File, format = 'gtf')
}
inputFile = "/Users/Documents/gencode.v27.long_noncoding_RNAs.gtf"
extractGenesOfGTF("/Users/Documents/gencode.v27.long_noncoding_RNAs.gtf", "/Users/Documents/Onlygenes_gencode.v27.long_noncoding_RNAs.gtf",
"/Users/Documents/Onlygenes_gencode.v27.long_noncoding_RNAs.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment