Skip to content

Instantly share code, notes, and snippets.

@bkutlu
Last active September 29, 2017 19:14
Show Gist options
  • Save bkutlu/3f95ee27942391119e0f651f2cfa63c8 to your computer and use it in GitHub Desktop.
Save bkutlu/3f95ee27942391119e0f651f2cfa63c8 to your computer and use it in GitHub Desktop.
Finding the longest peptide lengths genome wide
### script to find the length of the largest peptide for all the genes
### example demo for Mus Musculus
## load the necessary packages
library(dplyr)
library(biomaRt)
library(org.Mm.eg.db)
library(Biostrings)
## connect to Biomart Ensembl database and select mmusculus dataset
ensembl <- useMart("ensembl")
ensembl <- useDataset("mmusculus_gene_ensembl",mart=ensembl)
## function that generates a data frame of ids for all the genes
loadEnsMusBioc <- function(){
require('org.Mm.eg.db')
ensTab <- toTable(org.Mm.egENSEMBL)
symTab <- toTable(org.Mm.egSYMBOL)
ensMusEgSym <<- merge(ensTab,symTab)
print('Just base::loaded ensMusEgSym!')
}#loadEnsMusBioc
loadEnsMusBioc()
## get the peptide sequences from biomart
protein <- getSequence(id=ensMusEgSym$ensembl_id,
type="ensembl_gene_id",
seqType="peptide",
mart=ensembl)
## remove rows with unavailable sequences
protein %>% filter(peptide != "Sequence unavailable") -> proteinL
## add a column corresponding to the length of the peptide
proteinLW <- data.frame(proteinL, slength = width(aaS))
## for multiple sequences select the longest peptide
proteinLW %>%
group_by(ensembl_gene_id) %>%
filter(rank(slength, ties.method = "first") == 1) -> proteinLWF
proteinLW %>%
group_by(ensembl_gene_id) %>% summarise(n())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment