Skip to content

Instantly share code, notes, and snippets.

@FloWuenne
Last active July 20, 2018 15:35
Show Gist options
  • Save FloWuenne/ff003a56a50d34acedaaa6512b6cb960 to your computer and use it in GitHub Desktop.
Save FloWuenne/ff003a56a50d34acedaaa6512b6cb960 to your computer and use it in GitHub Desktop.
Analyze and fix .gtf file for Drop-seq pipeline to remove warnings and skipping for genes with issues
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Load libraries
```{r}
library(refGenome)
library(dplyr)
library(tidyr)
```
# Read in original gtf
```{r}
# create ensemblGenome object for storing Ensembl genomic annotation data
ens <- ensemblGenome()
read.gtf(ens, "./mm10_with_Adamts19_tm4a_tm4b_allele.gtf")
```
```{r}
# create table of genes
my_gene <- getGenePositions(ens)
```
```{r}
## Get all genes that are found more than once
skipped_genes <- my_gene %>%
count(gene_name) %>%
subset(n > 1)
```
# Test if all skipped genes are missing in modified REF and are present in standard REF
```{r}
good_DGE <- read.table("DGE_to_test_skipped_genes/Florian_170814_R1_1200STAMPS_E14_5.aligned.tagged.cleaned.selected_STAMPS.DGE.txt",
sep="\t",
header=T,
row.names =1,
quote="")
```
```{r}
bad_DGE <- read.table("DGE_to_test_skipped_genes/E14.5_r3.DGE.txt",
sep="\t",
header=T,
row.names =1,
quote="")
```
```{r}
good_genes <- rownames(good_DGE)
bad_genes <- rownames(bad_DGE)
```
```{r}
## Check how many genes are found in alignment against reference GSE_mm10 and against reference + lacZ
good_genes_intersect <- intersect(skipped_genes$gene_name,good_genes)
bad_genes_intersect <- intersect(skipped_genes$gene_name,bad_genes)
## Check how many of these are found in both
overlap_bad_good <- intersect(good_genes_intersect,bad_genes_intersect)
```
```{r}
## Subset gtf for gene of interest
subset(my_gene,gene_name=="1700040F15Rik")
```
# Get most recent ensemble IDs using biomart
```{r}
library("biomaRt")
ensembl=useMart("ensembl")
ensembl = useDataset("mmusculus_gene_ensembl",mart=ensembl)
```
```{r}
attributes = listAttributes(ensembl)
ensemble_gene_IDS <- getBM(attributes=c('ensembl_gene_id', 'ensembl_gene_id_version','ensembl_transcript_id','chromosome_name'),
mart = ensembl)
```
# Check which gene IDS overlap between genes with multiple IDs from gtf and current ensemble IDs
```{r}
## Get all gene IDs for genes that have more than 1
skipped_genes_IDs <- subset(my_gene,gene_name %in%skipped_genes$gene_name)
overlap_skipped_ensemble <- intersect(ensemble_gene_IDS$ensembl_gene_id,skipped_genes_IDs$gene_id)
```
```{r}
## Get the genes that only had 1 gene ID
skipped_genes_IDs_overlap <- subset(skipped_genes_IDs,gene_id %in% overlap_skipped_ensemble)
## Get the gene ID that were not in ensemble for filtering later on
to_filter_for_multiple_gene_IDs <- subset(skipped_genes_IDs,!gene_id %in% overlap_skipped_ensemble)
more_than_two_geneID <- to_filter_for_multiple_gene_IDs %>%
count(gene_name) %>%
arrange(desc(n))
## Count number of genes that still have more than 1 gene_ID
multiple_gene_IDs_table <- skipped_genes_IDs_overlap %>%
count(gene_name) %>%
subset(n > 1) %>%
arrange(desc(n))
## Genes that were filtered for multiple gene IDs
multiple_gene_IDs <- multiple_gene_IDs_table$gene_name
## filter my_gene subset for the remaining genes
skipped_genes_IDs_overlap_geneIDs <- subset(skipped_genes_IDs_overlap,gene_name %in% multiple_gene_IDs_table$gene_name)
## Find chromosomal disagreement genes
chromosomal_disagreement <- skipped_genes_IDs_overlap_geneIDs %>%
group_by(gene_name) %>%
count(seqid) %>%
count(gene_name) %>%
arrange(desc(nn)) %>%
subset(nn > 1)
chromosomal_disagreement_genes <- chromosomal_disagreement$gene_name
## Get info for genes with chromosomal disagreement
chromosomal_disagreement_genes_table <- subset(skipped_genes_IDs_overlap_geneIDs,gene_name %in% chromosomal_disagreement$gene_name)
## Genes that are not filtered by duplicate Gene ID NOR chromosomal disagreement
count_skipped_genes_IDs_left <- skipped_genes_IDs_overlap_geneIDs %>%
group_by(gene_name) %>%
count(seqid) %>%
count(gene_name) %>%
arrange(desc(nn)) %>%
subset(nn == 1)
```
```{r}
## Get genes that have strand disagreement
skipped_genes_IDs_overlap_geneIDs <- subset(skipped_genes_IDs_overlap_geneIDs,!gene_name %in% chromosomal_disagreement$gene_name)
skipped_genes_IDs_overlap_geneIDs_chrom_strand <- subset(skipped_genes_IDs_overlap_geneIDs,gene_name %in% count_skipped_genes_IDs_left$gene_name)
## Find genes that have more than one strand
skipped_genes_IDs_overlap_geneIDs_chrom_strand_sum <- skipped_genes_IDs_overlap_geneIDs_chrom_strand %>%
group_by(gene_name) %>%
count(strand) %>%
subset(n == 1)
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_sum$gene_name)
skipped_genes_IDs_overlap_geneIDs_chrom_strand_complicated <- subset(skipped_genes_IDs_overlap_geneIDs_chrom_strand,
!gene_name %in% skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes)
```
## Finally, filter out all these gene IDs that fall into one of the four categories that cause problems
```{r}
## Multiple gene IDs
to_filter_for_multiple_gene_IDs
## Chromosomal disagreement
chromosomal_disagreement_genes
## Strand disagreement
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes
## Complicated reasons
filtered_complicated_cases <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_complicated$gene_name)
## Sanity checks to make sure all genes included
length(intersect(skipped_genes_IDs$gene_name,to_filter_for_multiple_gene_IDs$gene_name))
length(intersect(skipped_genes_IDs$gene_name,chromosomal_disagreement_genes))
length(intersect(skipped_genes_IDs$gene_name,skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes))
## Make a table with all gene IDs and their properties
multiple_geneIDs_genenames <- unique(to_filter_for_multiple_gene_IDs$gene_name)
chromosomal_disagreement_genes <- unique(chromosomal_disagreement_genes)
skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes <- unique(skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes)
filtered_complicated_cases <- unique(filtered_complicated_cases)
multiple_gene_ids_table <- data.frame("gene_name"=multiple_geneIDs_genenames,
"issue"=replicate(length(multiple_geneIDs_genenames),"Multiple_gene_IDs"))
chromosomal_disagreements_table <- data.frame("gene_name"=chromosomal_disagreement_genes,
"issue"=replicate(length(chromosomal_disagreement_genes),"Chromosomal_disagreement"))
strand_disagreements_table <- data.frame("gene_name"=skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes,
"issue"=replicate(length(skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes),"Strand_disagreement"))
complicated_cases_table <- data.frame("gene_name"=filtered_complicated_cases,
"issue"=replicate(length(filtered_complicated_cases),"Complicated_case"))
full_issues_table <- rbind(multiple_gene_ids_table,
chromosomal_disagreements_table,
strand_disagreements_table,
complicated_cases_table)
write.table(full_issues_table,
file="./gtf_issues_genes.tsv",
sep="\t",
row.names=FALSE,
col.names=TRUE,
quote=FALSE)
```
# Finally clean up the gtf from all the genes that cause warnings
```{r}
original_gtf <- ens@ev$gtf
## First we remove the duplicate gene IDs that are not the current ones
filtered_gtf <- subset(original_gtf,!gene_id %in% to_filter_for_multiple_gene_IDs$gene_id)
## Then filter out the genes that have chromosome and strand disagreements
filtered_gtf <- subset(filtered_gtf,!gene_name %in% chromosomal_disagreement_genes)
filtered_gtf <- subset(filtered_gtf,!gene_name %in% skipped_genes_IDs_overlap_geneIDs_chrom_strand_genes)
## Finally, for now, filter out the complicated gene cases as well
filtered_gtf <- subset(filtered_gtf,!gene_name %in% filtered_complicated_cases)
## Save a table that contains information
```
```{r}
library(tidyr)
library(dplyr)
## Export the gtf in the original formatting
filtered_gtf_formatted <- filtered_gtf
## Use apply to add the last column to the gtf
## This will effectively loose the protein_id and exon_id entries but we don't need these in the Drop-seq pipe
filtered_gtf_formatted$last_column <- paste("gene_id \"",filtered_gtf_formatted[,13],"\"; ",
"transcript_id \"",filtered_gtf_formatted[,15],"\"; ",
"exon_number \"",filtered_gtf_formatted[,17],"\"; ",
"gene_name \"",filtered_gtf_formatted[,16],"\"; ",
"gene_biotype \"",filtered_gtf_formatted[,11],"\"; ",
"transcript_name \"",filtered_gtf_formatted[,12],"\";",
sep="")
filtered_gtf_formatted <- filtered_gtf_formatted %>%
dplyr::select(-c(id,protein_id,gene_biotype,transcript_name,gene_id,exon_id,transcript_id,gene_name,gene_name,exon_number))
```
```{r}
write.table(filtered_gtf_formatted,
file="./mm10_with_Adamts19_tm4a_tm4b_allele.fixed.gtf",
sep="\t",
row.names=FALSE,
col.names=FALSE,
quote=FALSE)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment