Skip to content

Instantly share code, notes, and snippets.

View ag1805x's full-sized avatar
😎

Arindam Ghosh ag1805x

😎
View GitHub Profile
# Extract gene lengths from GTF file
# Download GTF file from https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
echo "Gene_ID, Gene_name, Gene_type, Chr, Start, End, Gene_length" > gdc_gencode_v36_annotation.csv
awk '{if($3 == "gene") print $0}' gencode.v36.annotation.gtf | awk -F '[\t ;]' '{print $10","$16","$13","$1","$4","$5","$5-$4}' >> gdc_gencode_v36_annotation.csv
@ag1805x
ag1805x / miRBase_GFF2IDlist
Last active August 15, 2020 16:26
Extract mature and precursor miRNA list from miRBase gff3 file. Can be used for inter-conversion
# Extract mature and precursor miRNA list from miRBase gff3 file
# Can be used for interconversion
grep -v '#' hsa.gff3 | grep -w "miRNA" | cut -f9 > temp
paste <(cut -d ';' -f1 temp | cut -d '=' -f1) <(cut -d ';' -f2 temp | cut -d '=' -f1) <(cut -d ';' -f3 temp | cut -d '=' -f1) <(cut -d ';' -f4 temp | cut -d '=' -f1) | head -n1 >> miRbaseIDs.tsv
paste <(cut -d ';' -f1 temp | cut -d '=' -f2) <(cut -d ';' -f2 temp | cut -d '=' -f2) <(cut -d ';' -f3 temp | cut -d '=' -f2) <(cut -d ';' -f4 temp | cut -d '=' -f2) >> miRbaseIDs.tsv
@ag1805x
ag1805x / exploreGTF
Last active July 13, 2020 07:00
One liners to explore GTF file
# Extract list of unique Gene IDs in GTF file
awk '{if($3 == "gene") print $0}' ../Homo_sapiens.GRCh38.84.gtf | cut -f9 | cut -d ';' -f1 | cut -d ' ' -f2 | sort | uniq | wc -l
# Extract gene list of particular biotype
grep 'gene_biotype "protein_coding"' ../Homo_sapiens.GRCh38.84.gtf | awk '{if($3 == "gene") print $0}' | cut -f9 | cut -d ';' -f1 | cut -d ' ' -f2 | sort | uniq | wc -l
# Create subset of Ensembl GTF file based on gene biotype
@ag1805x
ag1805x / Pather2Revigo
Created December 24, 2019 06:41
Using Panther GO into Revigo
# The PANTHER GO output doesnot directly support visualisation by REVIGO.
# Using this single line code in linux terminal we can make it compatible for REVIGO
# Gene Ontology enrichment analysis done using PANTHER (interface from: http://geneontology.org/)
# The results are exported as Table
# The initial few lines are cleaned to keep only the result table on which we execute the following:
paste <(cut -f1 PantherOut.tsv | cut -d "(" -f2 | cut -d ")" -f1) <(cut -f7 List.tsv) >> RevigoIn.tsv
@ag1805x
ag1805x / tx2gene_fasta
Created September 18, 2019 07:44
Code to generate tx2gene from reference transcriptome for use with tximport() in R
# Code to generate tx2gene from reference transcriptome for use with tximport() in R
# Reference transcriptome: wget ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
zcat Homo_sapiens.GRCh38.cdna.all.fa.gz | grep '>' | cut -d ' ' -f1,4,7 > temp
paste <(cut -d '>' -f2 temp | cut -d ' ' -f1) <(cut -d ' ' -f2 temp | cut -d ':' -f2) <(cut -d ' ' -f3 temp | cut -d ':' -f2) >> tx2gene.txt
rm temp
@ag1805x
ag1805x / Sample_hClust.R
Created September 16, 2019 06:41
Using hclust() to cluster RNA-seq samples and visualize dendrogram
library(dendextend)
countsPC <- read.table("CountsPC_Gene_MultimapOverlap.tsv", header=T, row.names=c(1))
factors <- as.data.frame(read.table("factors.txt", sep="\t", header = TRUE, row.names=c(1)))
dist <- dist(t(countsPC), method="euclidean")
cluster <- hclust(dist)
# main, sub, xlab, ylab
@ag1805x
ag1805x / GFF2GCcontent
Created August 22, 2019 10:35
Code to extract GC% of genes from GFF file using bedtools
###########################################
# Code to extract GC% of genes from GFF file
# using bedtools
# - Arindam Ghosh (22 August 2019)
###########################################
bedtools nuc -fi Homo_sapiens.GRCh38.dna.primary_assembly.fa -bed Homo_sapiens.GRCh38.84.gff3 | grep ID=gene:ENSG > temp.txt
echo -e "GeneID\tpct_GC" > GRCh38_GeneGC.txt
#!/usr/bin/env bash
# make_rRNA.sh
# Kamil Slowikowski
# December 12, 2014
#
# Modified: Arindam Ghosh (July 24, 2019 )
#
#
# Referenc Genome: GRCh38.p5 Ensembl release 84
#
@ag1805x
ag1805x / Heatmap_BacterialRiboswitchCount.R
Created June 10, 2019 11:11
#Code to cdeate heatmap of riboswitch counts in different bacteria.
#Code to cdeate heatmap of riboswitch counts in different bacteria.
library(pheatmap)
data<- as.matrix(read.csv("heat_map_table.csv", header=T, row.names=c(1)))
col_key = c("#800000", "#e6194B", "#3cb44b", "#ffe119", "#4363d8", "#f58231", "#911eb4", "#42d4f4", "#f032e6", "#bfef45", "#fabebe", "#469990", "#fffac8")
#Since max count was 12 we used 12 distinct colours
pheatmap(data, treeheight_row = 0, treeheight_col = 0, cluster_rows=FALSE, cluster_cols=FALSE, color=rev(col_key), cellwidth=16, cellheight=15, height=15, width=10,filename="Heatmap.png")
@ag1805x
ag1805x / SRRtoGSM
Last active March 24, 2023 11:43
Extract GSM id for a SRR id.
#Using NCBI's efetch to retrive GSM id (sample id) from a given SRR id(run id)
efetch -db sra -id SRR2453355 -format xml | xmllint --xpath "EXPERIMENT_PACKAGE_SET/EXPERIMENT_PACKAGE/EXPERIMENT/@alias" - | cut -d \" -f2