Skip to content

Instantly share code, notes, and snippets.

@jspaezp
Created September 13, 2021 22:24
Show Gist options
  • Save jspaezp/eb332f04ec3d2e9ac56b63bfb181a52d to your computer and use it in GitHub Desktop.
Save jspaezp/eb332f04ec3d2e9ac56b63bfb181a52d to your computer and use it in GitHub Desktop.
star cluster search
# Note that you need to install these packages beforehand
library(AnnotationDbi)
library(org.Hs.eg.db)
library(dplyr)
library(ggplot2)
files <- dir("./STAR", recursive = TRUE, pattern = "ReadsPerGene", full.names=TRUE)
read_file <- function(x) {
tab <- read.table(x, skip=4)[, 1:2]
sample_name <- gsub(".*STAR/(.*)/ReadsPer.*", "\\1", x)
colnames(tab) <- c("Gene", paste0("Count_", sample_name))
return(tab)
}
tab_list <- lapply(files, read_file)
lapply(tab_list, function(x) {hist(x[[2]])})
lef_join_genes <- function(x, y) {
return(left_join(x, y, by = c("Gene" = "Gene")))
}
count_matrix <- Reduce(x=tab_list,f=lef_join_genes)
mapped_ids <- mapIds(org.Hs.eg.db, keys=count_matrix$Gene,column = "SYMBOL", keytype = "ENSEMBL")
mapped_ids <- ifelse(is.na(mapped_ids), yes = count_matrix$Gene, no = mapped_ids)
count_matrix$GeneName <- mapped_ids
count_matrix <- as_tibble(count_matrix)
count_matrix
dir.create("count_matrix", showWarnings=FALSE, recursive=TRUE)
write.csv2(count_matrix, file = "count_matrix/matrix.csv", row.names=FALSE)
#!/bin/bash
module load R/4.0.1
set -x
set -e
R --file="./count_matrix.R"
#!/bin/bash
module load STAR/2.7.8a
set -x
set -e
STAR --version
# http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/
wget --no-parent -r http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/
mv ./labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/ .
STAR --runThreadN 4 \
--runMode genomeGenerate \
--genomeDir STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99 \
--genomeFastaFiles STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
--sjdbGTFfile STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.99.gtf \
--sjdbOverhang 149
#!/bin/bash
#SBATCH --time 12:00:00
#SBATCH -A pccr
#SBATCH -c 6 # number of cores requested -- this needs to be greater than or equal to the number of cores you plan to use to run your job
#SBATCH --mem 48G
#SBATCH --job-name STAR_index # Job name
#SBATCH -o STAR-%j.out # File to which standard out will be written
#SBATCH -e STAR-%j.err # File to which standard err will be written
module load STAR/2.7.8a
set -x
set -e
STAR --version
# http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/
# wget --no-parent -r http://labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/
# mv ./labshare.cshl.edu/shares/gingeraslab/www-data/dobin/STAR/STARgenomes/ .
# STAR --runThreadN 4 \
# --runMode genomeGenerate \
# --genomeDir STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99 \
# --genomeFastaFiles STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.dna.primary_assembly.fa \
# --sjdbGTFfile STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.99.gtf \
# --sjdbOverhang 149
# STARgenomes
# └── Human
# └── GRCh38_Ensembl99_sparseD3_sjdbOverhang99
# ├── chrLength.txt
# ├── chrNameLength.txt
# ├── chrName.txt
# ├── chrStart.txt
# ├── exonGeTrInfo.tab
# ├── exonInfo.tab
# ├── geneInfo.tab
# ├── Genome
# ├── genomeParameters.txt
# ├── Homo_sapiens.GRCh38.99.gtf <<<<---- IMPORTANT
# ├── Homo_sapiens.GRCh38.dna.primary_assembly.fa <<<<---- IMPORTANT
# ├── index.html
# ├── index.html?C=D;O=A
# ├── index.html?C=D;O=D
# ├── index.html?C=M;O=A
# ├── index.html?C=M;O=D
# ├── index.html?C=N;O=A
# ├── index.html?C=N;O=D
# ├── index.html?C=S;O=A
# ├── index.html?C=S;O=D
# ├── log
# ├── Log.out
# ├── SA
# ├── SAindex
# ├── sjdbInfo.txt
# ├── sjdbList.fromGTF.out.tab
# ├── sjdbList.out.tab
# └── transcriptInfo.tab
#
# 2 directories, 28 files
mkdir -p STAR
EXTRA_PARAMS="--outSAMunmapped Within
--chimSegmentMin 12 \
--chimJunctionOverhangMin 8 \
--chimOutJunctionFormat 1 \
--alignSJDBoverhangMin 10 \
--alignMatesGapMax 100000 \
--alignIntronMax 100000 \
--alignSJstitchMismatchNmax 5 -1 5 5 \
--chimMultimapScoreRange 3 \
--chimScoreJunctionNonGTAG -4 \
--chimMultimapNmax 20 \
--chimNonchimScoreDropMin 10 \
--peOverlapNbasesMin 12 \
--peOverlapMMp 0.1 \
--alignInsertionFlush Right \
--alignSplicedMateMapLminOverLmate 0 \
--alignSplicedMateMapLmin 30"
for i in trimmed/*/*_1.fq.gz ; do
j=$(echo ${i} | sed -e "s/_1.fq.gz/_2.fq.gz/g")
nam=$(basename ${i} | sed -e "s/_1.fq.gz//g")
printf "${i}\t${j}\t${nam}\n" >> STAR/manifest.tsv
mkdir -p STAR/${nam}
mkdir -p STARFusion/${nam}
STAR --quantMode GeneCounts --runThreadN 6 \
--genomeDir STARgenomes/Human/GRCh38_Ensembl99_sparseD3_sjdbOverhang99 \
--readFilesIn ${i},${j} \
--outFileNamePrefix STAR/${nam}/ \
--twopassMode Basic \
--readFilesCommand "gunzip -c" ${EXTRA_PARAMS}
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment