Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Last active April 28, 2024 12:58
Show Gist options
  • Star 16 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
Scripts and other things for RNA-Seq
# Add yourself to the sudo group
su -
usermod -aG sudo danny
exit
# Install the virtual box guest additions
cd /media/cdrom0
sudo sh ./VBoxLinuxAdditions.run
# Install R and deps
sudo apt install r-base
sudo nano /etc/apt/sources.list
sudo apt install libssl-dev
sudo apt install libxml2-dev
sudo apt install libcurl4-openssl-dev
# Install R packages
sudo R
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("GenomicFeatures")
BiocManager::install("preprocessCore")
q("no")
# Install Trimmomatic
sudo apt install git
sudo apt install ant
mkdir software
cd software
git clone https://github.com/usadellab/Trimmomatic.git
cd Trimmomatic
ant
# Install STAR
git clone https://github.com/alexdobin/STAR.git
cd STAR/source
# [UPDATE 2024] - Checkout an older version of STAR
git checkout ee50484
make
# Install picard
git clone https://github.com/broadinstitute/picard.git
cd picard
# [UPDATED 2024] - Checkout an older version of PICARD
git checkout 5db8017
./gradlew shadowJar
# Install HTSlib / samtools / bcftools
sudo apt install autoconf
git clone https://github.com/samtools/htslib.git
git clone https://github.com/samtools/samtools.git
git clone https://github.com/samtools/bcftools.git
# Install HTSlib
cd htslib
git submodule update --init --recursive
autoreconf -i
./configure
make
# Install samtools
cd samtools
autoheader
autoconf -Wno-syntax
./configure
make
# Install bcftools
cd bcftools
autoheader
autoconf -Wno-syntax
./configure
make
# GATK install
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip
unzip gatk-4.2.6.1.zip
# SRA
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz
mkdir sratoolkit
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive
# Make symbolic links
mkdir bin
cd bin
ln -s /home/danny/software/STAR/source/STAR STAR
ln -s /home/danny/software/htslib/bgzip bgzip
ln -s /home/danny/software/samtools/samtools samtools
ln -s /home/danny/software/bcftools/bcftools bcftools
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump
#Update the bashrc file
nano ~/.bashrc
# Add at the end:
export PATH="$HOME/bin:$PATH"
# Additional link:
ln -s /home/danny/software/htslib/tabix tabix
# Additional R package:
sudo R
install.packages("ggplot2")
install.packages("gplots")
install.packages("gsalib")
q("no")
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip
unzip IGV_Linux_2.14.1_WithJava.zip
#
# Download Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/"
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome."
chrs <- c(as.character(as.roman(seq(1:16))), "Mito")
# Download
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Download command
cmd <- paste0("wget ", uri, fname)
#cat(cmd, "\n")
system(cmd)
}
# Create an empty the file
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
}
# Compress the fasta file using bgzip (keep original)
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
# Delete the chromosomes
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("rm ", fname)
#cat(cmd, "\n")
system(cmd)
}
# Get the reference transcriptome and GTF for STAR
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz
# Index the genome using samtools
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
# Generate genome/transcriptome index using STAR
STAR --runThreadN 2 --runMode genomeGenerate \
--genomeDir ~/genome/STAR \
--genomeSAindexNbases 10 \
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa
# Get the reference SNPs and index using tabix
tabix saccharomyces_cerevisiae.vcf.gz
# Index the genome using picard
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
#
# Align SRA reads to the Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
if(!is.na(outputfile) && file.exists(outputfile)){
cat("Output for step exists, skipping this step\n");
invisible("")
}
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
if(res[1] >= 1){
cat("Error external process did not finish\n\n");
if(quitOnError) q("no")
}
}
input.dir <- "/home/danny/data/raw"
input.base <- "SRR13978643" #Get from the command line
output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
genome.path <- "/home/danny/genome/STAR"
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"
# Create an output folder
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }
# STEP 0 - SRA Download and Compress
setwd(input.dir)
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq"))
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))
# STEP 1 - READ Trimming
trim.files <- c(
paste0(input.dir, "/", input.base,"_1.fastq.gz"),
paste0(input.dir, "/", input.base,"_2.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
)
trim.path <- "/home/danny/software/Trimmomatic"
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)
execute(trim.cmd, trim.files[3])
# STEP 1.1 - UNZIP for STAR
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])
# STEP 2 - Alignment using STAR
star.outbase <- paste0(output.dir, "/", input.base)
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")
star.exec <- "STAR --runMode alignReads"
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate")
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
star.out <- paste0("--outFileNamePrefix ", star.outbase)
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)
execute(star.cmd, star.bam)
# STEP 2.1 - Create a samtools index
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
# STEP 2.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", star.bam))
execute(paste0("samtools coverage ", star.bam))
#STEP 3 - Remove duplicate reads using picard tools
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
metrics.out <- paste0(star.outbase, "_metrics.txt")
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
p.in <- paste0("-I ", star.bam)
p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
p.opts <- paste0("--REMOVE_DUPLICATES true")
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)
execute(p.cmd, p.bam)
# STEP 3.1 - Create a samtools index
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
# STEP 3.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", p.bam))
execute(paste0("samtools coverage ", p.bam))
# STEP 4 - Add read group (1) and sample run, library, and name
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
execute(p.cmd)
# STEP 4.1 - Create a samtools index
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))
# STEP 5 - GATK prep
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)
# STEP 5.1 - GATK BaseRecalibrator
gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
execute(gatk.cmd, gatk.cov1)
# STEP 5.2 - GATK ApplyBQSR
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
execute(gatk.cmd, recal.bam)
# STEP 5.3 - GATK BaseRecalibrator
gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
execute(gatk.cmd, gatk.cov2)
# STEP 5.4 - GATK AnalyzeCovariates
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
execute(gatk.cmd)
# STEP 6 - Index the recalibrated bam files
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))
# STEP 6.1 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", recal.bam))
execute(paste0("samtools coverage ", recal.bam))
q("no")
@DannyArends
Copy link
Author

Dear all,

I have edited the 0_installSoftware.sh script to checkout the correct version of PICARD and STAR, this should fix the:

Exception in thread "main" java.lang.NullPointerException: Cannot invoke "htsjdk.samtools.SAMReadGroupRecord.getReadGroupId()" because the return value of "htsjdk.samtools.SAMRecord.getReadGroup()" is null

error encountered by some of you

@aqibafarman
Copy link

aqibafarman commented Apr 22, 2024

Hi @DannyArends ,
I'm done with PICARD duplictate error by upgrading picard & java. And I'm already thankful to you for helping. But when I try to run this command ./igv.sh my IGV viewer is not opening. I wonder if you can help me to sort it. When I run command it shows this:
echo Using bundled JDK.
WARNING: package com.sun.java.swing.plaf.windows not in java.desktop
WARNING: package sun.awt.windows not in java.desktop
openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment Temurin-11.0.13+8 (build 11.0.13+8)
OpenJDK 64-Bit Server VM Temurin-11.0.13+8 (build 11.0.13+8, mixed mode)
INFO [Apr 22,2024 08:59] [Globals] Development mode is enabled
SEVERE [Apr 22,2024 08:59] [DefaultExceptionHandler] Unhandled exception
SEVERE [Apr 22,2024 08:59] [DefaultExceptionHandler] java.awt.AWTError: Can't connect to X11 window server using ':0' as the value of the DISPLAY variable.
at java.desktop/sun.awt.X11GraphicsEnvironment.initDisplay(Native Method)
at java.desktop/sun.awt.X11GraphicsEnvironment$1.run(Unknown Source)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.desktop/sun.awt.X11GraphicsEnvironment.(Unknown Source)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment$LocalGE.createGE(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment$LocalGE.(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment.getLocalGraphicsEnvironment(Unknown Source)
at java.desktop/sun.awt.X11.XToolkit.(Unknown Source)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Unknown Source)
at java.desktop/java.awt.Toolkit$2.run(Unknown Source)
at java.desktop/java.awt.Toolkit$2.run(Unknown Source)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.desktop/java.awt.Toolkit.getDefaultToolkit(Unknown Source)
at java.desktop/java.awt.Toolkit.getEventQueue(Unknown Source)
at java.desktop/java.awt.EventQueue.invokeLater(Unknown Source)
at java.desktop/javax.swing.SwingUtilities.invokeLater(Unknown Source)
at org.igv/org.broad.igv.ui.Main.main(Main.java:124)

@DannyArends
Copy link
Author

@aqibafarman this is an issue with the X11 display under Linux not being setup properly and happens when you use a linux which runs in headless mode. E.g. while logging into a server or cluster remotely or when are you using WSL2 instead of a virtual box. The easiest is to transfer the files to windows and use the IGV under window.

@Chris091089
Copy link

Chris091089 commented Apr 27, 2024 via email

@aqibafarman
Copy link

@DannyArends Thanks for letting me know. I am using WSL. But, I will work on your suggestion.

@aqibafarman this is an issue with the X11 display under Linux not being setup properly and happens when you use a linux which runs in headless mode. E.g. while logging into a server or cluster remotely or when are you using WSL2 instead of a virtual box. The easiest is to transfer the files to windows and use the IGV under window.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment