-
-
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
# Add yourself to the sudo group | |
su - | |
usermod -aG sudo danny | |
exit | |
# Install the virtual box guest additions | |
cd /media/cdrom0 | |
sudo sh ./VBoxLinuxAdditions.run | |
# Install R and deps | |
sudo apt install r-base | |
sudo nano /etc/apt/sources.list | |
sudo apt install libssl-dev | |
sudo apt install libxml2-dev | |
sudo apt install libcurl4-openssl-dev | |
# Install R packages | |
sudo R | |
if (!require("BiocManager", quietly = TRUE)) | |
install.packages("BiocManager") | |
BiocManager::install("GenomicFeatures") | |
BiocManager::install("preprocessCore") | |
q("no") | |
# Install Trimmomatic | |
sudo apt install git | |
sudo apt install ant | |
mkdir software | |
cd software | |
git clone https://github.com/usadellab/Trimmomatic.git | |
cd Trimmomatic | |
ant | |
# Install STAR | |
git clone https://github.com/alexdobin/STAR.git | |
cd STAR/source | |
# [UPDATE 2024] - Checkout an older version of STAR | |
git checkout ee50484 | |
make | |
# Install picard | |
git clone https://github.com/broadinstitute/picard.git | |
cd picard | |
# [UPDATED 2024] - Checkout an older version of PICARD | |
git checkout 5db8017 | |
./gradlew shadowJar | |
# Install HTSlib / samtools / bcftools | |
sudo apt install autoconf | |
git clone https://github.com/samtools/htslib.git | |
git clone https://github.com/samtools/samtools.git | |
git clone https://github.com/samtools/bcftools.git | |
# Install HTSlib | |
cd htslib | |
git submodule update --init --recursive | |
autoreconf -i | |
./configure | |
make | |
# Install samtools | |
cd samtools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# Install bcftools | |
cd bcftools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# GATK install | |
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip | |
unzip gatk-4.2.6.1.zip | |
# SRA | |
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz | |
mkdir sratoolkit | |
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit | |
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive | |
# Make symbolic links | |
mkdir bin | |
cd bin | |
ln -s /home/danny/software/STAR/source/STAR STAR | |
ln -s /home/danny/software/htslib/bgzip bgzip | |
ln -s /home/danny/software/samtools/samtools samtools | |
ln -s /home/danny/software/bcftools/bcftools bcftools | |
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump | |
#Update the bashrc file | |
nano ~/.bashrc | |
# Add at the end: | |
export PATH="$HOME/bin:$PATH" | |
# Additional link: | |
ln -s /home/danny/software/htslib/tabix tabix | |
# Additional R package: | |
sudo R | |
install.packages("ggplot2") | |
install.packages("gplots") | |
install.packages("gsalib") | |
q("no") | |
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip | |
unzip IGV_Linux_2.14.1_WithJava.zip |
# | |
# Download Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/" | |
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome." | |
chrs <- c(as.character(as.roman(seq(1:16))), "Mito") | |
# Download | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Download command | |
cmd <- paste0("wget ", uri, fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Create an empty the file | |
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Compress the fasta file using bgzip (keep original) | |
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
# Delete the chromosomes | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("rm ", fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} |
# Get the reference transcriptome and GTF for STAR | |
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using samtools | |
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz | |
# Generate genome/transcriptome index using STAR | |
STAR --runThreadN 2 --runMode genomeGenerate \ | |
--genomeDir ~/genome/STAR \ | |
--genomeSAindexNbases 10 \ | |
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \ | |
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa | |
# Get the reference SNPs and index using tabix | |
tabix saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using picard | |
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \ | |
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz |
# | |
# Align SRA reads to the Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){ | |
if(!is.na(outputfile) && file.exists(outputfile)){ | |
cat("Output for step exists, skipping this step\n"); | |
invisible("") | |
} | |
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n") | |
if(res[1] >= 1){ | |
cat("Error external process did not finish\n\n"); | |
if(quitOnError) q("no") | |
} | |
} | |
input.dir <- "/home/danny/data/raw" | |
input.base <- "SRR13978643" #Get from the command line | |
output.dir <- paste0("/home/danny/data/output/", input.base,".aln") | |
genome.path <- "/home/danny/genome/STAR" | |
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz" | |
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz" | |
# Create an output folder | |
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) } | |
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) } | |
# STEP 0 - SRA Download and Compress | |
setwd(input.dir) | |
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq")) | |
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz")) | |
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz")) | |
# STEP 1 - READ Trimming | |
trim.files <- c( | |
paste0(input.dir, "/", input.base,"_1.fastq.gz"), | |
paste0(input.dir, "/", input.base,"_2.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.U.fastq.gz") | |
) | |
trim.path <- "/home/danny/software/Trimmomatic" | |
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar") | |
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10") | |
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36") | |
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts) | |
execute(trim.cmd, trim.files[3]) | |
# STEP 1.1 - UNZIP for STAR | |
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3])) | |
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5])) | |
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)]) | |
# STEP 2 - Alignment using STAR | |
star.outbase <- paste0(output.dir, "/", input.base) | |
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam") | |
star.exec <- "STAR --runMode alignReads" | |
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate") | |
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" ")) | |
star.out <- paste0("--outFileNamePrefix ", star.outbase) | |
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out) | |
execute(star.cmd, star.bam) | |
# STEP 2.1 - Create a samtools index | |
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai")) | |
# STEP 2.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", star.bam)) | |
execute(paste0("samtools coverage ", star.bam)) | |
#STEP 3 - Remove duplicate reads using picard tools | |
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam") | |
metrics.out <- paste0(star.outbase, "_metrics.txt") | |
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar" | |
p.in <- paste0("-I ", star.bam) | |
p.out <- paste0("-O ", p.bam, " -M ", metrics.out) | |
p.opts <- paste0("--REMOVE_DUPLICATES true") | |
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out) | |
execute(p.cmd, p.bam) | |
# STEP 3.1 - Create a samtools index | |
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai")) | |
# STEP 3.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", p.bam)) | |
execute(paste0("samtools coverage ", p.bam)) | |
# STEP 4 - Add read group (1) and sample run, library, and name | |
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam") | |
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base) | |
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts) | |
execute(p.cmd) | |
# STEP 4.1 - Create a samtools index | |
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai")) | |
# STEP 5 - GATK prep | |
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar" | |
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps) | |
# STEP 5.1 - GATK BaseRecalibrator | |
gatk.cov1 <- paste0(star.outbase, "_cov1.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1) | |
execute(gatk.cmd, gatk.cov1) | |
# STEP 5.2 - GATK ApplyBQSR | |
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam) | |
execute(gatk.cmd, recal.bam) | |
# STEP 5.3 - GATK BaseRecalibrator | |
gatk.cov2 <- paste0(star.outbase, "_cov2.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2) | |
execute(gatk.cmd, gatk.cov2) | |
# STEP 5.4 - GATK AnalyzeCovariates | |
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf") | |
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot) | |
execute(gatk.cmd) | |
# STEP 6 - Index the recalibrated bam files | |
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai")) | |
# STEP 6.1 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", recal.bam)) | |
execute(paste0("samtools coverage ", recal.bam)) | |
q("no") |
Thank you very much, the problem is essentially a matter of some SAMTOOLS updates, be careful, the version that did not work for me is:
3.1.1-16-g5b0b4c014-SNAPSHOT, the version that works is: 2.18.27-SNAPSHOT installed in docker taken from the following link: https://hub.docker.com/r/agrf/picard
Thanks for the support.
@Chris091089 I am encountering the same error which you faced and resolve earlier. I tried to download picard version 2.18.27-SNAPSHOT from your shared link. But it ended up in the following error:
Using default tag: latest
Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?
did you encounter the same issue. If yes, please help. I would be more than thankful for this favour.
Dear all,
I have edited the 0_installSoftware.sh script to checkout the correct version of PICARD and STAR, this should fix the:
Exception in thread "main" java.lang.NullPointerException: Cannot invoke "htsjdk.samtools.SAMReadGroupRecord.getReadGroupId()" because the return value of "htsjdk.samtools.SAMRecord.getReadGroup()" is null
error encountered by some of you
Hi @DannyArends ,
I'm done with PICARD duplictate error by upgrading picard & java. And I'm already thankful to you for helping. But when I try to run this command ./igv.sh my IGV viewer is not opening. I wonder if you can help me to sort it. When I run command it shows this:
echo Using bundled JDK.
WARNING: package com.sun.java.swing.plaf.windows not in java.desktop
WARNING: package sun.awt.windows not in java.desktop
openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment Temurin-11.0.13+8 (build 11.0.13+8)
OpenJDK 64-Bit Server VM Temurin-11.0.13+8 (build 11.0.13+8, mixed mode)
INFO [Apr 22,2024 08:59] [Globals] Development mode is enabled
SEVERE [Apr 22,2024 08:59] [DefaultExceptionHandler] Unhandled exception
SEVERE [Apr 22,2024 08:59] [DefaultExceptionHandler] java.awt.AWTError: Can't connect to X11 window server using ':0' as the value of the DISPLAY variable.
at java.desktop/sun.awt.X11GraphicsEnvironment.initDisplay(Native Method)
at java.desktop/sun.awt.X11GraphicsEnvironment$1.run(Unknown Source)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.desktop/sun.awt.X11GraphicsEnvironment.(Unknown Source)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment$LocalGE.createGE(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment$LocalGE.(Unknown Source)
at java.desktop/java.awt.GraphicsEnvironment.getLocalGraphicsEnvironment(Unknown Source)
at java.desktop/sun.awt.X11.XToolkit.(Unknown Source)
at java.base/java.lang.Class.forName0(Native Method)
at java.base/java.lang.Class.forName(Unknown Source)
at java.desktop/java.awt.Toolkit$2.run(Unknown Source)
at java.desktop/java.awt.Toolkit$2.run(Unknown Source)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.desktop/java.awt.Toolkit.getDefaultToolkit(Unknown Source)
at java.desktop/java.awt.Toolkit.getEventQueue(Unknown Source)
at java.desktop/java.awt.EventQueue.invokeLater(Unknown Source)
at java.desktop/javax.swing.SwingUtilities.invokeLater(Unknown Source)
at org.igv/org.broad.igv.ui.Main.main(Main.java:124)
@aqibafarman this is an issue with the X11 display under Linux not being setup properly and happens when you use a linux which runs in headless mode. E.g. while logging into a server or cluster remotely or when are you using WSL2 instead of a virtual box. The easiest is to transfer the files to windows and use the IGV under window.
@DannyArends Thanks for letting me know. I am using WSL. But, I will work on your suggestion.
@aqibafarman this is an issue with the X11 display under Linux not being setup properly and happens when you use a linux which runs in headless mode. E.g. while logging into a server or cluster remotely or when are you using WSL2 instead of a virtual box. The easiest is to transfer the files to windows and use the IGV under window.
Hi, you managed to solve the error, I have the same problem, how did you solve it?