fo40225/star.sh

## star.sh
git clone https://github.com/alexdobin/STAR.git -b 2.7.9a
cd STAR/source
# edit Makefile:L35 CXXFLAGS_SIMD ?= -mavx2
# to CXXFLAGS_SIMD ?= -march=native
make STAR -j $(nproc)
sudo make install

cd ~
mkdir Gencode_human
cd Gencode_human
mkdir release_19
wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz
wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
gunzip GRCh37.p13.genome.fa.gz
gunzip gencode.v19.annotation.gtf.gz
# fix Gencode to GRCh37
#sed -r -i 's/^>\S+ />/g' GRCh37.p13.genome.fa
#sed -r -i 's/^chr//g' gencode.v19.annotation.gtf

cd ..
mkdir release_22 # The National Institutes of Health The National Cancer Institute The Genomic Data Commons use v22 https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.primary_assembly.genome.fa.gz
wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/gencode.v22.primary_assembly.annotation.gtf.gz
gunzip GRCh38.primary_assembly.genome.fa.gz
gunzip gencode.v22.primary_assembly.annotation.gtf.gz
# from STAR manual
# 2.2.1 Which chromosomes/scaffolds/patches to include?
# It is strongly recommended to include major chromosomes (e.g., for human chr1-22,chrX,chrY,chrM,) as well as un-placed and un-localized scaffolds. Typically, un-placed/un-localized scaffolds add just a few MegaBases to the genome length, however, a substantial number of reads may map to ribosomal RNA (rRNA) repeats on these scaffolds. These reads would be reported as unmapped if the scaffolds are not included in the genome, or, even worse, may be aligned to wrong loci on the chromosomes. Generally, patches and alternative haplotypes should not be included in the genome.
# Examples of acceptable genome sequence files:
# • ENSEMBL: files marked with .dna.primary.assembly, such as: ftp://ftp.ensembl. org/pub/release-77/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_ assembly.fa.gz
# • GENCODE: files marked with PRI (primary). Strongly recommended for mouse and human: http://www.gencodegenes.org/.
# 2.2.2 Which annotations to use?
# The use of the most comprehensive annotations for a given species is strongly recommended. Very importantly, chromosome names in the annotations GTF file have to match chromosome names in the FASTA genome sequence files. For example, one can use ENSEMBL FASTA files with ENSEMBL GTF files, and UCSC FASTA files with UCSC FASTA files. However, since UCSC uses chr1, chr2, ... naming convention, and ENSEMBL uses 1, 2, ... naming, the ENSEMBL and UCSC FASTA and GTF files cannot be mixed together, unless chromosomes are renamed to match between the FASTA anf GTF files.

cd ~

#generate genome indices
export genomeDir=~/Gencode_human/release_22
STAR \
--runThreadN $(nproc) \
--runMode genomeGenerate \
--genomeDir $genomeDir \
--genomeFastaFiles $genomeDir/GRCh38.primary_assembly.genome.fa \
--sjdbGTFfile $genomeDir/gencode.v22.primary_assembly.annotation.gtf

# rna alignment
ulimit -n 65536
STAR \
--runThreadN $(nproc) \
--genomeDir $genomeDir \
--readFilesIn RNA-001_R1.fq.gz RNA-001_R2.fq.gz \
--readFilesCommand gunzip -c \
--outFileNamePrefix RNA-001. \
--outSAMtype BAM Unsorted \
--outSAMattributes NH HI AS nM NM MD jM jI MC ch XS \
--outSAMattrRGline "ID:RNA-001" "SM:RNA-001" "PL:Illumina" \
--outBAMcompression 2 \
--twopassMode Basic
	git clone https://github.com/alexdobin/STAR.git -b 2.7.9a
	cd STAR/source
	# edit Makefile:L35 CXXFLAGS_SIMD ?= -mavx2
	# to CXXFLAGS_SIMD ?= -march=native
	make STAR -j $(nproc)
	sudo make install

	cd ~
	mkdir Gencode_human
	cd Gencode_human
	mkdir release_19
	wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz
	wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
	gunzip GRCh37.p13.genome.fa.gz
	gunzip gencode.v19.annotation.gtf.gz
	# fix Gencode to GRCh37
	#sed -r -i 's/^>\S+ />/g' GRCh37.p13.genome.fa
	#sed -r -i 's/^chr//g' gencode.v19.annotation.gtf

	cd ..
	mkdir release_22 # The National Institutes of Health The National Cancer Institute The Genomic Data Commons use v22 https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files
	wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.primary_assembly.genome.fa.gz
	wget http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/gencode.v22.primary_assembly.annotation.gtf.gz
	gunzip GRCh38.primary_assembly.genome.fa.gz
	gunzip gencode.v22.primary_assembly.annotation.gtf.gz
	# from STAR manual
	# 2.2.1 Which chromosomes/scaffolds/patches to include?
	# It is strongly recommended to include major chromosomes (e.g., for human chr1-22,chrX,chrY,chrM,) as well as un-placed and un-localized scaffolds. Typically, un-placed/un-localized scaffolds add just a few MegaBases to the genome length, however, a substantial number of reads may map to ribosomal RNA (rRNA) repeats on these scaffolds. These reads would be reported as unmapped if the scaffolds are not included in the genome, or, even worse, may be aligned to wrong loci on the chromosomes. Generally, patches and alternative haplotypes should not be included in the genome.
	# Examples of acceptable genome sequence files:
	# • ENSEMBL: files marked with .dna.primary.assembly, such as: ftp://ftp.ensembl. org/pub/release-77/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_ assembly.fa.gz
	# • GENCODE: files marked with PRI (primary). Strongly recommended for mouse and human: http://www.gencodegenes.org/.
	# 2.2.2 Which annotations to use?
	# The use of the most comprehensive annotations for a given species is strongly recommended. Very importantly, chromosome names in the annotations GTF file have to match chromosome names in the FASTA genome sequence files. For example, one can use ENSEMBL FASTA files with ENSEMBL GTF files, and UCSC FASTA files with UCSC FASTA files. However, since UCSC uses chr1, chr2, ... naming convention, and ENSEMBL uses 1, 2, ... naming, the ENSEMBL and UCSC FASTA and GTF files cannot be mixed together, unless chromosomes are renamed to match between the FASTA anf GTF files.

	cd ~

	#generate genome indices
	export genomeDir=~/Gencode_human/release_22
	STAR \
	--runThreadN $(nproc) \
	--runMode genomeGenerate \
	--genomeDir $genomeDir \
	--genomeFastaFiles $genomeDir/GRCh38.primary_assembly.genome.fa \
	--sjdbGTFfile $genomeDir/gencode.v22.primary_assembly.annotation.gtf

	# rna alignment
	ulimit -n 65536
	STAR \
	--runThreadN $(nproc) \
	--genomeDir $genomeDir \
	--readFilesIn RNA-001_R1.fq.gz RNA-001_R2.fq.gz \
	--readFilesCommand gunzip -c \
	--outFileNamePrefix RNA-001. \
	--outSAMtype BAM Unsorted \
	--outSAMattributes NH HI AS nM NM MD jM jI MC ch XS \
	--outSAMattrRGline "ID:RNA-001" "SM:RNA-001" "PL:Illumina" \
	--outBAMcompression 2 \
	--twopassMode Basic