Install micromamba or mamba or conda
# Install micromamba
"${SHELL}" <(curl -L micro.mamba.pm/install.sh)
You will then see something like this in a BASH shell (parts with "(type....)" are added for instructions
using XAM | |
using ArgParse | |
using DataFrames | |
using Dates | |
using Statistics | |
using Base.MathConstants | |
using StatsBase | |
using StatsPlots | |
using KernelDensity |
# note: the Dorado BAM file produced by basecalling POD5 files needs to be indexed with "samtools index input_file" | |
# note2: something seems really strange about the qualtiy scores, I have checked several times, but they seem correct | |
# I am not sure why they are reporting that for example reads with Q38 average quality scores. | |
# EDIT: this is fixed, see https://www.biostars.org/p/295932/#295936 for better formula for average Phred Score | |
# tested with julia-1.10.3 and XAM.jl-0.40 | |
using XAM | |
using ArgParse | |
using DataFrames | |
using Dates | |
using Plots |
# note: outputs to STDOUT a SAM file without a header | |
# note2: remove auxillary tags and information | |
# note3: ignores RNEXT and PNEXT from BAM file (puts * and 0 respectivelv) | |
# tested with julialang v1.10.2 and XAM v0.4.0 | |
# $ julia changeBAMquality.jl input.bam ? > test.sam.noheader | |
# | |
# to add a header from the original BAM file that had its qualities changed | |
# and add on a PG line | |
# $ ASCII_CHARACTER="?" | |
# $ INPUT=input.bam |
using XAM | |
using ArgParse | |
using DataFrames | |
using Dates | |
using Plots | |
using StatsBase | |
using Plots.PlotMeasures | |
function parse_commandline() |
# tested on Julialang v.1.10.2, DataStructures v0.18.16, and FASTX v2.1.4 | |
# Usage | |
# $ julia stitch-fasta.jl chr20.herro.fasta.Q30.recal.shred.fasta > chr20.herro.fasta.Q30.recal.fasta | |
# | |
import Pkg; Pkg.add("FASTX") | |
import Pkg; Pkg.add("DataStructures") | |
using DataStructures | |
using FASTX | |
function process_fasta_file(filename::String) |
# tested on Julialang v.1.10.2 , DataStructures v0.18.16, and FASTX v2.1.4 | |
# Usage | |
# $ julia stitch-fastq.jl chr20.herro.fasta.Q30.recal.shred.fastq > chr20.herro.fasta.Q30.recal.fastq | |
# | |
import Pkg; Pkg.add("FASTX") | |
import Pkg; Pkg.add("DataStructures") | |
using DataStructures | |
using FASTX | |
function process_fastq_file(filename::String) |
# previous versions allowed to generate directly from BAM, but there seemed to have been some troubles | |
# between versions of XAM, so this version is working so far for its intended purpose on https://github.com/brendanofallon/jovian | |
# also, this version is rather fast | |
# | |
# | |
# note: ignores quality scores at the moment - fixed in revision#4 | |
# note2: outputs to STDOUT a SAM file without a header | |
# tested with julialang v1.10.2 and XAM v0.4.0 | |
# $ julia shredBAM.jl input.bam 300 > test.sam.noheader | |
# |
# output from best commit #fcdfa97 (https://github.com/google/best), .summary_identity_stats.csv files using reads | |
# aligned to concatenated chr20_MATERNAL and chr20_PATERNAL from hg002v1.0.1.fasta.gz (https://github.com/marbl/HG002) (https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.0.1.fasta.gz) | |
# using mm2-fast commit # 10bde16 using settings: --eqx --secondary=no -Y -c -ax map-ont -k 19 -w 13 -t 48 | |
# or using these settings for Illumina NextSeq2000 reads: -t 48 --eqx --secondary=no -acx sr | |
# | |
# brutal_rewrite (br) commit # ad87f92 (https://github.com/natir/br) using settings: -k 19 -m graph | |
# kmer read filter (kmrf) commit # 36cad24 (https://github.com/natir/kmrf) using setting: -k 17 | |
# peregrine-2021 (pg_asm) commit # 6698eb1 (https://github.com/cschin/peregrine-2021): using default settings | |
# | |
# herro (herro) commit # c41dc30 (https://github.com/lbcb-sci/herro) using defaults and model at time of commit |
This was with https://zymo-files.s3.amazonaws.com/BioPool/ZymoBIOMICS.STD.refseq.v2.zip | |
RAW_SUP_Duplex pg_asm_1x_corrected_SUP_duplex pg_asm_2x_corrected_SUP_duplex pg_asm_3x_corrected_SUP_duplex | |
Bacillus_subtilis Bacillus_subtilis Bacillus_subtilis Bacillus_subtilis | |
# target bases: 4041255 # target bases: 4041255 # target bases: 4041255 # target bases: 4041255 | |
# target bases overlapping regions: 4041255 (100.00%) # target bases overlapping regions: 4041255 (100.00%) # target bases overlapping regions: 4041255 (100.00%) # target bases overlapping regions: 4041255 (100.00%) | |
1159311 reference bases covered by exactly one contig 3791080 reference bases covered by exactly one contig 3642732 reference bases covered by exa |