Skip to content

Instantly share code, notes, and snippets.

@albertopessia
Last active January 3, 2018 14:29
Show Gist options
  • Save albertopessia/fd9df11fb2bdb158ad91936c4638d6fd to your computer and use it in GitHub Desktop.
Save albertopessia/fd9df11fb2bdb158ad91936c4638d6fd to your computer and use it in GitHub Desktop.
Run Kpax3 from the command line
# Run Kpax3 ( https://github.com/albertopessia/Kpax3.jl )
# directly from the command line for a smoother experience.
#
# For an optimization with default parameters type
# julia run_kpax3.jl /path/to/dataset.fasta
#
# For a complete description type
# julia run_kpax3.jl --help
#
# NOTE
# Requires ArgParse and Kpax3 packages.
# Either install them from within julia or run
# julia -e 'Pkg.add("ArgParse"); Pkg.add("Kpax3");'
using ArgParse
using Kpax3
function parse_commandline()
settings = ArgParseSettings(prog="julia run_kpax3.jl",
description="Bayesian bi-clustering of categorical data",
epilog="Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. Bioinformatics.",
version="0.3",
add_version=true)
@add_arg_table settings begin
"--missing"
help = "String of characters to be considered missing. Example: \"?*#-bjxz\""
arg_type = String
default = "?*#-bjxz"
"--param-alpha"
help = "Ewens-Pitman distribution parameter. Number in (0, 1)."
arg_type = Float64
default = 0.5
"--param-theta"
help = "Ewens-Pitman distribution parameter. Number greater than -alpha."
arg_type = Float64
default = -0.25
"--param-gamma"
help = "Comma separated string with prior probabilities for column statuses: noise, weak signal, strong signal. Example: \"0.6, 0.35, 0.05\""
arg_type = String
default = "0.6, 0.35, 0.05"
"--param-r"
help = "Beta distribution hyper-parameter. Read the paper to learn its usage. Recommended to leave it to its default value."
arg_type = Float64
default = log(0.001) / log(0.95)
"--mcmc-length"
help = "Length of the Markov Chain after the warmup period."
arg_type = Int
default = 100000
"--mcmc-warmup"
help = "Length of the Markov Chain warmup period."
arg_type = Int
default = 10000
"--mcmc-thin"
help = "Markov Chain thin step."
arg_type = Int
default = 1
"--mcmc-op"
help = "Comma separated string with Markov Chain operator probabilities: split-merge, gibbs, biased random walk. Example: \"0.5, 0.0, 0.5\""
arg_type = String
default = "0.5, 0.0, 0.5"
"--ga-pop-size"
help = "Genetic algorithm population size."
arg_type = Int
default = 20
"--ga-max-iter"
help = "Maximum number of genetic algorithm iterations."
arg_type = Int
default = 20000
"--ga-max-gap"
help = "Stop the genetic algorithm after these many steps without better solutions."
arg_type = Int
default = 5000
"--ga-xrate"
help = "Genetic algorithm crossover rate."
arg_type = Float64
default = 0.9
"--ga-mrate"
help = "Genetic algorithm mutation rate."
arg_type = Float64
default = 0.005
"--verbose"
help = "Print informative messages during execution."
action = :store_true
"--verbose-step"
help = "Print diagnostics messages every fixed number of steps."
arg_type = Int
default = 500
"--csv"
help = "Enable support for a comma separated value (csv) dataset."
action = :store_true
"--type", "-t"
help = "Type of data analysis. MCMC (for a full Bayesian analysis) or GA (for Maximum A Posteriori estimation with a genetic algorithm)"
arg_type = String
default = "GA"
"--init-partition"
help = "Path to an initial partition. If not specified (default) automatically initialize one."
arg_type = String
default = ""
"--output", "-o"
help = "Path to the output file (file extension not needed)."
arg_type = String
default = "kpax3_output"
"file"
help = "Path to the dataset"
required = true
end
return parse_args(settings)
end
function kpax3_find_map(data::AminoAcidData,
initial_partition::Vector{Int},
settings::KSettings)
solution = kpax3ga(data, initial_partition, settings)
writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
nothing
end
function kpax3_run_mcmc(data::AminoAcidData,
initial_partition::Vector{Int},
settings::KSettings)
kpax3mcmc(data, initial_partition, settings)
solution = kpax3estimate(data, settings)
writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
nothing
end
function amino_acid_data_analysis(settings::KSettings,
partition::String,
optimization::Bool)
data = AminoAcidData(settings)
initial_partition = if partition == ""
initializepartition(settings)
else
normalizepartition(partition, data.id)
end
if optimization
kpax3_find_map(data, initial_partition, settings)
else
kpax3_run_mcmc(data, initial_partition, settings)
end
nothing
end
function categorical_data_analysis(settings::KSettings,
partition::String,
optimization::Bool)
data = CategoricalData(settings)
k = ceil(Int, 0.5 * sqrt(length(data.id)))
initial_partition = Clustering.kmeans(convert(Array{Float64}, data.data), k).assignments
if optimization
kpax3_find_map(data, initial_partition, settings)
else
kpax3_run_mcmc(data, initial_partition, settings)
end
nothing
end
function main()
args = parse_commandline()
settings = KSettings(args["file"],
args["output"],
miss=map(x -> UInt8(x), collect(args["missing"])),
misscsv=[""; map(x -> String(x), split(args["missing"], ""))],
alpha=args["param-alpha"],
theta=args["param-theta"],
gamma=map(x -> parse(Float64, x), split(args["param-gamma"], ",")),
r=args["param-r"],
verbose=args["verbose"],
verbosestep=args["verbose-step"],
T=args["mcmc-length"],
burnin=args["mcmc-warmup"],
tstep=args["mcmc-thin"],
op=map(x -> parse(Float64, x), split(args["mcmc-op"], ",")),
popsize=args["ga-pop-size"],
maxiter=args["ga-max-iter"],
maxgap=args["ga-max-gap"],
xrate=args["ga-xrate"],
mrate=args["ga-mrate"])
if (args["type"] != "GA") && (args["type"] != "MCMC")
throw(ArgumentError("Only 'GA' and 'MCMC' analysis type is allowed."))
end
optimization = args["type"] == "GA"
if !args["csv"]
amino_acid_data_analysis(settings, args["init-partition"], optimization)
else
categorical_data_analysis(settings, args["init-partition"], optimization)
end
nothing
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment