Last active
January 3, 2018 14:29
-
-
Save albertopessia/fd9df11fb2bdb158ad91936c4638d6fd to your computer and use it in GitHub Desktop.
Run Kpax3 from the command line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run Kpax3 ( https://github.com/albertopessia/Kpax3.jl ) | |
# directly from the command line for a smoother experience. | |
# | |
# For an optimization with default parameters type | |
# julia run_kpax3.jl /path/to/dataset.fasta | |
# | |
# For a complete description type | |
# julia run_kpax3.jl --help | |
# | |
# NOTE | |
# Requires ArgParse and Kpax3 packages. | |
# Either install them from within julia or run | |
# julia -e 'Pkg.add("ArgParse"); Pkg.add("Kpax3");' | |
using ArgParse | |
using Kpax3 | |
function parse_commandline() | |
settings = ArgParseSettings(prog="julia run_kpax3.jl", | |
description="Bayesian bi-clustering of categorical data", | |
epilog="Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. Bioinformatics.", | |
version="0.3", | |
add_version=true) | |
@add_arg_table settings begin | |
"--missing" | |
help = "String of characters to be considered missing. Example: \"?*#-bjxz\"" | |
arg_type = String | |
default = "?*#-bjxz" | |
"--param-alpha" | |
help = "Ewens-Pitman distribution parameter. Number in (0, 1)." | |
arg_type = Float64 | |
default = 0.5 | |
"--param-theta" | |
help = "Ewens-Pitman distribution parameter. Number greater than -alpha." | |
arg_type = Float64 | |
default = -0.25 | |
"--param-gamma" | |
help = "Comma separated string with prior probabilities for column statuses: noise, weak signal, strong signal. Example: \"0.6, 0.35, 0.05\"" | |
arg_type = String | |
default = "0.6, 0.35, 0.05" | |
"--param-r" | |
help = "Beta distribution hyper-parameter. Read the paper to learn its usage. Recommended to leave it to its default value." | |
arg_type = Float64 | |
default = log(0.001) / log(0.95) | |
"--mcmc-length" | |
help = "Length of the Markov Chain after the warmup period." | |
arg_type = Int | |
default = 100000 | |
"--mcmc-warmup" | |
help = "Length of the Markov Chain warmup period." | |
arg_type = Int | |
default = 10000 | |
"--mcmc-thin" | |
help = "Markov Chain thin step." | |
arg_type = Int | |
default = 1 | |
"--mcmc-op" | |
help = "Comma separated string with Markov Chain operator probabilities: split-merge, gibbs, biased random walk. Example: \"0.5, 0.0, 0.5\"" | |
arg_type = String | |
default = "0.5, 0.0, 0.5" | |
"--ga-pop-size" | |
help = "Genetic algorithm population size." | |
arg_type = Int | |
default = 20 | |
"--ga-max-iter" | |
help = "Maximum number of genetic algorithm iterations." | |
arg_type = Int | |
default = 20000 | |
"--ga-max-gap" | |
help = "Stop the genetic algorithm after these many steps without better solutions." | |
arg_type = Int | |
default = 5000 | |
"--ga-xrate" | |
help = "Genetic algorithm crossover rate." | |
arg_type = Float64 | |
default = 0.9 | |
"--ga-mrate" | |
help = "Genetic algorithm mutation rate." | |
arg_type = Float64 | |
default = 0.005 | |
"--verbose" | |
help = "Print informative messages during execution." | |
action = :store_true | |
"--verbose-step" | |
help = "Print diagnostics messages every fixed number of steps." | |
arg_type = Int | |
default = 500 | |
"--csv" | |
help = "Enable support for a comma separated value (csv) dataset." | |
action = :store_true | |
"--type", "-t" | |
help = "Type of data analysis. MCMC (for a full Bayesian analysis) or GA (for Maximum A Posteriori estimation with a genetic algorithm)" | |
arg_type = String | |
default = "GA" | |
"--init-partition" | |
help = "Path to an initial partition. If not specified (default) automatically initialize one." | |
arg_type = String | |
default = "" | |
"--output", "-o" | |
help = "Path to the output file (file extension not needed)." | |
arg_type = String | |
default = "kpax3_output" | |
"file" | |
help = "Path to the dataset" | |
required = true | |
end | |
return parse_args(settings) | |
end | |
function kpax3_find_map(data::AminoAcidData, | |
initial_partition::Vector{Int}, | |
settings::KSettings) | |
solution = kpax3ga(data, initial_partition, settings) | |
writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose) | |
nothing | |
end | |
function kpax3_run_mcmc(data::AminoAcidData, | |
initial_partition::Vector{Int}, | |
settings::KSettings) | |
kpax3mcmc(data, initial_partition, settings) | |
solution = kpax3estimate(data, settings) | |
writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose) | |
nothing | |
end | |
function amino_acid_data_analysis(settings::KSettings, | |
partition::String, | |
optimization::Bool) | |
data = AminoAcidData(settings) | |
initial_partition = if partition == "" | |
initializepartition(settings) | |
else | |
normalizepartition(partition, data.id) | |
end | |
if optimization | |
kpax3_find_map(data, initial_partition, settings) | |
else | |
kpax3_run_mcmc(data, initial_partition, settings) | |
end | |
nothing | |
end | |
function categorical_data_analysis(settings::KSettings, | |
partition::String, | |
optimization::Bool) | |
data = CategoricalData(settings) | |
k = ceil(Int, 0.5 * sqrt(length(data.id))) | |
initial_partition = Clustering.kmeans(convert(Array{Float64}, data.data), k).assignments | |
if optimization | |
kpax3_find_map(data, initial_partition, settings) | |
else | |
kpax3_run_mcmc(data, initial_partition, settings) | |
end | |
nothing | |
end | |
function main() | |
args = parse_commandline() | |
settings = KSettings(args["file"], | |
args["output"], | |
miss=map(x -> UInt8(x), collect(args["missing"])), | |
misscsv=[""; map(x -> String(x), split(args["missing"], ""))], | |
alpha=args["param-alpha"], | |
theta=args["param-theta"], | |
gamma=map(x -> parse(Float64, x), split(args["param-gamma"], ",")), | |
r=args["param-r"], | |
verbose=args["verbose"], | |
verbosestep=args["verbose-step"], | |
T=args["mcmc-length"], | |
burnin=args["mcmc-warmup"], | |
tstep=args["mcmc-thin"], | |
op=map(x -> parse(Float64, x), split(args["mcmc-op"], ",")), | |
popsize=args["ga-pop-size"], | |
maxiter=args["ga-max-iter"], | |
maxgap=args["ga-max-gap"], | |
xrate=args["ga-xrate"], | |
mrate=args["ga-mrate"]) | |
if (args["type"] != "GA") && (args["type"] != "MCMC") | |
throw(ArgumentError("Only 'GA' and 'MCMC' analysis type is allowed.")) | |
end | |
optimization = args["type"] == "GA" | |
if !args["csv"] | |
amino_acid_data_analysis(settings, args["init-partition"], optimization) | |
else | |
categorical_data_analysis(settings, args["init-partition"], optimization) | |
end | |
nothing | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment