albertopessia/run_kpax3.jl

## run_kpax3.jl
# Run Kpax3 ( https://github.com/albertopessia/Kpax3.jl )
# directly from the command line for a smoother experience.
#
# For an optimization with default parameters type
#     julia run_kpax3.jl /path/to/dataset.fasta
#
# For a complete description type
#     julia run_kpax3.jl --help
#
# NOTE
# Requires ArgParse and Kpax3 packages.
# Either install them from within julia or run
#     julia -e 'Pkg.add("ArgParse"); Pkg.add("Kpax3");'
using ArgParse
using Kpax3

function parse_commandline()
  settings = ArgParseSettings(prog="julia run_kpax3.jl",
                              description="Bayesian bi-clustering of categorical data",
                              epilog="Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. Bioinformatics.",
                              version="0.3",
                              add_version=true)

  @add_arg_table settings begin
    "--missing"
      help = "String of characters to be considered missing. Example: \"?*#-bjxz\""
      arg_type = String
      default = "?*#-bjxz"
    "--param-alpha"
      help = "Ewens-Pitman distribution parameter. Number in (0, 1)."
      arg_type = Float64
      default = 0.5
    "--param-theta"
      help = "Ewens-Pitman distribution parameter. Number greater than -alpha."
      arg_type = Float64
      default = -0.25
    "--param-gamma"
      help = "Comma separated string with prior probabilities for column statuses: noise, weak signal, strong signal. Example: \"0.6, 0.35, 0.05\""
      arg_type = String
      default = "0.6, 0.35, 0.05"
    "--param-r"
      help = "Beta distribution hyper-parameter. Read the paper to learn its usage. Recommended to leave it to its default value."
      arg_type = Float64
      default = log(0.001) / log(0.95)
    "--mcmc-length"
      help = "Length of the Markov Chain after the warmup period."
      arg_type = Int
      default = 100000
    "--mcmc-warmup"
      help = "Length of the Markov Chain warmup period."
      arg_type = Int
      default = 10000
    "--mcmc-thin"
      help = "Markov Chain thin step."
      arg_type = Int
      default = 1
    "--mcmc-op"
      help = "Comma separated string with Markov Chain operator probabilities: split-merge, gibbs, biased random walk. Example: \"0.5, 0.0, 0.5\""
      arg_type = String
      default = "0.5, 0.0, 0.5"
    "--ga-pop-size"
      help = "Genetic algorithm population size."
      arg_type = Int
      default = 20
    "--ga-max-iter"
      help = "Maximum number of genetic algorithm iterations."
      arg_type = Int
      default = 20000
    "--ga-max-gap"
      help = "Stop the genetic algorithm after these many steps without better solutions."
      arg_type = Int
      default = 5000
    "--ga-xrate"
      help = "Genetic algorithm crossover rate."
      arg_type = Float64
      default = 0.9
    "--ga-mrate"
      help = "Genetic algorithm mutation rate."
      arg_type = Float64
      default = 0.005
    "--verbose"
      help = "Print informative messages during execution."
      action = :store_true
    "--verbose-step"
      help = "Print diagnostics messages every fixed number of steps."
      arg_type = Int
      default = 500
    "--csv"
      help = "Enable support for a comma separated value (csv) dataset."
      action = :store_true
    "--type", "-t"
      help = "Type of data analysis. MCMC (for a full Bayesian analysis) or GA (for Maximum A Posteriori estimation with a genetic algorithm)"
      arg_type = String
      default = "GA"
    "--init-partition"
      help = "Path to an initial partition. If not specified (default) automatically initialize one."
      arg_type = String
      default = ""
    "--output", "-o"
      help = "Path to the output file (file extension not needed)."
      arg_type = String
      default = "kpax3_output"
    "file"
      help = "Path to the dataset"
      required = true
  end

  return parse_args(settings)
end

function kpax3_find_map(data::AminoAcidData,
                        initial_partition::Vector{Int},
                        settings::KSettings)
  solution = kpax3ga(data, initial_partition, settings)
  writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
  nothing
end

function kpax3_run_mcmc(data::AminoAcidData,
                        initial_partition::Vector{Int},
                        settings::KSettings)
  kpax3mcmc(data, initial_partition, settings)
  solution = kpax3estimate(data, settings)
  writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
  nothing
end

function amino_acid_data_analysis(settings::KSettings,
                                  partition::String,
                                  optimization::Bool)
  data = AminoAcidData(settings)

  initial_partition = if partition == ""
                        initializepartition(settings)
                      else
                        normalizepartition(partition, data.id)
                      end

  if optimization
    kpax3_find_map(data, initial_partition, settings)
  else
    kpax3_run_mcmc(data, initial_partition, settings)
  end

  nothing
end

function categorical_data_analysis(settings::KSettings,
                                   partition::String,
                                   optimization::Bool)
  data = CategoricalData(settings)

  k = ceil(Int, 0.5 * sqrt(length(data.id)))
  initial_partition = Clustering.kmeans(convert(Array{Float64}, data.data), k).assignments

  if optimization
    kpax3_find_map(data, initial_partition, settings)
  else
    kpax3_run_mcmc(data, initial_partition, settings)
  end

  nothing
end

function main()
  args = parse_commandline()

  settings = KSettings(args["file"],
                       args["output"],
                       miss=map(x -> UInt8(x), collect(args["missing"])),
                       misscsv=[""; map(x -> String(x), split(args["missing"], ""))],
                       alpha=args["param-alpha"],
                       theta=args["param-theta"],
                       gamma=map(x -> parse(Float64, x), split(args["param-gamma"], ",")),
                       r=args["param-r"],
                       verbose=args["verbose"],
                       verbosestep=args["verbose-step"],
                       T=args["mcmc-length"],
                       burnin=args["mcmc-warmup"],
                       tstep=args["mcmc-thin"],
                       op=map(x -> parse(Float64, x), split(args["mcmc-op"], ",")),
                       popsize=args["ga-pop-size"],
                       maxiter=args["ga-max-iter"],
                       maxgap=args["ga-max-gap"],
                       xrate=args["ga-xrate"],
                       mrate=args["ga-mrate"])

  if (args["type"] != "GA") && (args["type"] != "MCMC")
    throw(ArgumentError("Only 'GA' and 'MCMC' analysis type is allowed."))
  end

  optimization = args["type"] == "GA"

  if !args["csv"]
    amino_acid_data_analysis(settings, args["init-partition"], optimization)
  else
    categorical_data_analysis(settings, args["init-partition"], optimization)
  end

  nothing
end

main()
	# Run Kpax3 ( https://github.com/albertopessia/Kpax3.jl )
	# directly from the command line for a smoother experience.
	#
	# For an optimization with default parameters type
	# julia run_kpax3.jl /path/to/dataset.fasta
	#
	# For a complete description type
	# julia run_kpax3.jl --help
	#
	# NOTE
	# Requires ArgParse and Kpax3 packages.
	# Either install them from within julia or run
	# julia -e 'Pkg.add("ArgParse"); Pkg.add("Kpax3");'
	using ArgParse
	using Kpax3

	function parse_commandline()
	settings = ArgParseSettings(prog="julia run_kpax3.jl",
	description="Bayesian bi-clustering of categorical data",
	epilog="Pessia, A. and Corander, J. (2018). Kpax3: Bayesian bi-clustering of large sequence datasets. Bioinformatics.",
	version="0.3",
	add_version=true)

	@add_arg_table settings begin
	"--missing"
	help = "String of characters to be considered missing. Example: \"?*#-bjxz\""
	arg_type = String
	default = "?*#-bjxz"
	"--param-alpha"
	help = "Ewens-Pitman distribution parameter. Number in (0, 1)."
	arg_type = Float64
	default = 0.5
	"--param-theta"
	help = "Ewens-Pitman distribution parameter. Number greater than -alpha."
	arg_type = Float64
	default = -0.25
	"--param-gamma"
	help = "Comma separated string with prior probabilities for column statuses: noise, weak signal, strong signal. Example: \"0.6, 0.35, 0.05\""
	arg_type = String
	default = "0.6, 0.35, 0.05"
	"--param-r"
	help = "Beta distribution hyper-parameter. Read the paper to learn its usage. Recommended to leave it to its default value."
	arg_type = Float64
	default = log(0.001) / log(0.95)
	"--mcmc-length"
	help = "Length of the Markov Chain after the warmup period."
	arg_type = Int
	default = 100000
	"--mcmc-warmup"
	help = "Length of the Markov Chain warmup period."
	arg_type = Int
	default = 10000
	"--mcmc-thin"
	help = "Markov Chain thin step."
	arg_type = Int
	default = 1
	"--mcmc-op"
	help = "Comma separated string with Markov Chain operator probabilities: split-merge, gibbs, biased random walk. Example: \"0.5, 0.0, 0.5\""
	arg_type = String
	default = "0.5, 0.0, 0.5"
	"--ga-pop-size"
	help = "Genetic algorithm population size."
	arg_type = Int
	default = 20
	"--ga-max-iter"
	help = "Maximum number of genetic algorithm iterations."
	arg_type = Int
	default = 20000
	"--ga-max-gap"
	help = "Stop the genetic algorithm after these many steps without better solutions."
	arg_type = Int
	default = 5000
	"--ga-xrate"
	help = "Genetic algorithm crossover rate."
	arg_type = Float64
	default = 0.9
	"--ga-mrate"
	help = "Genetic algorithm mutation rate."
	arg_type = Float64
	default = 0.005
	"--verbose"
	help = "Print informative messages during execution."
	action = :store_true
	"--verbose-step"
	help = "Print diagnostics messages every fixed number of steps."
	arg_type = Int
	default = 500
	"--csv"
	help = "Enable support for a comma separated value (csv) dataset."
	action = :store_true
	"--type", "-t"
	help = "Type of data analysis. MCMC (for a full Bayesian analysis) or GA (for Maximum A Posteriori estimation with a genetic algorithm)"
	arg_type = String
	default = "GA"
	"--init-partition"
	help = "Path to an initial partition. If not specified (default) automatically initialize one."
	arg_type = String
	default = ""
	"--output", "-o"
	help = "Path to the output file (file extension not needed)."
	arg_type = String
	default = "kpax3_output"
	"file"
	help = "Path to the dataset"
	required = true
	end

	return parse_args(settings)
	end

	function kpax3_find_map(data::AminoAcidData,
	initial_partition::Vector{Int},
	settings::KSettings)
	solution = kpax3ga(data, initial_partition, settings)
	writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
	nothing
	end

	function kpax3_run_mcmc(data::AminoAcidData,
	initial_partition::Vector{Int},
	settings::KSettings)
	kpax3mcmc(data, initial_partition, settings)
	solution = kpax3estimate(data, settings)
	writeresults(data, solution, settings.ofile, what=4, verbose=settings.verbose)
	nothing
	end

	function amino_acid_data_analysis(settings::KSettings,
	partition::String,
	optimization::Bool)
	data = AminoAcidData(settings)

	initial_partition = if partition == ""
	initializepartition(settings)
	else
	normalizepartition(partition, data.id)
	end

	if optimization
	kpax3_find_map(data, initial_partition, settings)
	else
	kpax3_run_mcmc(data, initial_partition, settings)
	end

	nothing
	end

	function categorical_data_analysis(settings::KSettings,
	partition::String,
	optimization::Bool)
	data = CategoricalData(settings)

	k = ceil(Int, 0.5 * sqrt(length(data.id)))
	initial_partition = Clustering.kmeans(convert(Array{Float64}, data.data), k).assignments

	if optimization
	kpax3_find_map(data, initial_partition, settings)
	else
	kpax3_run_mcmc(data, initial_partition, settings)
	end

	nothing
	end

	function main()
	args = parse_commandline()

	settings = KSettings(args["file"],
	args["output"],
	miss=map(x -> UInt8(x), collect(args["missing"])),
	misscsv=[""; map(x -> String(x), split(args["missing"], ""))],
	alpha=args["param-alpha"],
	theta=args["param-theta"],
	gamma=map(x -> parse(Float64, x), split(args["param-gamma"], ",")),
	r=args["param-r"],
	verbose=args["verbose"],
	verbosestep=args["verbose-step"],
	T=args["mcmc-length"],
	burnin=args["mcmc-warmup"],
	tstep=args["mcmc-thin"],
	op=map(x -> parse(Float64, x), split(args["mcmc-op"], ",")),
	popsize=args["ga-pop-size"],
	maxiter=args["ga-max-iter"],
	maxgap=args["ga-max-gap"],
	xrate=args["ga-xrate"],
	mrate=args["ga-mrate"])

	if (args["type"] != "GA") && (args["type"] != "MCMC")
	throw(ArgumentError("Only 'GA' and 'MCMC' analysis type is allowed."))
	end

	optimization = args["type"] == "GA"

	if !args["csv"]
	amino_acid_data_analysis(settings, args["init-partition"], optimization)
	else
	categorical_data_analysis(settings, args["init-partition"], optimization)
	end

	nothing
	end

	main()