keithshep/liftover-csv.rb

## liftover-csv.rb
#!/usr/bin/ruby -w

################################################################################
# Created By: Keith Sheppard (keith.sheppard@jax.org)
# Modification History:
#   July 7, 2008: Initial revision created to run the liftover utility on
#   the CGD (cgd.jax.org) imputed SNP data in comma-separated format
#
# This is a ruby script for running liftover to map SNPs in a source
# comma-separated file to a destination file using a different NCBI genome
# coordinate system. Run the script with no parameters to see usage
################################################################################

################################################################################
# a little function for printing usage and quitting the script
################################################################################
def printusageandexit(exit_status=1)
  STDERR.print(
    "Usage: liftover-csv.rb <input.csv|input-dir> [chromosome-column-name] " +
    "[base-pair-position-column-name] chain-file <output.csv|output-dir>\n")
  STDERR.print "  default chromosome-column-name=ChrID\n"
  STDERR.print "  default base-pair-position-column-name=build 36 bp Position\n"
  STDERR.print "  NOTE: this script expects the \"liftover\" executable to be\n"
  STDERR.print "        on the path\n"
  STDERR.print "  NOTE: the output will only be sorted correctly if the \n"
  STDERR.print "        input file is for a single chromosome. this is because\n"
  STDERR.print "        the output is sorted on base-pair position only\n"

  Process.exit(exit_status)
end

################################################################################
# get the index of the column with the given name assuming that the header
# passed in is comma separated. If we can't find it, return -1
################################################################################
def findcolumnindex(header_line, column_name)
  i = 0
  header_line.split(',').each do |each_column_name|
    if each_column_name == column_name
      return i
    else
      i = i + 1
    end
  end

  # we couldn't find it...
  return -1
end

################################################################################
# sort and save the genotype data to the given file. this function returns
# an opened read-only reference to the file that it creates
################################################################################
def sortandsave(data_hash, file_name)
  out_file = File.open(file_name, "w")
  data_hash.sort.each do |key, value|
    # join the data with a ',' so that we follow a CSV format, and write it
    # to the file
    out_file.puts(value.join(","))
  end

  # OK, we're done.. close the file
  out_file.close

  # Now reopen the file as read-only and return it
  return File.open(file_name, "r")
end

################################################################################
# Lift the given SNPs using the liftover command line program. The liftover
# application should be in the users PATH and should be executable by typing
# "liftover"
# in_file_name:
#     the path of the input file. this file should be a CSV file with a header
#     row. the header row should contain a "ChrID" column that contains
#     chromosome ID's in BED format (so that liftover can use them) and all of
#     the chromosomes should be the same.
# chromosome_column_name_arg:
#     the name of the column that contains the chromosome name
# base_pair_column_name_name:
#     the name of the column that contains the SNPs
# chain_file_name:
#     the path to the chain file to use for the liftover command
# out_file_name:
#     the path to the output file that we should create for the lifted data
################################################################################
def lift_snps(
    in_file_name,
    chromosome_column_name_arg,
    base_pair_column_name_name,
    chain_file_name,
    out_file_name)

  # These variables can be modified (or maybe parameterized) to match what you
  # need them to be.

  # The name of the liftover executable
  liftover_executable = "liftover"

  # If you're running out of memory you should reduce the
  # max_rows_in_memory variable.
  max_rows_in_memory = 100000

  # do a couple of sanity checks before we start
  if File.exists?(out_file_name)
    STDERR.print(
        "Refusing to run because output file " + out_file_name + " exists\n")
    Process.exit(1)
  end

  # intermediate/temporary files used in the invokation of liftover
  temp_dir_name = "temp-dir"

  if File.exists?(temp_dir_name)
    STDERR.print(
        "Refusing to run because " + temp_dir_name + " already exists." +
        " If you don't need it, delete it and re-run the command.\n")
    Process.exit(1)
  end

  Dir.mkdir(temp_dir_name)
  temp_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me.bed"
  temp_lifted_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me-also.bed"
  temp_unmapped_file_name = temp_dir_name + File::SEPARATOR + "delete-me.unmapped"

  # open up the input genotype file
  in_file = File.open(in_file_name, "r")

  # blow through and comments or empty lines to get to the CSV header
  header_line = ""
  in_file.each_line do |each_header_line|
    each_header_line = each_header_line.strip
    if (!each_header_line.empty?) && each_header_line[0..0] != '#'
      header_line = each_header_line
      break
    end
  end

  # find the column where the base pair value is kept
  base_pair_column_index = findcolumnindex(header_line, base_pair_column_name_name)
  if(base_pair_column_index == -1)
    STDERR.print(
        "Failed to locate column named " + base_pair_column_name_name +
        " in file " + in_file_name + "\n\n")
    printusageandexit
  end

  # find the column where the chromosome identifier is kept
  chr_id_column_index = findcolumnindex(header_line, chromosome_column_name_arg)
  if(chr_id_column_index == -1)
    STDERR.print(
        "Failed to locate column named " + chromosome_column_name_arg +
        " in file " + in_file_name + "\n\n")
    printusageandexit
  end

  # create a BED formatted file so that liftover can convert the data. Also
  # create a mapping from old_position=>old_row
  print(
      "building temporary BED file for SNP data from " +
      in_file_name + " ...\n")
  temp_bed_file = File.open(temp_bed_file_name, "w")
  in_file.each_line do |line|
    line = line.strip
    line_fields = line.split(',');
    base_pair_position = line_fields[base_pair_column_index]
    base_pair_plus_one_position = (base_pair_position.to_i + 1).to_s
    chromosome_name = line_fields[chr_id_column_index]

    # liftover will pass the line through the 4th position unmodified, so we'll
    # be able to rebuild the rows
    temp_bed_file.puts(
      chromosome_name + "\t" +
        base_pair_position + "\t" +
        base_pair_plus_one_position + "\t" +
        line)
  end

  # close the BED files
  in_file.close
  temp_bed_file.close

  # make sure that the chain file that we were given exists
  unless File.exists?(chain_file_name)
    STDERR.print "Can't find chain file: " + chain_file_name + " ...\n\n"
    printusageandexit
  end

  # OK... try to invoke the liftover command
  print "running liftover on the BED file ...\n"
  print(
    liftover_executable + " " +
      temp_bed_file_name + " " +
      chain_file_name + " " +
      temp_lifted_bed_file_name + " " +
      temp_unmapped_file_name + "\n")
  unless system(
      liftover_executable,
      temp_bed_file_name,
      chain_file_name,
      temp_lifted_bed_file_name,
      temp_unmapped_file_name)
    STDERR.print "liftover command failed: " + $?.to_s + "\n\n"
    Process.exit(1)
  end

  print "restoring CSV format to liftover output ...\n"
  # turn the lifted bed file into a position map where new_position=>old_row
  temp_lifted_bed_file = File.open(temp_lifted_bed_file_name, "r")
  new_positions = {}
  sorted_files = []
  temp_lifted_bed_file.each_line do |line|

    bed_tokens = line.strip.split(' ')

    # the remapped position will be the 1st token of the bed file
    new_position = bed_tokens[1]

    # the old csv line should have been passed through unchanged to the
    # 3rd token. Tokenize this line and replace the old position
    # with the new position
    old_csv_line = bed_tokens[3]
    csv_tokens = old_csv_line.split(',')
    csv_tokens[base_pair_column_index] = new_position

    # map the tokens by base pair position
    new_positions[new_position.to_i] = csv_tokens

    # don't let the map grow past "max rows" value. dump the current values
    # to an intermediate file
    if new_positions.length == max_rows_in_memory
      temp_sorted_file_name =
        temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
        sorted_files.length.to_s
      print(
        "  sorting and saving " + new_positions.length.to_s + " rows to " +
          temp_sorted_file_name + " ...\n")
      sorted_files[sorted_files.length] = sortandsave(
        new_positions,
        temp_sorted_file_name)
      new_positions.replace({})
    end
  end

  # dump any remaining rows to the final intermediate file
  unless new_positions.empty?
    temp_sorted_file_name =
      temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
      sorted_files.length.to_s
    print(
      "  sorting and saving " + new_positions.length.to_s + " rows to " +
        temp_sorted_file_name + " ...\n")
    sorted_files[sorted_files.length] = sortandsave(
      new_positions,
      temp_sorted_file_name)
    new_positions.replace({})
  end

  # merge all of the partial results into our final output file
  print "  merging results to " + out_file_name + " ...\n"
  sorted_file_map = {}
  out_file = File.open(out_file_name, "w")
  out_file.puts(header_line)
  sorted_files.each do |curr_file|
    line = curr_file.gets
    if line != nil
      sorted_file_map[curr_file] = line.strip.split(',')
    end
  end

  # keep track of the prev snp position for a sanity check
  prev_snp_position = -1
  until sorted_file_map.empty?
    min_file = sorted_file_map.keys[0]
    min_position = sorted_file_map.values[0][base_pair_column_index].to_i
    sorted_file_map.each do |file_key, value|
      curr_position = value[base_pair_column_index].to_i
      if curr_position < min_position
        min_position = curr_position
        min_file = file_key
      end
    end

    # do sanity check to make sure that SNP positions are being output
    # in order
    if min_position < prev_snp_position
      STDERR.print "Error detected: bad order for SNP output"
      Process.exit(1)
    else
      prev_snp_position = min_position
    end

    out_file.puts(sorted_file_map[min_file].join(','))
    new_value = min_file.gets
    if new_value == nil
      sorted_file_map.delete(min_file)
      min_file.close
    else
      sorted_file_map[min_file] = new_value.strip.split(',')
    end
  end

  temp_lifted_bed_file.close
  out_file.close

  # delete all of the temporary files
  Dir.entries(temp_dir_name).each do |temp_file|
    begin
      File.delete(
        temp_dir_name + File::SEPARATOR +
        temp_file)
    rescue
      # don't care... if our Dir delete fails, then we're in trouble
    end
  end
  Dir.delete(temp_dir_name)
end

################################################################################
# Main entry point of script.
# Run the script without any parameters to get usage.
################################################################################

# assign meaningful names to the arguments
if ARGV.length == 3
  in_file_arg                 = ARGV[0]
  chromosome_column_name_arg  = "ChrID"
  base_pair_column_name_arg   = "build 36 bp Position"
  chain_file_arg              = ARGV[1]
  out_file_arg                = ARGV[2]
elsif ARGV.length == 5
  in_file_arg                 = ARGV[0]
  chromosome_column_name_arg  = ARGV[1]
  base_pair_column_name_arg   = ARGV[2]
  chain_file_arg              = ARGV[3]
  out_file_arg                = ARGV[4]
else
  # the args don't match what we expect... quit
  printusageandexit
end

if File.directory?(in_file_arg)
  unless File.directory?(out_file_arg)
    Dir.mkdir(out_file_arg)
  end

  print "converting directory " + in_file_arg + " ...\n"

  Dir.entries(in_file_arg).each do |curr_file|
    if File.extname(curr_file) == ".csv"
      curr_in_file_name = in_file_arg + File::SEPARATOR + curr_file
      curr_out_file_name = out_file_arg + File::SEPARATOR + File.basename(curr_file)
      print "========================================\n"
      print(
        "converting " + curr_in_file_name + " to " +
        curr_out_file_name + " ...\n")
      lift_snps(
        curr_in_file_name,
        chromosome_column_name_arg,
        base_pair_column_name_arg,
        chain_file_arg,
        curr_out_file_name)
    end
  end
else
  lift_snps(
    in_file_arg,
    chromosome_column_name_arg,
    base_pair_column_name_arg,
    chain_file_arg,
    out_file_arg)
end
	#!/usr/bin/ruby -w

	################################################################################
	# Created By: Keith Sheppard (keith.sheppard@jax.org)
	# Modification History:
	# July 7, 2008: Initial revision created to run the liftover utility on
	# the CGD (cgd.jax.org) imputed SNP data in comma-separated format
	#
	# This is a ruby script for running liftover to map SNPs in a source
	# comma-separated file to a destination file using a different NCBI genome
	# coordinate system. Run the script with no parameters to see usage
	################################################################################

	################################################################################
	# a little function for printing usage and quitting the script
	################################################################################
	def printusageandexit(exit_status=1)
	STDERR.print(
	"Usage: liftover-csv.rb <input.csv\|input-dir> [chromosome-column-name] " +
	"[base-pair-position-column-name] chain-file <output.csv\|output-dir>\n")
	STDERR.print " default chromosome-column-name=ChrID\n"
	STDERR.print " default base-pair-position-column-name=build 36 bp Position\n"
	STDERR.print " NOTE: this script expects the \"liftover\" executable to be\n"
	STDERR.print " on the path\n"
	STDERR.print " NOTE: the output will only be sorted correctly if the \n"
	STDERR.print " input file is for a single chromosome. this is because\n"
	STDERR.print " the output is sorted on base-pair position only\n"

	Process.exit(exit_status)
	end

	################################################################################
	# get the index of the column with the given name assuming that the header
	# passed in is comma separated. If we can't find it, return -1
	################################################################################
	def findcolumnindex(header_line, column_name)
	i = 0
	header_line.split(',').each do \|each_column_name\|
	if each_column_name == column_name
	return i
	else
	i = i + 1
	end
	end

	# we couldn't find it...
	return -1
	end

	################################################################################
	# sort and save the genotype data to the given file. this function returns
	# an opened read-only reference to the file that it creates
	################################################################################
	def sortandsave(data_hash, file_name)
	out_file = File.open(file_name, "w")
	data_hash.sort.each do \|key, value\|
	# join the data with a ',' so that we follow a CSV format, and write it
	# to the file
	out_file.puts(value.join(","))
	end

	# OK, we're done.. close the file
	out_file.close

	# Now reopen the file as read-only and return it
	return File.open(file_name, "r")
	end

	################################################################################
	# Lift the given SNPs using the liftover command line program. The liftover
	# application should be in the users PATH and should be executable by typing
	# "liftover"
	# in_file_name:
	# the path of the input file. this file should be a CSV file with a header
	# row. the header row should contain a "ChrID" column that contains
	# chromosome ID's in BED format (so that liftover can use them) and all of
	# the chromosomes should be the same.
	# chromosome_column_name_arg:
	# the name of the column that contains the chromosome name
	# base_pair_column_name_name:
	# the name of the column that contains the SNPs
	# chain_file_name:
	# the path to the chain file to use for the liftover command
	# out_file_name:
	# the path to the output file that we should create for the lifted data
	################################################################################
	def lift_snps(
	in_file_name,
	chromosome_column_name_arg,
	base_pair_column_name_name,
	chain_file_name,
	out_file_name)

	# These variables can be modified (or maybe parameterized) to match what you
	# need them to be.

	# The name of the liftover executable
	liftover_executable = "liftover"

	# If you're running out of memory you should reduce the
	# max_rows_in_memory variable.
	max_rows_in_memory = 100000

	# do a couple of sanity checks before we start
	if File.exists?(out_file_name)
	STDERR.print(
	"Refusing to run because output file " + out_file_name + " exists\n")
	Process.exit(1)
	end

	# intermediate/temporary files used in the invokation of liftover
	temp_dir_name = "temp-dir"

	if File.exists?(temp_dir_name)
	STDERR.print(
	"Refusing to run because " + temp_dir_name + " already exists." +
	" If you don't need it, delete it and re-run the command.\n")
	Process.exit(1)
	end

	Dir.mkdir(temp_dir_name)
	temp_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me.bed"
	temp_lifted_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me-also.bed"
	temp_unmapped_file_name = temp_dir_name + File::SEPARATOR + "delete-me.unmapped"

	# open up the input genotype file
	in_file = File.open(in_file_name, "r")

	# blow through and comments or empty lines to get to the CSV header
	header_line = ""
	in_file.each_line do \|each_header_line\|
	each_header_line = each_header_line.strip
	if (!each_header_line.empty?) && each_header_line[0..0] != '#'
	header_line = each_header_line
	break
	end
	end

	# find the column where the base pair value is kept
	base_pair_column_index = findcolumnindex(header_line, base_pair_column_name_name)
	if(base_pair_column_index == -1)
	STDERR.print(
	"Failed to locate column named " + base_pair_column_name_name +
	" in file " + in_file_name + "\n\n")
	printusageandexit
	end

	# find the column where the chromosome identifier is kept
	chr_id_column_index = findcolumnindex(header_line, chromosome_column_name_arg)
	if(chr_id_column_index == -1)
	STDERR.print(
	"Failed to locate column named " + chromosome_column_name_arg +
	" in file " + in_file_name + "\n\n")
	printusageandexit
	end

	# create a BED formatted file so that liftover can convert the data. Also
	# create a mapping from old_position=>old_row
	print(
	"building temporary BED file for SNP data from " +
	in_file_name + " ...\n")
	temp_bed_file = File.open(temp_bed_file_name, "w")
	in_file.each_line do \|line\|
	line = line.strip
	line_fields = line.split(',');
	base_pair_position = line_fields[base_pair_column_index]
	base_pair_plus_one_position = (base_pair_position.to_i + 1).to_s
	chromosome_name = line_fields[chr_id_column_index]

	# liftover will pass the line through the 4th position unmodified, so we'll
	# be able to rebuild the rows
	temp_bed_file.puts(
	chromosome_name + "\t" +
	base_pair_position + "\t" +
	base_pair_plus_one_position + "\t" +
	line)
	end

	# close the BED files
	in_file.close
	temp_bed_file.close

	# make sure that the chain file that we were given exists
	unless File.exists?(chain_file_name)
	STDERR.print "Can't find chain file: " + chain_file_name + " ...\n\n"
	printusageandexit
	end

	# OK... try to invoke the liftover command
	print "running liftover on the BED file ...\n"
	print(
	liftover_executable + " " +
	temp_bed_file_name + " " +
	chain_file_name + " " +
	temp_lifted_bed_file_name + " " +
	temp_unmapped_file_name + "\n")
	unless system(
	liftover_executable,
	temp_bed_file_name,
	chain_file_name,
	temp_lifted_bed_file_name,
	temp_unmapped_file_name)
	STDERR.print "liftover command failed: " + $?.to_s + "\n\n"
	Process.exit(1)
	end

	print "restoring CSV format to liftover output ...\n"
	# turn the lifted bed file into a position map where new_position=>old_row
	temp_lifted_bed_file = File.open(temp_lifted_bed_file_name, "r")
	new_positions = {}
	sorted_files = []
	temp_lifted_bed_file.each_line do \|line\|

	bed_tokens = line.strip.split(' ')

	# the remapped position will be the 1st token of the bed file
	new_position = bed_tokens[1]

	# the old csv line should have been passed through unchanged to the
	# 3rd token. Tokenize this line and replace the old position
	# with the new position
	old_csv_line = bed_tokens[3]
	csv_tokens = old_csv_line.split(',')
	csv_tokens[base_pair_column_index] = new_position

	# map the tokens by base pair position
	new_positions[new_position.to_i] = csv_tokens

	# don't let the map grow past "max rows" value. dump the current values
	# to an intermediate file
	if new_positions.length == max_rows_in_memory
	temp_sorted_file_name =
	temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
	sorted_files.length.to_s
	print(
	" sorting and saving " + new_positions.length.to_s + " rows to " +
	temp_sorted_file_name + " ...\n")
	sorted_files[sorted_files.length] = sortandsave(
	new_positions,
	temp_sorted_file_name)
	new_positions.replace({})
	end
	end

	# dump any remaining rows to the final intermediate file
	unless new_positions.empty?
	temp_sorted_file_name =
	temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
	sorted_files.length.to_s
	print(
	" sorting and saving " + new_positions.length.to_s + " rows to " +
	temp_sorted_file_name + " ...\n")
	sorted_files[sorted_files.length] = sortandsave(
	new_positions,
	temp_sorted_file_name)
	new_positions.replace({})
	end

	# merge all of the partial results into our final output file
	print " merging results to " + out_file_name + " ...\n"
	sorted_file_map = {}
	out_file = File.open(out_file_name, "w")
	out_file.puts(header_line)
	sorted_files.each do \|curr_file\|
	line = curr_file.gets
	if line != nil
	sorted_file_map[curr_file] = line.strip.split(',')
	end
	end

	# keep track of the prev snp position for a sanity check
	prev_snp_position = -1
	until sorted_file_map.empty?
	min_file = sorted_file_map.keys[0]
	min_position = sorted_file_map.values[0][base_pair_column_index].to_i
	sorted_file_map.each do \|file_key, value\|
	curr_position = value[base_pair_column_index].to_i
	if curr_position < min_position
	min_position = curr_position
	min_file = file_key
	end
	end

	# do sanity check to make sure that SNP positions are being output
	# in order
	if min_position < prev_snp_position
	STDERR.print "Error detected: bad order for SNP output"
	Process.exit(1)
	else
	prev_snp_position = min_position
	end

	out_file.puts(sorted_file_map[min_file].join(','))
	new_value = min_file.gets
	if new_value == nil
	sorted_file_map.delete(min_file)
	min_file.close
	else
	sorted_file_map[min_file] = new_value.strip.split(',')
	end
	end

	temp_lifted_bed_file.close
	out_file.close

	# delete all of the temporary files
	Dir.entries(temp_dir_name).each do \|temp_file\|
	begin
	File.delete(
	temp_dir_name + File::SEPARATOR +
	temp_file)
	rescue
	# don't care... if our Dir delete fails, then we're in trouble
	end
	end
	Dir.delete(temp_dir_name)
	end

	################################################################################
	# Main entry point of script.
	# Run the script without any parameters to get usage.
	################################################################################

	# assign meaningful names to the arguments
	if ARGV.length == 3
	in_file_arg = ARGV[0]
	chromosome_column_name_arg = "ChrID"
	base_pair_column_name_arg = "build 36 bp Position"
	chain_file_arg = ARGV[1]
	out_file_arg = ARGV[2]
	elsif ARGV.length == 5
	in_file_arg = ARGV[0]
	chromosome_column_name_arg = ARGV[1]
	base_pair_column_name_arg = ARGV[2]
	chain_file_arg = ARGV[3]
	out_file_arg = ARGV[4]
	else
	# the args don't match what we expect... quit
	printusageandexit
	end

	if File.directory?(in_file_arg)
	unless File.directory?(out_file_arg)
	Dir.mkdir(out_file_arg)
	end

	print "converting directory " + in_file_arg + " ...\n"

	Dir.entries(in_file_arg).each do \|curr_file\|
	if File.extname(curr_file) == ".csv"
	curr_in_file_name = in_file_arg + File::SEPARATOR + curr_file
	curr_out_file_name = out_file_arg + File::SEPARATOR + File.basename(curr_file)
	print "========================================\n"
	print(
	"converting " + curr_in_file_name + " to " +
	curr_out_file_name + " ...\n")
	lift_snps(
	curr_in_file_name,
	chromosome_column_name_arg,
	base_pair_column_name_arg,
	chain_file_arg,
	curr_out_file_name)
	end
	end
	else
	lift_snps(
	in_file_arg,
	chromosome_column_name_arg,
	base_pair_column_name_arg,
	chain_file_arg,
	out_file_arg)
	end