Skip to content

Instantly share code, notes, and snippets.

@keithshep
Created June 14, 2010 13:32
Show Gist options
  • Save keithshep/437676 to your computer and use it in GitHub Desktop.
Save keithshep/437676 to your computer and use it in GitHub Desktop.
liftover and sort a CSV file
#!/usr/bin/ruby -w
################################################################################
# Created By: Keith Sheppard (keith.sheppard@jax.org)
# Modification History:
# July 7, 2008: Initial revision created to run the liftover utility on
# the CGD (cgd.jax.org) imputed SNP data in comma-separated format
#
# This is a ruby script for running liftover to map SNPs in a source
# comma-separated file to a destination file using a different NCBI genome
# coordinate system. Run the script with no parameters to see usage
################################################################################
################################################################################
# a little function for printing usage and quitting the script
################################################################################
def printusageandexit(exit_status=1)
STDERR.print(
"Usage: liftover-csv.rb <input.csv|input-dir> [chromosome-column-name] " +
"[base-pair-position-column-name] chain-file <output.csv|output-dir>\n")
STDERR.print " default chromosome-column-name=ChrID\n"
STDERR.print " default base-pair-position-column-name=build 36 bp Position\n"
STDERR.print " NOTE: this script expects the \"liftover\" executable to be\n"
STDERR.print " on the path\n"
STDERR.print " NOTE: the output will only be sorted correctly if the \n"
STDERR.print " input file is for a single chromosome. this is because\n"
STDERR.print " the output is sorted on base-pair position only\n"
Process.exit(exit_status)
end
################################################################################
# get the index of the column with the given name assuming that the header
# passed in is comma separated. If we can't find it, return -1
################################################################################
def findcolumnindex(header_line, column_name)
i = 0
header_line.split(',').each do |each_column_name|
if each_column_name == column_name
return i
else
i = i + 1
end
end
# we couldn't find it...
return -1
end
################################################################################
# sort and save the genotype data to the given file. this function returns
# an opened read-only reference to the file that it creates
################################################################################
def sortandsave(data_hash, file_name)
out_file = File.open(file_name, "w")
data_hash.sort.each do |key, value|
# join the data with a ',' so that we follow a CSV format, and write it
# to the file
out_file.puts(value.join(","))
end
# OK, we're done.. close the file
out_file.close
# Now reopen the file as read-only and return it
return File.open(file_name, "r")
end
################################################################################
# Lift the given SNPs using the liftover command line program. The liftover
# application should be in the users PATH and should be executable by typing
# "liftover"
# in_file_name:
# the path of the input file. this file should be a CSV file with a header
# row. the header row should contain a "ChrID" column that contains
# chromosome ID's in BED format (so that liftover can use them) and all of
# the chromosomes should be the same.
# chromosome_column_name_arg:
# the name of the column that contains the chromosome name
# base_pair_column_name_name:
# the name of the column that contains the SNPs
# chain_file_name:
# the path to the chain file to use for the liftover command
# out_file_name:
# the path to the output file that we should create for the lifted data
################################################################################
def lift_snps(
in_file_name,
chromosome_column_name_arg,
base_pair_column_name_name,
chain_file_name,
out_file_name)
# These variables can be modified (or maybe parameterized) to match what you
# need them to be.
# The name of the liftover executable
liftover_executable = "liftover"
# If you're running out of memory you should reduce the
# max_rows_in_memory variable.
max_rows_in_memory = 100000
# do a couple of sanity checks before we start
if File.exists?(out_file_name)
STDERR.print(
"Refusing to run because output file " + out_file_name + " exists\n")
Process.exit(1)
end
# intermediate/temporary files used in the invokation of liftover
temp_dir_name = "temp-dir"
if File.exists?(temp_dir_name)
STDERR.print(
"Refusing to run because " + temp_dir_name + " already exists." +
" If you don't need it, delete it and re-run the command.\n")
Process.exit(1)
end
Dir.mkdir(temp_dir_name)
temp_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me.bed"
temp_lifted_bed_file_name = temp_dir_name + File::SEPARATOR + "delete-me-also.bed"
temp_unmapped_file_name = temp_dir_name + File::SEPARATOR + "delete-me.unmapped"
# open up the input genotype file
in_file = File.open(in_file_name, "r")
# blow through and comments or empty lines to get to the CSV header
header_line = ""
in_file.each_line do |each_header_line|
each_header_line = each_header_line.strip
if (!each_header_line.empty?) && each_header_line[0..0] != '#'
header_line = each_header_line
break
end
end
# find the column where the base pair value is kept
base_pair_column_index = findcolumnindex(header_line, base_pair_column_name_name)
if(base_pair_column_index == -1)
STDERR.print(
"Failed to locate column named " + base_pair_column_name_name +
" in file " + in_file_name + "\n\n")
printusageandexit
end
# find the column where the chromosome identifier is kept
chr_id_column_index = findcolumnindex(header_line, chromosome_column_name_arg)
if(chr_id_column_index == -1)
STDERR.print(
"Failed to locate column named " + chromosome_column_name_arg +
" in file " + in_file_name + "\n\n")
printusageandexit
end
# create a BED formatted file so that liftover can convert the data. Also
# create a mapping from old_position=>old_row
print(
"building temporary BED file for SNP data from " +
in_file_name + " ...\n")
temp_bed_file = File.open(temp_bed_file_name, "w")
in_file.each_line do |line|
line = line.strip
line_fields = line.split(',');
base_pair_position = line_fields[base_pair_column_index]
base_pair_plus_one_position = (base_pair_position.to_i + 1).to_s
chromosome_name = line_fields[chr_id_column_index]
# liftover will pass the line through the 4th position unmodified, so we'll
# be able to rebuild the rows
temp_bed_file.puts(
chromosome_name + "\t" +
base_pair_position + "\t" +
base_pair_plus_one_position + "\t" +
line)
end
# close the BED files
in_file.close
temp_bed_file.close
# make sure that the chain file that we were given exists
unless File.exists?(chain_file_name)
STDERR.print "Can't find chain file: " + chain_file_name + " ...\n\n"
printusageandexit
end
# OK... try to invoke the liftover command
print "running liftover on the BED file ...\n"
print(
liftover_executable + " " +
temp_bed_file_name + " " +
chain_file_name + " " +
temp_lifted_bed_file_name + " " +
temp_unmapped_file_name + "\n")
unless system(
liftover_executable,
temp_bed_file_name,
chain_file_name,
temp_lifted_bed_file_name,
temp_unmapped_file_name)
STDERR.print "liftover command failed: " + $?.to_s + "\n\n"
Process.exit(1)
end
print "restoring CSV format to liftover output ...\n"
# turn the lifted bed file into a position map where new_position=>old_row
temp_lifted_bed_file = File.open(temp_lifted_bed_file_name, "r")
new_positions = {}
sorted_files = []
temp_lifted_bed_file.each_line do |line|
bed_tokens = line.strip.split(' ')
# the remapped position will be the 1st token of the bed file
new_position = bed_tokens[1]
# the old csv line should have been passed through unchanged to the
# 3rd token. Tokenize this line and replace the old position
# with the new position
old_csv_line = bed_tokens[3]
csv_tokens = old_csv_line.split(',')
csv_tokens[base_pair_column_index] = new_position
# map the tokens by base pair position
new_positions[new_position.to_i] = csv_tokens
# don't let the map grow past "max rows" value. dump the current values
# to an intermediate file
if new_positions.length == max_rows_in_memory
temp_sorted_file_name =
temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
sorted_files.length.to_s
print(
" sorting and saving " + new_positions.length.to_s + " rows to " +
temp_sorted_file_name + " ...\n")
sorted_files[sorted_files.length] = sortandsave(
new_positions,
temp_sorted_file_name)
new_positions.replace({})
end
end
# dump any remaining rows to the final intermediate file
unless new_positions.empty?
temp_sorted_file_name =
temp_dir_name + File::SEPARATOR + File.basename(out_file_name) + "." +
sorted_files.length.to_s
print(
" sorting and saving " + new_positions.length.to_s + " rows to " +
temp_sorted_file_name + " ...\n")
sorted_files[sorted_files.length] = sortandsave(
new_positions,
temp_sorted_file_name)
new_positions.replace({})
end
# merge all of the partial results into our final output file
print " merging results to " + out_file_name + " ...\n"
sorted_file_map = {}
out_file = File.open(out_file_name, "w")
out_file.puts(header_line)
sorted_files.each do |curr_file|
line = curr_file.gets
if line != nil
sorted_file_map[curr_file] = line.strip.split(',')
end
end
# keep track of the prev snp position for a sanity check
prev_snp_position = -1
until sorted_file_map.empty?
min_file = sorted_file_map.keys[0]
min_position = sorted_file_map.values[0][base_pair_column_index].to_i
sorted_file_map.each do |file_key, value|
curr_position = value[base_pair_column_index].to_i
if curr_position < min_position
min_position = curr_position
min_file = file_key
end
end
# do sanity check to make sure that SNP positions are being output
# in order
if min_position < prev_snp_position
STDERR.print "Error detected: bad order for SNP output"
Process.exit(1)
else
prev_snp_position = min_position
end
out_file.puts(sorted_file_map[min_file].join(','))
new_value = min_file.gets
if new_value == nil
sorted_file_map.delete(min_file)
min_file.close
else
sorted_file_map[min_file] = new_value.strip.split(',')
end
end
temp_lifted_bed_file.close
out_file.close
# delete all of the temporary files
Dir.entries(temp_dir_name).each do |temp_file|
begin
File.delete(
temp_dir_name + File::SEPARATOR +
temp_file)
rescue
# don't care... if our Dir delete fails, then we're in trouble
end
end
Dir.delete(temp_dir_name)
end
################################################################################
# Main entry point of script.
# Run the script without any parameters to get usage.
################################################################################
# assign meaningful names to the arguments
if ARGV.length == 3
in_file_arg = ARGV[0]
chromosome_column_name_arg = "ChrID"
base_pair_column_name_arg = "build 36 bp Position"
chain_file_arg = ARGV[1]
out_file_arg = ARGV[2]
elsif ARGV.length == 5
in_file_arg = ARGV[0]
chromosome_column_name_arg = ARGV[1]
base_pair_column_name_arg = ARGV[2]
chain_file_arg = ARGV[3]
out_file_arg = ARGV[4]
else
# the args don't match what we expect... quit
printusageandexit
end
if File.directory?(in_file_arg)
unless File.directory?(out_file_arg)
Dir.mkdir(out_file_arg)
end
print "converting directory " + in_file_arg + " ...\n"
Dir.entries(in_file_arg).each do |curr_file|
if File.extname(curr_file) == ".csv"
curr_in_file_name = in_file_arg + File::SEPARATOR + curr_file
curr_out_file_name = out_file_arg + File::SEPARATOR + File.basename(curr_file)
print "========================================\n"
print(
"converting " + curr_in_file_name + " to " +
curr_out_file_name + " ...\n")
lift_snps(
curr_in_file_name,
chromosome_column_name_arg,
base_pair_column_name_arg,
chain_file_arg,
curr_out_file_name)
end
end
else
lift_snps(
in_file_arg,
chromosome_column_name_arg,
base_pair_column_name_arg,
chain_file_arg,
out_file_arg)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment