epaule/cut.txt

## cut.txt
Step 9 - Rename and re-run maps
It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method:

Run:
bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta>


Then:
/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa


Following this the curator can run a nucmer alignment to double check the naming is correct. This is especially recommended for birds as the renaming script may not get the correct naming for the small repetitive micro chromosomes.

Once satisfied, change the names of the renamed files to the names required for submission.

i.e.:
mv hap2.renamed.fa tolID_2.curated_primary.no_mt.unscrubbed.fa
mv <renamed_hap2_chroms.csv> tolID_2.chromosome.list.csv


## hap2_hap1_ID_mapping.sh
#!/bin/bash

# Tom Mathers.

# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2.

# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).

# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.

# Run on farm with at least 16 cores and 16 Gb ram.


if [ $# -ne 2 ]; then
    echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
    exit 1
fi


hap1=$1
hap2=$2

/software/grit/bin//mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16 -s 50000;

cut -d " " -f1 mashmap.out | grep -v SCAFFOLD | grep -v unloc | uniq > tmp; while read id; do awk -v val=${id} '$1==val' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv

rm tmp

## update_mapping.rb
#!/usr/bin/env ruby

require 'optparse'

fasta = ''
table = ''
debug = false
new_mapping = false
chromosome_list = false

def get_mapping(file)
  mapping = Hash.new
  File.new(file).each_line{|line|
    line.chomp!
    c=line.split
    next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across
    next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping
    next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping
    mapping[c[0]]=c[1]
  }
  mapping
end

def create_chromosme_list_line(id)
    /(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
    name   = "#{$1}"
    suffix = "#{$2}"
    stripped_name = name.sub("SUPER_",'')
    line = []
    if suffix.length >=1
      line = [name+suffix,stripped_name,'no']
    else
      line = [name,stripped_name,'yes']
    end
    line.join(',')
end

OptionParser.new do |parser|
  parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]"
  parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f }
  parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f }
  parser.on("-d", "--debug") { debug = true }
  parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f }
  parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f}
  parser.on("-h", "--help") do
    puts parser
    exit
  end
end.parse!

hap2_hap1_mapping = get_mapping(table)

count=1
new_table = File.new(new_mapping,'w') if new_mapping
new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

File.new(fasta).each_line{|line|
    if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      if hap2_hap1_mapping.has_key?(name)
        line = line.sub(name, hap2_hap1_mapping[name])
        $stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
        new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      else
        while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
          count+=1
        end
        id = "SUPER_#{count}"
        line = line.sub(name,id)
        hap2_hap1_mapping[id]=name
        $stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
        new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      end
    elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
    end
    print line
}
	Step 9 - Rename and re-run maps
	It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method:

	Run:
	bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta>


	Then:
	/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa


	Following this the curator can run a nucmer alignment to double check the naming is correct. This is especially recommended for birds as the renaming script may not get the correct naming for the small repetitive micro chromosomes.

	Once satisfied, change the names of the renamed files to the names required for submission.

	i.e.:
	mv hap2.renamed.fa tolID_2.curated_primary.no_mt.unscrubbed.fa
	mv <renamed_hap2_chroms.csv> tolID_2.chromosome.list.csv
	#!/bin/bash

	# Tom Mathers.

	# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies.

	# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping.

	# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2.

	# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs).

	# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument.

	# Run on farm with at least 16 cores and 16 Gb ram.


	if [ $# -ne 2 ]; then
	echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta
	exit 1
	fi


	hap1=$1
	hap2=$2

	/software/grit/bin//mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16 -s 50000;

	cut -d " " -f1 mashmap.out \| grep -v SCAFFOLD \| grep -v unloc \| uniq > tmp; while read id; do awk -v val=${id} '$1==val' mashmap.out \| awk '{print $0 "\t" $9 - $8}' \| sort -nrk11,11 \| head -n1 ; done < tmp \| awk '{print $1 "\t" $6}' > hap2_hap1.tsv

	rm tmp
	#!/usr/bin/env ruby

	require 'optparse'

	fasta = ''
	table = ''
	debug = false
	new_mapping = false
	chromosome_list = false

	def get_mapping(file)
	mapping = Hash.new
	File.new(file).each_line{\|line\|
	line.chomp!
	c=line.split
	next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across
	next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping
	next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping
	mapping[c[0]]=c[1]
	}
	mapping
	end

	def create_chromosme_list_line(id)
	/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
	name = "#{$1}"
	suffix = "#{$2}"
	stripped_name = name.sub("SUPER_",'')
	line = []
	if suffix.length >=1
	line = [name+suffix,stripped_name,'no']
	else
	line = [name,stripped_name,'yes']
	end
	line.join(',')
	end

	OptionParser.new do \|parser\|
	parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug \| --new_mapping NEW_MAPPING_TSV \| --help \| --chromosome_list NEW_CHROMOSOME_LIST]"
	parser.on("-f", "--fasta FASTA_FILE") { \|f\| fasta = f }
	parser.on("-t", "--mashmap_table TABLE_TSV") { \|f\| table = f }
	parser.on("-d", "--debug") { debug = true }
	parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { \|f\| new_mapping = f }
	parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {\|f\| chromosome_list = f}
	parser.on("-h", "--help") do
	puts parser
	exit
	end
	end.parse!

	hap2_hap1_mapping = get_mapping(table)

	count=1
	new_table = File.new(new_mapping,'w') if new_mapping
	new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

	File.new(fasta).each_line{\|line\|
	if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	if hap2_hap1_mapping.has_key?(name)
	line = line.sub(name, hap2_hap1_mapping[name])
	$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
	new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	else
	while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
	count+=1
	end
	id = "SUPER_#{count}"
	line = line.sub(name,id)
	hap2_hap1_mapping[id]=name
	$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
	new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	end
	elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
	end
	print line
	}