Last active
September 7, 2023 09:18
-
-
Save epaule/c01d1e30bc94dd5c22fb04ef546c1922 to your computer and use it in GitHub Desktop.
chromosome renaming
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Step 9 - Rename and re-run maps | |
It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method: | |
Run: | |
bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta> | |
Then: | |
/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa | |
Following this the curator can run a nucmer alignment to double check the naming is correct. This is especially recommended for birds as the renaming script may not get the correct naming for the small repetitive micro chromosomes. | |
Once satisfied, change the names of the renamed files to the names required for submission. | |
i.e.: | |
mv hap2.renamed.fa tolID_2.curated_primary.no_mt.unscrubbed.fa | |
mv <renamed_hap2_chroms.csv> tolID_2.chromosome.list.csv | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Tom Mathers. | |
# Script to map hap2 chromosome (SUPER) IDs to hap1 IDs for genomeark assemblies. | |
# Script aligns hap2 to hap1 with mashmap and selects the longest alignment per chrom. Only scaffolds with SUPER IDs are included in the output mapping. | |
# Output file is a tsv with HAP2 id in column 1 and the corresponding HAP1 ID in column 2. | |
# Input files are hap1 and hap2 fasta files generated by rapid_join.pl (ie. with SUPER IDs). | |
# Always give <hap1_fasta> as first argument and <hap2_fasta> as second argument. | |
# Run on farm with at least 16 cores and 16 Gb ram. | |
if [ $# -ne 2 ]; then | |
echo $0: usage: sh hap2_hap1_ID_mapping.sh hap1_fasta hap2_fasta | |
exit 1 | |
fi | |
hap1=$1 | |
hap2=$2 | |
/software/grit/bin//mashmap -r ${hap1} -q ${hap2} -f one-to-one -t 16 -s 50000; | |
cut -d " " -f1 mashmap.out | grep -v SCAFFOLD | grep -v unloc | uniq > tmp; while read id; do awk -v val=${id} '$1==val' mashmap.out | awk '{print $0 "\t" $9 - $8}' | sort -nrk11,11 | head -n1 ; done < tmp | awk '{print $1 "\t" $6}' > hap2_hap1.tsv | |
rm tmp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'optparse' | |
fasta = '' | |
table = '' | |
debug = false | |
new_mapping = false | |
chromosome_list = false | |
def get_mapping(file) | |
mapping = Hash.new | |
File.new(file).each_line{|line| | |
line.chomp! | |
c=line.split | |
next unless c[1] =~ /SUPER_\d+/ # do not map sex chromosomes across | |
next if mapping.values.include?(c[1]) # create an artificial 1:1 mapping | |
next if mapping.keys.include?(c[0]) # create an artificial 1:1 mapping | |
mapping[c[0]]=c[1] | |
} | |
mapping | |
end | |
def create_chromosme_list_line(id) | |
/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
stripped_name = name.sub("SUPER_",'') | |
line = [] | |
if suffix.length >=1 | |
line = [name+suffix,stripped_name,'no'] | |
else | |
line = [name,stripped_name,'yes'] | |
end | |
line.join(',') | |
end | |
OptionParser.new do |parser| | |
parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]" | |
parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f } | |
parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f } | |
parser.on("-d", "--debug") { debug = true } | |
parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f } | |
parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f} | |
parser.on("-h", "--help") do | |
puts parser | |
exit | |
end | |
end.parse! | |
hap2_hap1_mapping = get_mapping(table) | |
count=1 | |
new_table = File.new(new_mapping,'w') if new_mapping | |
new_chrom_file = File.new(chromosome_list,'w') if chromosome_list | |
File.new(fasta).each_line{|line| | |
if />(SUPER_\d+)(_unloc_\d+)*/.match(line) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
if hap2_hap1_mapping.has_key?(name) | |
line = line.sub(name, hap2_hap1_mapping[name]) | |
$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug | |
new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping | |
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list | |
else | |
while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}") | |
count+=1 | |
end | |
id = "SUPER_#{count}" | |
line = line.sub(name,id) | |
hap2_hap1_mapping[id]=name | |
$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug | |
new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping | |
new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list | |
end | |
elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line) | |
name = "#{$1}" | |
suffix = "#{$2}" | |
new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list | |
end | |
print line | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment