epaule/BUSCO_synteny_v1.1.sh

## BUSCO_synteny_v1.1.sh
#!/bin/bash

# Script by Tom Mathers.

# Script requires 32 cores and ~50Gb ram (depending on genome size).
# Seqkit and BUSCO need to be in path.
# Scaffold IDs need to be in final tol format.
# Query and ref short IDs need to be two letter codes eg. "Ac".
# Query and ref fasta files need to be in working dir.

### V1.1 added code to remove unlocs from links file.

if [ $# -ne 5 ]; then
    echo $0: usage: sh BUSCO_synteny.sh ref_fasta query_fasta ref_short_id query_short_id
    exit 1
fi

ref=$1
query=$2
ref_short_id=$4
query_short_id=$5

export MODULEPATH=/software/modules/:$MODULEPATH
module load ISG/singularity/

export SINGULARTY_TMPDIR=/lustre/scratch123/tol/teams/grit/mh6/singularity
export SINGULARTY_CACHEDIR=/lustre/scratch123/tol/teams/grit/mh6/singularity/cache

SIF=/lustre/scratch123/tol/teams/grit/mh6/singularity/busco.sif
BUSCO="singularity exec --bind `pwd`:$HOME,/lustre/scratch123/tol/resources/busco/v5:/busco_downloads $SIF busco --auto-lineage --offline -c 32 -m genome -i"

# Run BUSCO  on ref assembly.
`$BUSCO $ref -o busco5_${ref_short_id}`

# Run BUSCO on query assembly.
`$BUSCO $query -o busco5_${query_short_id}`

# Extract BUSCOs for chroms only.
grep Complete busco5_${ref_short_id}/run_/full_table.tsv | awk '{print $3 "\t" $4 "\t" $5 "\t" $1}' | grep SUPER | sed "s/SUPER_/${ref_short_id}/g" > ${ref_short_id}.tmp
grep Complete busco5_${query_short_id}/run_/full_table.tsv | awk '{print $3 "\t" $4 "\t" $5 "\t" $1}' | grep SUPER | sed "s/SUPER_/${query_short_id}/g" > ${query_short_id}.tmp

# Get shared BUSCOs IDs.
cut -f4 ${ref_short_id}.tmp > tmp; grep -wFf tmp ${query_short_id}.tmp > ${query_short_id}.shared.tmp
cut -f4 ${query_short_id}.tmp > tmp; grep -wFf tmp ${ref_short_id}.tmp > ${ref_short_id}.shared.tmp
cut -f4 ${query_short_id}.shared.tmp > ${ref_short_id}_${query_short_id}_shared_buscos.txt

# Extract shared BUSCO coords from each species and make links file. Add 100kb to each busco to improve visulaisation.
echo "chr1,start1,end1,chr2,start2,end2,color" > links_header

while read id; do grep -w ${id} ${query_short_id}.tmp; done < ${ref_short_id}_${query_short_id}_shared_buscos.txt > tmp1
while read id; do grep -w ${id} ${ref_short_id}.tmp; done < ${ref_short_id}_${query_short_id}_shared_buscos.txt | paste tmp1 - | cut -f1-3,5-7 | awk '{print $1 "," $2 "," $3+1000000 "," $4 "," $5 "," $6+1000000}' > tmp2; cut -d "," -f4 tmp2 | paste tmp2 - |sed 's/\t/,/g' | cat links_header - | grep -v unloc > ${ref_short_id}_${query_short_id}.links


# Generate chromsome file for shiny circos.
seqkit fx2tab --length --name ${ref} | grep -v unloc | grep SUPER | awk '{print $1 ",1," $2}' | sed "s/SUPER_/${ref_short_id}/g" > tmp
seqkit fx2tab --length --name ${query} | grep -v unloc | grep SUPER | awk '{print $1 ",1," $2}' | sed "s/SUPER_/${query_short_id}/g" | cat - tmp > ${ref_short_id}_${query_short_id}_chrom.csv

# Generate random colour codes for ref chroms for shiny circos.
grep ${ref_short_id} ${ref_short_id}_${query_short_id}_chrom.csv |cut -d "," -f1 > tmp

while read id; do echo ${id}:"#$(openssl rand -hex 3)"; done < tmp | tr "\n" ";" | awk '{print $0}' > ref_colour_codes.txt

# Clean up.
rm ${ref_short_id}.tmp
rm ${query_short_id}.tmp
rm ${query_short_id}.shared.tmp
rm ${ref_short_id}.shared.tmp
rm links_header
rm tmp1
rm tmp2
rm tmp
#cleanup BUSCO dir

## copy_for_QC.sh
#!/bin/bash

INPUT=$1
TARGETDIR=$2


copy_files(){

    cp rapid_prtxt_XL.tpf haps_rapid_prtxt_XL.tpf ./"$INPUT".curated_primary.no_mt.unscrubbed.fa ./"$INPUT".inter.csv ./"$INPUT".additional_haplotigs.unscrubbed.fa ./"$INPUT".chromosome.list.csv ./"$INPUT".curated.agp "$TARGETDIR"

    if [ $? -eq 0 ]; then
        ls -lt "$TARGETDIR"
    else
        echo "something went wrong, copy files manually"
    fi
}

copy_files

## curation_tool.rb
#!/nfs/users/nfs_m/mh6/.rvm/rubies/ruby-3.2.1/bin/ruby
# hardcoded the ruby until i can get a v3.x installed somewhere central

require 'optparse'

require "net/http"
require "json"
require "yaml"
require "fileutils"

class GritJiraIssue
  @@url = "jira.sanger.ac.uk"

  def initialize(name)
    @id = name
    @token = self.get_token
  end

  def json
    @json ||= self.get_json
  end

  def yaml
    @yaml ||= self.get_yaml
  end

  def hic_read_dir
     self.yaml["hic_read_dir"]
  end

  def projects
     self.yaml['projects']
  end

  def geval_db
     self.json["fields"]["customfield_11643"]
  end

  def decon_file
     self.json["fields"]["customfield_11677"]
  end

  def release_version
     self.json["fields"]["customfield_11609"].to_i
  end

  def tol_id
     self.yaml["specimen"]
  end

  # in the form of tol_id _ version
  def sample_version
     if self.geval_db
        return self.geval_db.split("_")[2..-1].join("_")
     else
       return "#{self.tol_id}_#{self.release_version}"
     end
  end

  # curation working directory
  def working_dir
    "/lustre/scratch123/tol/teams/grit/#{ENV["USER"]}/#{self.tol_id}_#{self.release_version}"
  end

  def pretext_dir
    prefix = self.tol_id[0]
    dir = Dir["/nfs/team135/curated_pretext_maps/#{prefix}*"].select{|f|File.directory?(f)}
    if prefix == "i"
	second = self.tol_id[1]
        dir = Dir["/nfs/team135/curated_pretext_maps/#{prefix}_*/#{second}_*/"].select{|f|File.directory?(f)}
    end
    dir[0]
  end

  # directory where the curated files go, based on the decon file  directory
  def curated_dir
    t = self.decon_file.split("/")[0..-4].join("/") + "/curated/"+self.tol_id

    if self.projects.map(&:downcase).include? "genomeark"
      t+=".genomeark.#{self.release_version}"
    else
      t+=".#{self.release_version}"
    end
    t
  end

  ## methods that actually do things

  # reads the token from .netrc
  def get_token
    File.new(File.expand_path("~/.netrc")).each_line { |line|
      line = line.chomp
      columns = line.split
      return columns[-1].to_s if @@url == columns[1]
    }
    raise "cannot get token for #{@@url}"
  end

  def get_json
    r = Net::HTTP.get_response(URI("https://#{@@url}/rest/api/2/issue/#{@id}"), {Accept: "application/json", Authorization: "Bearer #{@token}"})
    raise "cannot get the ticket" if r.class < Net::HTTPOK
    JSON.parse(r.body)
  end

  def get_yaml
    yaml_url = self.json["fields"]["attachment"].map{ |e| e["content"] }.select { |elem| /.*\.y(a)*ml/.match(elem) }[0]
    r = Net::HTTP.get_response(URI("#{yaml_url}"), {Authorization: "Bearer #{@token}"})
    raise "cannot get the yaml" if r.class < Net::HTTPOK
    YAML.load(r.body)
  end

end
# end of the class


# copy pretext
def setup_local(y)
    wd = "#{Dir.pwd}/#{y.tol_id}"
    FileUtils.mkdir_p(wd)

    cmd = <<-HERE
touch #{wd}/notes;
scp #{ENV["USER"]}@tol:/nfs/treeoflife-01/teams/grit/data/pretext_maps//#{y.tol_id}*.pretext #{wd}/
HERE
    puts `#{cmd}`
    raise "something went wrong" unless $?.success?
end

# inital setup
def setup_tol(y)
    wd = y.working_dir
    FileUtils.mkdir_p(wd)

    fasta_gz = y.decon_file.sub("contamination", "decontaminated.fa.gz")

    raise Exception.new("scaffolds.tpf in working #{wd} already exists") if File.exist?(wd+"/scaffolds.tpf")

    cmd = <<-HERE
cd #{wd} ;
zcat #{fasta_gz} > original.fa ;
perl /software/grit/projects/vgp_curation_scripts/rapid_split.pl -fa original.fa ;
mv -f original.fa.tpf original.tpf ;
cp original.tpf scaffolds.tpf;
HERE
    puts `#{cmd}`
    raise "something went wrong" unless $?.success?
end

# make files from the preetxt agp and build a new pretext
def build_release(y)
  id = y.sample_version
  wd = y.working_dir

  Dir.chdir(wd) do

   id = y.tol_id unless File.exist?("#{id}.pretext.agp_1")
   cmd = <<-HERE
touch #{id}.additional_haplotigs.unscrubbed.fa ;
rapid_pretext2tpf_XL.py scaffolds.tpf #{id}.pretext.agp_1 ;
rapid_join.pl -csv chrs.csv -fa original.fa -tpf rapid_prtxt_XL.tpf -out #{id} ;
rapid_join.pl -fa original.fa -tpf haps_rapid_prtxt_XL.tpf -out #{id} -hap ;
HERE
   o = `#{cmd}`
   puts o
   raise "something went wrong" unless $?.success?

   File.write( wd+"/#{y.sample_version}.curation_stats" , o )

   # Make new pretext map.
   cmd = <<-HERE
/software/grit/projects/vgp_curation_scripts/Pretext_HiC_pipeline.sh -i #{id}.curated_primary.no_mt.unscrubbed.fa -s #{id} -k #{y.hic_read_dir} -d `pwd`
HERE
   puts `#{cmd}`
   raise "something went wrong" unless $?.success?

   # gfasta
   cmd = <<-HERE
~mh6/AGPcorrect.py original.fasta #{id}.pretext.agp_1 > corrected.agp ;
/software/grit/projects/gfastats/gfastats original.fasta -a corrected.agp -o curated.fasta
HERE

   puts `#{cmd}`
   raise "something went wrong" unless $?.success?
  end
end

# copy files into the curated directory for QC
def copy_qc(y)
    target_dir = y.curated_dir
    wd = y.working_dir
    FileUtils.mkdir_p(target_dir)

    Dir.chdir(wd) do

      # sample_id + release_version | or from geval database
      input_id = y.sample_version
      input_id = y.tol_id unless File.exist?("#{input_id}.curated_primary.no_mt.unscrubbed.fa")

      #required files
      files = [
       "rapid_prtxt_XL.tpf",
       "haps_rapid_prtxt_XL.tpf",
       "#{input_id}.curated_primary.no_mt.unscrubbed.fa",
       "#{input_id}.inter.csv",
       "#{input_id}.chromosome.list.csv"
      ]

      #optional files
      [ "#{input_id}.additional_haplotigs.unscrubbed.fa",
        "#{input_id}.curation_stats"
      ].each{|f| files.append(f) if File.file?(f) }

      files.each { |f|
          puts FileUtils.cp("#{wd}/#{f}", "#{target_dir}/#{f}", verbose: true)
      }

      # copy pretext
      pretext = Dir["#{wd}/*/*.pretext"].sort_by{|f|File.mtime(f)}[-1]
      FileUtils.cp(pretext,"#{y.pretext_dir}/#{input_id}.curated.pretext", verbose: true) if pretext
  end
end

## main CLI bit

issue = "none"
setup = false
qc = false
tol = false
release = false

OptionParser.new do |parser|
  parser.banner = "Usage: curation_tool --issue JIRA_ID [--copy_pretext | --copy_qc | --setup_working_dir | --build_release]"
  parser.on("-i", "--issue JIRA_ID") { |i| issue = i }
  parser.on("-p", "--copy_pretext") { setup = true }
  parser.on("-w", "--setup_working_dir") { tol = true }
  parser.on("-r", "--build_release"){release = true}
  parser.on("-q", "--copy_qc") { qc = true }
  parser.on("-h", "--help") do
    puts parser
    exit
  end
end.parse!

y = GritJiraIssue.new(issue)

if setup
  puts "copy pretext => #{Dir.pwd}/#{y.tol_id}"
  setup_local(y)
elsif tol
  puts "staging files for RC => #{y.working_dir}"
  setup_tol(y)
elsif release
  puts "building release files and pretext => #{y.working_dir}"
  build_release(y)
elsif qc
  puts "stage for QC => #{y.curated_dir}"
  copy_qc(y)
end

## get_comparators.bash
#!/bin/bash

# Written by Dom Absolon 09/08/2022
# This script is a wrapper for downloading close comparatpors and running nucmers

source /software/grit/projects/contamination_screen/conf/contamination_screen.conf

LSB_DEFAULTGROUP=grit-grp

FASTA='original.fa'
suffix=''
SPECIES=0
mem=8000
REDUCED=''

usage() {
cat << EOF
Usage: $0 [REQUIRED -s "species name"] [OPTIONAL -c, -r, -m memory_multiplier]

REQUIRED:
	-s [text]    complete latin species name e.g. "Pollenia labialis"
OPTIONAL:
	-m [numeric] memory multiplier
	-c           run on the curated fasta
        -r           run on the first 3 comparators only
EOF
	exit 1
}

while getopts "s:m:cr" opt
   do
     case $opt in
        c ) FASTA=*.curated*.fa ; suffix="_cur" ;;
        r ) REDUCED=1 ;;
        s ) SPECIES=$OPTARG ;;
    	m ) let "mem=$mem*$OPTARG";;
	h | *) usage ;;
     esac
done

if [[ ${#SPECIES} -eq 0 ]]; then
        echo "a species name is required"
	usage
fi

echo "Species of interest is: $SPECIES"
echo "These are the closest matches to your assembly of interest:"
python /software/grit/projects/contamination_screen/scripts/closest_chromosome_level_assemblies.py "$SPECIES" |tee closest_assems.csv

# make_id_file
grep -o GCA............ closest_assems.csv > closest_comparators.ids
rm closest_assems.csv

# reduce the comparators if wanted
if [[ ! -z "$REDUCED" ]]
then
    head -n 3 closest_comparators.ids > closest_comparators_reduced.ids
    mv -f closest_comparators.ids all_comparators.ids
    mv -f closest_comparators_reduced.ids closest_comparators.ids
fi

#fetch
for GCA in `cat closest_comparators.ids` ; do
    wget -O ${GCA}.fa.gz "https://www.ebi.ac.uk/ena/browser/api/fasta/${GCA}?download=true&gzip=true"
    gzip -t ${GCA}.fa.gz || echo "could not download $GCA"
done

#rename / reheader
for i in GC*.gz; do
   filesize=$(stat -c%s "$i")
   if (( filesize > 100000 )); then
	   zcat $i | /software/grit/projects/vgp_curation_scripts/reheader > ${i%.*}_reheader.fna
   fi
   rm $i
done

for i in *.fna; do
  GCA=$(echo $i | cut -f1,2 -d _)
  echo "bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' -n 48 \"/software/grit/bin/nucmer -t 48 --prefix ${GCA}${suffix} $i $FASTA\""
  bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' -n 48 "/software/grit/bin/nucmer -t 48 --prefix ${GCA}${suffix} $i $FASTA"
done

wait

# submit the dotprep
for i in *.delta; do
  if (( $filesize > 50000 )); then
    bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' "/software/grit/bin/DotPrep.py --delta $i --out ${i%.delta}"
  fi
done

## reheader.cr
#!/bin/env crystal
# usage: ./reheader input_file > output_file

def get_id(line : String)
  x = line.split
  accession = x[0]
  x.each_with_index { |i, n|
    if i.downcase.includes?("chromosome") && !line.includes?("muller")
      if line.includes?("unlocali")
        number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
        return "chr_" + number + "_unloc_" + accession
      else
        number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
        return "chr_" + number
      end
    end

    if i.includes?("LG")
      if line.includes?("unlocali")
        number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
        return "LG_" + number + "_unloc_" + accession
      else
        number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
        return "LG_" + number
      end
    end

    if i.downcase.includes?("linkage")
      if line.includes?("unlocali")
        number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
        return "LG_" + number + "_unloc_" + accession
      else
        if x[n + 1].includes?("group")
          return x[n + 2].gsub(",", "") # Some GCAs have "linkage group"
        end
      end
    end
  }
  ""
end

ARGF.each_line { |line|
  if line[0] == '>'
    line = line.lstrip('>')
    id = get_id(line)
    id = line.split[0] if id == ""
    puts ">" + id
  else
    puts line
  end
}

## remove_contamination.bash
#!/bin/bash
# Script to filter contaminant scaffolds identified during curation from tpf, agp and original fasta files.
# Need to run this after copying agp across (post pretext curation) but before running rapid_pretext2tpf_XL.py and rapid_join.pl.
# NOTE - if gaps from filtered scaffolds will need to be manually dealt with after running.
if [ $# -ne 2 ]; then
  echo $0: usage: sh remove_contam.sh tol_id scaffolds_list
  exit 1
fi
# vars
tol_id=$1
filt_list=$2
seqkit grep -v -f ${filt_list} original.fa > tmp.fa; mv tmp.fa original.fa
grep -vwFf ${filt_list} ${tol_id}.pretext.agp_1 > tmp; mv tmp ${tol_id}.pretext.agp_1
grep -vwFf ${filt_list} scaffolds.tpf > tmp; mv tmp scaffolds.tpf
grep -vwFf ${filt_list} original.tpf > tmp; mv tmp original.tpf

## setup_local.sh
#!/bin/bash

# Script to set up local dir and files for curation. Need to run from curations dir and give tol id as variable.

if [ $# -ne 1 ]; then
    echo $0: usage: sh setup_local.sh tol_id
    exit 1
fi

# vars

tol_id=$1

# make dir, notes file and copy map.

mkdir $PWD/${tol_id}

pushd $PWD/${tol_id}

touch notes

scp tm18@tol-head2:/nfs/team135/pretext_maps/${tol_id}.pretext .

loc=$(echo "$PWD")

popd

text_1=$(echo "Local dir:")

echo $text_1'\n'$loc

## update_mapping.rb
#!/usr/bin/env ruby

require 'optparse'

fasta = ''
table = ''
debug = false
new_mapping = false
chromosome_list = ''

def get_mapping(file)
  mapping = Hash.new
  File.new(file).each_line{|line|
    line.chomp!
    c=line.split
    mapping[c[0]]=c[1]
  }
  mapping
end

def create_chromosme_list_line(id)
    /(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
    name   = "#{$1}"
    suffix = "#{$2}"
    stripped_name = name.sub("SUPER_",'')
    line = []
    if suffix.length >=1
      line = [name+suffix,stripped_name,'no']
    else
      line = [name,stripped_name,'yes']
    end
    line.join(',')
end

OptionParser.new do |parser|
  parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug | --new_mapping NEW_MAPPING_TSV | --help | --chromosome_list NEW_CHROMOSOME_LIST]"
  parser.on("-f", "--fasta FASTA_FILE") { |f| fasta = f }
  parser.on("-t", "--mashmap_table TABLE_TSV") { |f| table = f }
  parser.on("-d", "--debug") { debug = true }
  parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { |f| new_mapping = f }
  parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {|f| chromosome_list = f}
  parser.on("-h", "--help") do
    puts parser
    exit
  end
end.parse!

hap2_hap1_mapping = get_mapping(table)

count=1
new_table = File.new(new_mapping,'w') if new_mapping
new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

File.new(fasta).each_line{|line|
    if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      if hap2_hap1_mapping.has_key?(name)
        line = line.sub(name, hap2_hap1_mapping[name])
        $stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
        new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      else
        while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
          count+=1
        end
        id = "SUPER_#{count}"
        line = line.sub(name,id)
        hap2_hap1_mapping[id]=name
        $stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
        new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
        new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
      end
    elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
      name   = "#{$1}"
      suffix = "#{$2}"
      new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
    end
    print line
}

## workflow.txt
Worflow plan
===============
on the example of GRIT-775 / odAplAero1.1

with a checkout of https://gist.github.com/epaule/0ceaaa25ea30405732988fb9a37acd16
or at /lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16


1.) initial setup
at sanger (tol / farm5) and with curation_v2

/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -w -i GRIT-775
to setup a working directory in you lustre space (/lustre/scratch123/tol/teams/grit/<USER>/<tol_id>_<release>) in that example: /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1

2.) nucmer
run the nucmers
cd /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1
/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/get_comparators.bash -s "Aplysina aerophoba"

3.) copy pretext
at your laptop (wherever you checked out the gist - or copied the script):
ruby curation_tool.rb -i GRIT-775 -p
to scp from sanger the pretext into a working directory which is your current one + the <told_id>_<release> . in that example: /tmp/0ceaaa25ea30405732988fb9a37acd16/odAplAero1

4.) curation
edit your scaffolds.tpf in the working directory /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1/
if you edit it somewhere else, copy it back after curation

4.1) more nucmers
run nucmer on the curated fasta
cd /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1
/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/get_comparators.bash -s "Aplysina aerophoba" -c

5.) stage curated files back
after curation, dump the agp and scp it back to sanger into the working directory:
scp  /tmp/0ceaaa25ea30405732988fb9a37acd16/odAplAero1/odAplAero1_1.pretext.agp_1 tol:/lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1/
if you edited the tpf on your laptop rather than the working directory, it also needs copying back

6.) build release files
build pretext (again at sanger (tol / farm5) and with curation_v2 )
/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -w -i GRIT-775

6.1) something went wrong and needs fixing in pretext or TPF
- there is a file called odAplAero1_1.curation_stats with the output of step 5.)
- update the agp/tpf, etc.
and run 6.) again

7.) send it to QC
/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -q -i GRIT-775
copies the files to the curated/ directory and the latest pretect to /nfs/team135/curated_pretext_maps/


ToDo:
========
* move the scripts to our vgp_curation repo
* write a proper documentation
* create a environment that can run the decon and the curation scripts
* add something for renaming
* add custom memory for nucmer
	#!/bin/bash

	# Script by Tom Mathers.

	# Script requires 32 cores and ~50Gb ram (depending on genome size).
	# Seqkit and BUSCO need to be in path.
	# Scaffold IDs need to be in final tol format.
	# Query and ref short IDs need to be two letter codes eg. "Ac".
	# Query and ref fasta files need to be in working dir.

	### V1.1 added code to remove unlocs from links file.

	if [ $# -ne 5 ]; then
	echo $0: usage: sh BUSCO_synteny.sh ref_fasta query_fasta ref_short_id query_short_id
	exit 1
	fi

	ref=$1
	query=$2
	ref_short_id=$4
	query_short_id=$5

	export MODULEPATH=/software/modules/:$MODULEPATH
	module load ISG/singularity/

	export SINGULARTY_TMPDIR=/lustre/scratch123/tol/teams/grit/mh6/singularity
	export SINGULARTY_CACHEDIR=/lustre/scratch123/tol/teams/grit/mh6/singularity/cache

	SIF=/lustre/scratch123/tol/teams/grit/mh6/singularity/busco.sif
	BUSCO="singularity exec --bind `pwd`:$HOME,/lustre/scratch123/tol/resources/busco/v5:/busco_downloads $SIF busco --auto-lineage --offline -c 32 -m genome -i"

	# Run BUSCO on ref assembly.
	`$BUSCO $ref -o busco5_${ref_short_id}`

	# Run BUSCO on query assembly.
	`$BUSCO $query -o busco5_${query_short_id}`

	# Extract BUSCOs for chroms only.
	grep Complete busco5_${ref_short_id}/run_/full_table.tsv \| awk '{print $3 "\t" $4 "\t" $5 "\t" $1}' \| grep SUPER \| sed "s/SUPER_/${ref_short_id}/g" > ${ref_short_id}.tmp
	grep Complete busco5_${query_short_id}/run_/full_table.tsv \| awk '{print $3 "\t" $4 "\t" $5 "\t" $1}' \| grep SUPER \| sed "s/SUPER_/${query_short_id}/g" > ${query_short_id}.tmp

	# Get shared BUSCOs IDs.
	cut -f4 ${ref_short_id}.tmp > tmp; grep -wFf tmp ${query_short_id}.tmp > ${query_short_id}.shared.tmp
	cut -f4 ${query_short_id}.tmp > tmp; grep -wFf tmp ${ref_short_id}.tmp > ${ref_short_id}.shared.tmp
	cut -f4 ${query_short_id}.shared.tmp > ${ref_short_id}_${query_short_id}_shared_buscos.txt

	# Extract shared BUSCO coords from each species and make links file. Add 100kb to each busco to improve visulaisation.
	echo "chr1,start1,end1,chr2,start2,end2,color" > links_header

	while read id; do grep -w ${id} ${query_short_id}.tmp; done < ${ref_short_id}_${query_short_id}_shared_buscos.txt > tmp1
	while read id; do grep -w ${id} ${ref_short_id}.tmp; done < ${ref_short_id}_${query_short_id}_shared_buscos.txt \| paste tmp1 - \| cut -f1-3,5-7 \| awk '{print $1 "," $2 "," $3+1000000 "," $4 "," $5 "," $6+1000000}' > tmp2; cut -d "," -f4 tmp2 \| paste tmp2 - \|sed 's/\t/,/g' \| cat links_header - \| grep -v unloc > ${ref_short_id}_${query_short_id}.links


	# Generate chromsome file for shiny circos.
	seqkit fx2tab --length --name ${ref} \| grep -v unloc \| grep SUPER \| awk '{print $1 ",1," $2}' \| sed "s/SUPER_/${ref_short_id}/g" > tmp
	seqkit fx2tab --length --name ${query} \| grep -v unloc \| grep SUPER \| awk '{print $1 ",1," $2}' \| sed "s/SUPER_/${query_short_id}/g" \| cat - tmp > ${ref_short_id}_${query_short_id}_chrom.csv

	# Generate random colour codes for ref chroms for shiny circos.
	grep ${ref_short_id} ${ref_short_id}_${query_short_id}_chrom.csv \|cut -d "," -f1 > tmp

	while read id; do echo ${id}:"#$(openssl rand -hex 3)"; done < tmp \| tr "\n" ";" \| awk '{print $0}' > ref_colour_codes.txt

	# Clean up.
	rm ${ref_short_id}.tmp
	rm ${query_short_id}.tmp
	rm ${query_short_id}.shared.tmp
	rm ${ref_short_id}.shared.tmp
	rm links_header
	rm tmp1
	rm tmp2
	rm tmp
	#cleanup BUSCO dir
	#!/bin/bash

	INPUT=$1
	TARGETDIR=$2


	copy_files(){

	cp rapid_prtxt_XL.tpf haps_rapid_prtxt_XL.tpf ./"$INPUT".curated_primary.no_mt.unscrubbed.fa ./"$INPUT".inter.csv ./"$INPUT".additional_haplotigs.unscrubbed.fa ./"$INPUT".chromosome.list.csv ./"$INPUT".curated.agp "$TARGETDIR"

	if [ $? -eq 0 ]; then
	ls -lt "$TARGETDIR"
	else
	echo "something went wrong, copy files manually"
	fi
	}

	copy_files
	#!/nfs/users/nfs_m/mh6/.rvm/rubies/ruby-3.2.1/bin/ruby
	# hardcoded the ruby until i can get a v3.x installed somewhere central

	require 'optparse'

	require "net/http"
	require "json"
	require "yaml"
	require "fileutils"

	class GritJiraIssue
	@@url = "jira.sanger.ac.uk"

	def initialize(name)
	@id = name
	@token = self.get_token
	end

	def json
	@json \|\|= self.get_json
	end

	def yaml
	@yaml \|\|= self.get_yaml
	end

	def hic_read_dir
	self.yaml["hic_read_dir"]
	end

	def projects
	self.yaml['projects']
	end

	def geval_db
	self.json["fields"]["customfield_11643"]
	end

	def decon_file
	self.json["fields"]["customfield_11677"]
	end

	def release_version
	self.json["fields"]["customfield_11609"].to_i
	end

	def tol_id
	self.yaml["specimen"]
	end

	# in the form of tol_id _ version
	def sample_version
	if self.geval_db
	return self.geval_db.split("_")[2..-1].join("_")
	else
	return "#{self.tol_id}_#{self.release_version}"
	end
	end

	# curation working directory
	def working_dir
	"/lustre/scratch123/tol/teams/grit/#{ENV["USER"]}/#{self.tol_id}_#{self.release_version}"
	end

	def pretext_dir
	prefix = self.tol_id[0]
	dir = Dir["/nfs/team135/curated_pretext_maps/#{prefix}*"].select{\|f\|File.directory?(f)}
	if prefix == "i"
	second = self.tol_id[1]
	dir = Dir["/nfs/team135/curated_pretext_maps/#{prefix}_/#{second}_/"].select{\|f\|File.directory?(f)}
	end
	dir[0]
	end

	# directory where the curated files go, based on the decon file directory
	def curated_dir
	t = self.decon_file.split("/")[0..-4].join("/") + "/curated/"+self.tol_id

	if self.projects.map(&:downcase).include? "genomeark"
	t+=".genomeark.#{self.release_version}"
	else
	t+=".#{self.release_version}"
	end
	t
	end

	## methods that actually do things

	# reads the token from .netrc
	def get_token
	File.new(File.expand_path("~/.netrc")).each_line { \|line\|
	line = line.chomp
	columns = line.split
	return columns[-1].to_s if @@url == columns[1]
	}
	raise "cannot get token for #{@@url}"
	end

	def get_json
	r = Net::HTTP.get_response(URI("https://#{@@url}/rest/api/2/issue/#{@id}"), {Accept: "application/json", Authorization: "Bearer #{@token}"})
	raise "cannot get the ticket" if r.class < Net::HTTPOK
	JSON.parse(r.body)
	end

	def get_yaml
	yaml_url = self.json["fields"]["attachment"].map{ \|e\| e["content"] }.select { \|elem\| /.\.y(a)ml/.match(elem) }[0]
	r = Net::HTTP.get_response(URI("#{yaml_url}"), {Authorization: "Bearer #{@token}"})
	raise "cannot get the yaml" if r.class < Net::HTTPOK
	YAML.load(r.body)
	end

	end
	# end of the class


	# copy pretext
	def setup_local(y)
	wd = "#{Dir.pwd}/#{y.tol_id}"
	FileUtils.mkdir_p(wd)

	cmd = <<-HERE
	touch #{wd}/notes;
	scp #{ENV["USER"]}@tol:/nfs/treeoflife-01/teams/grit/data/pretext_maps//#{y.tol_id}*.pretext #{wd}/
	HERE
	puts `#{cmd}`
	raise "something went wrong" unless $?.success?
	end

	# inital setup
	def setup_tol(y)
	wd = y.working_dir
	FileUtils.mkdir_p(wd)

	fasta_gz = y.decon_file.sub("contamination", "decontaminated.fa.gz")

	raise Exception.new("scaffolds.tpf in working #{wd} already exists") if File.exist?(wd+"/scaffolds.tpf")

	cmd = <<-HERE
	cd #{wd} ;
	zcat #{fasta_gz} > original.fa ;
	perl /software/grit/projects/vgp_curation_scripts/rapid_split.pl -fa original.fa ;
	mv -f original.fa.tpf original.tpf ;
	cp original.tpf scaffolds.tpf;
	HERE
	puts `#{cmd}`
	raise "something went wrong" unless $?.success?
	end

	# make files from the preetxt agp and build a new pretext
	def build_release(y)
	id = y.sample_version
	wd = y.working_dir

	Dir.chdir(wd) do

	id = y.tol_id unless File.exist?("#{id}.pretext.agp_1")
	cmd = <<-HERE
	touch #{id}.additional_haplotigs.unscrubbed.fa ;
	rapid_pretext2tpf_XL.py scaffolds.tpf #{id}.pretext.agp_1 ;
	rapid_join.pl -csv chrs.csv -fa original.fa -tpf rapid_prtxt_XL.tpf -out #{id} ;
	rapid_join.pl -fa original.fa -tpf haps_rapid_prtxt_XL.tpf -out #{id} -hap ;
	HERE
	o = `#{cmd}`
	puts o
	raise "something went wrong" unless $?.success?

	File.write( wd+"/#{y.sample_version}.curation_stats" , o )

	# Make new pretext map.
	cmd = <<-HERE
	/software/grit/projects/vgp_curation_scripts/Pretext_HiC_pipeline.sh -i #{id}.curated_primary.no_mt.unscrubbed.fa -s #{id} -k #{y.hic_read_dir} -d `pwd`
	HERE
	puts `#{cmd}`
	raise "something went wrong" unless $?.success?

	# gfasta
	cmd = <<-HERE
	~mh6/AGPcorrect.py original.fasta #{id}.pretext.agp_1 > corrected.agp ;
	/software/grit/projects/gfastats/gfastats original.fasta -a corrected.agp -o curated.fasta
	HERE

	puts `#{cmd}`
	raise "something went wrong" unless $?.success?
	end
	end

	# copy files into the curated directory for QC
	def copy_qc(y)
	target_dir = y.curated_dir
	wd = y.working_dir
	FileUtils.mkdir_p(target_dir)

	Dir.chdir(wd) do

	# sample_id + release_version \| or from geval database
	input_id = y.sample_version
	input_id = y.tol_id unless File.exist?("#{input_id}.curated_primary.no_mt.unscrubbed.fa")

	#required files
	files = [
	"rapid_prtxt_XL.tpf",
	"haps_rapid_prtxt_XL.tpf",
	"#{input_id}.curated_primary.no_mt.unscrubbed.fa",
	"#{input_id}.inter.csv",
	"#{input_id}.chromosome.list.csv"
	]

	#optional files
	[ "#{input_id}.additional_haplotigs.unscrubbed.fa",
	"#{input_id}.curation_stats"
	].each{\|f\| files.append(f) if File.file?(f) }

	files.each { \|f\|
	puts FileUtils.cp("#{wd}/#{f}", "#{target_dir}/#{f}", verbose: true)
	}

	# copy pretext
	pretext = Dir["#{wd}//.pretext"].sort_by{\|f\|File.mtime(f)}[-1]
	FileUtils.cp(pretext,"#{y.pretext_dir}/#{input_id}.curated.pretext", verbose: true) if pretext
	end
	end

	## main CLI bit

	issue = "none"
	setup = false
	qc = false
	tol = false
	release = false

	OptionParser.new do \|parser\|
	parser.banner = "Usage: curation_tool --issue JIRA_ID [--copy_pretext \| --copy_qc \| --setup_working_dir \| --build_release]"
	parser.on("-i", "--issue JIRA_ID") { \|i\| issue = i }
	parser.on("-p", "--copy_pretext") { setup = true }
	parser.on("-w", "--setup_working_dir") { tol = true }
	parser.on("-r", "--build_release"){release = true}
	parser.on("-q", "--copy_qc") { qc = true }
	parser.on("-h", "--help") do
	puts parser
	exit
	end
	end.parse!

	y = GritJiraIssue.new(issue)

	if setup
	puts "copy pretext => #{Dir.pwd}/#{y.tol_id}"
	setup_local(y)
	elsif tol
	puts "staging files for RC => #{y.working_dir}"
	setup_tol(y)
	elsif release
	puts "building release files and pretext => #{y.working_dir}"
	build_release(y)
	elsif qc
	puts "stage for QC => #{y.curated_dir}"
	copy_qc(y)
	end
	#!/bin/bash

	# Written by Dom Absolon 09/08/2022
	# This script is a wrapper for downloading close comparatpors and running nucmers

	source /software/grit/projects/contamination_screen/conf/contamination_screen.conf

	LSB_DEFAULTGROUP=grit-grp

	FASTA='original.fa'
	suffix=''
	SPECIES=0
	mem=8000
	REDUCED=''

	usage() {
	cat << EOF
	Usage: $0 [REQUIRED -s "species name"] [OPTIONAL -c, -r, -m memory_multiplier]

	REQUIRED:
	-s [text] complete latin species name e.g. "Pollenia labialis"
	OPTIONAL:
	-m [numeric] memory multiplier
	-c run on the curated fasta
	-r run on the first 3 comparators only
	EOF
	exit 1
	}

	while getopts "s:m:cr" opt
	do
	case $opt in
	c ) FASTA=.curated.fa ; suffix="_cur" ;;
	r ) REDUCED=1 ;;
	s ) SPECIES=$OPTARG ;;
	m ) let "mem=$mem*$OPTARG";;
	h \| *) usage ;;
	esac
	done

	if [[ ${#SPECIES} -eq 0 ]]; then
	echo "a species name is required"
	usage
	fi

	echo "Species of interest is: $SPECIES"
	echo "These are the closest matches to your assembly of interest:"
	python /software/grit/projects/contamination_screen/scripts/closest_chromosome_level_assemblies.py "$SPECIES" \|tee closest_assems.csv

	# make_id_file
	grep -o GCA............ closest_assems.csv > closest_comparators.ids
	rm closest_assems.csv

	# reduce the comparators if wanted
	if [[ ! -z "$REDUCED" ]]
	then
	head -n 3 closest_comparators.ids > closest_comparators_reduced.ids
	mv -f closest_comparators.ids all_comparators.ids
	mv -f closest_comparators_reduced.ids closest_comparators.ids
	fi

	#fetch
	for GCA in `cat closest_comparators.ids` ; do
	wget -O ${GCA}.fa.gz "https://www.ebi.ac.uk/ena/browser/api/fasta/${GCA}?download=true&gzip=true"
	gzip -t ${GCA}.fa.gz \|\| echo "could not download $GCA"
	done

	#rename / reheader
	for i in GC*.gz; do
	filesize=$(stat -c%s "$i")
	if (( filesize > 100000 )); then
	zcat $i \| /software/grit/projects/vgp_curation_scripts/reheader > ${i%.*}_reheader.fna
	fi
	rm $i
	done

	for i in *.fna; do
	GCA=$(echo $i \| cut -f1,2 -d _)
	echo "bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' -n 48 \"/software/grit/bin/nucmer -t 48 --prefix ${GCA}${suffix} $i $FASTA\""
	bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' -n 48 "/software/grit/bin/nucmer -t 48 --prefix ${GCA}${suffix} $i $FASTA"
	done

	wait

	# submit the dotprep
	for i in *.delta; do
	if (( $filesize > 50000 )); then
	bsub -K -o o -M $mem -R'select[mem>'$mem'] rusage[mem='$mem'] span[hosts=1]' "/software/grit/bin/DotPrep.py --delta $i --out ${i%.delta}"
	fi
	done
	#!/bin/env crystal
	# usage: ./reheader input_file > output_file

	def get_id(line : String)
	x = line.split
	accession = x[0]
	x.each_with_index { \|i, n\|
	if i.downcase.includes?("chromosome") && !line.includes?("muller")
	if line.includes?("unlocali")
	number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
	return "chr_" + number + "_unloc_" + accession
	else
	number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
	return "chr_" + number
	end
	end

	if i.includes?("LG")
	if line.includes?("unlocali")
	number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
	return "LG_" + number + "_unloc_" + accession
	else
	number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
	return "LG_" + number
	end
	end

	if i.downcase.includes?("linkage")
	if line.includes?("unlocali")
	number = x[n + 1].gsub(",", "") # Some GCAs have commas in the chr number field
	return "LG_" + number + "_unloc_" + accession
	else
	if x[n + 1].includes?("group")
	return x[n + 2].gsub(",", "") # Some GCAs have "linkage group"
	end
	end
	end
	}
	""
	end

	ARGF.each_line { \|line\|
	if line[0] == '>'
	line = line.lstrip('>')
	id = get_id(line)
	id = line.split[0] if id == ""
	puts ">" + id
	else
	puts line
	end
	}
	#!/bin/bash
	# Script to filter contaminant scaffolds identified during curation from tpf, agp and original fasta files.
	# Need to run this after copying agp across (post pretext curation) but before running rapid_pretext2tpf_XL.py and rapid_join.pl.
	# NOTE - if gaps from filtered scaffolds will need to be manually dealt with after running.
	if [ $# -ne 2 ]; then
	echo $0: usage: sh remove_contam.sh tol_id scaffolds_list
	exit 1
	fi
	# vars
	tol_id=$1
	filt_list=$2
	seqkit grep -v -f ${filt_list} original.fa > tmp.fa; mv tmp.fa original.fa
	grep -vwFf ${filt_list} ${tol_id}.pretext.agp_1 > tmp; mv tmp ${tol_id}.pretext.agp_1
	grep -vwFf ${filt_list} scaffolds.tpf > tmp; mv tmp scaffolds.tpf
	grep -vwFf ${filt_list} original.tpf > tmp; mv tmp original.tpf
	#!/bin/bash

	# Script to set up local dir and files for curation. Need to run from curations dir and give tol id as variable.

	if [ $# -ne 1 ]; then
	echo $0: usage: sh setup_local.sh tol_id
	exit 1
	fi

	# vars

	tol_id=$1

	# make dir, notes file and copy map.

	mkdir $PWD/${tol_id}

	pushd $PWD/${tol_id}

	touch notes

	scp tm18@tol-head2:/nfs/team135/pretext_maps/${tol_id}.pretext .

	loc=$(echo "$PWD")

	popd

	text_1=$(echo "Local dir:")

	echo $text_1'\n'$loc
	#!/usr/bin/env ruby

	require 'optparse'

	fasta = ''
	table = ''
	debug = false
	new_mapping = false
	chromosome_list = ''

	def get_mapping(file)
	mapping = Hash.new
	File.new(file).each_line{\|line\|
	line.chomp!
	c=line.split
	mapping[c[0]]=c[1]
	}
	mapping
	end

	def create_chromosme_list_line(id)
	/(SUPER_[A-Z0-9]+)(_unloc_\d+)*/.match(id)
	name = "#{$1}"
	suffix = "#{$2}"
	stripped_name = name.sub("SUPER_",'')
	line = []
	if suffix.length >=1
	line = [name+suffix,stripped_name,'no']
	else
	line = [name,stripped_name,'yes']
	end
	line.join(',')
	end

	OptionParser.new do \|parser\|
	parser.banner = "Usage: update_mapping --fasta FASTA_FILE --mashmap_table TABLE_TSV [ --debug \| --new_mapping NEW_MAPPING_TSV \| --help \| --chromosome_list NEW_CHROMOSOME_LIST]"
	parser.on("-f", "--fasta FASTA_FILE") { \|f\| fasta = f }
	parser.on("-t", "--mashmap_table TABLE_TSV") { \|f\| table = f }
	parser.on("-d", "--debug") { debug = true }
	parser.on("-n", "--new_mapping NEW_MAPPING_TSV") { \|f\| new_mapping = f }
	parser.on("-c", "--chromosome_name NEW_CHROMOSOME_LIST") {\|f\| chromosome_list = f}
	parser.on("-h", "--help") do
	puts parser
	exit
	end
	end.parse!

	hap2_hap1_mapping = get_mapping(table)

	count=1
	new_table = File.new(new_mapping,'w') if new_mapping
	new_chrom_file = File.new(chromosome_list,'w') if chromosome_list

	File.new(fasta).each_line{\|line\|
	if />(SUPER_\d+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	if hap2_hap1_mapping.has_key?(name)
	line = line.sub(name, hap2_hap1_mapping[name])
	$stderr.puts("renaming #{name}#{suffix} => #{hap2_hap1_mapping[name]}#{suffix}") if debug
	new_table.puts([name+suffix, hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	else
	while hap2_hap1_mapping.reverse_each.to_h.has_key?("SUPER_#{count}")
	count+=1
	end
	id = "SUPER_#{count}"
	line = line.sub(name,id)
	hap2_hap1_mapping[id]=name
	$stderr.puts("renaming #{name}#{suffix} => #{id}#{suffix}") if debug
	new_table.puts( [name+suffix,hap2_hap1_mapping[name]+suffix].join("\t")) if new_mapping
	new_chrom_file.puts(create_chromosme_list_line(hap2_hap1_mapping[name]+suffix)) if chromosome_list
	end
	elsif />(SUPER_\w+)(_unloc_\d+)*/.match(line)
	name = "#{$1}"
	suffix = "#{$2}"
	new_chrom_file.puts(create_chromosme_list_line(name+suffix)) if chromosome_list
	end
	print line
	}
	Worflow plan
	===============
	on the example of GRIT-775 / odAplAero1.1

	with a checkout of https://gist.github.com/epaule/0ceaaa25ea30405732988fb9a37acd16
	or at /lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16


	1.) initial setup
	at sanger (tol / farm5) and with curation_v2

	/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -w -i GRIT-775
	to setup a working directory in you lustre space (/lustre/scratch123/tol/teams/grit/<USER>/<tol_id>_<release>) in that example: /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1

	2.) nucmer
	run the nucmers
	cd /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1
	/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/get_comparators.bash -s "Aplysina aerophoba"

	3.) copy pretext
	at your laptop (wherever you checked out the gist - or copied the script):
	ruby curation_tool.rb -i GRIT-775 -p
	to scp from sanger the pretext into a working directory which is your current one + the <told_id>_<release> . in that example: /tmp/0ceaaa25ea30405732988fb9a37acd16/odAplAero1

	4.) curation
	edit your scaffolds.tpf in the working directory /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1/
	if you edit it somewhere else, copy it back after curation

	4.1) more nucmers
	run nucmer on the curated fasta
	cd /lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1
	/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/get_comparators.bash -s "Aplysina aerophoba" -c

	5.) stage curated files back
	after curation, dump the agp and scp it back to sanger into the working directory:
	scp /tmp/0ceaaa25ea30405732988fb9a37acd16/odAplAero1/odAplAero1_1.pretext.agp_1 tol:/lustre/scratch123/tol/teams/grit/<USER>/odAplAero1_1/
	if you edited the tpf on your laptop rather than the working directory, it also needs copying back

	6.) build release files
	build pretext (again at sanger (tol / farm5) and with curation_v2 )
	/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -w -i GRIT-775

	6.1) something went wrong and needs fixing in pretext or TPF
	- there is a file called odAplAero1_1.curation_stats with the output of step 5.)
	- update the agp/tpf, etc.
	and run 6.) again

	7.) send it to QC
	/lustre/scratch123/tol/teams/grit/mh6/0ceaaa25ea30405732988fb9a37acd16/curation_tool.rb -q -i GRIT-775
	copies the files to the curated/ directory and the latest pretect to /nfs/team135/curated_pretext_maps/


	ToDo:
	========
	* move the scripts to our vgp_curation repo
	* write a proper documentation
	* create a environment that can run the decon and the curation scripts
	* add something for renaming
	* add custom memory for nucmer