Michael Paulini epaule

## get_rnaseq_number_for_species.rb
#!/nfs/users/nfs_m/mh6/.linuxbrew/homebrew/bin/ruby
# usage:  ruby get_rnaseq_number_for_species.rb "Xestospongia muta"

require "net/http"
require "uri"
require "cgi"

species = ARGV[0]

species = CGI.escapeURIComponent(species)

## gist:6a6b78f07a9ddc64e0a58fbd8c12094e
#!/bin/env perl
open IN,shift;
while(<IN>){
	push @h,"$1" if />(\S+)/
}
close IN;
open IN,shift;
while(<IN>){
	@F=split;
	next unless $F[2] eq 'CDS';

## get_lineage_from_species.cr
#!/usr/bin/env crystal
require "http/client"
require "json"

species = ARGV[0]

species = species.gsub(/\s/, "%20")

r = HTTP::Client.get("https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/#{species}", headers: HTTP::Headers{"Accept" => "application/json"})
raise "cannot get the taxonomy" unless r.success?

## mark_scaffolds.rb
#!/usr/bin/env ruby
# frozen_string_literal: true

# add > signs to the nearest gaps in a TPF based on AGP breaks
# usage: ./mark_scaffolds.rb -a AGP -t TPF -o TPF2

require 'optparse'

# get the gaps from the AGP as Hash of Arrays [seq_id] => [1,2,3]
def parse_agp(file)

## cut.txt
Step 9 - Rename and re-run maps
It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method:

Run:
bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta>


Then:
/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa

## fix_capitalisation.bash
#!/bin/bash

# simple files
for f in *.bed *.contamination *recommendation *report.txt *taxonomy.rpt */*/*combined_summary.csv *.fa *.gb
do
    perl -i_original -pne 's/scaffold_/SCAFFOLD_/' $f
done

# compressed fastas
for f in *.gz

## BUSCO_synteny_v1.1.sh
#!/bin/bash

# Script by Tom Mathers.

# Script requires 32 cores and ~50Gb ram (depending on genome size).
# Seqkit and BUSCO need to be in path.
# Scaffold IDs need to be in final tol format.
# Query and ref short IDs need to be two letter codes eg. "Ac".
# Query and ref fasta files need to be in working dir.

## elixir snippet
NimbleCSV.define(MyParser, separator: ";", escape: "\"")

def load_stream file do
  file
  |> File.stream!()
  |> Flow.from_enumerable()
  |> Flow.map(&MyParser.parse_string/1)
  |> Enum.count()
end

## cobiont_stats.cr
#!/bin/env crystal
# cobiont_stats.cr contamination_file1 contamination_file_2

# returns a Array of String
def read_contamination_file(file : String)
	ids = Array(String).new
	File.each_line(file){|line|
		ids << $1 if /REMOVE\s+(\S+)/.match(line)
	}
	return ids.uniq

## filter_merged.cr
#!/bin/env crystal

require "option_parser"

phylum="Arthropoda|insect"
dir="20230226_qqAmaFero1.20230225.haplotigs.fa_asg_cobiont_check_run/collected_tables/"
OptionParser.parse do |parser|
	parser.banner = "Usage: filter_merged --phylum xyz --infile <in.merged>"
	parser.on("-p PHYLUM","--phylum=PHYLUM","Specifies the phylum(s) of the host separated by | [default=#{phylum}]"){|p|phylum=p}
	parser.on("-d directory","--directory=DIR","merged ASG directory[default=#{dir}]"){|d|dir=d}
	#!/nfs/users/nfs_m/mh6/.linuxbrew/homebrew/bin/ruby
	# usage: ruby get_rnaseq_number_for_species.rb "Xestospongia muta"

	require "net/http"
	require "uri"
	require "cgi"

	species = ARGV[0]

	species = CGI.escapeURIComponent(species)
	#!/bin/env perl
	open IN,shift;
	while(<IN>){
	push @h,"$1" if />(\S+)/
	}
	close IN;
	open IN,shift;
	while(<IN>){
	@F=split;
	next unless $F[2] eq 'CDS';
	#!/usr/bin/env crystal
	require "http/client"
	require "json"

	species = ARGV[0]

	species = species.gsub(/\s/, "%20")

	r = HTTP::Client.get("https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/#{species}", headers: HTTP::Headers{"Accept" => "application/json"})
	raise "cannot get the taxonomy" unless r.success?
	#!/usr/bin/env ruby
	# frozen_string_literal: true

	# add > signs to the nearest gaps in a TPF based on AGP breaks
	# usage: ./mark_scaffolds.rb -a AGP -t TPF -o TPF2

	require 'optparse'

	# get the gaps from the AGP as Hash of Arrays [seq_id] => [1,2,3]
	def parse_agp(file)
	Step 9 - Rename and re-run maps
	It is likely that between the pairs of chromosomes there will be discrepancies in size. This will result in the files being in different orders when they get size sorted. This needs to be corrected before submission. The can be done manually or using the following method:

	Run:
	bsub -G grit-grp -n 16 -e e_hap2hap_mapping -o o_hap2hap_mapping -M 16000 -R 'select[mem>16000] rusage[mem=16000] span[hosts=1]' sh /software/grit/projects/vgp_curation_scripts/hap2_hap1_ID_mapping.sh <h1_fasta> <h2_fasta>


	Then:
	/software/grit/projects/vgp_curation_scripts/update_mapping.rb -c <renamed_hap2_chroms.csv> -f <hap2.fa> -t <hap2_hap1.tsv> > hap2.renamed.fa
	#!/bin/bash

	# simple files
	for f in .bed .contamination recommendation report.txt taxonomy.rpt //combined_summary.csv .fa .gb
	do
	perl -i_original -pne 's/scaffold_/SCAFFOLD_/' $f
	done

	# compressed fastas
	for f in *.gz
	#!/bin/bash

	# Script by Tom Mathers.

	# Script requires 32 cores and ~50Gb ram (depending on genome size).
	# Seqkit and BUSCO need to be in path.
	# Scaffold IDs need to be in final tol format.
	# Query and ref short IDs need to be two letter codes eg. "Ac".
	# Query and ref fasta files need to be in working dir.
	NimbleCSV.define(MyParser, separator: ";", escape: "\"")

	def load_stream file do
	file
	\|> File.stream!()
	\|> Flow.from_enumerable()
	\|> Flow.map(&MyParser.parse_string/1)
	\|> Enum.count()
	end
	#!/bin/env crystal
	# cobiont_stats.cr contamination_file1 contamination_file_2

	# returns a Array of String
	def read_contamination_file(file : String)
	ids = Array(String).new
	File.each_line(file){\|line\|
	ids << $1 if /REMOVE\s+(\S+)/.match(line)
	}
	return ids.uniq
	#!/bin/env crystal

	require "option_parser"

	phylum="Arthropoda\|insect"
	dir="20230226_qqAmaFero1.20230225.haplotigs.fa_asg_cobiont_check_run/collected_tables/"
	OptionParser.parse do \|parser\|
	parser.banner = "Usage: filter_merged --phylum xyz --infile <in.merged>"
	parser.on("-p PHYLUM","--phylum=PHYLUM","Specifies the phylum(s) of the host separated by \| [default=#{phylum}]"){\|p\|phylum=p}
	parser.on("-d directory","--directory=DIR","merged ASG directory[default=#{dir}]"){\|d\|dir=d}