/gist:db7f9dcb00f4dd817ea1 Secret

## gistfile1.txt

use v6;

# This is taken from Rosettacode, it translate Int to romans strings
# It is buggy, but I don't know why
my %symbols =
    1 => "I", 5 => "V", 10 => "X", 50 => "L", 100 => "C",
    500 => "D", 1_000 => "M";

my @subtractors =
    1_000, 100,  500, 100,  100, 10,  50, 10,  10, 1,  5, 1,  1, 0;

multi sub roman (0) { '' }
multi sub roman (Int $n) {
    for @subtractors -> $cut, $minus {
        $n >= $cut
            and return %symbols{$cut} ~ roman($n - $cut);
        $n >= $cut - $minus
            and return %symbols{$minus} ~ roman($n + $minus);
     }
}

sub find-from-roman (Str:D $roman, $in-list) returns Int {
# 1st WORKAROUND: Rosettacode is buggy, we need this or we get erratic behaviour
    chomp($roman);
    my $n = 0;
    my Int $retval;
    for |$in-list -> $cmp {
	if (($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> || $$ ] /) || ($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> || $$ ] /)) {
	    if (!defined($retval)) {
		$retval = $n;
	    } else {
	        die "Cannot decode titles for ambiguity";
	    }
	}
	$n++;
    }
    return $retval;
}

#| Annotate the synthetic chromosome according to reference annotation:
#| annotations input and outputs should be passed through stdin and stdout
sub MAIN ($header, $binsize, $comparisondir) {

    # We want comparisons to be chosen intelligently using information from $header
    my $head-array = do {
	my $head-file = $header.IO.open(:r)
	    or die "Could not open $header";

        eager do for $head-file.lines -> $line {
	    my $info = $line.split("\t")[0];
	    $info.split('~').List;
	}
    }

    # Get the list of all chromosomes, why eager?
    my $all-chromosomes = eager gather {
	my Str $last-chr = "";
	# Why flat ?
        for |$head-array -> $arr ($chr, $bin) {
            if ($chr ne $last-chr) {
                take $chr;
                $last-chr = $chr;
            }
        }
    }
    $*ERR.say($all-chromosomes.perl);

    # Get the lists of synthetic chromosomes
    # Why eager?
    my $syn-chromosomes = eager gather {
	my Str $last-chr = "";
	# Why flat?
        for |$head-array -> $arr ($chr, $bin) {
	    if (($chr ~~ m:i/Syn<[IVXM]>+/) && ($chr ne $last-chr)) {
		take $chr;
		$last-chr = $chr;
	    }
	}
    }
    $*ERR.say($syn-chromosomes.perl);

    # WORKAROUND: To avoid erratic behaviour
    my $syn-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-chromosomes) !! Int } );
    my $all-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $all-chromosomes) !! Int } );
    $*ERR.say($syn-chromosomes-index-built-from-roman.perl);
    $*ERR.say($all-chromosomes-index-built-from-roman.perl);

    # Why eager?
    my $all-compfiles = eager $comparisondir.IO.dir(test => /:i '.' txt $/)>>.basename;
    # Why flat?
    my $compfiles = eager gather for |$syn-chromosomes -> $name {
	# Find the most similar name in compfile, to do so
	# extract the roman numeral
	my $num = ($name ~~ m/(<[IVX]>+)/)[0].Str or die "Could not extract roman number";
        my $element = find-from-roman($num, $all-compfiles);
	if (defined($element)) {
	    take $all-compfiles[$element];
	} else {
	    warn "$name chromosome not found";
	}
    }
    $*ERR.say($compfiles.perl);

    my Int $bins = $binsize.Int;

    # Higly parallel chromosome loader (still super-slow)
    # Why eager and flat?
    my $comp-prom = eager do for |$compfiles -> $comparison { start {
	my $compa-file = ("$comparisondir/" ~ $comparison).IO.open(:r)
	    or die "Could not open $comparison";

	$compa-file.get xx 2;
	my $syn-str = $compa-file.get.chomp;
	my $syn-index = $syn-str.index("\t");
	my $syn-title = $syn-str.substr(0, $syn-index);
	# This is slow as hell
	my $syn-prom = start { eager $syn-str.substr($syn-index+1).split("\t")>>.Int };

	$compa-file.get xx 3;
	my $ref-str = $compa-file.get.chomp;
	my $ref-index = $ref-str.index("\t");
	my $ref-title = $ref-str.substr(0, $ref-index);
	# This is slow as hell
	my $ref-prom = start { eager $ref-str.substr($ref-index+1).split("\t")>>.Int };

	my ($ref, $syn) = await $ref-prom, $syn-prom;

	my $resarray = Array.new();
	$resarray[0] = 0;
        loop (my $i = 0; $i < $ref.elems; $i++) {
	    $resarray[$ref[$i]] = $syn[$i];
        }
	($syn-title, $ref-title, $resarray);
    } }

    my $annot-array = do for $*IN.lines -> $line {
	# Why .List?
        $line.split("\t").List;
    }

    my ($syn-titles, $ref-titles, $comp-arrays);

    # This is a global mess for fixing [Z]
    if ($comp-prom.elems > 1) {
	my @comp-results = await |$comp-prom;
	($syn-titles, $ref-titles, $comp-arrays) = [Z] @comp-results;
    } else {
        my @comp-results = await |$comp-prom;
	my $comp-list;
	($syn-titles, $ref-titles, $comp-list) = @comp-results;
	$syn-titles = ($syn-titles).List; $ref-titles = ($ref-titles).List; $comp-arrays = ($comp-list,).List;
    }
    $*ERR.say($syn-titles.perl);

    # To avoid erratic behaviour
    my $syn-titles-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-titles) !! Int } );
    $*ERR.say($syn-titles-index-built-from-roman.perl);

    # Why eager and flat?
    my $news = eager gather for |$annot-array -> $arr ($chr, $type, $st, $en, $mean) {
	my $titnum;
	if (my $num = ($chr ~~ m/chr(\d+)/)[0]) {
	    $titnum = $syn-titles-index-built-from-roman[$num.Str.Int]; # find-from-roman(roman($num.Str.Int), $syn-titles);
	    $num = $num.Str.Int;
	}

	if (defined($titnum)) {
            my Int $new-st = $comp-arrays[$titnum][$st];
            my Int $new-en = $comp-arrays[$titnum][$en];
	    my Int $c-length = $new-en - $new-st;
	    my Int $o-length = $en - $st;
	    warn "Lenght is different: ($new-en - $new-st) = $c-length vs ($en - $st) = $o-length" if ($c-length != $o-length);
	    my Int $new-mean = Int(($new-en + $new-st) / 2);
	    my Int $headnum = $syn-chromosomes-index-built-from-roman[$num]; #find-from-roman(roman($num), $syn-chromosomes);
	    die "Lost annotation $chr $mean" unless defined($headnum);
	    take ($syn-chromosomes[$headnum], $new-mean).List;
	    $*ERR.say(($syn-chromosomes[$headnum], $new-mean).List);
        } else {
	    if ($chr eq "chrM") {
		take ("ChrM", $mean).List;
		$*ERR.say(("ChrM", $mean).List);
	    } else {
	        my $headnum = $all-chromosomes-index-built-from-roman[$num]; # find-from-roman(roman($num), $all-chromosomes);
		if (defined($headnum)) {
		   take ($all-chromosomes[$headnum], $mean).List;
		   $*ERR.say(($all-chromosomes[$headnum], $mean).List);
		} else {
		   warn "Lost annotation $chr $num {roman($num)}";
		}
	    }
        }
    }

    die "Bins is not a multiple of 2, not yet implemented" unless ($bins/2 == Int($bins/2));
    my $diff = Int($bins/2);
    # Why flat?
    for |$head-array -> $arr ($chr, $bin) {
	say "$chr~$bin", "\t", $news.grep( { (.[0] eq $chr) && ($bin - $diff <= .[1] < $bin + $diff) } ).elems;
    }
}

	use v6;

	# This is taken from Rosettacode, it translate Int to romans strings
	# It is buggy, but I don't know why
	my %symbols =
	1 => "I", 5 => "V", 10 => "X", 50 => "L", 100 => "C",
	500 => "D", 1_000 => "M";

	my @subtractors =
	1_000, 100, 500, 100, 100, 10, 50, 10, 10, 1, 5, 1, 1, 0;

	multi sub roman (0) { '' }
	multi sub roman (Int $n) {
	for @subtractors -> $cut, $minus {
	$n >= $cut
	and return %symbols{$cut} ~ roman($n - $cut);
	$n >= $cut - $minus
	and return %symbols{$minus} ~ roman($n + $minus);
	}
	}

	sub find-from-roman (Str:D $roman, $in-list) returns Int {
	# 1st WORKAROUND: Rosettacode is buggy, we need this or we get erratic behaviour
	chomp($roman);
	my $n = 0;
	my Int $retval;
	for \|$in-list -> $cmp {
	if (($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> \|\| $$ ] /) \|\| ($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> \|\| $$ ] /)) {
	if (!defined($retval)) {
	$retval = $n;
	} else {
	die "Cannot decode titles for ambiguity";
	}
	}
	$n++;
	}
	return $retval;
	}

	#\| Annotate the synthetic chromosome according to reference annotation:
	#\| annotations input and outputs should be passed through stdin and stdout
	sub MAIN ($header, $binsize, $comparisondir) {

	# We want comparisons to be chosen intelligently using information from $header
	my $head-array = do {
	my $head-file = $header.IO.open(:r)
	or die "Could not open $header";

	eager do for $head-file.lines -> $line {
	my $info = $line.split("\t")[0];
	$info.split('~').List;
	}
	}

	# Get the list of all chromosomes, why eager?
	my $all-chromosomes = eager gather {
	my Str $last-chr = "";
	# Why flat ?
	for \|$head-array -> $arr ($chr, $bin) {
	if ($chr ne $last-chr) {
	take $chr;
	$last-chr = $chr;
	}
	}
	}
	$*ERR.say($all-chromosomes.perl);

	# Get the lists of synthetic chromosomes
	# Why eager?
	my $syn-chromosomes = eager gather {
	my Str $last-chr = "";
	# Why flat?
	for \|$head-array -> $arr ($chr, $bin) {
	if (($chr ~~ m:i/Syn<[IVXM]>+/) && ($chr ne $last-chr)) {
	take $chr;
	$last-chr = $chr;
	}
	}
	}
	$*ERR.say($syn-chromosomes.perl);

	# WORKAROUND: To avoid erratic behaviour
	my $syn-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-chromosomes) !! Int } );
	my $all-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $all-chromosomes) !! Int } );
	$*ERR.say($syn-chromosomes-index-built-from-roman.perl);
	$*ERR.say($all-chromosomes-index-built-from-roman.perl);

	# Why eager?
	my $all-compfiles = eager $comparisondir.IO.dir(test => /:i '.' txt $/)>>.basename;
	# Why flat?
	my $compfiles = eager gather for \|$syn-chromosomes -> $name {
	# Find the most similar name in compfile, to do so
	# extract the roman numeral
	my $num = ($name ~~ m/(<[IVX]>+)/)[0].Str or die "Could not extract roman number";
	my $element = find-from-roman($num, $all-compfiles);
	if (defined($element)) {
	take $all-compfiles[$element];
	} else {
	warn "$name chromosome not found";
	}
	}
	$*ERR.say($compfiles.perl);

	my Int $bins = $binsize.Int;

	# Higly parallel chromosome loader (still super-slow)
	# Why eager and flat?
	my $comp-prom = eager do for \|$compfiles -> $comparison { start {
	my $compa-file = ("$comparisondir/" ~ $comparison).IO.open(:r)
	or die "Could not open $comparison";

	$compa-file.get xx 2;
	my $syn-str = $compa-file.get.chomp;
	my $syn-index = $syn-str.index("\t");
	my $syn-title = $syn-str.substr(0, $syn-index);
	# This is slow as hell
	my $syn-prom = start { eager $syn-str.substr($syn-index+1).split("\t")>>.Int };

	$compa-file.get xx 3;
	my $ref-str = $compa-file.get.chomp;
	my $ref-index = $ref-str.index("\t");
	my $ref-title = $ref-str.substr(0, $ref-index);
	# This is slow as hell
	my $ref-prom = start { eager $ref-str.substr($ref-index+1).split("\t")>>.Int };

	my ($ref, $syn) = await $ref-prom, $syn-prom;

	my $resarray = Array.new();
	$resarray[0] = 0;
	loop (my $i = 0; $i < $ref.elems; $i++) {
	$resarray[$ref[$i]] = $syn[$i];
	}
	($syn-title, $ref-title, $resarray);
	} }

	my $annot-array = do for $*IN.lines -> $line {
	# Why .List?
	$line.split("\t").List;
	}

	my ($syn-titles, $ref-titles, $comp-arrays);

	# This is a global mess for fixing [Z]
	if ($comp-prom.elems > 1) {
	my @comp-results = await \|$comp-prom;
	($syn-titles, $ref-titles, $comp-arrays) = [Z] @comp-results;
	} else {
	my @comp-results = await \|$comp-prom;
	my $comp-list;
	($syn-titles, $ref-titles, $comp-list) = @comp-results;
	$syn-titles = ($syn-titles).List; $ref-titles = ($ref-titles).List; $comp-arrays = ($comp-list,).List;
	}
	$*ERR.say($syn-titles.perl);

	# To avoid erratic behaviour
	my $syn-titles-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-titles) !! Int } );
	$*ERR.say($syn-titles-index-built-from-roman.perl);

	# Why eager and flat?
	my $news = eager gather for \|$annot-array -> $arr ($chr, $type, $st, $en, $mean) {
	my $titnum;
	if (my $num = ($chr ~~ m/chr(\d+)/)[0]) {
	$titnum = $syn-titles-index-built-from-roman[$num.Str.Int]; # find-from-roman(roman($num.Str.Int), $syn-titles);
	$num = $num.Str.Int;
	}

	if (defined($titnum)) {
	my Int $new-st = $comp-arrays[$titnum][$st];
	my Int $new-en = $comp-arrays[$titnum][$en];
	my Int $c-length = $new-en - $new-st;
	my Int $o-length = $en - $st;
	warn "Lenght is different: ($new-en - $new-st) = $c-length vs ($en - $st) = $o-length" if ($c-length != $o-length);
	my Int $new-mean = Int(($new-en + $new-st) / 2);
	my Int $headnum = $syn-chromosomes-index-built-from-roman[$num]; #find-from-roman(roman($num), $syn-chromosomes);
	die "Lost annotation $chr $mean" unless defined($headnum);
	take ($syn-chromosomes[$headnum], $new-mean).List;
	$*ERR.say(($syn-chromosomes[$headnum], $new-mean).List);
	} else {
	if ($chr eq "chrM") {
	take ("ChrM", $mean).List;
	$*ERR.say(("ChrM", $mean).List);
	} else {
	my $headnum = $all-chromosomes-index-built-from-roman[$num]; # find-from-roman(roman($num), $all-chromosomes);
	if (defined($headnum)) {
	take ($all-chromosomes[$headnum], $mean).List;
	$*ERR.say(($all-chromosomes[$headnum], $mean).List);
	} else {
	warn "Lost annotation $chr $num {roman($num)}";
	}
	}
	}
	}

	die "Bins is not a multiple of 2, not yet implemented" unless ($bins/2 == Int($bins/2));
	my $diff = Int($bins/2);
	# Why flat?
	for \|$head-array -> $arr ($chr, $bin) {
	say "$chr~$bin", "\t", $news.grep( { (.[0] eq $chr) && ($bin - $diff <= .[1] < $bin + $diff) } ).elems;
	}
	}