Skip to content

Instantly share code, notes, and snippets.

Created October 29, 2015 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/db7f9dcb00f4dd817ea1 to your computer and use it in GitHub Desktop.
Save anonymous/db7f9dcb00f4dd817ea1 to your computer and use it in GitHub Desktop.
My first Perl6 program, AKA how-to-work-around-things-u-dont-understand
use v6;
# This is taken from Rosettacode, it translate Int to romans strings
# It is buggy, but I don't know why
my %symbols =
1 => "I", 5 => "V", 10 => "X", 50 => "L", 100 => "C",
500 => "D", 1_000 => "M";
my @subtractors =
1_000, 100, 500, 100, 100, 10, 50, 10, 10, 1, 5, 1, 1, 0;
multi sub roman (0) { '' }
multi sub roman (Int $n) {
for @subtractors -> $cut, $minus {
$n >= $cut
and return %symbols{$cut} ~ roman($n - $cut);
$n >= $cut - $minus
and return %symbols{$minus} ~ roman($n + $minus);
}
}
sub find-from-roman (Str:D $roman, $in-list) returns Int {
# 1st WORKAROUND: Rosettacode is buggy, we need this or we get erratic behaviour
chomp($roman);
my $n = 0;
my Int $retval;
for |$in-list -> $cmp {
if (($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> || $$ ] /) || ($cmp ~~ m/ <-[IVX]> <$roman> [ <-[IVX]> || $$ ] /)) {
if (!defined($retval)) {
$retval = $n;
} else {
die "Cannot decode titles for ambiguity";
}
}
$n++;
}
return $retval;
}
#| Annotate the synthetic chromosome according to reference annotation:
#| annotations input and outputs should be passed through stdin and stdout
sub MAIN ($header, $binsize, $comparisondir) {
# We want comparisons to be chosen intelligently using information from $header
my $head-array = do {
my $head-file = $header.IO.open(:r)
or die "Could not open $header";
eager do for $head-file.lines -> $line {
my $info = $line.split("\t")[0];
$info.split('~').List;
}
}
# Get the list of all chromosomes, why eager?
my $all-chromosomes = eager gather {
my Str $last-chr = "";
# Why flat ?
for |$head-array -> $arr ($chr, $bin) {
if ($chr ne $last-chr) {
take $chr;
$last-chr = $chr;
}
}
}
$*ERR.say($all-chromosomes.perl);
# Get the lists of synthetic chromosomes
# Why eager?
my $syn-chromosomes = eager gather {
my Str $last-chr = "";
# Why flat?
for |$head-array -> $arr ($chr, $bin) {
if (($chr ~~ m:i/Syn<[IVXM]>+/) && ($chr ne $last-chr)) {
take $chr;
$last-chr = $chr;
}
}
}
$*ERR.say($syn-chromosomes.perl);
# WORKAROUND: To avoid erratic behaviour
my $syn-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-chromosomes) !! Int } );
my $all-chromosomes-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $all-chromosomes) !! Int } );
$*ERR.say($syn-chromosomes-index-built-from-roman.perl);
$*ERR.say($all-chromosomes-index-built-from-roman.perl);
# Why eager?
my $all-compfiles = eager $comparisondir.IO.dir(test => /:i '.' txt $/)>>.basename;
# Why flat?
my $compfiles = eager gather for |$syn-chromosomes -> $name {
# Find the most similar name in compfile, to do so
# extract the roman numeral
my $num = ($name ~~ m/(<[IVX]>+)/)[0].Str or die "Could not extract roman number";
my $element = find-from-roman($num, $all-compfiles);
if (defined($element)) {
take $all-compfiles[$element];
} else {
warn "$name chromosome not found";
}
}
$*ERR.say($compfiles.perl);
my Int $bins = $binsize.Int;
# Higly parallel chromosome loader (still super-slow)
# Why eager and flat?
my $comp-prom = eager do for |$compfiles -> $comparison { start {
my $compa-file = ("$comparisondir/" ~ $comparison).IO.open(:r)
or die "Could not open $comparison";
$compa-file.get xx 2;
my $syn-str = $compa-file.get.chomp;
my $syn-index = $syn-str.index("\t");
my $syn-title = $syn-str.substr(0, $syn-index);
# This is slow as hell
my $syn-prom = start { eager $syn-str.substr($syn-index+1).split("\t")>>.Int };
$compa-file.get xx 3;
my $ref-str = $compa-file.get.chomp;
my $ref-index = $ref-str.index("\t");
my $ref-title = $ref-str.substr(0, $ref-index);
# This is slow as hell
my $ref-prom = start { eager $ref-str.substr($ref-index+1).split("\t")>>.Int };
my ($ref, $syn) = await $ref-prom, $syn-prom;
my $resarray = Array.new();
$resarray[0] = 0;
loop (my $i = 0; $i < $ref.elems; $i++) {
$resarray[$ref[$i]] = $syn[$i];
}
($syn-title, $ref-title, $resarray);
} }
my $annot-array = do for $*IN.lines -> $line {
# Why .List?
$line.split("\t").List;
}
my ($syn-titles, $ref-titles, $comp-arrays);
# This is a global mess for fixing [Z]
if ($comp-prom.elems > 1) {
my @comp-results = await |$comp-prom;
($syn-titles, $ref-titles, $comp-arrays) = [Z] @comp-results;
} else {
my @comp-results = await |$comp-prom;
my $comp-list;
($syn-titles, $ref-titles, $comp-list) = @comp-results;
$syn-titles = ($syn-titles).List; $ref-titles = ($ref-titles).List; $comp-arrays = ($comp-list,).List;
}
$*ERR.say($syn-titles.perl);
# To avoid erratic behaviour
my $syn-titles-index-built-from-roman = eager (0..16).map( { $_ ?? find-from-roman(roman($_), $syn-titles) !! Int } );
$*ERR.say($syn-titles-index-built-from-roman.perl);
# Why eager and flat?
my $news = eager gather for |$annot-array -> $arr ($chr, $type, $st, $en, $mean) {
my $titnum;
if (my $num = ($chr ~~ m/chr(\d+)/)[0]) {
$titnum = $syn-titles-index-built-from-roman[$num.Str.Int]; # find-from-roman(roman($num.Str.Int), $syn-titles);
$num = $num.Str.Int;
}
if (defined($titnum)) {
my Int $new-st = $comp-arrays[$titnum][$st];
my Int $new-en = $comp-arrays[$titnum][$en];
my Int $c-length = $new-en - $new-st;
my Int $o-length = $en - $st;
warn "Lenght is different: ($new-en - $new-st) = $c-length vs ($en - $st) = $o-length" if ($c-length != $o-length);
my Int $new-mean = Int(($new-en + $new-st) / 2);
my Int $headnum = $syn-chromosomes-index-built-from-roman[$num]; #find-from-roman(roman($num), $syn-chromosomes);
die "Lost annotation $chr $mean" unless defined($headnum);
take ($syn-chromosomes[$headnum], $new-mean).List;
$*ERR.say(($syn-chromosomes[$headnum], $new-mean).List);
} else {
if ($chr eq "chrM") {
take ("ChrM", $mean).List;
$*ERR.say(("ChrM", $mean).List);
} else {
my $headnum = $all-chromosomes-index-built-from-roman[$num]; # find-from-roman(roman($num), $all-chromosomes);
if (defined($headnum)) {
take ($all-chromosomes[$headnum], $mean).List;
$*ERR.say(($all-chromosomes[$headnum], $mean).List);
} else {
warn "Lost annotation $chr $num {roman($num)}";
}
}
}
}
die "Bins is not a multiple of 2, not yet implemented" unless ($bins/2 == Int($bins/2));
my $diff = Int($bins/2);
# Why flat?
for |$head-array -> $arr ($chr, $bin) {
say "$chr~$bin", "\t", $news.grep( { (.[0] eq $chr) && ($bin - $diff <= .[1] < $bin + $diff) } ).elems;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment