dbolser-ebi/rebuild_genotype_code.plx

## rebuild_genotype_code.plx
#!/usr/bin/env perl

use 5.14.0;
use warnings;

use autodie qw(:all);

use DBI;

my $dsn =
    'dbi:mysql:host=mysql-eg-prod-2.ebi.ac.uk;port=4239;'.
    'dbname=triticum_aestivum_variation_31_84_2';

my $dbh = DBI->
  connect( $dsn, 'ensro', '', );

## See https://metacpan.org/pod/DBD::mysql#mysql_use_result
## and also http://www.perlmonks.org/?node_id=273952
$dbh->{mysql_use_result} = 1;

my $limit = 1_000_000;

my $sql = <<EOS;
  # 422,103,416 ROWS
  #   9,310,856 ROWS
  SELECT variation_id, allele_1, allele_2
  FROM   tmp_individual_genotype_single_bp
  #UNION ALL
  #SELECT variation_id, allele_1, allele_2
  #FROM   sample_genotype_multiple_bp
  ## DEBUGGING
  LIMIT  $limit
EOS


## Go time

my $sth = $dbh->prepare($sql);
$sth->execute();

my %allele_code;
my %genotype_code;
my %population_genotype;


## Seems slower...
#while (my $row_aref = $sth->fetchrow_arrayref) {

## Seems faster...
## See http://www.perlmonks.org/?node_id=273952
my $max_rows = 5_000;

while (my $aref = $sth->fetchall_arrayref(undef, $max_rows)) {
    ## $aref now contains (at most) $max_rows rows

    for my $row_aref (@$aref){
        my ($variation_id, $allele_1, $allele_2) = @$row_aref;

        $allele_code{$allele_1}++;
        $allele_code{$allele_2}++;

        my $genotype = $allele_1. ':'. $allele_2;

        $genotype_code{$genotype}++;

        $population_genotype{$variation_id}{'total'}++;
        $population_genotype{$variation_id}{$genotype}++;

    }
}

warn
    "got ", scalar keys %genotype_code, " genotypes ",
    "from ", scalar keys %allele_code, " alleles.\n";


## Build the allele and genotype code tables

open AC, '>', 'allele_code.tsv';

my $allele_code_id = 0;
for (sort keys %allele_code){
    $allele_code_id++;
    $allele_code{$_} = $allele_code_id;
    say AC join("\t", $allele_code_id, $_);
}

open GC, '>', 'genotype_code.tsv';

my $genotype_code_id = 0;
for (sort keys %genotype_code){
    $genotype_code_id++;
    $genotype_code{$_} = $genotype_code_id;
    my ($allele_1, $allele_2) = split /:/;
    say GC join("\t", $genotype_code_id, $allele_code{$allele_1}, 1);
    say GC join("\t", $genotype_code_id, $allele_code{$allele_2}, 2);
}


## Dump the population_genotypes

for my $variation_id (sort {$a<=>$b} keys %population_genotype){

    ## Extract the total (it isn't a real genotype)
    my $total = delete $population_genotype{$variation_id}{'total'};

    for my $genotype (sort keys %{$population_genotype{$variation_id}}){
        say
            join("\t",
                 $variation_id,
                 $genotype_code{$genotype},
                 $population_genotype{$variation_id}{$genotype} / $total,
                 $population_genotype{$variation_id}{$genotype},
            );
    }
}


warn "OK\n";
	#!/usr/bin/env perl

	use 5.14.0;
	use warnings;

	use autodie qw(:all);

	use DBI;

	my $dsn =
	'dbi:mysql:host=mysql-eg-prod-2.ebi.ac.uk;port=4239;'.
	'dbname=triticum_aestivum_variation_31_84_2';

	my $dbh = DBI->
	connect( $dsn, 'ensro', '', );

	## See https://metacpan.org/pod/DBD::mysql#mysql_use_result
	## and also http://www.perlmonks.org/?node_id=273952
	$dbh->{mysql_use_result} = 1;

	my $limit = 1_000_000;

	my $sql = <<EOS;
	# 422,103,416 ROWS
	# 9,310,856 ROWS
	SELECT variation_id, allele_1, allele_2
	FROM tmp_individual_genotype_single_bp
	#UNION ALL
	#SELECT variation_id, allele_1, allele_2
	#FROM sample_genotype_multiple_bp
	## DEBUGGING
	LIMIT $limit
	EOS



	## Go time

	my $sth = $dbh->prepare($sql);
	$sth->execute();

	my %allele_code;
	my %genotype_code;
	my %population_genotype;



	## Seems slower...
	#while (my $row_aref = $sth->fetchrow_arrayref) {

	## Seems faster...
	## See http://www.perlmonks.org/?node_id=273952
	my $max_rows = 5_000;

	while (my $aref = $sth->fetchall_arrayref(undef, $max_rows)) {
	## $aref now contains (at most) $max_rows rows

	for my $row_aref (@$aref){
	my ($variation_id, $allele_1, $allele_2) = @$row_aref;

	$allele_code{$allele_1}++;
	$allele_code{$allele_2}++;

	my $genotype = $allele_1. ':'. $allele_2;

	$genotype_code{$genotype}++;

	$population_genotype{$variation_id}{'total'}++;
	$population_genotype{$variation_id}{$genotype}++;

	}
	}

	warn
	"got ", scalar keys %genotype_code, " genotypes ",
	"from ", scalar keys %allele_code, " alleles.\n";



	## Build the allele and genotype code tables

	open AC, '>', 'allele_code.tsv';

	my $allele_code_id = 0;
	for (sort keys %allele_code){
	$allele_code_id++;
	$allele_code{$_} = $allele_code_id;
	say AC join("\t", $allele_code_id, $_);
	}

	open GC, '>', 'genotype_code.tsv';

	my $genotype_code_id = 0;
	for (sort keys %genotype_code){
	$genotype_code_id++;
	$genotype_code{$_} = $genotype_code_id;
	my ($allele_1, $allele_2) = split /:/;
	say GC join("\t", $genotype_code_id, $allele_code{$allele_1}, 1);
	say GC join("\t", $genotype_code_id, $allele_code{$allele_2}, 2);
	}



	## Dump the population_genotypes

	for my $variation_id (sort {$a<=>$b} keys %population_genotype){

	## Extract the total (it isn't a real genotype)
	my $total = delete $population_genotype{$variation_id}{'total'};

	for my $genotype (sort keys %{$population_genotype{$variation_id}}){
	say
	join("\t",
	$variation_id,
	$genotype_code{$genotype},
	$population_genotype{$variation_id}{$genotype} / $total,
	$population_genotype{$variation_id}{$genotype},
	);
	}
	}



	warn "OK\n";