Skip to content

Instantly share code, notes, and snippets.

@dbolser-ebi
Created May 24, 2016 16:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbolser-ebi/e7dd7e03118fadd67e528ea042e98ac3 to your computer and use it in GitHub Desktop.
Save dbolser-ebi/e7dd7e03118fadd67e528ea042e98ac3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use 5.14.0;
use warnings;
use autodie qw(:all);
use DBI;
my $dsn =
'dbi:mysql:host=mysql-eg-prod-2.ebi.ac.uk;port=4239;'.
'dbname=triticum_aestivum_variation_31_84_2';
my $dbh = DBI->
connect( $dsn, 'ensro', '', );
## See https://metacpan.org/pod/DBD::mysql#mysql_use_result
## and also http://www.perlmonks.org/?node_id=273952
$dbh->{mysql_use_result} = 1;
my $limit = 1_000_000;
my $sql = <<EOS;
# 422,103,416 ROWS
# 9,310,856 ROWS
SELECT variation_id, allele_1, allele_2
FROM tmp_individual_genotype_single_bp
#UNION ALL
#SELECT variation_id, allele_1, allele_2
#FROM sample_genotype_multiple_bp
## DEBUGGING
LIMIT $limit
EOS
## Go time
my $sth = $dbh->prepare($sql);
$sth->execute();
my %allele_code;
my %genotype_code;
my %population_genotype;
## Seems slower...
#while (my $row_aref = $sth->fetchrow_arrayref) {
## Seems faster...
## See http://www.perlmonks.org/?node_id=273952
my $max_rows = 5_000;
while (my $aref = $sth->fetchall_arrayref(undef, $max_rows)) {
## $aref now contains (at most) $max_rows rows
for my $row_aref (@$aref){
my ($variation_id, $allele_1, $allele_2) = @$row_aref;
$allele_code{$allele_1}++;
$allele_code{$allele_2}++;
my $genotype = $allele_1. ':'. $allele_2;
$genotype_code{$genotype}++;
$population_genotype{$variation_id}{'total'}++;
$population_genotype{$variation_id}{$genotype}++;
}
}
warn
"got ", scalar keys %genotype_code, " genotypes ",
"from ", scalar keys %allele_code, " alleles.\n";
## Build the allele and genotype code tables
open AC, '>', 'allele_code.tsv';
my $allele_code_id = 0;
for (sort keys %allele_code){
$allele_code_id++;
$allele_code{$_} = $allele_code_id;
say AC join("\t", $allele_code_id, $_);
}
open GC, '>', 'genotype_code.tsv';
my $genotype_code_id = 0;
for (sort keys %genotype_code){
$genotype_code_id++;
$genotype_code{$_} = $genotype_code_id;
my ($allele_1, $allele_2) = split /:/;
say GC join("\t", $genotype_code_id, $allele_code{$allele_1}, 1);
say GC join("\t", $genotype_code_id, $allele_code{$allele_2}, 2);
}
## Dump the population_genotypes
for my $variation_id (sort {$a<=>$b} keys %population_genotype){
## Extract the total (it isn't a real genotype)
my $total = delete $population_genotype{$variation_id}{'total'};
for my $genotype (sort keys %{$population_genotype{$variation_id}}){
say
join("\t",
$variation_id,
$genotype_code{$genotype},
$population_genotype{$variation_id}{$genotype} / $total,
$population_genotype{$variation_id}{$genotype},
);
}
}
warn "OK\n";
@dbolser-ebi
Copy link
Author

15:13 < mst> nah, I have a reason
15:13 < mst> if you use $sth->bind_columns($varation_1, $allele_1, $allele_2)
15:13 < mst> $sth->fetch should populate them directly
15:14 < mst> thereby skipping the arrayref fuckery
15:14 < dbolser> ah, nice
15:14 < mst> which might turn out to be faster

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment