Created
May 24, 2016 16:59
-
-
Save dbolser-ebi/e7dd7e03118fadd67e528ea042e98ac3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.14.0; | |
use warnings; | |
use autodie qw(:all); | |
use DBI; | |
my $dsn = | |
'dbi:mysql:host=mysql-eg-prod-2.ebi.ac.uk;port=4239;'. | |
'dbname=triticum_aestivum_variation_31_84_2'; | |
my $dbh = DBI-> | |
connect( $dsn, 'ensro', '', ); | |
## See https://metacpan.org/pod/DBD::mysql#mysql_use_result | |
## and also http://www.perlmonks.org/?node_id=273952 | |
$dbh->{mysql_use_result} = 1; | |
my $limit = 1_000_000; | |
my $sql = <<EOS; | |
# 422,103,416 ROWS | |
# 9,310,856 ROWS | |
SELECT variation_id, allele_1, allele_2 | |
FROM tmp_individual_genotype_single_bp | |
#UNION ALL | |
#SELECT variation_id, allele_1, allele_2 | |
#FROM sample_genotype_multiple_bp | |
## DEBUGGING | |
LIMIT $limit | |
EOS | |
## Go time | |
my $sth = $dbh->prepare($sql); | |
$sth->execute(); | |
my %allele_code; | |
my %genotype_code; | |
my %population_genotype; | |
## Seems slower... | |
#while (my $row_aref = $sth->fetchrow_arrayref) { | |
## Seems faster... | |
## See http://www.perlmonks.org/?node_id=273952 | |
my $max_rows = 5_000; | |
while (my $aref = $sth->fetchall_arrayref(undef, $max_rows)) { | |
## $aref now contains (at most) $max_rows rows | |
for my $row_aref (@$aref){ | |
my ($variation_id, $allele_1, $allele_2) = @$row_aref; | |
$allele_code{$allele_1}++; | |
$allele_code{$allele_2}++; | |
my $genotype = $allele_1. ':'. $allele_2; | |
$genotype_code{$genotype}++; | |
$population_genotype{$variation_id}{'total'}++; | |
$population_genotype{$variation_id}{$genotype}++; | |
} | |
} | |
warn | |
"got ", scalar keys %genotype_code, " genotypes ", | |
"from ", scalar keys %allele_code, " alleles.\n"; | |
## Build the allele and genotype code tables | |
open AC, '>', 'allele_code.tsv'; | |
my $allele_code_id = 0; | |
for (sort keys %allele_code){ | |
$allele_code_id++; | |
$allele_code{$_} = $allele_code_id; | |
say AC join("\t", $allele_code_id, $_); | |
} | |
open GC, '>', 'genotype_code.tsv'; | |
my $genotype_code_id = 0; | |
for (sort keys %genotype_code){ | |
$genotype_code_id++; | |
$genotype_code{$_} = $genotype_code_id; | |
my ($allele_1, $allele_2) = split /:/; | |
say GC join("\t", $genotype_code_id, $allele_code{$allele_1}, 1); | |
say GC join("\t", $genotype_code_id, $allele_code{$allele_2}, 2); | |
} | |
## Dump the population_genotypes | |
for my $variation_id (sort {$a<=>$b} keys %population_genotype){ | |
## Extract the total (it isn't a real genotype) | |
my $total = delete $population_genotype{$variation_id}{'total'}; | |
for my $genotype (sort keys %{$population_genotype{$variation_id}}){ | |
say | |
join("\t", | |
$variation_id, | |
$genotype_code{$genotype}, | |
$population_genotype{$variation_id}{$genotype} / $total, | |
$population_genotype{$variation_id}{$genotype}, | |
); | |
} | |
} | |
warn "OK\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
15:13 < mst> nah, I have a reason
15:13 < mst> if you use $sth->bind_columns($varation_1, $allele_1, $allele_2)
15:13 < mst> $sth->fetch should populate them directly
15:14 < mst> thereby skipping the arrayref fuckery
15:14 < dbolser> ah, nice
15:14 < mst> which might turn out to be faster