Skip to content

Instantly share code, notes, and snippets.

@dbolser-ebi
Created February 11, 2015 11:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbolser-ebi/e61f78968eb816610c0c to your computer and use it in GitHub Desktop.
Save dbolser-ebi/e61f78968eb816610c0c to your computer and use it in GitHub Desktop.
-- Try adding an index...
#ALTER TABLE tmp_individual_genotype_single_bp
# ADD INDEX allele_1_idx (allele_1),
# ADD INDEX allele_2_idx (allele_2);
# Query OK, 277545309 rows affected (43 min 29.78 sec)
-- The above index seems to have no impact on query execution time
-- below, as the individual_id_idx is picked instead of either of
-- these allele indexes!
-- Make a table with the allele_code IDs (and population_ids) instead
-- of allele strings (and individual_ids). Bizarrely adding the JOIN
-- to individual_population actually speeds up the query by a factor
-- of approximately 2.
OPTIMIZE TABLE tmp_individual_genotype_single_bp;
OPTIMIZE TABLE allele_code;
DROP TABLE IF EXISTS temp_individual_genotype;
CREATE TEMPORARY TABLE temp_individual_genotype(
INDEX (variation_id, population_id),
INDEX (variation_id, population_id, allele_code_id_1),
INDEX (variation_id, population_id, allele_code_id_2),
INDEX (variation_id, population_id, allele_code_id_g)
)
#EXPLAIN
SELECT
variation_id, population_id,
one.allele_code_id AS allele_code_id_1,
two.allele_code_id AS allele_code_id_2,
CONCAT(
one.allele_code_id, ':',
two.allele_code_id) AS allele_code_id_g
FROM
tmp_individual_genotype_single_bp
INNER JOIN
individual_population p
USING
(individual_id)
INNER JOIN
allele_code one ON allele_1 = one.allele
INNER JOIN
allele_code two ON allele_2 = two.allele
#
#LIMIT
# 50000000
;
# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
# Records: 555,090,618 Duplicates: 0 Warnings: 0
OPTIMIZE TABLE temp_individual_genotype;
# 1 row in set (10 min 50.60 sec)
-- TODO: Add the multiple BP data to the same table here!
-- Now make tables for each allele, genotype, and the total...
CREATE TABLE temp_individual_genotype_tt
(PRIMARY KEY (variation_id, population_id))
#EXPLAIN
SELECT variation_id, population_id,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id;
CREATE TABLE temp_individual_genotype_a1
(PRIMARY KEY (variation_id, population_id, allele_code_id_1))
#EXPLAIN
SELECT variation_id, population_id, allele_code_id_1,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_1;
CREATE TABLE temp_individual_genotype_a2
(PRIMARY KEY (variation_id, population_id, allele_code_id_2))
#EXPLAIN
SELECT variation_id, population_id, allele_code_id_2,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_2;
CREATE TABLE temp_individual_genotype_gg
(PRIMARY KEY (variation_id, population_id, allele_code_id_g))
#EXPLAIN
SELECT variation_id, population_id, allele_code_id_g,
COUNT(*) AS N
FROM temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_g;
OPTIMIZE TABLE temp_individual_genotype_tt;
OPTIMIZE TABLE temp_individual_genotype_a1;
OPTIMIZE TABLE temp_individual_genotype_a2;
OPTIMIZE TABLE temp_individual_genotype_gg;
-- Now do an annoying shuffle to combine a1 and a2... Adding N into
-- the index here avoids a costly data look-up later. TODO: Should
-- merge above steps with this one.
DROP TABLE IF EXISTS temp_individual_genotype_ax;
CREATE TABLE temp_individual_genotype_ax
(INDEX pk_ish (variation_id, population_id, allele_code_id, N))
SELECT
variation_id, population_id, allele_code_id_1 AS allele_code_id, N
FROM
temp_individual_genotype_a1
# LIMIT
# 6000000
;
INSERT INTO temp_individual_genotype_ax
SELECT
variation_id, population_id, allele_code_id_2 AS allele_code_id, N
FROM
temp_individual_genotype_a2
# LIMIT
# 6000000
;
OPTIMIZE TABLE temp_individual_genotype_ax;
-- and finally...
CREATE TABLE temp_individual_genotype_aa
(PRIMARY KEY (variation_id, population_id, allele_code_id))
#EXPLAIN
SELECT
variation_id, population_id, allele_code_id,
COUNT(*) AS X, SUM(N) AS N
FROM
temp_individual_genotype_ax
GROUP BY
variation_id, population_id, allele_code_id
;
OPTIMIZE TABLE temp_individual_genotype_aa;
-- MAKE COUNTS LIKE THIS
-- Allele
RENAME TABLE allele TO allele_bk;
CREATE TABLE allele LIKE allele_bk;
INSERT INTO allele
(variation_id, allele_code_id, population_id, count, frequency)
SELECT
variation_id, allele_code_id, population_id,
a.N AS count,
a.N / t.N AS frequency
FROM
temp_individual_genotype_tt t
INNER JOIN
temp_individual_genotype_aa a
USING
(variation_id, population_id)
#LIMIT
# 03
;
OPTIMIZE TABLE allele;
-- Genotype
-- Select population genotype in same format as allele_code_id_g above
CREATE TABLE temp_genotype_code
(PRIMARY KEY (allele_code_id_g)) AS
SELECT
genotype_code_id,
CONCAT(
one.allele_code_id, ':',
two.allele_code_id) AS allele_code_id_g
FROM
genotype_code one
INNER JOIN
genotype_code two
USING
(genotype_code_id)
WHERE
one.haplotype_id = 1 AND
two.haplotype_id = 2
;
OPTIMIZE TABLE temp_genotype_code;
RENAME TABLE population_genotype TO population_genotype_bk;
CREATE TABLE population_genotype LIKE population_genotype_bk;
INSERT INTO population_genotype
(variation_id, genotype_code_id, population_id, frequency)
SELECT
variation_id, genotype_code_id, population_id,
g.N / t.N AS frequency
FROM
temp_individual_genotype_tt t
INNER JOIN
temp_individual_genotype_gg g
USING
(variation_id, population_id)
INNER JOIN
temp_genotype_code
USING
(allele_code_id_g)
#LIMIT
# 03
;
OPTIMIZE TABLE population_genotype;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment