dbolser-ebi/gist:e61f78968eb816610c0c

## gistfile1.sql

-- Try adding an index...
#ALTER TABLE tmp_individual_genotype_single_bp
#  ADD INDEX allele_1_idx (allele_1),
#  ADD INDEX allele_2_idx (allele_2);

# Query OK, 277545309 rows affected (43 min 29.78 sec)

-- The above index seems to have no impact on query execution time
-- below, as the individual_id_idx is picked instead of either of
-- these allele indexes!


-- Make a table with the allele_code IDs (and population_ids) instead
-- of allele strings (and individual_ids).  Bizarrely adding the JOIN
-- to individual_population actually speeds up the query by a factor
-- of approximately 2.

OPTIMIZE TABLE tmp_individual_genotype_single_bp;
OPTIMIZE TABLE allele_code;

DROP             TABLE IF EXISTS temp_individual_genotype;
CREATE TEMPORARY TABLE           temp_individual_genotype(
  INDEX (variation_id, population_id),
  INDEX (variation_id, population_id, allele_code_id_1),
  INDEX (variation_id, population_id, allele_code_id_2),
  INDEX (variation_id, population_id, allele_code_id_g)
)
#EXPLAIN
SELECT
  variation_id, population_id,
  one.allele_code_id    AS allele_code_id_1,
  two.allele_code_id    AS allele_code_id_2,
  CONCAT(
    one.allele_code_id, ':',
    two.allele_code_id) AS allele_code_id_g
FROM
  tmp_individual_genotype_single_bp
INNER JOIN
  individual_population    p
USING
  (individual_id)
INNER JOIN
  allele_code one ON allele_1 = one.allele
INNER JOIN
  allele_code two ON allele_2 = two.allele
#
#LIMIT
#  50000000
;

# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
# Records:  555,090,618  Duplicates: 0  Warnings: 0

OPTIMIZE TABLE temp_individual_genotype;
# 1 row in set (10 min 50.60 sec)


-- TODO: Add the multiple BP data to the same table here!


-- Now make tables for each allele, genotype, and the total...
CREATE TABLE temp_individual_genotype_tt
  (PRIMARY KEY (variation_id, population_id))
#EXPLAIN
SELECT   variation_id, population_id,
         COUNT(*) AS N
FROM     temp_individual_genotype
GROUP BY variation_id, population_id;

CREATE TABLE temp_individual_genotype_a1
  (PRIMARY KEY (variation_id, population_id, allele_code_id_1))
#EXPLAIN
SELECT   variation_id, population_id, allele_code_id_1,
         COUNT(*) AS N
FROM     temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_1;

CREATE TABLE temp_individual_genotype_a2
  (PRIMARY KEY (variation_id, population_id, allele_code_id_2))
#EXPLAIN
SELECT   variation_id, population_id, allele_code_id_2,
         COUNT(*) AS N
FROM     temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_2;

CREATE TABLE temp_individual_genotype_gg
  (PRIMARY KEY (variation_id, population_id, allele_code_id_g))
#EXPLAIN
SELECT   variation_id, population_id, allele_code_id_g,
         COUNT(*) AS N
FROM     temp_individual_genotype
GROUP BY variation_id, population_id, allele_code_id_g;

OPTIMIZE TABLE temp_individual_genotype_tt;
OPTIMIZE TABLE temp_individual_genotype_a1;
OPTIMIZE TABLE temp_individual_genotype_a2;
OPTIMIZE TABLE temp_individual_genotype_gg;


-- Now do an annoying shuffle to combine a1 and a2... Adding N into
-- the index here avoids a costly data look-up later. TODO: Should
-- merge above steps with this one.
DROP   TABLE IF EXISTS temp_individual_genotype_ax;
CREATE TABLE           temp_individual_genotype_ax
(INDEX pk_ish (variation_id, population_id, allele_code_id, N))
SELECT
  variation_id, population_id, allele_code_id_1 AS allele_code_id, N
FROM
  temp_individual_genotype_a1
#  LIMIT
#    6000000
;

INSERT INTO            temp_individual_genotype_ax
SELECT
  variation_id, population_id, allele_code_id_2 AS allele_code_id, N
FROM
  temp_individual_genotype_a2
#  LIMIT
#    6000000
;

OPTIMIZE TABLE temp_individual_genotype_ax;

-- and finally...
CREATE TABLE temp_individual_genotype_aa
  (PRIMARY KEY (variation_id, population_id, allele_code_id))
#EXPLAIN
SELECT
  variation_id, population_id, allele_code_id,
  COUNT(*) AS X, SUM(N) AS N
FROM
  temp_individual_genotype_ax
GROUP BY
  variation_id, population_id, allele_code_id
;

OPTIMIZE TABLE temp_individual_genotype_aa;


-- MAKE COUNTS LIKE THIS

-- Allele

RENAME TABLE allele TO   allele_bk;
CREATE TABLE allele LIKE allele_bk;

INSERT INTO allele
  (variation_id, allele_code_id, population_id, count, frequency)
SELECT
  variation_id, allele_code_id, population_id,
  a.N       AS count,
  a.N / t.N AS frequency
FROM
  temp_individual_genotype_tt t
INNER JOIN
  temp_individual_genotype_aa a
USING
  (variation_id, population_id)
#LIMIT
#  03
;

OPTIMIZE TABLE allele;


-- Genotype

-- Select population genotype in same format as allele_code_id_g above

CREATE TABLE temp_genotype_code
  (PRIMARY KEY (allele_code_id_g)) AS
SELECT
  genotype_code_id,
  CONCAT(
    one.allele_code_id, ':',
    two.allele_code_id) AS allele_code_id_g
FROM
  genotype_code one
INNER JOIN
  genotype_code two
USING
  (genotype_code_id)
WHERE
  one.haplotype_id = 1 AND
  two.haplotype_id = 2
;

OPTIMIZE TABLE temp_genotype_code;

RENAME TABLE population_genotype TO   population_genotype_bk;
CREATE TABLE population_genotype LIKE population_genotype_bk;

INSERT INTO population_genotype
  (variation_id, genotype_code_id, population_id, frequency)
SELECT
  variation_id, genotype_code_id, population_id,
  g.N / t.N AS frequency
FROM
  temp_individual_genotype_tt t
INNER JOIN
  temp_individual_genotype_gg g
USING
  (variation_id, population_id)
INNER JOIN
  temp_genotype_code
USING
  (allele_code_id_g)
#LIMIT
#  03
;

OPTIMIZE TABLE population_genotype;

	-- Try adding an index...
	#ALTER TABLE tmp_individual_genotype_single_bp
	# ADD INDEX allele_1_idx (allele_1),
	# ADD INDEX allele_2_idx (allele_2);

	# Query OK, 277545309 rows affected (43 min 29.78 sec)

	-- The above index seems to have no impact on query execution time
	-- below, as the individual_id_idx is picked instead of either of
	-- these allele indexes!



	-- Make a table with the allele_code IDs (and population_ids) instead
	-- of allele strings (and individual_ids). Bizarrely adding the JOIN
	-- to individual_population actually speeds up the query by a factor
	-- of approximately 2.

	OPTIMIZE TABLE tmp_individual_genotype_single_bp;
	OPTIMIZE TABLE allele_code;

	DROP TABLE IF EXISTS temp_individual_genotype;
	CREATE TEMPORARY TABLE temp_individual_genotype(
	INDEX (variation_id, population_id),
	INDEX (variation_id, population_id, allele_code_id_1),
	INDEX (variation_id, population_id, allele_code_id_2),
	INDEX (variation_id, population_id, allele_code_id_g)
	)
	#EXPLAIN
	SELECT
	variation_id, population_id,
	one.allele_code_id AS allele_code_id_1,
	two.allele_code_id AS allele_code_id_2,
	CONCAT(
	one.allele_code_id, ':',
	two.allele_code_id) AS allele_code_id_g
	FROM
	tmp_individual_genotype_single_bp
	INNER JOIN
	individual_population p
	USING
	(individual_id)
	INNER JOIN
	allele_code one ON allele_1 = one.allele
	INNER JOIN
	allele_code two ON allele_2 = two.allele
	#
	#LIMIT
	# 50000000
	;

	# Query OK, 555,090,618 rows affected (3 hours 9 min 50.87 sec)
	# Records: 555,090,618 Duplicates: 0 Warnings: 0

	OPTIMIZE TABLE temp_individual_genotype;
	# 1 row in set (10 min 50.60 sec)


	-- TODO: Add the multiple BP data to the same table here!





	-- Now make tables for each allele, genotype, and the total...
	CREATE TABLE temp_individual_genotype_tt
	(PRIMARY KEY (variation_id, population_id))
	#EXPLAIN
	SELECT variation_id, population_id,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id;

	CREATE TABLE temp_individual_genotype_a1
	(PRIMARY KEY (variation_id, population_id, allele_code_id_1))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_1,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_1;

	CREATE TABLE temp_individual_genotype_a2
	(PRIMARY KEY (variation_id, population_id, allele_code_id_2))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_2,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_2;

	CREATE TABLE temp_individual_genotype_gg
	(PRIMARY KEY (variation_id, population_id, allele_code_id_g))
	#EXPLAIN
	SELECT variation_id, population_id, allele_code_id_g,
	COUNT(*) AS N
	FROM temp_individual_genotype
	GROUP BY variation_id, population_id, allele_code_id_g;

	OPTIMIZE TABLE temp_individual_genotype_tt;
	OPTIMIZE TABLE temp_individual_genotype_a1;
	OPTIMIZE TABLE temp_individual_genotype_a2;
	OPTIMIZE TABLE temp_individual_genotype_gg;



	-- Now do an annoying shuffle to combine a1 and a2... Adding N into
	-- the index here avoids a costly data look-up later. TODO: Should
	-- merge above steps with this one.
	DROP TABLE IF EXISTS temp_individual_genotype_ax;
	CREATE TABLE temp_individual_genotype_ax
	(INDEX pk_ish (variation_id, population_id, allele_code_id, N))
	SELECT
	variation_id, population_id, allele_code_id_1 AS allele_code_id, N
	FROM
	temp_individual_genotype_a1
	# LIMIT
	# 6000000
	;

	INSERT INTO temp_individual_genotype_ax
	SELECT
	variation_id, population_id, allele_code_id_2 AS allele_code_id, N
	FROM
	temp_individual_genotype_a2
	# LIMIT
	# 6000000
	;

	OPTIMIZE TABLE temp_individual_genotype_ax;

	-- and finally...
	CREATE TABLE temp_individual_genotype_aa
	(PRIMARY KEY (variation_id, population_id, allele_code_id))
	#EXPLAIN
	SELECT
	variation_id, population_id, allele_code_id,
	COUNT(*) AS X, SUM(N) AS N
	FROM
	temp_individual_genotype_ax
	GROUP BY
	variation_id, population_id, allele_code_id
	;

	OPTIMIZE TABLE temp_individual_genotype_aa;





	-- MAKE COUNTS LIKE THIS

	-- Allele

	RENAME TABLE allele TO allele_bk;
	CREATE TABLE allele LIKE allele_bk;

	INSERT INTO allele
	(variation_id, allele_code_id, population_id, count, frequency)
	SELECT
	variation_id, allele_code_id, population_id,
	a.N AS count,
	a.N / t.N AS frequency
	FROM
	temp_individual_genotype_tt t
	INNER JOIN
	temp_individual_genotype_aa a
	USING
	(variation_id, population_id)
	#LIMIT
	# 03
	;

	OPTIMIZE TABLE allele;



	-- Genotype

	-- Select population genotype in same format as allele_code_id_g above

	CREATE TABLE temp_genotype_code
	(PRIMARY KEY (allele_code_id_g)) AS
	SELECT
	genotype_code_id,
	CONCAT(
	one.allele_code_id, ':',
	two.allele_code_id) AS allele_code_id_g
	FROM
	genotype_code one
	INNER JOIN
	genotype_code two
	USING
	(genotype_code_id)
	WHERE
	one.haplotype_id = 1 AND
	two.haplotype_id = 2
	;

	OPTIMIZE TABLE temp_genotype_code;

	RENAME TABLE population_genotype TO population_genotype_bk;
	CREATE TABLE population_genotype LIKE population_genotype_bk;

	INSERT INTO population_genotype
	(variation_id, genotype_code_id, population_id, frequency)
	SELECT
	variation_id, genotype_code_id, population_id,
	g.N / t.N AS frequency
	FROM
	temp_individual_genotype_tt t
	INNER JOIN
	temp_individual_genotype_gg g
	USING
	(variation_id, population_id)
	INNER JOIN
	temp_genotype_code
	USING
	(allele_code_id_g)
	#LIMIT
	# 03
	;

	OPTIMIZE TABLE population_genotype;