matthew-brett/test_random_subquery.sql

## test_random_subquery.sql
/* Below I load a table prepared by "make_thingies.py".

It has a column "things" that have 100 instances of "one", 10 instances of
"two" and one instance of "three".
*/
use things;
DROP TABLE IF EXISTS `thingies`;
CREATE TABLE thingies (
    id INT,
    things VARCHAR(255) NOT NULL,
    other_things VARCHAR(255) NOT NULL,
    PRIMARY KEY (id)
);

LOAD DATA INFILE '/Volumes/rebrought/data/thingies.csv'
INTO TABLE thingies
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;

/* In the query below, I attempt to:

1. Select unique "things"
2. Choose each one with 0.5 probability
3. Select rows from the whole table given selected items from 2.
*/
SELECT id, things, other_things FROM thingies
WHERE things in
(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
    WHERE rand() < 0.5);
/* The query above seems to give One and Two much more often than Three.

This despite the fact that the sub-query

(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
    WHERE rand() < 0.5);

when run separately, does appear to select "one", "two" and "three" with
probability 0.5.

On the other hand, if I first do a query to select unique things into a table,
and then use that table, I get One, Two, Three with equal chances.
*/

DROP TABLE IF EXISTS `my_ids`;
CREATE TABLE my_ids SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
    WHERE rand() < 0.5;
select * from my_ids;

SELECT id, things, other_things FROM thingies
WHERE things in (SELECT things from my_ids);
	/* Below I load a table prepared by "make_thingies.py".

	It has a column "things" that have 100 instances of "one", 10 instances of
	"two" and one instance of "three".
	*/
	use things;
	DROP TABLE IF EXISTS `thingies`;
	CREATE TABLE thingies (
	id INT,
	things VARCHAR(255) NOT NULL,
	other_things VARCHAR(255) NOT NULL,
	PRIMARY KEY (id)
	);

	LOAD DATA INFILE '/Volumes/rebrought/data/thingies.csv'
	INTO TABLE thingies
	FIELDS TERMINATED BY ','
	LINES TERMINATED BY '\n'
	IGNORE 1 ROWS;

	/* In the query below, I attempt to:

	1. Select unique "things"
	2. Choose each one with 0.5 probability
	3. Select rows from the whole table given selected items from 2.
	*/
	SELECT id, things, other_things FROM thingies
	WHERE things in
	(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
	WHERE rand() < 0.5);
	/* The query above seems to give One and Two much more often than Three.

	This despite the fact that the sub-query

	(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
	WHERE rand() < 0.5);

	when run separately, does appear to select "one", "two" and "three" with
	probability 0.5.

	On the other hand, if I first do a query to select unique things into a table,
	and then use that table, I get One, Two, Three with equal chances.
	*/

	DROP TABLE IF EXISTS `my_ids`;
	CREATE TABLE my_ids SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids
	WHERE rand() < 0.5;
	select * from my_ids;

	SELECT id, things, other_things FROM thingies
	WHERE things in (SELECT things from my_ids);