Created
February 14, 2020 11:43
-
-
Save matthew-brett/82668659bfb035c7b96a450a58fff42a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Below I load a table prepared by "make_thingies.py". | |
It has a column "things" that have 100 instances of "one", 10 instances of | |
"two" and one instance of "three". | |
*/ | |
use things; | |
DROP TABLE IF EXISTS `thingies`; | |
CREATE TABLE thingies ( | |
id INT, | |
things VARCHAR(255) NOT NULL, | |
other_things VARCHAR(255) NOT NULL, | |
PRIMARY KEY (id) | |
); | |
LOAD DATA INFILE '/Volumes/rebrought/data/thingies.csv' | |
INTO TABLE thingies | |
FIELDS TERMINATED BY ',' | |
LINES TERMINATED BY '\n' | |
IGNORE 1 ROWS; | |
/* In the query below, I attempt to: | |
1. Select unique "things" | |
2. Choose each one with 0.5 probability | |
3. Select rows from the whole table given selected items from 2. | |
*/ | |
SELECT id, things, other_things FROM thingies | |
WHERE things in | |
(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids | |
WHERE rand() < 0.5); | |
/* The query above seems to give One and Two much more often than Three. | |
This despite the fact that the sub-query | |
(SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids | |
WHERE rand() < 0.5); | |
when run separately, does appear to select "one", "two" and "three" with | |
probability 0.5. | |
On the other hand, if I first do a query to select unique things into a table, | |
and then use that table, I get One, Two, Three with equal chances. | |
*/ | |
DROP TABLE IF EXISTS `my_ids`; | |
CREATE TABLE my_ids SELECT things FROM (SELECT DISTINCT things FROM thingies) AS stids | |
WHERE rand() < 0.5; | |
select * from my_ids; | |
SELECT id, things, other_things FROM thingies | |
WHERE things in (SELECT things from my_ids); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment