Last active
September 28, 2021 19:04
-
-
Save wchargin/e26cea8567660f00e49bf07d7eefafe4 to your computer and use it in GitHub Desktop.
token rarity analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH project_id(project_id) AS (VALUES(163)), | |
token_bounds(min_id, max_id) AS ( | |
SELECT | |
(1000000 * project_id) AS min_id, | |
(1000000 * (project_id + 1)) AS max_id | |
FROM project_id | |
), | |
project_tokens(token_id) AS ( | |
SELECT token_id FROM tokens CROSS JOIN token_bounds | |
WHERE min_id <= token_id AND token_id < max_id | |
), | |
total_tokens(ct) AS ( | |
SELECT COUNT(1) AS ct FROM project_tokens | |
), | |
trait_counts(name, ct) AS ( | |
SELECT feature_name AS name, COUNT(1) AS ct | |
FROM project_tokens JOIN token_features USING (token_id) | |
GROUP BY feature_name | |
), | |
trait_rarities(name, ct, ln_p) AS ( | |
SELECT name, ct, LN((ct::real) / (SELECT ct::real FROM total_tokens)) | |
FROM trait_counts | |
ORDER BY ct ASC | |
), | |
token_rarities(token_id, sum_ln_p, why) AS ( | |
SELECT token_id, sum_ln_p, why | |
FROM | |
project_tokens, | |
LATERAL ( | |
SELECT | |
SUM(ln_p) AS sum_ln_p, | |
STRING_AGG(name || ' (' || ct || ')', ', ' ORDER BY ct ASC) AS why | |
FROM trait_rarities JOIN ( | |
SELECT feature_name AS name | |
FROM token_features | |
WHERE token_id = project_tokens.token_id | |
) AS these_features | |
USING (name) | |
) AS lat | |
) | |
SELECT token_id, EXP(sum_ln_p) AS joint_p, why FROM token_rarities | |
ORDER BY sum_ln_p ASC | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment