-- data table
CREATE TABLE transactions
(
id UInt32,
vector Array(Float32),
customer UInt32,
)
ENGINE = MergeTree -- this can be a Null engine
ORDER BY id
-- centroids table
CREATE TABLE centroids
(
customer UInt32,
vector AggregateFunction(avgForEach, Array(Float32))
)
ENGINE = AggregatingMergeTree
ORDER BY customer
-- Materialized View
CREATE MATERIALIZED VIEW centroids_mv TO centroids
AS SELECT customer, avgForEachState(vector) AS vector
FROM transactions GROUP BY customer
-- insert 100m transactions, each with 100 dimensions for their vector, across 10000 customers i.e. avg 10k each
INSERT INTO transactions SELECT
number AS id,
arrayMap(v -> randCanonical(v), range(100)) AS vector,
floor(randUniform(1, 10000)) AS customer
FROM numbers(100000000)
0 rows in set. Elapsed: 389.189 sec. Processed 200.00 million rows, 42.00 GB (513.89 thousand rows/s., 107.92 MB/s.)
Created
April 9, 2024 12:53
-
-
Save gingerwizard/f92337c1d0b04372adff8c2821cab46a to your computer and use it in GitHub Desktop.
Loading 100m transactions for kmeans
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment