Skip to content

Instantly share code, notes, and snippets.

@gingerwizard
Created April 9, 2024 12:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gingerwizard/f92337c1d0b04372adff8c2821cab46a to your computer and use it in GitHub Desktop.
Save gingerwizard/f92337c1d0b04372adff8c2821cab46a to your computer and use it in GitHub Desktop.
Loading 100m transactions for kmeans
-- data table
CREATE TABLE transactions
(
  id UInt32,
  vector Array(Float32),
  customer UInt32,
)
ENGINE = MergeTree -- this can be a Null engine
ORDER BY id

-- centroids table
CREATE TABLE centroids
(
  customer UInt32,
  vector AggregateFunction(avgForEach, Array(Float32))
)
ENGINE = AggregatingMergeTree
ORDER BY customer

-- Materialized View
CREATE MATERIALIZED VIEW centroids_mv TO centroids
AS SELECT customer, avgForEachState(vector) AS vector
FROM transactions GROUP BY customer

-- insert 100m transactions, each with 100 dimensions for their vector, across 10000 customers i.e. avg 10k each
INSERT INTO transactions SELECT
    number AS id,
    arrayMap(v -> randCanonical(v), range(100)) AS vector,
    floor(randUniform(1, 10000)) AS customer
FROM numbers(100000000)

0 rows in set. Elapsed: 389.189 sec. Processed 200.00 million rows, 42.00 GB (513.89 thousand rows/s., 107.92 MB/s.)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment