Skip to content

Instantly share code, notes, and snippets.

@gingerwizard
Created February 21, 2024 15:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gingerwizard/fc18cf5a3b9945b626cecb88a2deaa49 to your computer and use it in GitHub Desktop.
Save gingerwizard/fc18cf5a3b9945b626cecb88a2deaa49 to your computer and use it in GitHub Desktop.
-- original table
CREATE TABLE hackernews_copy
(
    `id` String,
    `doc_id` String,
    `comment` String,
    `vector` Array(Float32),
    `node_info` Tuple(start Nullable(UInt64), end Nullable(UInt64)),
    `metadata` String,
    `type` Enum8('story' = 1, 'comment' = 2, 'poll' = 3, 'pollopt' = 4, 'job' = 5),
    `by` LowCardinality(String),
    `time` DateTime,
    `title` String,
    `post_score` Int32,
    `dead` UInt8,
    `deleted` UInt8,
    `length` UInt32,
    `parent` UInt32,
    `kids` Array(UInt32)
)
ENGINE = MergeTree
ORDER BY (toDate(time), length, post_score)


-- hold text and by in memory for fast lookup - around 8GiB
CREATE DICTIONARY hacker_hierarchy
(
    `id` UInt64,
    `text` String,
    `by` String
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(QUERY 'SELECT id, text, by FROM default.hackernews_copy'))
LIFETIME(MIN 0 MAX 0)
LAYOUT(HASHED_ARRAY())

-- create table
CREATE TABLE hackernews
(
    `id` String,
    `doc_id` String,
    `comment` String,
    `text` String,
    `vector` Array(Float32),
    `node_info` Tuple(start Nullable(UInt64), end Nullable(UInt64)),
    `metadata` String,
    `type` Enum8('story' = 1, 'comment' = 2, 'poll' = 3, 'pollopt' = 4, 'job' = 5),
    `by` LowCardinality(String),
    `time` DateTime,
    `title` String,
    `post_score` Int32,
    `dead` UInt8,
    `deleted` UInt8,
    `length` UInt32,
    `parent` UInt32,
    `kids` Array(UInt32)
)
ENGINE = MergeTree
ORDER BY (toDate(time), length, post_score)

-- insert from copy, grabbing parent and children text from dictionary
INSERT INTO hackernews SELECT
id,
id AS doc_id,
comment,
if(parent != 0, dictGet(hacker_hierarchy, 'by', parent) || ':' || dictGet(hacker_hierarchy, 'text', parent) || '\n', '') || by ||  ': ' || comment || '\n' || arrayStringConcat(arrayMap(k -> dictGet(hacker_hierarchy, 'by', k) || ': ' || dictGet(hacker_hierarchy, 'text', k), kids), '\n') as text,
vector,
(0, 0) AS node_info,
toJSONString(CAST((deleted, type, by, time, dead, parent, poll, kids, url, score, title, parts, descendants), 'Tuple(deleted UInt8, type Enum8(\'story\' = 1, \'comment\' = 2, \'poll\' = 3, \'pollopt\' = 4, \'job\' = 5), by String, time DateTime, dead UInt8, parent UInt32, poll UInt32, kids Array(UInt32), url String, post_score Int32, title String, parts Array(UInt32), descendants Int32)')) AS metadata, type, by, time, title, score as post_score, dead, deleted, length(tokens(text)) as length, parent, kids
FROM hackernews_copy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment