Skip to content

Instantly share code, notes, and snippets.

@tom-clickhouse
Last active December 28, 2023 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tom-clickhouse/0135b7f125c96bb42a798105653fc8aa to your computer and use it in GitHub Desktop.
Save tom-clickhouse/0135b7f125c96bb42a798105653fc8aa to your computer and use it in GitHub Desktop.
Observe buffer flushes
WITH
'default' AS db_name,
'upclick_metrics' AS table_name,
10 AS last_x_minutes,
4 AS last_m_flushes_per_node_per_shape_per_settings,
-- get matches - each row belongs to a specific buffer flush identified by flush_query_id
T_ASYNC_INSERT_LOG AS (
SELECT
hostName() AS host_name,
query,
format,
query_id,
bytes,
rows,
flush_time,
flush_query_id
FROM
clusterAllReplicas(default, system.asynchronous_insert_log)
WHERE
status = 'Ok'
AND database = db_name
AND table = table_name
AND flush_time > now() - toIntervalMinute(last_x_minutes)
),
T_QUERY_LOG AS (
SELECT
query_id,
normalized_query_hash,
Settings,
cityHash64(Settings) AS settings_hash,
user,
client_name,
http_user_agent
FROM
clusterAllReplicas(default, system.query_log)
WHERE
has(tables, db_name || '.' || table_name)
AND query_kind = 'Insert'
AND type = 'QueryFinish'
AND event_time > now() - toIntervalMinute(last_x_minutes)
),
T_BOTH AS (
SELECT
*
FROM T_ASYNC_INSERT_LOG tail JOIN T_QUERY_LOG tql ON tail.query_id = tql.query_id
),
-- group matches by flush_query_id to get one row per buffer flush
T1 AS (
SELECT
any(host_name) AS host_name,
any(query) AS sample_query,
arraySort(arrayDistinct(groupArray(format))) AS formats,
groupArray(query_id) AS query_ids,
sum(bytes) AS data_uncompressed_bytes,
sum(rows) AS rows,
any(flush_time) AS flush_time,
count() AS queries,
--
any(normalized_query_hash) AS normalized_query_hash,
any(Settings) AS Settings,
any(settings_hash) AS settings_hash,
arraySort(arrayDistinct(groupArray(user))) AS users,
arraySort(arrayDistinct(groupArray(client_name))) AS client_names,
arraySort(arrayDistinct(groupArray(http_user_agent))) AS http_user_agents
FROM
T_BOTH
GROUP BY
flush_query_id
ORDER BY
host_name,
normalized_query_hash, -- it is (per table) per query shape per settings per host, so by ordering by it, it becomes clear
settings_hash,
flush_time DESC
),
-- but it is too much rows, it is hard to get a quick overview of all involved hosts and query shapes and settings
-- therefore it would be nice if we could see only n last_m_flushes_per_node_per_shape_per_settings
-- that is easy to formulate in ClickHouse - we can group by host and query shape and settings
-- and then use groupArray in order to select n last_m_flushes_per_node_per_shape_per_settings
T2 AS (
SELECT
host_name,
normalized_query_hash,
settings_hash,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(flush_time) AS flush_time_,
arrayPushBack(arrayMap(d -> formatReadableTimeDelta(d), arrayPopFront(arrayDifference(arrayMap(t -> -1 * toUnixTimestamp(t) ,flush_time_)))), '-') AS diff_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(queries) AS queries_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(data_uncompressed_bytes) AS data_uncompressed_bytes_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(rows) AS rows_,
any(sample_query) AS sample_query,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(formats) AS formats_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(query_ids) AS query_ids_,
any(Settings) AS Settings,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(users) AS users_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(client_names) AS client_names_,
groupArray(last_m_flushes_per_node_per_shape_per_settings)(http_user_agents) AS http_user_agents_
FROM
T1
GROUP BY
host_name,
normalized_query_hash,
settings_hash
),
-- now for easier readability bring everything back into tabular format by using the inverse of groupArray: ARRAY JOIN
T3 AS (
SELECT
DENSE_RANK() OVER(ORDER BY host_name) AS host_id,
-- normalized_query_hash,
-- settings_hash,
DENSE_RANK() OVER(ORDER BY normalized_query_hash) AS query_shape_id,
DENSE_RANK() OVER(ORDER BY settings_hash) AS settings_id,
formatDateTime(flush_time_, '%T') AS flush_time,
diff AS duration_since_prev_flush,
queries,
data_uncompressed_bytes,
rows,
sample_query,
Settings,
query_ids,
formats,
users,
client_names,
http_user_agents
FROM
T2 ARRAY JOIN
flush_time_ AS flush_time_,
diff_ AS diff,
queries_ AS queries,
data_uncompressed_bytes_ AS data_uncompressed_bytes,
rows_ AS rows,
query_ids_ AS query_ids,
formats_ AS formats,
users_ AS users,
client_names_ AS client_names,
http_user_agents_ AS http_user_agents
ORDER BY
host_name,
query_shape_id,
settings_id,
flush_time DESC
)
SELECT
host_id AS n,
query_shape_id AS q,
settings_id AS s,
flush_time AS flush,
replaceOne(duration_since_prev_flush, 'seconds', 's') AS prev,
rows,
formatReadableSize(data_uncompressed_bytes) AS data
,sample_query
,Settings AS sample_settings
,query_ids
,formats
,users
,client_names
,http_user_agents
FROM
T3
SETTINGS skip_unavailable_shards = 1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment