-
-
Save tom-clickhouse/0135b7f125c96bb42a798105653fc8aa to your computer and use it in GitHub Desktop.
Observe buffer flushes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
'default' AS db_name, | |
'upclick_metrics' AS table_name, | |
10 AS last_x_minutes, | |
4 AS last_m_flushes_per_node_per_shape_per_settings, | |
-- get matches - each row belongs to a specific buffer flush identified by flush_query_id | |
T_ASYNC_INSERT_LOG AS ( | |
SELECT | |
hostName() AS host_name, | |
query, | |
format, | |
query_id, | |
bytes, | |
rows, | |
flush_time, | |
flush_query_id | |
FROM | |
clusterAllReplicas(default, system.asynchronous_insert_log) | |
WHERE | |
status = 'Ok' | |
AND database = db_name | |
AND table = table_name | |
AND flush_time > now() - toIntervalMinute(last_x_minutes) | |
), | |
T_QUERY_LOG AS ( | |
SELECT | |
query_id, | |
normalized_query_hash, | |
Settings, | |
cityHash64(Settings) AS settings_hash, | |
user, | |
client_name, | |
http_user_agent | |
FROM | |
clusterAllReplicas(default, system.query_log) | |
WHERE | |
has(tables, db_name || '.' || table_name) | |
AND query_kind = 'Insert' | |
AND type = 'QueryFinish' | |
AND event_time > now() - toIntervalMinute(last_x_minutes) | |
), | |
T_BOTH AS ( | |
SELECT | |
* | |
FROM T_ASYNC_INSERT_LOG tail JOIN T_QUERY_LOG tql ON tail.query_id = tql.query_id | |
), | |
-- group matches by flush_query_id to get one row per buffer flush | |
T1 AS ( | |
SELECT | |
any(host_name) AS host_name, | |
any(query) AS sample_query, | |
arraySort(arrayDistinct(groupArray(format))) AS formats, | |
groupArray(query_id) AS query_ids, | |
sum(bytes) AS data_uncompressed_bytes, | |
sum(rows) AS rows, | |
any(flush_time) AS flush_time, | |
count() AS queries, | |
-- | |
any(normalized_query_hash) AS normalized_query_hash, | |
any(Settings) AS Settings, | |
any(settings_hash) AS settings_hash, | |
arraySort(arrayDistinct(groupArray(user))) AS users, | |
arraySort(arrayDistinct(groupArray(client_name))) AS client_names, | |
arraySort(arrayDistinct(groupArray(http_user_agent))) AS http_user_agents | |
FROM | |
T_BOTH | |
GROUP BY | |
flush_query_id | |
ORDER BY | |
host_name, | |
normalized_query_hash, -- it is (per table) per query shape per settings per host, so by ordering by it, it becomes clear | |
settings_hash, | |
flush_time DESC | |
), | |
-- but it is too much rows, it is hard to get a quick overview of all involved hosts and query shapes and settings | |
-- therefore it would be nice if we could see only n last_m_flushes_per_node_per_shape_per_settings | |
-- that is easy to formulate in ClickHouse - we can group by host and query shape and settings | |
-- and then use groupArray in order to select n last_m_flushes_per_node_per_shape_per_settings | |
T2 AS ( | |
SELECT | |
host_name, | |
normalized_query_hash, | |
settings_hash, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(flush_time) AS flush_time_, | |
arrayPushBack(arrayMap(d -> formatReadableTimeDelta(d), arrayPopFront(arrayDifference(arrayMap(t -> -1 * toUnixTimestamp(t) ,flush_time_)))), '-') AS diff_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(queries) AS queries_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(data_uncompressed_bytes) AS data_uncompressed_bytes_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(rows) AS rows_, | |
any(sample_query) AS sample_query, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(formats) AS formats_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(query_ids) AS query_ids_, | |
any(Settings) AS Settings, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(users) AS users_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(client_names) AS client_names_, | |
groupArray(last_m_flushes_per_node_per_shape_per_settings)(http_user_agents) AS http_user_agents_ | |
FROM | |
T1 | |
GROUP BY | |
host_name, | |
normalized_query_hash, | |
settings_hash | |
), | |
-- now for easier readability bring everything back into tabular format by using the inverse of groupArray: ARRAY JOIN | |
T3 AS ( | |
SELECT | |
DENSE_RANK() OVER(ORDER BY host_name) AS host_id, | |
-- normalized_query_hash, | |
-- settings_hash, | |
DENSE_RANK() OVER(ORDER BY normalized_query_hash) AS query_shape_id, | |
DENSE_RANK() OVER(ORDER BY settings_hash) AS settings_id, | |
formatDateTime(flush_time_, '%T') AS flush_time, | |
diff AS duration_since_prev_flush, | |
queries, | |
data_uncompressed_bytes, | |
rows, | |
sample_query, | |
Settings, | |
query_ids, | |
formats, | |
users, | |
client_names, | |
http_user_agents | |
FROM | |
T2 ARRAY JOIN | |
flush_time_ AS flush_time_, | |
diff_ AS diff, | |
queries_ AS queries, | |
data_uncompressed_bytes_ AS data_uncompressed_bytes, | |
rows_ AS rows, | |
query_ids_ AS query_ids, | |
formats_ AS formats, | |
users_ AS users, | |
client_names_ AS client_names, | |
http_user_agents_ AS http_user_agents | |
ORDER BY | |
host_name, | |
query_shape_id, | |
settings_id, | |
flush_time DESC | |
) | |
SELECT | |
host_id AS n, | |
query_shape_id AS q, | |
settings_id AS s, | |
flush_time AS flush, | |
replaceOne(duration_since_prev_flush, 'seconds', 's') AS prev, | |
rows, | |
formatReadableSize(data_uncompressed_bytes) AS data | |
,sample_query | |
,Settings AS sample_settings | |
,query_ids | |
,formats | |
,users | |
,client_names | |
,http_user_agents | |
FROM | |
T3 | |
SETTINGS skip_unavailable_shards = 1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment