Skip to content

Instantly share code, notes, and snippets.

@3manuek
Created May 15, 2020 21:58
Show Gist options
  • Save 3manuek/14825693041ea30db67d1c71cbe34ec7 to your computer and use it in GitHub Desktop.
Save 3manuek/14825693041ea30db67d1c71cbe34ec7 to your computer and use it in GitHub Desktop.
Redshift (RS) / ClickHouse (CH)
---
Redshift 3 nodes (1 coordinator) / CH 2 nodes, same specs.
RS:
8m 36s / 1m40s
select trunc(created_at),* from request_stats_2015;
CH:
Not compared as the query does a full scan, even tho CH responded immediatly:
22184046 rows in set. Elapsed: 14.583 sec. Processed 22.51 million rows, 10.58 GB (1.54 million rows/s., 725.57 MB/s.)
---
RS:
46 - 51 secs
select trunc(created_at),* from (select id FROM request_stats_2015 order by random() limit 10 ) list join request_stats_2015 rs on list.id = rs.id;
39 s
select trunc(created_at),* from (select id FROM request_stats_2015 order by random() limit 10 ) list join request_stats_2015 rs on list.id = rs.id;
CH:
Supports sampling, in which is not necessary to get a _order by random()_. Same results as the previous.
---
RS:
23 s
SELECT (SELECT max("t2"."created_at") FROM "request_stats_2015" AS t2 WHERE ((("t2"."resource_type" = 'tokens') AND ("t2"."store_id" = 'HIDDEN')) AND NOT ("t2"."remote_address" ILIKE '10.0.%'))) AS last_login_at, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ((("t2"."resource_type" = 'patients') AND ("t2"."action" = 'read')) AND ("t2"."data" ILIKE '%ref%')))) AS opened_reports_count, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDENv') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_letter'))) AS print_letter_count, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_schedule_call'))) AS schedule_call_count, ((SELECT count(distinct("t2"."resource_id")) FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_preview_plans'))) + (SELECT count(distinct("t2"."resource_id")) FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_preview_comparison')))) AS printed_comparisons_count FROM (SELECT 1) AS void
CH:
9.431 sec.
https://github.com/yandex/ClickHouse/issues/26 : subquery can't reference columns from outer query. correlated queries are impossible for now.
Query modified to ->
SELECT
(SELECT max(created_at)
FROM request_stats_2015
WHERE (((resource_type = 'tokens')
AND (store_id = 'HIDDEN'))
AND NOT (remote_address LIKE '10.0.%'))) AS last_login_at,
(SELECT count(id)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (((resource_type = 'patients')
AND (action = 'read'))
AND (data LIKE '%ref%')))) AS opened_reports_count,
(SELECT count(*)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_letter'))) AS print_letter_count,
(SELECT count(*)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_schedule_call'))) AS schedule_call_count,
(
(SELECT count(distinct(resource_id))
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_preview_plans'))) +
(SELECT count(distinct(resource_id))
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_preview_comparison')))) AS printed_comparisons_count
FROM
(SELECT 1) AS void
┌───────last_login_at─┬─NULL─┬─print_letter_count─┬─NULL─┬─printed_comparisons_count─┐
│ 2017-05-16 15:43:57 │ \N │ 1 │ \N │ 90 │
└─────────────────────┴──────┴────────────────────┴──────┴───────────────────────────┘
1 rows in set. Elapsed: 9.431 sec.
---
RS:
3.8s
SELECT count('*'), max("t1"."created_at"), min("t1"."created_at") FROM "request_stats_2015" AS t1 WHERE (("t1"."resource_type" != 'stats') AND ("t1"."store_id" = 'HIDDEN'))
CH:
2.120 sec
SELECT count(id), max(created_at),
min(created_at)
FROM request_stats_2015
WHERE ((resource_type != 'stats')
AND (store_id = 'HIDDEN'))
1 rows in set. Elapsed: 2.120 sec. Processed 91.13 million rows, 9.30 GB (42.98 million rows/s., 4.39 GB/s.)
1 rows in set. Elapsed: 2.054 sec. Processed 91.13 million rows, 9.30 GB (44.38 million rows/s., 4.53 GB/s.)
Redshift (RS) / ClickHouse (CH)
---
Redshift 3 nodes (1 coordinator) / CH 2 nodes, same specs.
RS:
8m 36s / 1m40s
select trunc(created_at),* from request_stats_2015;
CH:
Not compared as the query does a full scan, even tho CH responded immediatly:
22184046 rows in set. Elapsed: 14.583 sec. Processed 22.51 million rows, 10.58 GB (1.54 million rows/s., 725.57 MB/s.)
---
RS:
46 - 51 secs
select trunc(created_at),* from (select id FROM request_stats_2015 order by random() limit 10 ) list join request_stats_2015 rs on list.id = rs.id;
39 s
select trunc(created_at),* from (select id FROM request_stats_2015 order by random() limit 10 ) list join request_stats_2015 rs on list.id = rs.id;
CH:
Supports sampling, in which is not necessary to get a _order by random()_. Same results as the previous.
---
RS:
23 s
SELECT (SELECT max("t2"."created_at") FROM "request_stats_2015" AS t2 WHERE ((("t2"."resource_type" = 'tokens') AND ("t2"."store_id" = 'HIDDEN')) AND NOT ("t2"."remote_address" ILIKE '10.0.%'))) AS last_login_at, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ((("t2"."resource_type" = 'patients') AND ("t2"."action" = 'read')) AND ("t2"."data" ILIKE '%ref%')))) AS opened_reports_count, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDENv') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_letter'))) AS print_letter_count, (SELECT count('*') FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_schedule_call'))) AS schedule_call_count, ((SELECT count(distinct("t2"."resource_id")) FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_preview_plans'))) + (SELECT count(distinct("t2"."resource_id")) FROM "request_stats_2015" AS t2 WHERE (((("t2"."store_id" = 'HIDDEN') AND NOT ("t2"."remote_address" ILIKE '10.0.%')) AND ("t2"."created_at" BETWEEN '2016-09-01T00:00:00'::timestamp AND '2017-05-23T00:00:00'::timestamp)) AND ("t2"."resource_type" = 'click_print_preview_comparison')))) AS printed_comparisons_count FROM (SELECT 1) AS void
CH:
9.431 sec.
https://github.com/yandex/ClickHouse/issues/26 : subquery can't reference columns from outer query. correlated queries are impossible for now.
Query modified to ->
SELECT
(SELECT max(created_at)
FROM request_stats_2015
WHERE (((resource_type = 'tokens')
AND (store_id = 'HIDDEN'))
AND NOT (remote_address LIKE '10.0.%'))) AS last_login_at,
(SELECT count(id)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (((resource_type = 'patients')
AND (action = 'read'))
AND (data LIKE '%ref%')))) AS opened_reports_count,
(SELECT count(*)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_letter'))) AS print_letter_count,
(SELECT count(*)
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_schedule_call'))) AS schedule_call_count,
(
(SELECT count(distinct(resource_id))
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_preview_plans'))) +
(SELECT count(distinct(resource_id))
FROM request_stats_2015
WHERE ((((store_id = 'HIDDEN')
AND NOT (remote_address LIKE '10.0.%'))
AND (created_at BETWEEN '2016-09-01 00:00:00' AND '2017-05-23 00:00:00'))
AND (resource_type = 'click_print_preview_comparison')))) AS printed_comparisons_count
FROM
(SELECT 1) AS void
┌───────last_login_at─┬─NULL─┬─print_letter_count─┬─NULL─┬─printed_comparisons_count─┐
│ 2017-05-16 15:43:57 │ \N │ 1 │ \N │ 90 │
└─────────────────────┴──────┴────────────────────┴──────┴───────────────────────────┘
1 rows in set. Elapsed: 9.431 sec.
---
RS:
3.8s
SELECT count('*'), max("t1"."created_at"), min("t1"."created_at") FROM "request_stats_2015" AS t1 WHERE (("t1"."resource_type" != 'stats') AND ("t1"."store_id" = 'HIDDEN'))
CH:
2.120 sec
SELECT count(id), max(created_at),
min(created_at)
FROM request_stats_2015
WHERE ((resource_type != 'stats')
AND (store_id = 'HIDDEN'))
1 rows in set. Elapsed: 2.120 sec. Processed 91.13 million rows, 9.30 GB (42.98 million rows/s., 4.39 GB/s.)
1 rows in set. Elapsed: 2.054 sec. Processed 91.13 million rows, 9.30 GB (44.38 million rows/s., 4.53 GB/s.)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment