vinayvenu/gist:4249e6be2c682555efdb0ac65dbaab3d

## gistfile1.txt
-- The objective of this session is to look at performance,
reset role;

select * from individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';
set role ihmp;
select * from individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';-- column does not exist
-- Pitfall. Use the schema name everywhere (For avni-server, you don't have to do this for public schemas because of application.properties)
select * from public.individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';-- column does not exist

reset role;
-- Operations for jsonb is usually performed with the @> and some other specific operators work very well
select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

-- See how much faster the results are. Now let's try this with the org role
set role ihmp;
select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

-- What is happening? To understand this, let us dig a bit deeper into how Postgres views these queries
set role ihmp;
explain (analyse, buffers) select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;
-- To read this, one easy way is to remove all the organisation plans as it makes it easier on the eyes. After removing them, it looks like this
-- QUERY PLAN
-- Gather  (cost=1096.61..66107.80 rows=413 width=1151) (actual time=61.869..114.758 rows=8 loops=1)
--   Workers Planned: 2
--   Params Evaluated: $5
--   Workers Launched: 2
--   Buffers: shared hit=124 read=58918
--   InitPlan 5 (returns $5)
--   ->  Parallel Seq Scan on individual  (cost=2.07..64971.96 rows=172 width=1151) (actual time=46.457..98.968 rows=3 loops=3)
-- "        Filter: (((organisation_id = $5) OR (hashed SubPlan 7)) AND (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb))"
--         Rows Removed by Filter: 271809
--         Buffers: shared hit=112 read=58918
-- Planning Time: 1.031 ms
-- Execution Time: 114.883 ms

-- Important to notice
-- 60,000 buffers hit. One buffer is about 4kb. This means about 60MB read to retrieve 8 rows!
-- Parallel sequential scan on individual table. We are essentially reading the entire table and trying to figure out the values.
-- It is important to remember that high IO can cause two specific effects
-- 1. The query will be good if you run it a second time. This is because of inbuilt caching in Postgres.
-- 2. This is a performance problem in a different way. AWS has strict quotas for IO expected from a database, and queries like this can exhaust the buffer pretty past
-- 3. IO is not as parallelizable as other things, which means many high IO queries will slow down the DB considerably. It is also hard to scale this out easily (can't add new CPUs etc)
-- Is there a better way? To understand this, lets look at how the plan would look like when we run this without a role.


reset role;
explain (analyse, buffers) select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;
-- QUERY PLAN
-- Bitmap Heap Scan on individual  (cost=30.32..2995.84 rows=815 width=1151) (actual time=0.072..0.144 rows=11 loops=1)
-- "  Recheck Cond: (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb)"
--   Heap Blocks: exact=11
--   Buffers: shared hit=16
--   ->  Bitmap Index Scan on idx_individual_obs  (cost=0.00..30.12 rows=815 width=0) (actual time=0.050..0.051 rows=11 loops=1)
-- "        Index Cond: (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb)"
--         Buffers: shared hit=5
-- Planning Time: 0.505 ms
-- Execution Time: 0.223 ms

-- Look at the buffers hit. Also notice that this time, we are using a bitmap index scan on idx_individual_obs.
-- Indices are smaller tables that have keys arranged in order, and help in faster retrieval of data. Typically, assume a query takes 10 ms. When the data (number of rows) increases 10-fold, then it will take 100 ms if the query uses a sequential scan, and 20 ms if it uses a bitmap index scan. When the data increases 100-fold, then it becomes 1000 ms vs 30 ms. This is why we need indices for most of our operations.

-- There are some caveats where it is not effective to be using indices. We will discuss this at a later point.

-- Now lets see how things work with the ETL tables
reset role;
select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

set role ihmp;
select * from ihmp.household where "Household number" = '294';

explain (analyse, buffers ) select * from ihmp.household where "Household number" = '294';

-- The number of buffers used is relatively high, but the size of the data is low, so its ok. Lets now go back to the original question. When is an index useful?
-- When there is a lot of data
--      Amount of data - When we break down data by organisation, the need for indices reduces because there is not much data
--      Type of data - Indices work when there are more unique values. For eg: we don't need an index on the gender column
--      Type of query - When your queries pick up more than 15% of data from the table, then there is no purpose of an index


-- Lets look at a reporting query, and see how we can optimize it.
reset role;
select count(*) from public.individual; -- 815436


-- Lets try optimizing a query in production that takes time
-- Under 5 list with Grade
set role ashwini;
reset role;
grant all on completed_program_encounter_view to ashwini;

WITH latest_anthro_encounters AS (
    SELECT
        penrol.individual_id AS individual_id,
        penrol.id AS enrolment_id,
        max(completed_program_encounter_view.encounter_date_time) AS max_date
    FROM program_enrolment_view penrol
             JOIN completed_program_encounter_view  ON penrol.id = completed_program_encounter_view.program_enrolment_id
    WHERE completed_program_encounter_view.observations??'c5f418f8-af4a-4a10-a70d-54f088c6a8c6'  AND penrol.program_id = 130
    GROUP BY 1,2
    ORDER BY 1
)
SELECT
    individual_gender_catchment_view.addresslevel_name as "Village",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  > -2.0) as "Grade 1",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  BETWEEN -2.9999 AND -2.0) as "Grade 2",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  <= -3.0) as "Grade 3"
FROM
    latest_anthro_encounters le
        JOIN individual_gender_catchment_view ON le.individual_id = individual_gender_catchment_view.id AND individual_gender_catchment_view.catchment_name != 'Ashwini Master Catchment'
    JOIN completed_program_encounter_view ON le.enrolment_id = completed_program_encounter_view.program_enrolment_id AND le.max_date=completed_program_encounter_view.encounter_date_time
GROUP BY 1;


-- Lets look at the plan for this one... And see how we may be able to fix it...
explain (analyse, buffers ) WITH latest_anthro_encounters AS (
    SELECT
        penrol.individual_id AS individual_id,
        penrol.id AS enrolment_id,
        max(completed_program_encounter_view.encounter_date_time) AS max_date
    FROM program_enrolment_view penrol
             JOIN completed_program_encounter_view  ON penrol.id = completed_program_encounter_view.program_enrolment_id
    WHERE completed_program_encounter_view.observations??'c5f418f8-af4a-4a10-a70d-54f088c6a8c6'  AND penrol.program_id = 130
    GROUP BY 1,2
    ORDER BY 1
)
SELECT
    individual_gender_catchment_view.addresslevel_name as "Village",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  > -2.0) as "Grade 1",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  BETWEEN -2.9999 AND -2.0) as "Grade 2",
    count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric  <= -3.0) as "Grade 3"
FROM
    latest_anthro_encounters le
        JOIN individual_gender_catchment_view ON le.individual_id = individual_gender_catchment_view.id AND individual_gender_catchment_view.catchment_name != 'Ashwini Master Catchment'
        JOIN completed_program_encounter_view ON le.enrolment_id = completed_program_encounter_view.program_enrolment_id AND le.max_date=completed_program_encounter_view.encounter_date_time
GROUP BY 1;
	-- The objective of this session is to look at performance,
	reset role;

	select * from individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';
	set role ihmp;
	select * from individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';-- column does not exist
	-- Pitfall. Use the schema name everywhere (For avni-server, you don't have to do this for public schemas because of application.properties)
	select * from public.individual where observations->>'24dabc3a-6562-4521-bd42-5fff11ea5c46' = '294';-- column does not exist

	reset role;
	-- Operations for jsonb is usually performed with the @> and some other specific operators work very well
	select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

	-- See how much faster the results are. Now let's try this with the org role
	set role ihmp;
	select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

	-- What is happening? To understand this, let us dig a bit deeper into how Postgres views these queries
	set role ihmp;
	explain (analyse, buffers) select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;
	-- To read this, one easy way is to remove all the organisation plans as it makes it easier on the eyes. After removing them, it looks like this
	-- QUERY PLAN
	-- Gather (cost=1096.61..66107.80 rows=413 width=1151) (actual time=61.869..114.758 rows=8 loops=1)
	-- Workers Planned: 2
	-- Params Evaluated: $5
	-- Workers Launched: 2
	-- Buffers: shared hit=124 read=58918
	-- InitPlan 5 (returns $5)
	-- -> Parallel Seq Scan on individual (cost=2.07..64971.96 rows=172 width=1151) (actual time=46.457..98.968 rows=3 loops=3)
	-- " Filter: (((organisation_id = $5) OR (hashed SubPlan 7)) AND (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb))"
	-- Rows Removed by Filter: 271809
	-- Buffers: shared hit=112 read=58918
	-- Planning Time: 1.031 ms
	-- Execution Time: 114.883 ms

	-- Important to notice
	-- 60,000 buffers hit. One buffer is about 4kb. This means about 60MB read to retrieve 8 rows!
	-- Parallel sequential scan on individual table. We are essentially reading the entire table and trying to figure out the values.
	-- It is important to remember that high IO can cause two specific effects
	-- 1. The query will be good if you run it a second time. This is because of inbuilt caching in Postgres.
	-- 2. This is a performance problem in a different way. AWS has strict quotas for IO expected from a database, and queries like this can exhaust the buffer pretty past
	-- 3. IO is not as parallelizable as other things, which means many high IO queries will slow down the DB considerably. It is also hard to scale this out easily (can't add new CPUs etc)
	-- Is there a better way? To understand this, lets look at how the plan would look like when we run this without a role.



	reset role;
	explain (analyse, buffers) select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;
	-- QUERY PLAN
	-- Bitmap Heap Scan on individual (cost=30.32..2995.84 rows=815 width=1151) (actual time=0.072..0.144 rows=11 loops=1)
	-- " Recheck Cond: (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb)"
	-- Heap Blocks: exact=11
	-- Buffers: shared hit=16
	-- -> Bitmap Index Scan on idx_individual_obs (cost=0.00..30.12 rows=815 width=0) (actual time=0.050..0.051 rows=11 loops=1)
	-- " Index Cond: (observations @> '{""24dabc3a-6562-4521-bd42-5fff11ea5c46"": ""294""}'::jsonb)"
	-- Buffers: shared hit=5
	-- Planning Time: 0.505 ms
	-- Execution Time: 0.223 ms

	-- Look at the buffers hit. Also notice that this time, we are using a bitmap index scan on idx_individual_obs.
	-- Indices are smaller tables that have keys arranged in order, and help in faster retrieval of data. Typically, assume a query takes 10 ms. When the data (number of rows) increases 10-fold, then it will take 100 ms if the query uses a sequential scan, and 20 ms if it uses a bitmap index scan. When the data increases 100-fold, then it becomes 1000 ms vs 30 ms. This is why we need indices for most of our operations.

	-- There are some caveats where it is not effective to be using indices. We will discuss this at a later point.

	-- Now lets see how things work with the ETL tables
	reset role;
	select * from public.individual where observations @> '{"24dabc3a-6562-4521-bd42-5fff11ea5c46": "294"}'::jsonb;

	set role ihmp;
	select * from ihmp.household where "Household number" = '294';

	explain (analyse, buffers ) select * from ihmp.household where "Household number" = '294';

	-- The number of buffers used is relatively high, but the size of the data is low, so its ok. Lets now go back to the original question. When is an index useful?
	-- When there is a lot of data
	-- Amount of data - When we break down data by organisation, the need for indices reduces because there is not much data
	-- Type of data - Indices work when there are more unique values. For eg: we don't need an index on the gender column
	-- Type of query - When your queries pick up more than 15% of data from the table, then there is no purpose of an index


	-- Lets look at a reporting query, and see how we can optimize it.
	reset role;
	select count(*) from public.individual; -- 815436


	-- Lets try optimizing a query in production that takes time
	-- Under 5 list with Grade
	set role ashwini;
	reset role;
	grant all on completed_program_encounter_view to ashwini;

	WITH latest_anthro_encounters AS (
	SELECT
	penrol.individual_id AS individual_id,
	penrol.id AS enrolment_id,
	max(completed_program_encounter_view.encounter_date_time) AS max_date
	FROM program_enrolment_view penrol
	JOIN completed_program_encounter_view ON penrol.id = completed_program_encounter_view.program_enrolment_id
	WHERE completed_program_encounter_view.observations??'c5f418f8-af4a-4a10-a70d-54f088c6a8c6' AND penrol.program_id = 130
	GROUP BY 1,2
	ORDER BY 1
	)
	SELECT
	individual_gender_catchment_view.addresslevel_name as "Village",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric > -2.0) as "Grade 1",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric BETWEEN -2.9999 AND -2.0) as "Grade 2",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric <= -3.0) as "Grade 3"
	FROM
	latest_anthro_encounters le
	JOIN individual_gender_catchment_view ON le.individual_id = individual_gender_catchment_view.id AND individual_gender_catchment_view.catchment_name != 'Ashwini Master Catchment'
	JOIN completed_program_encounter_view ON le.enrolment_id = completed_program_encounter_view.program_enrolment_id AND le.max_date=completed_program_encounter_view.encounter_date_time
	GROUP BY 1;



	-- Lets look at the plan for this one... And see how we may be able to fix it...
	explain (analyse, buffers ) WITH latest_anthro_encounters AS (
	SELECT
	penrol.individual_id AS individual_id,
	penrol.id AS enrolment_id,
	max(completed_program_encounter_view.encounter_date_time) AS max_date
	FROM program_enrolment_view penrol
	JOIN completed_program_encounter_view ON penrol.id = completed_program_encounter_view.program_enrolment_id
	WHERE completed_program_encounter_view.observations??'c5f418f8-af4a-4a10-a70d-54f088c6a8c6' AND penrol.program_id = 130
	GROUP BY 1,2
	ORDER BY 1
	)
	SELECT
	individual_gender_catchment_view.addresslevel_name as "Village",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric > -2.0) as "Grade 1",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric BETWEEN -2.9999 AND -2.0) as "Grade 2",
	count(*) FILTER (WHERE (completed_program_encounter_view.observations->>'c5f418f8-af4a-4a10-a70d-54f088c6a8c6')::numeric <= -3.0) as "Grade 3"
	FROM
	latest_anthro_encounters le
	JOIN individual_gender_catchment_view ON le.individual_id = individual_gender_catchment_view.id AND individual_gender_catchment_view.catchment_name != 'Ashwini Master Catchment'
	JOIN completed_program_encounter_view ON le.enrolment_id = completed_program_encounter_view.program_enrolment_id AND le.max_date=completed_program_encounter_view.encounter_date_time
	GROUP BY 1;