Skip to content

Instantly share code, notes, and snippets.

@chrisknoll
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisknoll/4a250613dc2a3ccdb8d5 to your computer and use it in GitHub Desktop.
Save chrisknoll/4a250613dc2a3ccdb8d5 to your computer and use it in GitHub Desktop.
Drug Era Builder SQL
/*
Drug era build logic:
Christopher Knoll
Janssen R&D
PURPOSE:
This SQL example demonstrates how drug exposure events in the CDM database can be rolled up into eras by arranging
each drug exposure start_date a row_number() and lining it up with the overall row_number for all start_date and end_dates.
When the row_number() of an overall date = 2 * the row_number of a start, we have a 'closed era'.
Example:
Consider the 2 overlapping date ranges:
1/1/2010 1/15/2010
1/12/2010 1/30/2010
By arranging the start dates in order with a row_number, and unioning this result with all end dates, you are left with the following:
EVENT_DATE START_ORD EVENT_TYPE
1/1/2010 1 -1
1/12/2010 2 -1
1/15/2010 null 1
1/30/2010 null 1
(Note: we add an EVENT_TYPE column so that EVENT_DATES that occur on the same day will have the ordinal of the start appear first before the end)
By applying a row_number() to these rows:
EVENT_DATE START_ORD OVERALL_ORD EVENT_TYPE
1/1/2010 1 1 -1
1/12/2010 2 2 -1
1/15/2010 null 3 1
1/30/2010 null 4 1
By applying a MAX(START_ORD) OVER (ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) to this set, we can 'fill down' the start_ord columns that have nulls:
EVENT_DATE START_ORD OVERALL_ORD EVENT_TYPE
1/1/2010 1 1 -1
1/12/2010 2 2 -1
1/15/2010 2 3 1
1/30/2010 2 4 1
From here we can see that where 2*START_ORD = OVERALL_ORD we have the date where all starts that have opened have been closed.
To get the final ERA, You find all the EVENT_DATEs where 2*START_ORD = OVERALL_ORD, (these are era end dates called ERA_END)
and the drug_exposure era_end is the first ERA_END that occurs after the drug_exposure start date. This is accomplished in a GROUP BY
clause (see SQL below).
Since the only end date identified was 1/3/2010, this logic returns this rowset:
START_DATE ERA_END
1/1/2010 1/30/2010
1/12/2010 1/30/2010
Finally (not shown in this demo), if you wanted to reduce all events to their era_start and era_end, you would take the
MIN(START_DATE) as ERA_START, GROUP BY (ERA_END) to get the following result:
ERA_START ERA_END
1/1/2010 1/30/2010
The SQL below shows the steps up to finding each drug exposure's ERA_END_DATE but does not do the final group-by to calculate the start_era.
*/
-- DROP TABLE #DRUG_EXPOSURE;
CREATE TABLE #DRUG_EXPOSURE
(
[DRUG_EXPOSURE_ID] [bigint] NOT NULL,
[PERSON_ID] [bigint] NOT NULL,
[DRUG_CONCEPT_ID] [int] NOT NULL,
[DRUG_EXPOSURE_START_DATE] [date] NOT NULL,
[DRUG_TYPE_CONCEPT_ID] [int] NOT NULL,
[REFILLS] [int] NULL,
[QUANTITY] [int] NULL,
[DAYS_SUPPLY] [int] NULL,
[DRUG_EXPOSURE_END_DATE] [date] NULL
) ON [PRIMARY]
GO
-- Person 1000000
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000000,'1/1/2010','1/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000000,'1/1/2010','1/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000000,'1/15/2010','1/20/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000000,'2/14/2010','2/20/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000000,'1/13/2010',null,10,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000001,'1/25/2010','2/3/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000002,'4/1/2010','5/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000003,'4/10/2010','4/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000004,'5/13/2010',null,15,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000000,2000005,'9/1/2010',null,10,3000000,1,1);
-- Person 1000001:
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000000,'1/1/2010','1/05/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000001,'1/21/2010',null,10,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000002,'4/1/2010','4/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000003,'8/1/2010',null,30,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000004,'10/1/2010','10/5/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000005,'10/3/2010','10/15/2010',null,3000000,1,1);
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY])
VALUES (0,1000001,2000006,'11/30/2010',null,5,3000000,1,1);
with T as
(
select *, ROW_NUMBER() OVER (order by DRUG_EXPOSURE_ID) as rn
from #DRUG_EXPOSURE
)
UPDATE T set DRUG_EXPOSURE_ID = rn;
-- here is the raw exposure data
select * from #DRUG_EXPOSURE;
-- here we show how we use the row_number for each START_DATE and row_number for all dates.
-- Note that when 2 * E.START_ORDINAL - E.OVERALL_ORD = 0, then we have matched all starts with ends to this point, and this would conclude an 'era'
-- In addition: we are applying a 30 day 'gap window' that if a start date begins within 30 days of an end date, it shoudl be
-- considered part of the same era. To do this,we just add 30 days to all end dates to push them out, and then when we find
-- all the dates that are era ends, we subtract the 30 days back.
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as
(
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE,
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE,
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY,
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum
FROM #DRUG_EXPOSURE d
)
select E.*, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date
FROM
(
select PERSON_ID, EVENT_DATE, EVENT_TYPE,
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date
from
(
-- select the start dates, assigning a row number to each
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, -1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL
from cteDrugTarget
UNION ALL
-- add the end dates with NULL as the row number, padding the end dates by 30 to allow a grace period for overlapping ranges.
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), 1 as EVENT_TYPE, NULL
FROM cteDrugTarget
) RAWDATA
) E
WHERE 2 * E.START_ORDINAL - E.OVERALL_ORD = 0
-- pretend that the data in #DRUG_EXPOSURE is the set or rows that match a certain descent concept ID.
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as
(
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE,
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE,
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY,
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum
FROM #DRUG_EXPOSURE d
),
cteEndDates (PERSON_ID, END_DATE) as -- the magic
(
select PERSON_ID, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date
FROM
(
select PERSON_ID, EVENT_DATE, EVENT_TYPE,
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date
from
(
-- select the start dates, assigning a row number to each
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, -1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL
from cteDrugTarget
UNION ALL
-- add the end dates with NULL as the row number, padding the end dates by 30 to allow a grace period for overlapping ranges.
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), 1 as EVENT_TYPE, NULL
FROM cteDrugTarget
) RAWDATA
) E
WHERE 2 * E.START_ORDINAL - E.OVERALL_ORD = 0
)
select * from cteEndDates;
-- now bring it all together by joining the calculated end dates back to the #drugExposure table, and the row's exposure's era end date will be the MIN end date that is >= the exposure date.
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as
(
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE,
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE,
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY,
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum
FROM #DRUG_EXPOSURE d
),
cteEndDates (PERSON_ID, END_DATE) as -- the magic
(
select PERSON_ID, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date
FROM
(
select PERSON_ID, EVENT_DATE, EVENT_TYPE,
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date
from
(
-- select the start dates, assigning a row number to each
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, 1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL
from cteDrugTarget
UNION ALL
-- pad the end dates by 30 to allow a grace period for overlapping ranges.
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), -1 as EVENT_TYPE, NULL
FROM cteDrugTarget
) RAWDATA
) E
WHERE (2 * E.START_ORDINAL) - E.OVERALL_ORD = 0
)
select d.DRUG_EXPOSURE_ID,
d.PERSON_ID,
d.DRUG_CONCEPT_ID,
d.DRUG_EXPOSURE_START_DATE,
d.DRUG_EXPOSURE_END_DATE,
MIN(e.END_DATE) as ERA_END_DATE,
d.DRUG_TYPE_CONCEPT_ID,
d.REFILLS,
d.QUANTITY,
d.DAYS_SUPPLY,
d.RowNum
FROM cteDrugTarget d
JOIN cteEndDates e on d.PERSON_ID = e.PERSON_ID and e.END_DATE >= d.DRUG_EXPOSURE_START_DATE
GROUP BY d.DRUG_EXPOSURE_ID,
d.PERSON_ID,
d.DRUG_CONCEPT_ID,
d.DRUG_EXPOSURE_START_DATE,
d.DRUG_EXPOSURE_END_DATE,
d.DRUG_TYPE_CONCEPT_ID,
d.REFILLS,
d.QUANTITY,
d.DAYS_SUPPLY,
d.RowNum
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment