Last active
August 29, 2015 14:19
-
-
Save chrisknoll/4a250613dc2a3ccdb8d5 to your computer and use it in GitHub Desktop.
Drug Era Builder SQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Drug era build logic: | |
Christopher Knoll | |
Janssen R&D | |
PURPOSE: | |
This SQL example demonstrates how drug exposure events in the CDM database can be rolled up into eras by arranging | |
each drug exposure start_date a row_number() and lining it up with the overall row_number for all start_date and end_dates. | |
When the row_number() of an overall date = 2 * the row_number of a start, we have a 'closed era'. | |
Example: | |
Consider the 2 overlapping date ranges: | |
1/1/2010 1/15/2010 | |
1/12/2010 1/30/2010 | |
By arranging the start dates in order with a row_number, and unioning this result with all end dates, you are left with the following: | |
EVENT_DATE START_ORD EVENT_TYPE | |
1/1/2010 1 -1 | |
1/12/2010 2 -1 | |
1/15/2010 null 1 | |
1/30/2010 null 1 | |
(Note: we add an EVENT_TYPE column so that EVENT_DATES that occur on the same day will have the ordinal of the start appear first before the end) | |
By applying a row_number() to these rows: | |
EVENT_DATE START_ORD OVERALL_ORD EVENT_TYPE | |
1/1/2010 1 1 -1 | |
1/12/2010 2 2 -1 | |
1/15/2010 null 3 1 | |
1/30/2010 null 4 1 | |
By applying a MAX(START_ORD) OVER (ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) to this set, we can 'fill down' the start_ord columns that have nulls: | |
EVENT_DATE START_ORD OVERALL_ORD EVENT_TYPE | |
1/1/2010 1 1 -1 | |
1/12/2010 2 2 -1 | |
1/15/2010 2 3 1 | |
1/30/2010 2 4 1 | |
From here we can see that where 2*START_ORD = OVERALL_ORD we have the date where all starts that have opened have been closed. | |
To get the final ERA, You find all the EVENT_DATEs where 2*START_ORD = OVERALL_ORD, (these are era end dates called ERA_END) | |
and the drug_exposure era_end is the first ERA_END that occurs after the drug_exposure start date. This is accomplished in a GROUP BY | |
clause (see SQL below). | |
Since the only end date identified was 1/3/2010, this logic returns this rowset: | |
START_DATE ERA_END | |
1/1/2010 1/30/2010 | |
1/12/2010 1/30/2010 | |
Finally (not shown in this demo), if you wanted to reduce all events to their era_start and era_end, you would take the | |
MIN(START_DATE) as ERA_START, GROUP BY (ERA_END) to get the following result: | |
ERA_START ERA_END | |
1/1/2010 1/30/2010 | |
The SQL below shows the steps up to finding each drug exposure's ERA_END_DATE but does not do the final group-by to calculate the start_era. | |
*/ | |
-- DROP TABLE #DRUG_EXPOSURE; | |
CREATE TABLE #DRUG_EXPOSURE | |
( | |
[DRUG_EXPOSURE_ID] [bigint] NOT NULL, | |
[PERSON_ID] [bigint] NOT NULL, | |
[DRUG_CONCEPT_ID] [int] NOT NULL, | |
[DRUG_EXPOSURE_START_DATE] [date] NOT NULL, | |
[DRUG_TYPE_CONCEPT_ID] [int] NOT NULL, | |
[REFILLS] [int] NULL, | |
[QUANTITY] [int] NULL, | |
[DAYS_SUPPLY] [int] NULL, | |
[DRUG_EXPOSURE_END_DATE] [date] NULL | |
) ON [PRIMARY] | |
GO | |
-- Person 1000000 | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000000,'1/1/2010','1/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000000,'1/1/2010','1/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000000,'1/15/2010','1/20/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000000,'2/14/2010','2/20/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000000,'1/13/2010',null,10,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000001,'1/25/2010','2/3/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000002,'4/1/2010','5/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000003,'4/10/2010','4/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000004,'5/13/2010',null,15,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000000,2000005,'9/1/2010',null,10,3000000,1,1); | |
-- Person 1000001: | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000000,'1/1/2010','1/05/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000001,'1/21/2010',null,10,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000002,'4/1/2010','4/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000003,'8/1/2010',null,30,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000004,'10/1/2010','10/5/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000005,'10/3/2010','10/15/2010',null,3000000,1,1); | |
INSERT INTO #DRUG_EXPOSURE ([DRUG_EXPOSURE_ID],[PERSON_ID],[DRUG_CONCEPT_ID],[DRUG_EXPOSURE_START_DATE],[DRUG_EXPOSURE_END_DATE],[DAYS_SUPPLY],[DRUG_TYPE_CONCEPT_ID],[REFILLS],[QUANTITY]) | |
VALUES (0,1000001,2000006,'11/30/2010',null,5,3000000,1,1); | |
with T as | |
( | |
select *, ROW_NUMBER() OVER (order by DRUG_EXPOSURE_ID) as rn | |
from #DRUG_EXPOSURE | |
) | |
UPDATE T set DRUG_EXPOSURE_ID = rn; | |
-- here is the raw exposure data | |
select * from #DRUG_EXPOSURE; | |
-- here we show how we use the row_number for each START_DATE and row_number for all dates. | |
-- Note that when 2 * E.START_ORDINAL - E.OVERALL_ORD = 0, then we have matched all starts with ends to this point, and this would conclude an 'era' | |
-- In addition: we are applying a 30 day 'gap window' that if a start date begins within 30 days of an end date, it shoudl be | |
-- considered part of the same era. To do this,we just add 30 days to all end dates to push them out, and then when we find | |
-- all the dates that are era ends, we subtract the 30 days back. | |
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as | |
( | |
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date | |
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, | |
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE, | |
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY, | |
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum | |
FROM #DRUG_EXPOSURE d | |
) | |
select E.*, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date | |
FROM | |
( | |
select PERSON_ID, EVENT_DATE, EVENT_TYPE, | |
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with | |
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date | |
from | |
( | |
-- select the start dates, assigning a row number to each | |
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, -1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL | |
from cteDrugTarget | |
UNION ALL | |
-- add the end dates with NULL as the row number, padding the end dates by 30 to allow a grace period for overlapping ranges. | |
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), 1 as EVENT_TYPE, NULL | |
FROM cteDrugTarget | |
) RAWDATA | |
) E | |
WHERE 2 * E.START_ORDINAL - E.OVERALL_ORD = 0 | |
-- pretend that the data in #DRUG_EXPOSURE is the set or rows that match a certain descent concept ID. | |
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as | |
( | |
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date | |
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, | |
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE, | |
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY, | |
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum | |
FROM #DRUG_EXPOSURE d | |
), | |
cteEndDates (PERSON_ID, END_DATE) as -- the magic | |
( | |
select PERSON_ID, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date | |
FROM | |
( | |
select PERSON_ID, EVENT_DATE, EVENT_TYPE, | |
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with | |
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date | |
from | |
( | |
-- select the start dates, assigning a row number to each | |
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, -1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL | |
from cteDrugTarget | |
UNION ALL | |
-- add the end dates with NULL as the row number, padding the end dates by 30 to allow a grace period for overlapping ranges. | |
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), 1 as EVENT_TYPE, NULL | |
FROM cteDrugTarget | |
) RAWDATA | |
) E | |
WHERE 2 * E.START_ORDINAL - E.OVERALL_ORD = 0 | |
) | |
select * from cteEndDates; | |
-- now bring it all together by joining the calculated end dates back to the #drugExposure table, and the row's exposure's era end date will be the MIN end date that is >= the exposure date. | |
with cteDrugTarget (DRUG_EXPOSURE_ID, PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, DRUG_EXPOSURE_END_DATE, DRUG_TYPE_CONCEPT_ID, REFILLS, QUANTITY, DAYS_SUPPLY, RowNum) as | |
( | |
-- Normalize DRUG_EXPOSURE_END_DATE to either the existing drug exposure end date, or add days supply, or add 1 day to the start date | |
select d.DRUG_EXPOSURE_ID, d. PERSON_ID, DRUG_CONCEPT_ID, DRUG_EXPOSURE_START_DATE, | |
COALESCE(DRUG_EXPOSURE_END_DATE, DATEADD(day,DAYS_SUPPLY,DRUG_EXPOSURE_START_DATE), DATEADD(day,1,DRUG_EXPOSURE_START_DATE)) as DRUG_EXPOSURE_END_DATE, | |
d.DRUG_TYPE_CONCEPT_ID, d.REFILLS, d.QUANTITY, d.DAYS_SUPPLY, | |
ROW_NUMBER() OVER (PARTITION BY d.PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as RowNum | |
FROM #DRUG_EXPOSURE d | |
), | |
cteEndDates (PERSON_ID, END_DATE) as -- the magic | |
( | |
select PERSON_ID, DATEADD(day,-30,EVENT_DATE) as END_DATE -- unpad the end date | |
FROM | |
( | |
select PERSON_ID, EVENT_DATE, EVENT_TYPE, | |
MAX(START_ORDINAL) OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE ROWS UNBOUNDED PRECEDING) as START_ORDINAL, -- this pulls the current START down from the prior rows so that the NULLs from the END DATES will contain a value we can compare with | |
ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY EVENT_DATE, EVENT_TYPE) AS OVERALL_ORD -- this re-numbers the inner UNION so all rows are numbered ordered by the event date | |
from | |
( | |
-- select the start dates, assigning a row number to each | |
Select PERSON_ID, DRUG_EXPOSURE_START_DATE AS EVENT_DATE, 1 as EVENT_TYPE, ROW_NUMBER() OVER (PARTITION BY PERSON_ID ORDER BY DRUG_EXPOSURE_START_DATE) as START_ORDINAL | |
from cteDrugTarget | |
UNION ALL | |
-- pad the end dates by 30 to allow a grace period for overlapping ranges. | |
select PERSON_ID, DATEADD(day,30,DRUG_EXPOSURE_END_DATE), -1 as EVENT_TYPE, NULL | |
FROM cteDrugTarget | |
) RAWDATA | |
) E | |
WHERE (2 * E.START_ORDINAL) - E.OVERALL_ORD = 0 | |
) | |
select d.DRUG_EXPOSURE_ID, | |
d.PERSON_ID, | |
d.DRUG_CONCEPT_ID, | |
d.DRUG_EXPOSURE_START_DATE, | |
d.DRUG_EXPOSURE_END_DATE, | |
MIN(e.END_DATE) as ERA_END_DATE, | |
d.DRUG_TYPE_CONCEPT_ID, | |
d.REFILLS, | |
d.QUANTITY, | |
d.DAYS_SUPPLY, | |
d.RowNum | |
FROM cteDrugTarget d | |
JOIN cteEndDates e on d.PERSON_ID = e.PERSON_ID and e.END_DATE >= d.DRUG_EXPOSURE_START_DATE | |
GROUP BY d.DRUG_EXPOSURE_ID, | |
d.PERSON_ID, | |
d.DRUG_CONCEPT_ID, | |
d.DRUG_EXPOSURE_START_DATE, | |
d.DRUG_EXPOSURE_END_DATE, | |
d.DRUG_TYPE_CONCEPT_ID, | |
d.REFILLS, | |
d.QUANTITY, | |
d.DAYS_SUPPLY, | |
d.RowNum | |
; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment