Skip to content

Instantly share code, notes, and snippets.

@shaabhishek
Created November 12, 2019 21:11
Show Gist options
  • Save shaabhishek/ec13106a699b8d2110ee6de51aac9a14 to your computer and use it in GitHub Desktop.
Save shaabhishek/ec13106a699b8d2110ee6de51aac9a14 to your computer and use it in GitHub Desktop.
dense_rank use case for assigning the visit_counter to each hospital visit for each patient
## imports and other stuff
# Boilerplate for setting things up
query_args = {'dbname': dbname, 'host': host, 'port': port, 'user': dbusername}
conn = psycopg2.connect(**query_args)
cur = conn.cursor()
cur.execute('SET search_path to ' + schema_name)
# Setup the query- note it is ALMOST the same as the sql file's query
query = \
"""
with visitcnts as
(
select d.subject_id, count(distinct d.hadm_id) as visits
from diagnoses_icd d
group by d.subject_id
), diag as
(
select visitcnts.subject_id, d2.hadm_id, d2.icd9_code
from visitcnts
inner join diagnoses_icd d2
on visitcnts.subject_id = d2.subject_id
where visits > 1
)
select diag.*, a.admittime
from diag
inner join admissions a
on diag.hadm_id = a.hadm_id
"""
# read table into dataframe
data_df = pd.read_sql_query(query, conn)
# extract out only the unique subject-hospital_admission-admittime combinations
hadm_times = data_df.drop('icd9_code', axis=1)
hadm_times = hadm_times.drop_duplicates()
## group by subject - create a list of indices for each hospital visit - assign each (subject,hadm) pair an index
# works because group keys are sorted
visit_idxs = hadm_times.groupby(['subject_id']).apply(lambda x: pd.Series(np.arange(len(x)))).values
hadm_times = hadm_times.sort_values(['subject_id', 'admittime'])
hadm_times['visit_idx'] = visit_idxs
# don't really need admittimes anymore
hadm_times.drop('admittime', axis=1, inplace=True)
data_df.drop('admittime', axis=1, inplace=True)
# join the visit-ranking with the full table
data_df = hadm_times.merge(data_df, on=['subject_id', 'hadm_id'], how='inner')
with visitcnts as
(
select d.subject_id, count(distinct d.hadm_id) as visits
from diagnoses_icd d
group by d.subject_id
), diag as
(
select visitcnts.subject_id, d2.hadm_id, d2.icd9_code
from visitcnts
inner join diagnoses_icd d2
on visitcnts.subject_id = d2.subject_id
where visits > 1
)
select diag.*, a.admittime,
dense_rank() over (
partition by diag.subject_id
order by a.admittime
) visit_idx
from diag
inner join admissions a
on diag.hadm_id = a.hadm_id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment