Skip to content

Instantly share code, notes, and snippets.

@distributedlock
Last active November 12, 2018 04:21
Show Gist options
  • Save distributedlock/d7d9f27e9ee59ff317dcb6e85c6051bc to your computer and use it in GitHub Desktop.
Save distributedlock/d7d9f27e9ee59ff317dcb6e85c6051bc to your computer and use it in GitHub Desktop.
PERC CSV Merges
import pandas as pd
from src.config import ROOT_DIR
df_admission = pd.read_csv(
f'{ROOT_DIR}/tmp/perc/percadmissionformfep_data_2018-09-14_1539.csv'
)
df_followup = pd.read_csv(
f'{ROOT_DIR}/tmp/perc/percfollowupfepprogr_data_2018-09-14_1538.csv'
)
df_interview = pd.read_csv(
f'{ROOT_DIR}/tmp/perc/percinterviewproband_data_2018-09-14_1534.csv'
)
df_survey_1533 = pd.read_csv(
f'{ROOT_DIR}/tmp/perc/percselfreportsurvey_data_2018-09-14_1533.csv'
)
df_survey_1535 = pd.read_csv(
f'{ROOT_DIR}/tmp/perc/percselfreportsurvey_data_2018-09-14_1535.csv'
)
df_followup_merge = df_admission.merge(
df_followup, how='right', on='bblid', suffixes=('__adm', '__folup')
)
print("17940 df_followup_merge: ", (df_followup_merge.bblid == 17940).sum())
df_interview_merge = df_admission.merge(
df_interview, how='right', on='bblid', suffixes=('__adm', '__intview')
)
print("17940 df_interview_merge: ", (df_interview_merge.bblid == 17940).sum())
df_survey_1533_merge = df_admission.merge(
df_survey_1533, how='right', on='bblid', suffixes=('__adm', '__svey1533')
)
print("17940 df_survey_1533_merge: ", (df_survey_1533_merge.bblid == 17940).sum())
df_survey_1535_merge = df_admission.merge(
df_survey_1535, how='right', on='bblid', suffixes=('__adm', '__svey1535')
)
print("17940 df_survey_1535_merge: ", (df_survey_1535_merge.bblid == 17940).sum())
df_merged = (
df_followup_merge.merge(df_interview_merge, how='outer')
.merge(df_survey_1533_merge, how='outer')
.merge(df_survey_1535_merge, how='outer')
)
df_merged = df_merged.sort_values(by=['bblid', 'admin_timestamp'], na_position='last')
print("Total bblid of `17940` df_merged: ", (df_merged.bblid == 17940).sum())
print("Dimensions: ", df_merged.shape)
print(
"Unique bblids adm raw: ", len(df_admission.dropna(subset=["bblid"]).bblid.unique())
)
print("Unique bblids merged: ", len(df_merged.dropna(subset=["bblid"]).bblid.unique()))
df_merged.to_csv(
f'{ROOT_DIR}/tmp/perc/merged_final_includes_invalid_bblid.csv',
sep=',',
encoding='utf-8',
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment