-
-
Save bememea/55eff98b78682de158e98d9d22ed66c5 to your computer and use it in GitHub Desktop.
Script to update IDs that triggered AttributeError: Can only use .str accessor with string values!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import synapseclient | |
import pandas as pd | |
# Backstory: When Sample Master was first created, the Rush | |
# project IDs were used as the values for 'individualID'. | |
# But, projids weren't fully anonymous and so couldn't be used | |
# as individualID annotations. Rush generated anonymized IDs | |
# with format R####### that are acceptable to use for | |
# (publicly-visible) annotations. We are now using the | |
# Rush-generated values as individual IDs and keeping | |
# the project IDs in the projid variable. Projids | |
# are still visible to folks who have signed the ROSMAP DUC. | |
syn = synapseclient.login() | |
# Get individualID (Rush-generated anonymized IDs) | |
rosmap_clin_entity = syn.get('syn3191087') | |
rosmap_clin_df = pd.read_csv(rosmap_clin_entity.path) | |
rosmap_id_df = rosmap_clin_df[['individualID', 'projid']] | |
# Get all ROSMAP records from Sample Master | |
rosmap_sm_table = syn.tableQuery("select * from syn18912660 where study = 'ROSMAP'") | |
rosmap_sm_df = rosmap_sm_table.asDataFrame() | |
rosmap_sm_cols = rosmap_sm_df.columns.values.tolist() | |
rosmap_sm_df['rowId'] = rosmap_sm_df.index | |
# Move the projids to the projid variable, then drop individualID | |
# so we can easily add the updated Rush-generated values in the next step | |
rosmap_sm_df['projid'] = rosmap_sm_df['individualID'] | |
rosmap_sm_df.drop(columns=['individualID'], inplace=True) | |
# Merge IDs into Sample Master to add the | |
# Rush-generated anonymized IDs as the 'individualID' values | |
rosmap_sm_id_df = rosmap_id_df.reset_index().merge(rosmap_sm_df, | |
how='right', | |
on='projid').set_index('rowId') | |
# Prepare table to upload to Synapse by putting columns in expected order | |
rosmap_updated_ids_df = rosmap_sm_id_df[rosmap_sm_cols].copy() | |
# Recast projid as a string since that's what the Synapse table is expecting | |
rosmap_updated_ids_df['projid'] = rosmap_updated_ids_df['projid'].astype(str) | |
rosmap_updated_ids_df.loc[rosmap_updated_ids_df['individualID'].isna(), 'individualID'] = '' | |
# Create df of records where the projid did not have a corresponding | |
# anonymized individualID so that we can contact Rush and request | |
# data for these participants (or find out why they're missing) | |
no_id_df = rosmap_updated_ids_df[pd.isna(rosmap_updated_ids_df.individualID)] | |
affected_projids = no_id_df.projid.unique() | |
affected_projids.sort() | |
print(', '.join(affected_projids)) | |
# ... and push back to Synapse | |
rosmap_updated_ids_df = syn.store(synapseclient.Table(rosmap_sm_table.tableId, rosmap_updated_ids_df)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment