Skip to content

Instantly share code, notes, and snippets.

@garland3
Last active February 14, 2023 16:50
Show Gist options
  • Save garland3/ca13afbe9068e4bfa22ff559cabdbd59 to your computer and use it in GitHub Desktop.
Save garland3/ca13afbe9068e4bfa22ff559cabdbd59 to your computer and use it in GitHub Desktop.
check_duplicate_columns
def check_duplicate_columns(df):
# Identify duplicate columns
duplicate_columns = df.columns[df.columns.duplicated(keep=False)]
if len(duplicate_columns) == 0:
print("No duplicate columns found.")
return []
else:
print(f"Duplicates found. {duplicate_columns.to_list()} ")
col_idxs_to_drop =[]
for col in duplicate_columns:
col_idx = np.nonzero(df.columns.to_numpy() == col)[0]
col_idxs_to_drop.append(col_idx[0]) # add the 1st one
for j in range(1,len(col_idx)):
previous_idx = col_idx[j-1]
current_idx = col_idx[j]
# print(f"p = {previous_idx} c = {current_idx} j = {j} and {col_idx}")
prev_col = df.iloc[:,previous_idx]
curr_col = df.iloc[:, current_idx]
print(f"Col name = {df.columns[current_idx]}")
if (prev_col == curr_col).all():
print(f"Column {current_idx} is a duplicate of {previous_idx} and has the same value for each row.")
else:
print(f"Column {current_idx} is a duplicate of {previous_idx} but has different values for each row.")
return col_idxs_to_drop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment