Skip to content

Instantly share code, notes, and snippets.

@diitaz93
Created July 15, 2022 11:11
Show Gist options
  • Save diitaz93/22b8a110247b3fbaaabcbe670aeecbb1 to your computer and use it in GitHub Desktop.
Save diitaz93/22b8a110247b3fbaaabcbe670aeecbb1 to your computer and use it in GitHub Desktop.
Useful python functions for data analysis
""" GLOBAL functions: Functions used over several projects.
* group_with_freq
Author: Juan Sebastian Diaz Boada
juansdb93@gmail.com
"""
import numpy as np
import pandas as pd
from collections import defaultdict
#---------------------------------------------------------------------------------------------------#
def group_with_freq(df,col,group_unique=False,new_name=None):
""" Groups identical values and calculates their frequency, returning an updated dataframe.
Calculates the frequency of each value in the column named 'col' from
dataframe 'df', adding it into a column named 'freq_'+col. It also assigns a group number
to each unique value in the column 'group_'+col. If the parameter 'group_unique'
is True, it groups sequences that appear only once in one group labelled as -1. The suffix
of the new column is by default the name of 'col', but can be changed adding a string as
'new_name' parameter.
Parameters
----------
df : pd.DataFrame
Dataframe with the column of values to group
col : string
Name of the column holding the values to analyze.
group_unique : bool, optional
Wether to group samples that only appear once in a group labelled '-1' or mantain each
unique element as a separate group. Default is False.
new_name : string, optional
Suffix of the neame of the new column, instead of using the name of the old column.
Default is None.
Returns
-------
pd.DataFrame
Dataframe with additional columns for group number and frequency.
"""
DF = df.copy()
freq_col_name = 'freq_' + col if new_name is None else 'freq_' + new_name
group_col_name = 'group_'+col if new_name is None else 'group_' + new_name
DF[freq_col_name] = DF.loc[:,col].map(DF.loc[:,col].value_counts()).astype(pd.Int64Dtype())
DF = DF.sort_values(by=[freq_col_name,col],ascending=False)
if group_unique:
# Default dict for cluster numbers. Return -1 if unseen instance
seq2idx = defaultdict(lambda : -1)
n_rep = len(DF.loc[DF[freq_col_name]!=1])
seqs = DF.loc[:,col].iloc[:n_rep].unique()
else:
seq2idx = {}
seqs = DF.loc[:,col].unique()
for n,g in enumerate(seqs):
seq2idx[g]= n
# Array with cluster numbers for each sequence
group = np.array([seq2idx[i[col]] if not isinstance(i[col], float) \
else pd.NA for _,i in DF.iterrows()])
DF[group_col_name]=group # Add cluster number to df
return DF
#---------------------------------------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment