Created
July 15, 2022 11:11
-
-
Save diitaz93/22b8a110247b3fbaaabcbe670aeecbb1 to your computer and use it in GitHub Desktop.
Useful python functions for data analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" GLOBAL functions: Functions used over several projects. | |
* group_with_freq | |
Author: Juan Sebastian Diaz Boada | |
juansdb93@gmail.com | |
""" | |
import numpy as np | |
import pandas as pd | |
from collections import defaultdict | |
#---------------------------------------------------------------------------------------------------# | |
def group_with_freq(df,col,group_unique=False,new_name=None): | |
""" Groups identical values and calculates their frequency, returning an updated dataframe. | |
Calculates the frequency of each value in the column named 'col' from | |
dataframe 'df', adding it into a column named 'freq_'+col. It also assigns a group number | |
to each unique value in the column 'group_'+col. If the parameter 'group_unique' | |
is True, it groups sequences that appear only once in one group labelled as -1. The suffix | |
of the new column is by default the name of 'col', but can be changed adding a string as | |
'new_name' parameter. | |
Parameters | |
---------- | |
df : pd.DataFrame | |
Dataframe with the column of values to group | |
col : string | |
Name of the column holding the values to analyze. | |
group_unique : bool, optional | |
Wether to group samples that only appear once in a group labelled '-1' or mantain each | |
unique element as a separate group. Default is False. | |
new_name : string, optional | |
Suffix of the neame of the new column, instead of using the name of the old column. | |
Default is None. | |
Returns | |
------- | |
pd.DataFrame | |
Dataframe with additional columns for group number and frequency. | |
""" | |
DF = df.copy() | |
freq_col_name = 'freq_' + col if new_name is None else 'freq_' + new_name | |
group_col_name = 'group_'+col if new_name is None else 'group_' + new_name | |
DF[freq_col_name] = DF.loc[:,col].map(DF.loc[:,col].value_counts()).astype(pd.Int64Dtype()) | |
DF = DF.sort_values(by=[freq_col_name,col],ascending=False) | |
if group_unique: | |
# Default dict for cluster numbers. Return -1 if unseen instance | |
seq2idx = defaultdict(lambda : -1) | |
n_rep = len(DF.loc[DF[freq_col_name]!=1]) | |
seqs = DF.loc[:,col].iloc[:n_rep].unique() | |
else: | |
seq2idx = {} | |
seqs = DF.loc[:,col].unique() | |
for n,g in enumerate(seqs): | |
seq2idx[g]= n | |
# Array with cluster numbers for each sequence | |
group = np.array([seq2idx[i[col]] if not isinstance(i[col], float) \ | |
else pd.NA for _,i in DF.iterrows()]) | |
DF[group_col_name]=group # Add cluster number to df | |
return DF | |
#---------------------------------------------------------------------------------------------------# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment