Skip to content

Instantly share code, notes, and snippets.

@shubhamagarwal92
Last active September 14, 2021 14:01
Show Gist options
  • Save shubhamagarwal92/13e2c41d09156c3810740d7697a883d1 to your computer and use it in GitHub Desktop.
Save shubhamagarwal92/13e2c41d09156c3810740d7697a883d1 to your computer and use it in GitHub Desktop.
Pandas helper functions for analysis
import pandas as pd
def read_json_to_df(file_path):
# df = pd.read_json(path_or_buf=file_path,orient='records',lines=True)
df = pd.read_json(path_or_buf=file_path, orient='records')
return df
def read_json_list_to_df(json_list):
df = pd.DataFrame.from_records(json_list)
return df
def count_unique(df, col_name):
""" Count unique values in a df column """
count = df[col_name].nunique()
return count
def get_unique_column_values(df,col_name):
""" Returns unique values """
return df[col_name].unique()
def get_column_stats(df,column_name,to_dict = False):
if to_dict:
return df[column_name].value_counts().to_dict()
else:
# return df[column_name].value_counts()
c = df[column_name].value_counts(dropna=False)
p = df[column_name].value_counts(dropna=False, normalize=True)*100
m = pd.concat([c,p], axis=1, keys=['counts', '%'])
return m
def get_pandas_percentile(df):
df['words'].describe(percentiles=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])
return
def flatten_json_column(df,col_name='utterance'):
temp_df = json_normalize(df[col_name].tolist())
df.reset_index(drop=True,inplace=True)
df = df.join(temp_df).drop(col_name, axis=1)
return df
def get_column_stats(df,column_name,to_dict = False):
if to_dict:
return df[column_name].value_counts().to_dict()
else:
return df[column_name].value_counts()
def findFiles(path):
return glob.glob(path)
def get_column_names(df):
return df.columns.values
def get_value_row_column(df,index,column_name):
return df.get_value(index,column_name)
def flatten_dic_column(df,col_name):
df_new= pd.concat([df.drop([col_name], axis=1), df[col_name].apply(pd.Series)], axis=1)
return df_new
def append_df(df, df_to_append, ignore_index=True):
new_df = df.append(df_to_append,ignore_index=ignore_index)
return new_df
def write_df_to_csv(df,outputFilePath):
df.to_csv(outputFilePath, sep=str('\t'),quotechar=str('"'), index=False, header=True)
def write_df_to_json(df,outputFilePath):
df.to_json(path_or_buf=outputFilePath,orient='records',lines=True)
def save_df_pickle(df,output_file):
df.to_pickle(output_file)
def get_unique_column_values(df,col_name):
""" Returns unique values """
return df[col_name].unique()
def count_unique(df, col_name):
""" Count unique values in a df column """
count = df[col_name].nunique()
return count
def print_analysis(_list, key="relevance", _type="relevance"):
df = pd.DataFrame(_list)
df.columns = [key]
print("\n-----------------------------------")
print("Total unique {} responses".format(_type))
print(count_unique(df,key))
print("\n-----------------------------------")
print("Stats for {} responses".format(_type))
print(get_column_stats(df,key))
print("\n-----------------------------------")
print("Number of {} responses".format(_type))
print(df[key].describe())
return
# Binarize
# https://datascience.stackexchange.com/questions/11797/split-a-list-of-values-into-columns-of-a-dataframe
# Fill na
# https://datascience.stackexchange.com/questions/15924/how-can-i-fill-nan-values-in-a-pandas-data-frame?rq=1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment