Skip to content

Instantly share code, notes, and snippets.

@nilesh-tawari
Last active March 1, 2023 07:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nilesh-tawari/b3816925f13bf164301c5ff8c98d222f to your computer and use it in GitHub Desktop.
Save nilesh-tawari/b3816925f13bf164301c5ff8c98d222f to your computer and use it in GitHub Desktop.
Pandas_cheatsheet.py
import pandas as pd
# fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Either of following
pd.options.mode.chained_assignment = None # default='warn'
df.is_copy = False
# read big csv
df = pd.read_csv(FILE_PATH, sep='\t', comment = '#', chunksize=1000, \
low_memory=False, iterator = True, compression='gzip')
df = pd.concat(list(df), ignore_index=True)
# select rows if values in list
df = df.loc[df['COLUMN_NAME'].isin([LIST_PATTERN])]
df = df[df['COL1'].isin([LIST_PATTERN]) & df['COL2'].isin([LIST_PATTERN])]
# select rows if value == string
df = df.loc[df['COL_NAME'] == 'STRING']
# select rows if not null
df = df[df['COL_NAME'].notnull()]
# select rows containing string
df = df[df['COL_NAME'].str.contains('STRING|STRING', na=False)]
# Merge concat
df_merged = pd.concat([df_1, df_2], ignore_index=True)
# merge dataframes
df_merged = pd.merge(df_1, df_2, left_on='COL1', right_on='COL2', how='outer', suffixes=['', '2'])
# working with text in column
df['COL_NAME'] = df.COL_NAME.str.split('.', expand=True)[0]
# fillna
df.fillna(value='-', inplace = True)
df['A'] = df_curated.apply(
lambda row: 'STRING' if row['B'] != '-' else '',
axis=1)
df['c'] = df.apply(
lambda row: row['a']*row['b'] if np.isnan(row['c']) else row['c'],
axis=1)
# drop columns
df.drop(['COL_NAMES'], axis=1, inplace=True)
# Sort within group
df = df.groupby(['COL_NAME']).apply(lambda x: x.sort_values(['COL_NAME'], ascending = False)).reset_index(drop=True)
# rename cols
df.rename(columns={'FROM_COL':'TO_COL'}, inplace=True)
# write csv file
df.to_csv('FILE_PATH', sep='\t', index=False)
# insert column at position
df.insert(idx, col_name, value)
# create col based on condition
def f(row):
if row['COL_1'] == '-' and row['COL2'] == 0:
val = 'SOME_VAL'
return val
df['COL_3'] = df.apply(f, axis=1)
# read excel
df = pd.read_excel(EXCEL, sheetname = 'SHEET1', skiprows=2, header=1)
# concat lists of df based on cols
df = pd.concat(dfs, axis=1, names=[LIST_COLS])
df = df.loc[:,~df.columns.duplicated()]
# replace
df['COL'].replace('FROM', 'TO', inplace=True)
# drop dups
df.drop_duplicates(subset=['COL'], inplace=True)
# sort
df.sort_values(by=['COL'], inplace=True)
# from dict to df
df = pd.DataFrame(list(d.items()), columns=['COL1', 'COL2'])
# groupby and join in list
df = df.groupby('COL', as_index=False).aggregate(lambda x: ', '.join(list(x)))
df = df.groupby(['a','b']).apply(lambda x: [list(x['c']), list(x['d'])]).apply(pd.Series)
df.columns =['a','b','c','d']
# add suffix
df = df.add_suffix('_some_suffix')
# split and stack list in a column
s = df['COL'].apply(pd.Series,1).stack().reset_index()
s.index = s.level_0
del s['level_0']
del s['level_1']
df = df.join(s)
# plot histograms
df.hist(column='COLS', bins=50)
# astype
df['COL_3'] = df['COL_3'].astype(int, errors='ignore')
# apply
df.apply(lambda x : str(x['COL1']) + x['COL2'], 1)
# split col in 2
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
# split list in a col to 2 cols
df[['COL1','COL2']] = pd.DataFrame(df.COL.values.tolist(), index= df.index)
# pivot
df = df.pivot(index='COL1', columns='COL2', values='COL3')
# odereddir
from collections import OrderedDict
oderded_dir = OrderedDict(zip([LIST1], [LIST2]))
df = pd.DataFrame.from_dict(oderded_dir,orient='index').transpose()
df1 = pd.concat({k: pd.Series(v) for k, v in oderded_dir.items()})
# file exists
os.path.exists(FILE_PATH)
# writing excel
def set_format(df, worksheet1):
'''
set column width in excel sheet based on len(column)
df ->
'''
for i, col in enumerate(df.columns):
column_len = df[col].astype(str).str.len().max()
column_len = max(column_len, len(col)) + 2
if column_len > 20:
column_len = len(col) +2
worksheet1.set_column(i,i,column_len)
if not os.path.exists(FILE_NAME):
writer = pd.ExcelWriter(FILE_NAME, engine = 'xlsxwriter')
df.to_excel(writer, SHEET_NAME, index = False, startrow = 2)#, float_format ="%.2g")
workbook = writer.book
worksheet1 = writer.sheets[SHEET_NAME]
worksheet1.set_zoom(110)
set_format(df, worksheet1)
writer.save()
print('Completed writing report !!!')
else:
print('Report exists !!!')
#Conditional format excel: http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
# access API
import requests
from pandas.io.json import json_normalize
site = 'http://oncokb.org/api/v1/evidences/lookup?source=oncotree'
response = requests.get(site)
df = json_normalize(response.json())
@nilesh-tawari
Copy link
Author

Useful snippets for data analysis in pandas

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment