nilesh-tawari/Pandas_cheatsheet.py

## Pandas_cheatsheet.py
import pandas as pd

# fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Either of following
pd.options.mode.chained_assignment = None  # default='warn'
df.is_copy = False


# read big csv
df = pd.read_csv(FILE_PATH, sep='\t', comment = '#', chunksize=1000, \
                low_memory=False, iterator = True, compression='gzip')
df = pd.concat(list(df), ignore_index=True)

# select rows if values in list
df = df.loc[df['COLUMN_NAME'].isin([LIST_PATTERN])]
df = df[df['COL1'].isin([LIST_PATTERN]) & df['COL2'].isin([LIST_PATTERN])]

# select rows if value == string
df = df.loc[df['COL_NAME'] == 'STRING']

# select rows if not null
df = df[df['COL_NAME'].notnull()]

# select rows containing string
df = df[df['COL_NAME'].str.contains('STRING|STRING', na=False)]

# Merge concat
df_merged = pd.concat([df_1, df_2], ignore_index=True)

# merge dataframes
df_merged = pd.merge(df_1, df_2, left_on='COL1', right_on='COL2', how='outer', suffixes=['', '2'])

# working with text in column
df['COL_NAME'] = df.COL_NAME.str.split('.', expand=True)[0]

# fillna
df.fillna(value='-', inplace = True)

df['A'] = df_curated.apply(
    lambda row: 'STRING' if row['B'] != '-' else '',
    axis=1)

df['c'] = df.apply(
    lambda row: row['a']*row['b'] if np.isnan(row['c']) else row['c'],
    axis=1)


# drop columns
df.drop(['COL_NAMES'], axis=1, inplace=True)

# Sort within group
df = df.groupby(['COL_NAME']).apply(lambda x: x.sort_values(['COL_NAME'], ascending = False)).reset_index(drop=True)

# rename cols
df.rename(columns={'FROM_COL':'TO_COL'}, inplace=True)

# write csv file
df.to_csv('FILE_PATH', sep='\t', index=False)

# insert column at position
df.insert(idx, col_name, value)

# create col based on condition

def f(row):
    if row['COL_1'] == '-' and row['COL2'] == 0:
        val = 'SOME_VAL'
    return val

df['COL_3'] = df.apply(f, axis=1)

# read excel
df = pd.read_excel(EXCEL, sheetname = 'SHEET1', skiprows=2, header=1)

# concat lists of df based on cols
df = pd.concat(dfs, axis=1, names=[LIST_COLS])
df = df.loc[:,~df.columns.duplicated()]

# replace
df['COL'].replace('FROM', 'TO', inplace=True)

# drop dups
df.drop_duplicates(subset=['COL'], inplace=True)

# sort
df.sort_values(by=['COL'], inplace=True)

# from dict to df
df = pd.DataFrame(list(d.items()), columns=['COL1', 'COL2'])

# groupby and join in list
df = df.groupby('COL', as_index=False).aggregate(lambda x: ', '.join(list(x)))
df = df.groupby(['a','b']).apply(lambda x: [list(x['c']), list(x['d'])]).apply(pd.Series)
df.columns =['a','b','c','d']

# add suffix
df = df.add_suffix('_some_suffix')

# split and stack list in a column
s = df['COL'].apply(pd.Series,1).stack().reset_index()
s.index = s.level_0
del s['level_0']
del s['level_1']
df = df.join(s)


# plot histograms
df.hist(column='COLS', bins=50)

# astype
df['COL_3'] = df['COL_3'].astype(int, errors='ignore')

# apply
df.apply(lambda x : str(x['COL1']) + x['COL2'], 1)

# split col in 2
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

# split list in a col to 2 cols
df[['COL1','COL2']] = pd.DataFrame(df.COL.values.tolist(), index= df.index)

# pivot
df = df.pivot(index='COL1', columns='COL2', values='COL3')

# odereddir
from collections import OrderedDict
oderded_dir = OrderedDict(zip([LIST1], [LIST2]))

df = pd.DataFrame.from_dict(oderded_dir,orient='index').transpose()
df1 = pd.concat({k: pd.Series(v) for k, v in oderded_dir.items()})

# file exists
os.path.exists(FILE_PATH)

# writing excel
def set_format(df, worksheet1):
    '''
    set column width in excel sheet based on len(column)
    df ->
    '''
    for i, col in enumerate(df.columns):
        column_len = df[col].astype(str).str.len().max()
        column_len = max(column_len, len(col)) + 2
        if column_len > 20:
            column_len = len(col) +2
        worksheet1.set_column(i,i,column_len)
if not os.path.exists(FILE_NAME):
    writer = pd.ExcelWriter(FILE_NAME, engine = 'xlsxwriter')
    df.to_excel(writer, SHEET_NAME, index = False, startrow = 2)#, float_format ="%.2g")
    workbook = writer.book
    worksheet1 = writer.sheets[SHEET_NAME]
    worksheet1.set_zoom(110)
    set_format(df, worksheet1)
    writer.save()
    print('Completed writing report !!!')
else:
    print('Report exists !!!')

#Conditional format excel:  http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html


# access API
import requests
from pandas.io.json import json_normalize

site = 'http://oncokb.org/api/v1/evidences/lookup?source=oncotree'
response = requests.get(site)
df = json_normalize(response.json())
	import pandas as pd

	# fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Either of following
	pd.options.mode.chained_assignment = None # default='warn'
	df.is_copy = False


	# read big csv
	df = pd.read_csv(FILE_PATH, sep='\t', comment = '#', chunksize=1000, \
	low_memory=False, iterator = True, compression='gzip')
	df = pd.concat(list(df), ignore_index=True)

	# select rows if values in list
	df = df.loc[df['COLUMN_NAME'].isin([LIST_PATTERN])]
	df = df[df['COL1'].isin([LIST_PATTERN]) & df['COL2'].isin([LIST_PATTERN])]

	# select rows if value == string
	df = df.loc[df['COL_NAME'] == 'STRING']

	# select rows if not null
	df = df[df['COL_NAME'].notnull()]

	# select rows containing string
	df = df[df['COL_NAME'].str.contains('STRING\|STRING', na=False)]

	# Merge concat
	df_merged = pd.concat([df_1, df_2], ignore_index=True)

	# merge dataframes
	df_merged = pd.merge(df_1, df_2, left_on='COL1', right_on='COL2', how='outer', suffixes=['', '2'])

	# working with text in column
	df['COL_NAME'] = df.COL_NAME.str.split('.', expand=True)[0]

	# fillna
	df.fillna(value='-', inplace = True)

	df['A'] = df_curated.apply(
	lambda row: 'STRING' if row['B'] != '-' else '',
	axis=1)

	df['c'] = df.apply(
	lambda row: row['a']*row['b'] if np.isnan(row['c']) else row['c'],
	axis=1)


	# drop columns
	df.drop(['COL_NAMES'], axis=1, inplace=True)

	# Sort within group
	df = df.groupby(['COL_NAME']).apply(lambda x: x.sort_values(['COL_NAME'], ascending = False)).reset_index(drop=True)

	# rename cols
	df.rename(columns={'FROM_COL':'TO_COL'}, inplace=True)

	# write csv file
	df.to_csv('FILE_PATH', sep='\t', index=False)

	# insert column at position
	df.insert(idx, col_name, value)

	# create col based on condition

	def f(row):
	if row['COL_1'] == '-' and row['COL2'] == 0:
	val = 'SOME_VAL'
	return val

	df['COL_3'] = df.apply(f, axis=1)

	# read excel
	df = pd.read_excel(EXCEL, sheetname = 'SHEET1', skiprows=2, header=1)

	# concat lists of df based on cols
	df = pd.concat(dfs, axis=1, names=[LIST_COLS])
	df = df.loc[:,~df.columns.duplicated()]

	# replace
	df['COL'].replace('FROM', 'TO', inplace=True)

	# drop dups
	df.drop_duplicates(subset=['COL'], inplace=True)

	# sort
	df.sort_values(by=['COL'], inplace=True)

	# from dict to df
	df = pd.DataFrame(list(d.items()), columns=['COL1', 'COL2'])

	# groupby and join in list
	df = df.groupby('COL', as_index=False).aggregate(lambda x: ', '.join(list(x)))
	df = df.groupby(['a','b']).apply(lambda x: [list(x['c']), list(x['d'])]).apply(pd.Series)
	df.columns =['a','b','c','d']

	# add suffix
	df = df.add_suffix('_some_suffix')

	# split and stack list in a column
	s = df['COL'].apply(pd.Series,1).stack().reset_index()
	s.index = s.level_0
	del s['level_0']
	del s['level_1']
	df = df.join(s)


	# plot histograms
	df.hist(column='COLS', bins=50)

	# astype
	df['COL_3'] = df['COL_3'].astype(int, errors='ignore')

	# apply
	df.apply(lambda x : str(x['COL1']) + x['COL2'], 1)

	# split col in 2
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	# split list in a col to 2 cols
	df[['COL1','COL2']] = pd.DataFrame(df.COL.values.tolist(), index= df.index)

	# pivot
	df = df.pivot(index='COL1', columns='COL2', values='COL3')

	# odereddir
	from collections import OrderedDict
	oderded_dir = OrderedDict(zip([LIST1], [LIST2]))

	df = pd.DataFrame.from_dict(oderded_dir,orient='index').transpose()
	df1 = pd.concat({k: pd.Series(v) for k, v in oderded_dir.items()})

	# file exists
	os.path.exists(FILE_PATH)

	# writing excel
	def set_format(df, worksheet1):
	'''
	set column width in excel sheet based on len(column)
	df ->
	'''
	for i, col in enumerate(df.columns):
	column_len = df[col].astype(str).str.len().max()
	column_len = max(column_len, len(col)) + 2
	if column_len > 20:
	column_len = len(col) +2
	worksheet1.set_column(i,i,column_len)
	if not os.path.exists(FILE_NAME):
	writer = pd.ExcelWriter(FILE_NAME, engine = 'xlsxwriter')
	df.to_excel(writer, SHEET_NAME, index = False, startrow = 2)#, float_format ="%.2g")
	workbook = writer.book
	worksheet1 = writer.sheets[SHEET_NAME]
	worksheet1.set_zoom(110)
	set_format(df, worksheet1)
	writer.save()
	print('Completed writing report !!!')
	else:
	print('Report exists !!!')

	#Conditional format excel: http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html


	# access API
	import requests
	from pandas.io.json import json_normalize

	site = 'http://oncokb.org/api/v1/evidences/lookup?source=oncotree'
	response = requests.get(site)
	df = json_normalize(response.json())