Skip to content

Instantly share code, notes, and snippets.

@prerakmody
Last active June 26, 2019 13:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prerakmody/7b4d852d0f9d82225298e611b351615a to your computer and use it in GitHub Desktop.
Save prerakmody/7b4d852d0f9d82225298e611b351615a to your computer and use it in GitHub Desktop.
Pandas Hacks
import pdb; #pdb.set_trace()
import pandas as pd
from IPython.display import display
# 1. Kick Off
df = pd.read_csv('myfile.csv')
print (df.head())
print (df.head(n=10))
print (df.columns)
col_Y = ''
for col in df.columns:
print (' - Col : ', col, ' || Unique Vals : ', df[col].unique())
display(df)
## 2. Handling nan values
df.dropna(inplace=True)
nanidxs = pd.isnull(df).any(1).nonzero()[0]
print ('Total NaN vals : ', len(df) - len(df.dropna()), ' || NaN Idxs : ', nanidxs)
print ('Data with NaN vals : ', df.iloc[nanidxs])
for col in df.columns:
tmp_len = len(df[df[col].isnull()])
if tmp_len:
print ('Col : ', col, ' || NaN Rows :', tmp_len)
bool_NaN = df_scene10.isnull().values.any()
## 3. Dropping rows/cols
df.drop('col1', axis=1, inplace=True)
df.drop(df[df['col1'] == 'col1_value'].index, axis=0, inplace=True)
# 4. Studying columns
def getCounter(df, col):
tmp = df[col].value_counts()
print (' --> ', list(zip(tmp.keys(), tmp.tolist())))
getCounter(df, col1)
## 5. Convert to matrix
data = df.as_matrix()
data = df.values
X = data[:,:-1] # for ML purposes
Y = data[:,-1] # for ML purposes
## 6. GroupBy commands
stats1 = df.groupby(['col1'])['col2'].count()
df['col1'].agg(['sum','count']).sort_values('count', ascending=False)
df.groupby(['col1', 'col2']).agg({'col3':'count', 'col4':'sum'})
df.groupby('col1')['col1'].agg(['count']).nlargest(10, 'count')
df.groupby(['col1', 'col2']).aggregate({'col3':'count'}).sort_values('col3', ascending=False)
## 7. Datetime
df['col1'] = pd.to_datetime(df['col1'])
df['hrs'] = df['col1'].dt.hour
## 8. Categorical/One hot encoding
df['col1'] = df['col1'].astype("category").cat.codes
df['col1'] = pd.Categorical(df['col1'])
df1 = pd.get_dummies(df['col1'], prefix = 'col1')
print (' - Col :', 'col1', ' || Extra cols added : ', len(df1.columns))
df = pd.concat([df, df1], axis=1)
col1_unique = np.array(df[col1].unique().tolist())
col1_not = np.delete(col1_unique, np.where(np.isin(some_list, col1_unique)))
df[col1 + '_none'] = np.where(np.isin(df[col1], col1_not), 1, 0)
## 9. Finding/Indexing
df['Y'] = np.where(df['col1']=='col1_value', 1, 0)
df['col2'] = df['col1'].copy()
idxs = df[df['col2'] == 'val2'].index.tolist()
df.loc[idxs, 'col2'] = df.loc[idxs, 'col1']*some_var
## 10. Misc
df.corr()
import seaborn as sns
sns.heatmap(df.corr(), cmap = sns.cm.vlag)
## 11. Rearranging Columns
cols = df.columns.tolist()
cols.remove('Y')
df = df[cols + ['Y']]
## 12. Writing to a .csv
tmp = np.array([1,1,1,1,1,1,1]).reshape(-1,1)
df = pd.DataFrame(tmp)
df.to_csv('file.csv', sep=',', index=False, header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment