-
-
Save chirag64/b4b408bcd8d9d070e237acd6ce2acd02 to your computer and use it in GitHub Desktop.
Cheat sheet for the python pandas library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
#### creating dataframes, adding and dropping columns | |
df = pd.DataFrame(np.arange(1,10).reshape(3,3),['A','B','C'],['w','x','y']) | |
df.columns = ['W','X','Y'] # change column names | |
df['Z']=df['X']+df['Y'] # new column with values X+Y | |
df['XX']=df.apply(lambda row: row['X']*2, axis=1) # new column with values twice of column X | |
df['YY']=1 # new column of ones | |
df.insert(2, column='D', value=100) # new column of '100's called 'D' at position 2 (3rd column) | |
df.drop('B',axis=0, inplace=True) # drop row | |
df.drop('Z',axis=1) # drop column | |
Z = df.pop('Z') # drop column and store series to a variable | |
#### selecting from dataframes | |
# select columns | |
df.X # column X (does not work when column name has spaces) | |
df['X'] # column X | |
df[['X','Y']] # columns X and Y | |
# select rows using loc and iloc | |
# can also use ix, but it's slightly tricky to use: https://stackoverflow.com/questions/31593201/pandas-iloc-vs-ix-vs-loc-explanation | |
# ix is useful for mixing usage of loc and iloc (use both labels and positions at the same time) | |
# returns a series if single row selected | |
df.loc['A'] # row A | |
df.loc['A':] # row A onwards | |
df.loc[:'X'] # until row X (inclusive!) | |
df.loc['A','X'] # row A, column X | |
df.loc[['A','doesnotexist']] # return row with NaN for doesnotexist, check if exists via "doesnotexist" in df.index | |
df.loc[['A','C'],['X','Y']] # rows A and C, columns X and Y | |
df.iloc[0] # row at position 0 | |
df.iloc[:,0:3] # column from position 0 to before 3 | |
df.iloc[[0,1],[0,1]] # rows 0 and 1, and columns 0 and 1 | |
#### broadcasting operations | |
df['X'].add(5) # == df['X'] + 5 | |
df['X'].sub(5) # == df['X'] - 5 == df['X'].subtract(5) | |
df['X'].mul(5) # == df['X'] * 5 == df['X'].multiply(5) | |
df['X'].div(5) # == df['X'] / 5 == df['X'].divide(5) | |
#### basic attributes and methods | |
# general data exploration | |
df.info() # type, index, columns, dtypes, memory usage - printed automatically | |
df.head() | |
df.tail() | |
df.sample(n=2) # return 2 random rows | |
df.sample(frac=.25) # return 25% random rows | |
df.nlargest(n=2, columns='X') # returns 2 rows for largest X | |
df.nsmallest(n=2, columns='X') # returns 2 rows for smallest X | |
df.index # RangeIndex | |
df.columns # nparray of column names | |
df.axes # index and columns | |
df.values # nparray of nparrays | |
df.dtypes # series | |
df.shape # tuple | |
df.get_dtype_counts() # count number of columns for each datatype | |
df.sum() # sums summable columns into a series (where column names become the index) | |
df.sum(axis = "columns") | |
# methods for columns | |
df.sort_values(by=['X','Y'], ascending=[False,True]) # sort df by column X (descending) then Y (ascending) | |
df['X'].isnull() # series of booleans | |
df['X'].rank() # rank values in column X in ascending order | |
df['X'].unique() # nparray: unique values from column X | |
df['X'].nunique() # number of unique values from column X, does not include null by default! | |
df['X'].value_counts() # returns count of each value from column X | |
df.drop_duplicates(subset = ['X'], keep ="first") # keep only first instance of X values | |
# index-related | |
df.set_index('X') # sets column X as the index | |
df.set_index('Y') # sets column Y as the index, removes column X! | |
df.reset_index() # resets index (removes current index, resets dataframe, inserts new index starting from 0) | |
df.sort_index() # important for optimization | |
# parse columns | |
df.astype(int) # convert to int | |
df['X'].astype("category") # convert to category (important for optimization) | |
pd.to_datetime(df['X']) # convert to date (important for optimization) | |
# rename columns | |
df.rename(columns = {'X': 'A', 'Y': 'B'}, inplace=True) | |
df.columns = ['X', 'Y', 'Z', 'XX', 'YY', 'ZZ'] | |
#### selecting with conditional operators | |
# create mask, which is a series of booleans | |
mask1 = df['X'] < 5 | |
mask2 = df['Y'].isin([4,5,6]) | |
mask3 = df['Y'].isnull() | |
mask4 = df['Y'].notnull() | |
mask5 = df['Y'].between(4,6) | |
mask6 = df['Y'].duplicated(keep = False) # return True if duplicated | |
mask7 = ~df['Y'].duplicated(keep = False) # return True if unique | |
# use mask to select from dataframe | |
df[mask1] # return rows where X<5 | |
df[mask1][['X','Y']] # return only columns X and Y | |
df[mask1 & mask2] | |
df[mask1 | mask2] | |
df[(mask1 | mask2) & mask3] | |
df.where(mask1) # return NaN for entire row when condition not met | |
# alernatively, use query (possibly better performance) | |
# ensure column names dont have spaces: use df.columns = [col.replace(" ","_") for col in df.columns] | |
df.query('X < 5') # return rows where X<5 | |
df.query('X in [4,5,6]') | |
df.query('X not in [4,5,6]') | |
#### missing values - dropna and fillna | |
df.dropna() # removes any rows with NaN values (how = "any" by default) | |
df.dropna(how="all") # removes rows with all NaN values | |
df.dropna(subset=["X"]) # remove rows where value is NaN in column X | |
df.dropna(axis=1,thresh=10) # drop all columns containing at least 10 NaN values | |
df.fillna(0) # replace all NaN values with 0 | |
df['Y'].fillna(0, inplace=True) # replace all NaN values in column 'Y' with 0 | |
df['X'].fillna(value=df['XX'].mean()) # replace NaN on column X with mean of column XX | |
#### groupby with aggregate function | |
df.groupby('X').describe() | |
#### concat | |
df1 = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['v','w','x','y','z']) | |
df2 = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['v','w','x','y','z']) | |
df3 = pd.DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['v','w','x','y','z']) | |
pd.concat([df1,df2,df3]) # concat vertically (match columns) | |
pd.concat([df1,df2,df3],axis=1) # concat horizontally (match rows) | |
#### merge (== join) | |
left = pd.DataFrame([[1,'A'],[1,'B'],[2,'B']], columns=['C1','C2'], index=['I1','I2','I3']) | |
right = pd.DataFrame([['B','C','D']], columns=['C2','C3','C4'], index=['I1']) | |
pd.merge(left,right,how='inner',on='C2') # same as inner join in sql (concats horizontally) | |
#### pivot tables | |
pivot = {'A': ['foo','foo','foo','bar','bar','bar'], | |
'B': ['one','one','two','two','one','one'], | |
'C': ['x','y','x','y','x','y'], | |
'D': [1,3,2,5,4,1]} | |
df = pd.DataFrame(pivot) | |
df.pivot_table(values='D',index=['A','B'],columns=['C']) | |
#### multi-level index dataframes (ie dataframes within dataframes) | |
outside = ['G1','G1','G1','G2','G2','G2'] | |
inside = [1,2,3,1,2,3] | |
hier_index = pd.MultiIndex.from_tuples(list(zip(outside,inside))) | |
df = pd.DataFrame(np.arange(1,13).reshape(6,2),hier_index,['A','B']) | |
df.index.names = ['Groups','Num'] | |
df.loc['G1'].loc[1]['A'] # access elements using multiple loc | |
df.xs(1,level='Num') # return rows where 'num'=1 | |
#### input output | |
# df = pd.read_csv('data_2d.csv', header=None) # headers included by default | |
# df.to_csv('out',index=False) | |
# df = pd.read_excel('Excel.xlsx',sheetname='Sheet1') | |
# df.to_excel('out.xlsx',sheet_name='NewSheet') | |
# data = pd.read_html('somelink') # returns a list | |
# don't use pandas to read sql (better alternatives available) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment