Skip to content

Instantly share code, notes, and snippets.

@stevenvo
Forked from bsweger/useful_pandas_snippets.md
Last active February 13, 2020 18:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save stevenvo/c66eeb3c970f8b86b0bf5ff486a78e39 to your computer and use it in GitHub Desktop.
Save stevenvo/c66eeb3c970f8b86b0bf5ff486a78e39 to your computer and use it in GitHub Desktop.
Useful Pandas Snippets
###### FORCE DATA FORMAT IN PANDAS ######
pd.options.display.float_format = '{:,.2f}'.format
# NOTES:
# 1. DO NOT update row in df.iterrows() -> https://stackoverflow.com/questions/25478528/updating-value-in-iterrow-for-pandas
# Use df.loc[idx, '<col>'] = <value>
###### REGEX on DataFrame ######
# Search a column of strings for a pattern
# Which rows of df['raw'] contain 'xxxx-xx-xx'?
df['raw'].str.contains('....-..-..', regex=True)
# Extract the column of single digits
# In the column 'raw', extract single digit in the strings
df['female'] = df['raw'].str.extract('(\d)', expand=True)
# Extract the column of dates
# In the column 'raw', extract xxxx-xx-xx in the strings
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)
# Extract the column of thousands
# In the column 'raw', extract ####.## in the strings
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True)
# Extract the column of words
# In the column 'raw', extract the word in the strings
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True)
### DATA CONVERSION ###
# Get list from pandas dataframe column
df['abc'].tolist()
#Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)
### BASIC DATA LOOPING, GET LENGTH ###
#Loop through rows in a DataFrame
#(if you must)
for index, row in df.iterrows():
print index, row['some column']
#Get length of data in a DataFrame column
df.column_name.str.len()
#Get quick count of rows in a DataFrame
len(df.index)
### DATA FILTERING, SEARCH ###
#List unique values in a DataFrame column
pd.unique(df.column_name.ravel())
#Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(value_list)]
#Grab DataFrame rows where column doesn't have certain values
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]
#Select from DataFrame using criteria from multiple columns
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
#get top n for each group of columns in a sorted dataframe
#(make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)
# Group by time (Hour/Minute) & another potential column
times = pd.to_datetime(df.datetime)
df_grp = df.groupby([times.dt.hour, '<other column name>'])
df_grp = df.groupby([times.dt.hour, '<other column name>']).<column_to_sum>.sum() #if need to SUM
# FLATTEN column headers after GROUPBY
# https://stackoverflow.com/questions/33004573/after-groupby-how-to-flatten-column-headers
df.reset_index()
#Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]
#select from DataFrame using multiple keys of a hierarchical index
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
#### MODIFY, SORT, SLICE, RENAME DATA FRAME ####
#Delete column from DataFrame
del df['column']
#Rename several DataFrame columns
df = df.rename(columns = {
'col1 old name':'col1 new name',
'col2 old name':'col2 new name',
'col3 old name':'col3 new name',
})
#lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)
#even more fancy DataFrame column re-naming
#lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
#Next few examples show how to work with text data in Pandas.
#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html
#Slice values in a DataFrame column (aka Series)
df.column.str[0:2]
#Lower-case everything in a DataFrame column
df.column_name = df.column_name.str.lower()
#Sort dataframe by multiple columns
df = df.sort_values(by=['col1', 'col2', 'col3'], ascending=[True, False, True])
df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) # Seem stop working
#Change all NaNs to None (useful before
#loading to a db)
df = df.where((pd.notnull(df)), None)
#Pivot data (with flexibility about what what
#becomes a column and what stays a row).
#Syntax works on Pandas >= .14
pd.pivot_table(
df,values='cell_value',
index=['col1', 'col2', 'col3'], #these stay as columns
columns=['col4']) #data values in this column become their own column
#change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)
# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
#Set DataFrame column values based on other column values
df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value
#Clean up missing values in multiple DataFrame columns
df = df.fillna({
'col1': 'missing',
'col2': '99.999',
'col3': '999',
'col4': 'missing',
'col5': 'missing',
'col6': '99'
})
#Concatenate two DataFrame columns into a new, single column
#(useful when dealing with composite keys, for example)
df['newcol'] = df['col1'].map(str) + df['col2'].map(str)
#Doing calculations with DataFrame columns that have missing values
#In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
# Split delimited values in a DataFrame column into two new columns
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
# Collapse hierarchical column indexes
df.columns = df.columns.get_level_values(0)
#Convert Django queryset to DataFrame
qs = DjangoModelName.objects.all()
q = qs.values()
df = pd.DataFrame.from_records(q)
#Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
# append or insert a row into pandas data frame
# using loc[]
df.loc[lens(df.index)] = [new value]
# append a column using loc too
# 'C' is the new column
df.loc[:,'C'] = [new column values]
# DataFrame - Drop Columns
claims = claims.drop(['Measure Start Date', 'Measure End Date'], axis=1)
# DataFrame - Replace content in columns
origin1 = '%'
claims[['Percent Spending Hospital', 'Percent Spending State']] = claims[['Percent Spending Hospital', 'Percent Spending State']].replace(origin1,'',regex=True).astype('float')/100
# DataFrame - filter datetime index
# By month and year
close_px[(close_px.index.year == 2010) & (close_px.index.month == 10)]
# By date range
close_px[('2011-06-01'<= close_px.index) & ('2011-06-16' >= close_px.index)]
# By weekday
close_px[('2011-06-01'<= close_px.index) &
('2011-06-16' >= close_px.index) &
(close_px.index.weekday == 4)] #friday
# DataFrame, resample index date time
close_px.resample('BM') # Business Month End
# DataFrame shift data UP or DOWN, or in some cases, one day earlier/later, one month earlier/later
close_px.shift(1, "BM") # 1 business month earlier, no NAN values, i.e. remove 1st row
close_px.shift(1) # 1 unit earlier, all values are push down 1 row, got NAN values in first row
# DataFrame drop duplicates on certain columns
df.drop_duplicates(['states', 'years'])
# DataFrame pivot - 3 ways:
df.pivot(index='states', columns='years', values='data1')
df.pivot('states','years','data1')
pd.pivot_table(df, index='states', columns='years', values='data1')
pd.pivot_table(df, index=['states'], columns=['years'], values='data1')
# with function aggregation
pd.pivot_table(df, index=['states'], columns=['years'], values='data1', aggfunc=np.mean)
# DataFrame cut columns into different equal pieces
pd.qcut(df['data1'], 2)
# To select the ith row, use iloc:
df_test.iloc[0]
# To select the ith value in the Btime column you could use:
df_test['Btime'].iloc[0]
# Add column - Ref: https://coderwall.com/p/dt4yja/adding-columns-to-a-pandas-dataframe
# Dataframes in some ways act very similar to Python dictionaries in that you easily add new columns.
df = pandas.DataFrame.from_csv('my_data.csv') # fake data
df['diff_A_B'] = df['A'] - df['B']
# Convert existing column from float to percentage
# Display column in percentage
df['pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in df['pct']], index = df.index)
# Get a list of row where the value of a certain column is not an empty list
# Ref: https://stackoverflow.com/questions/42143116/how-to-get-rows-containg-empty-list-in-a-data-frame-columns-in-python
test.B.apply(lambda c: c==[])
Out[71]:
0 True
1 False
Name: B, dtype: bool
test[test.B.apply(lambda c: c==[])]
Out[72]:
A B
0 a []
# found the difference between 2 data frames: df1, df2
# base on column 'id'
df1[~df1.id.isin(df2.id)]
# Update/Create column based on IF condition of other column
df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
# Check if value in one dataframe column does exist in the other dataframe column
Df1.name.isin(Df2.IDs).astype(int)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment