-
-
Save stevenvo/c66eeb3c970f8b86b0bf5ff486a78e39 to your computer and use it in GitHub Desktop.
Useful Pandas Snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### FORCE DATA FORMAT IN PANDAS ###### | |
pd.options.display.float_format = '{:,.2f}'.format | |
# NOTES: | |
# 1. DO NOT update row in df.iterrows() -> https://stackoverflow.com/questions/25478528/updating-value-in-iterrow-for-pandas | |
# Use df.loc[idx, '<col>'] = <value> | |
###### REGEX on DataFrame ###### | |
# Search a column of strings for a pattern | |
# Which rows of df['raw'] contain 'xxxx-xx-xx'? | |
df['raw'].str.contains('....-..-..', regex=True) | |
# Extract the column of single digits | |
# In the column 'raw', extract single digit in the strings | |
df['female'] = df['raw'].str.extract('(\d)', expand=True) | |
# Extract the column of dates | |
# In the column 'raw', extract xxxx-xx-xx in the strings | |
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True) | |
# Extract the column of thousands | |
# In the column 'raw', extract ####.## in the strings | |
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True) | |
# Extract the column of words | |
# In the column 'raw', extract the word in the strings | |
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True) | |
### DATA CONVERSION ### | |
# Get list from pandas dataframe column | |
df['abc'].tolist() | |
#Convert Series datatype to numeric, getting rid of any non-numeric values | |
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
### BASIC DATA LOOPING, GET LENGTH ### | |
#Loop through rows in a DataFrame | |
#(if you must) | |
for index, row in df.iterrows(): | |
print index, row['some column'] | |
#Get length of data in a DataFrame column | |
df.column_name.str.len() | |
#Get quick count of rows in a DataFrame | |
len(df.index) | |
### DATA FILTERING, SEARCH ### | |
#List unique values in a DataFrame column | |
pd.unique(df.column_name.ravel()) | |
#Grab DataFrame rows where column has certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(value_list)] | |
#Grab DataFrame rows where column doesn't have certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[~df.column.isin(value_list)] | |
#Select from DataFrame using criteria from multiple columns | |
newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
#get top n for each group of columns in a sorted dataframe | |
#(make sure dataframe is sorted first) | |
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5) | |
# Group by time (Hour/Minute) & another potential column | |
times = pd.to_datetime(df.datetime) | |
df_grp = df.groupby([times.dt.hour, '<other column name>']) | |
df_grp = df.groupby([times.dt.hour, '<other column name>']).<column_to_sum>.sum() #if need to SUM | |
# FLATTEN column headers after GROUPBY | |
# https://stackoverflow.com/questions/33004573/after-groupby-how-to-flatten-column-headers | |
df.reset_index() | |
#Grab DataFrame rows where specific column is null/notnull | |
newdf = df[df['column'].isnull()] | |
#select from DataFrame using multiple keys of a hierarchical index | |
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2')) | |
#### MODIFY, SORT, SLICE, RENAME DATA FRAME #### | |
#Delete column from DataFrame | |
del df['column'] | |
#Rename several DataFrame columns | |
df = df.rename(columns = { | |
'col1 old name':'col1 new name', | |
'col2 old name':'col2 new name', | |
'col3 old name':'col3 new name', | |
}) | |
#lower-case all DataFrame column names | |
df.columns = map(str.lower, df.columns) | |
#even more fancy DataFrame column re-naming | |
#lower-case all DataFrame column names (for example) | |
df.rename(columns=lambda x: x.split('.')[-1], inplace=True) | |
#Next few examples show how to work with text data in Pandas. | |
#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html | |
#Slice values in a DataFrame column (aka Series) | |
df.column.str[0:2] | |
#Lower-case everything in a DataFrame column | |
df.column_name = df.column_name.str.lower() | |
#Sort dataframe by multiple columns | |
df = df.sort_values(by=['col1', 'col2', 'col3'], ascending=[True, False, True]) | |
df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) # Seem stop working | |
#Change all NaNs to None (useful before | |
#loading to a db) | |
df = df.where((pd.notnull(df)), None) | |
#Pivot data (with flexibility about what what | |
#becomes a column and what stays a row). | |
#Syntax works on Pandas >= .14 | |
pd.pivot_table( | |
df,values='cell_value', | |
index=['col1', 'col2', 'col3'], #these stay as columns | |
columns=['col4']) #data values in this column become their own column | |
#change data type of DataFrame column | |
df.column_name = df.column_name.astype(np.int64) | |
# Get rid of non-numeric values throughout a DataFrame: | |
for col in refunds.columns.values: | |
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) | |
#Set DataFrame column values based on other column values | |
df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value | |
#Clean up missing values in multiple DataFrame columns | |
df = df.fillna({ | |
'col1': 'missing', | |
'col2': '99.999', | |
'col3': '999', | |
'col4': 'missing', | |
'col5': 'missing', | |
'col6': '99' | |
}) | |
#Concatenate two DataFrame columns into a new, single column | |
#(useful when dealing with composite keys, for example) | |
df['newcol'] = df['col1'].map(str) + df['col2'].map(str) | |
#Doing calculations with DataFrame columns that have missing values | |
#In example below, swap in 0 for df['col1'] cells that contain null | |
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
# Split delimited values in a DataFrame column into two new columns | |
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
# Collapse hierarchical column indexes | |
df.columns = df.columns.get_level_values(0) | |
#Convert Django queryset to DataFrame | |
qs = DjangoModelName.objects.all() | |
q = qs.values() | |
df = pd.DataFrame.from_records(q) | |
#Create a DataFrame from a Python dictionary | |
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) | |
# append or insert a row into pandas data frame | |
# using loc[] | |
df.loc[lens(df.index)] = [new value] | |
# append a column using loc too | |
# 'C' is the new column | |
df.loc[:,'C'] = [new column values] | |
# DataFrame - Drop Columns | |
claims = claims.drop(['Measure Start Date', 'Measure End Date'], axis=1) | |
# DataFrame - Replace content in columns | |
origin1 = '%' | |
claims[['Percent Spending Hospital', 'Percent Spending State']] = claims[['Percent Spending Hospital', 'Percent Spending State']].replace(origin1,'',regex=True).astype('float')/100 | |
# DataFrame - filter datetime index | |
# By month and year | |
close_px[(close_px.index.year == 2010) & (close_px.index.month == 10)] | |
# By date range | |
close_px[('2011-06-01'<= close_px.index) & ('2011-06-16' >= close_px.index)] | |
# By weekday | |
close_px[('2011-06-01'<= close_px.index) & | |
('2011-06-16' >= close_px.index) & | |
(close_px.index.weekday == 4)] #friday | |
# DataFrame, resample index date time | |
close_px.resample('BM') # Business Month End | |
# DataFrame shift data UP or DOWN, or in some cases, one day earlier/later, one month earlier/later | |
close_px.shift(1, "BM") # 1 business month earlier, no NAN values, i.e. remove 1st row | |
close_px.shift(1) # 1 unit earlier, all values are push down 1 row, got NAN values in first row | |
# DataFrame drop duplicates on certain columns | |
df.drop_duplicates(['states', 'years']) | |
# DataFrame pivot - 3 ways: | |
df.pivot(index='states', columns='years', values='data1') | |
df.pivot('states','years','data1') | |
pd.pivot_table(df, index='states', columns='years', values='data1') | |
pd.pivot_table(df, index=['states'], columns=['years'], values='data1') | |
# with function aggregation | |
pd.pivot_table(df, index=['states'], columns=['years'], values='data1', aggfunc=np.mean) | |
# DataFrame cut columns into different equal pieces | |
pd.qcut(df['data1'], 2) | |
# To select the ith row, use iloc: | |
df_test.iloc[0] | |
# To select the ith value in the Btime column you could use: | |
df_test['Btime'].iloc[0] | |
# Add column - Ref: https://coderwall.com/p/dt4yja/adding-columns-to-a-pandas-dataframe | |
# Dataframes in some ways act very similar to Python dictionaries in that you easily add new columns. | |
df = pandas.DataFrame.from_csv('my_data.csv') # fake data | |
df['diff_A_B'] = df['A'] - df['B'] | |
# Convert existing column from float to percentage | |
# Display column in percentage | |
df['pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in df['pct']], index = df.index) | |
# Get a list of row where the value of a certain column is not an empty list | |
# Ref: https://stackoverflow.com/questions/42143116/how-to-get-rows-containg-empty-list-in-a-data-frame-columns-in-python | |
test.B.apply(lambda c: c==[]) | |
Out[71]: | |
0 True | |
1 False | |
Name: B, dtype: bool | |
test[test.B.apply(lambda c: c==[])] | |
Out[72]: | |
A B | |
0 a [] | |
# found the difference between 2 data frames: df1, df2 | |
# base on column 'id' | |
df1[~df1.id.isin(df2.id)] | |
# Update/Create column based on IF condition of other column | |
df['elderly'] = np.where(df['age']>=50, 'yes', 'no') | |
# Check if value in one dataframe column does exist in the other dataframe column | |
Df1.name.isin(Df2.IDs).astype(int) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment