Last active
June 3, 2023 11:21
-
-
Save danlmarmot/a0f4a380ceee79760a0597acf2decae7 to your computer and use it in GitHub Desktop.
Pandas Snippets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Using Jupyter 4.2 and Python 3.5.1 | |
# Firstly, Jupyter notebook stuff | |
# | |
# Make the notebook the width of the brower window | |
from IPython.core.display import HTML | |
HTML("<style>.container { width:100% !important; }</style>") | |
#----- | |
# Now pandas stuffs | |
# Get rid of unwanted characters in column names--for example, replace a space with underscore, and get rid of forward slash | |
cols = df.columns | |
cols = cols.map(lambda x: x.lower().replace(' ', '_').replace('/','') \ | |
if isinstance(x, str) else x.lower()) | |
df.columns = cols | |
# Convert series into a dataframe | |
# GOTCHA: don't forget the brackets around the series, they're easily overlooked | |
import pandas as pd | |
series = pd.Series(["A","B"], index=['Column1','Column2']) | |
df = pd.DataFrame([series]) # makes a dataframe with columns named "Column1", "Column2" | |
df1 = pd.DataFrame([series]) # probably not what you want, columns are now "A" and "B" | |
# Create a dataframe from a single row at location 1234-- again, carefully note the brackets | |
df3 = pd.DataFrame([some_data_frame.iloc[1234]]) | |
# Compare two rows of a dataframe | |
def report_diff(x): | |
return x[0] if x[0] == x[1] else '{} | {}'.format(*x) | |
def get_row_differences(df, index1, index2, report_diff_fn): | |
df1 = pd.DataFrame([df.ix[index1]]).reset_index(drop=True) | |
df2 = pd.DataFrame([df.ix[index2]]).reset_index(drop=True) | |
diff_panel = pd.Panel(dict(df1=df1,df2=df2)) | |
return diff_panel.apply(report_diff_fn, axis=0) | |
print(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff)) | |
# Do that same thing, but display it pretty in a Jupyter notebook | |
pd.options.display.max_colwidth = 500 | |
def report_diff(x): | |
if x[0]==x[1]: | |
return x[0] | |
elif pd.isnull(x[0]) and pd.isnull(x[1]): | |
return u'<table style="background-color:#00ff00;font-weight:bold;">'+\ | |
'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', 'nan') | |
elif pd.isnull(x[0]) and ~pd.isnull(x[1]): | |
return u'<table style="background-color:#ffff00;font-weight:bold;">'+\ | |
'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', x[1]) | |
elif ~pd.isnull(x[0]) and pd.isnull(x[1]): | |
return u'<table style="background-color:#0000ff;font-weight:bold;">'+\ | |
'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0],'nan') | |
else: | |
return u'<table style="background-color:#ff0000;font-weight:bold;">'+\ | |
'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0], x[1]) | |
HTML(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff).to_html(escape=False)) | |
# Count number of null/NaN values in a column | |
df['column name'].isnull().sum() | |
df.column_name.isnull().sum() | |
# Read fixed-width text from a string | |
records = '''\ | |
id Fruit Color Price Purchase Date | |
1 Apple Red $0.99 2016-05-21 12:45:00 | |
2 Banana Yellow $0.69 2016-05-18 19:55:34 | |
3 Mango Orange $1.29 2016-05-24 08:12:33 | |
4 Mangosteen Purple $1,000.01 2016-06-11 00:05:00 | |
''' | |
df = pd.read_fwf(io.StringIO(records), widths=[4,11,11,9,23], parse_dates=[4]) | |
# Strip off commas and $ from columns | |
df['Price'] = df['Price'].astype(np.str).replace(to_replace='[$,]', value='', regex=True) | |
# ----------- | |
# below items are taken from https://gist.github.com/bsweger/e5817488d161f37dcbd2 | |
#List unique values in a DataFrame column | |
pd.unique(df.column_name.ravel()) | |
#Convert Series datatype to numeric, getting rid of any non-numeric values | |
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
#Grab DataFrame rows where column has certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(value_list)] | |
#Grab DataFrame rows where column doesn't have certain values | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[~df.column.isin(value_list)] | |
#Delete column from DataFrame | |
del df['column'] | |
#Select from DataFrame using criteria from multiple columns | |
newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
#Rename several DataFrame columns | |
df = df.rename(columns = { | |
'col1 old name':'col1 new name', | |
'col2 old name':'col2 new name', | |
'col3 old name':'col3 new name', | |
}) | |
#lower-case all DataFrame column names | |
df.columns = map(str.lower, df.columns) | |
#even more fancy DataFrame column re-naming | |
#lower-case all DataFrame column names (for example) | |
df.rename(columns=lambda x: x.split('.')[-1], inplace=True) | |
#Loop through rows in a DataFrame | |
#(if you must) | |
for index, row in df.iterrows(): | |
print index, row['some column'] | |
#Next few examples show how to work with text data in Pandas. | |
#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html | |
#Slice values in a DataFrame column (aka Series) | |
df.column.str[0:2] | |
#Lower-case everything in a DataFrame column | |
df.column_name = df.column_name.str.lower() | |
#Get length of data in a DataFrame column | |
df.column_name.str.len() | |
#Sort dataframe by multiple columns | |
df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) | |
#get top n for each group of columns in a sorted dataframe | |
#(make sure dataframe is sorted first) | |
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5) | |
#Grab DataFrame rows where specific column is null/notnull | |
newdf = df[df['column'].isnull()] | |
#select from DataFrame using multiple keys of a hierarchical index | |
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2')) | |
#Change all NaNs to None (useful before | |
#loading to a db) | |
df = df.where((pd.notnull(df)), None) | |
#Get quick count of rows in a DataFrame | |
len(df.index) | |
#Pivot data (with flexibility about what what | |
# Copied from https://gist.github.com/bsweger/e5817488d161f37dcbd2 | |
#becomes a column and what stays a row). | |
#Syntax works on Pandas >= .14 | |
pd.pivot_table( | |
df,values='cell_value', | |
index=['col1', 'col2', 'col3'], #these stay as columns | |
columns=['col4']) #data values in this column become their own column | |
#change data type of DataFrame column | |
df.column_name = df.column_name.astype(np.int64) | |
# Get rid of non-numeric values throughout a DataFrame: | |
for col in refunds.columns.values: | |
refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) | |
#Set DataFrame column values based on other column values | |
df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value | |
#Clean up missing values in multiple DataFrame columns | |
df = df.fillna({ | |
'col1': 'missing', | |
'col2': '99.999', | |
'col3': '999', | |
'col4': 'missing', | |
'col5': 'missing', | |
'col6': '99' | |
}) | |
#Concatenate two DataFrame columns into a new, single column | |
#(useful when dealing with composite keys, for example) | |
df['newcol'] = df['col1'].map(str) + df['col2'].map(str) | |
#Doing calculations with DataFrame columns that have missing values | |
#In example below, swap in 0 for df['col1'] cells that contain null | |
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
# Split delimited values in a DataFrame column into two new columns | |
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
# Collapse hierarchical column indexes | |
df.columns = df.columns.get_level_values(0) | |
#Convert Django queryset to DataFrame | |
qs = DjangoModelName.objects.all() | |
q = qs.values() | |
df = pd.DataFrame.from_records(q) | |
#Create a DataFrame from a Python dictionary | |
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment