danlmarmot/pandas_snippets.py

## pandas_snippets.py
# Using Jupyter 4.2 and Python 3.5.1

# Firstly, Jupyter notebook stuff
#
# Make the notebook the width of the brower window
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

#-----
# Now pandas stuffs

# Get rid of unwanted characters in column names--for example, replace a space with underscore, and get rid of forward slash
cols = df.columns
cols = cols.map(lambda x: x.lower().replace(' ', '_').replace('/','') \
                if isinstance(x, str) else x.lower())
df.columns = cols

# Convert series into a dataframe
# GOTCHA: don't forget the brackets around the series, they're easily overlooked
import pandas as pd
series = pd.Series(["A","B"], index=['Column1','Column2'])
df = pd.DataFrame([series])     # makes a dataframe with columns named "Column1", "Column2"
df1 = pd.DataFrame([series])    # probably not what you want, columns are now "A" and "B"

# Create a dataframe from a single row at location 1234-- again, carefully note the brackets
df3 = pd.DataFrame([some_data_frame.iloc[1234]])

# Compare two rows of a dataframe
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} | {}'.format(*x)

def get_row_differences(df, index1, index2, report_diff_fn):
    df1 = pd.DataFrame([df.ix[index1]]).reset_index(drop=True)
    df2 = pd.DataFrame([df.ix[index2]]).reset_index(drop=True)
    diff_panel = pd.Panel(dict(df1=df1,df2=df2))

    return diff_panel.apply(report_diff_fn, axis=0)

print(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff))

# Do that same thing, but display it pretty in a Jupyter notebook
pd.options.display.max_colwidth = 500
def report_diff(x):
    if x[0]==x[1]:
        return x[0]
    elif pd.isnull(x[0]) and pd.isnull(x[1]):
        return u'<table style="background-color:#00ff00;font-weight:bold;">'+\
            '<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', 'nan')
    elif pd.isnull(x[0]) and ~pd.isnull(x[1]):
        return u'<table style="background-color:#ffff00;font-weight:bold;">'+\
            '<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', x[1])
    elif ~pd.isnull(x[0]) and pd.isnull(x[1]):
        return u'<table style="background-color:#0000ff;font-weight:bold;">'+\
            '<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0],'nan')
    else:
        return u'<table style="background-color:#ff0000;font-weight:bold;">'+\
            '<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0], x[1])

HTML(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff).to_html(escape=False))

# Count number of null/NaN values in a column
df['column name'].isnull().sum()
df.column_name.isnull().sum()

# Read fixed-width text from a string
records = '''\
id  Fruit      Color      Price     Purchase Date
1   Apple      Red        $0.99     2016-05-21 12:45:00
2   Banana     Yellow     $0.69     2016-05-18 19:55:34
3   Mango      Orange     $1.29     2016-05-24 08:12:33
4   Mangosteen Purple     $1,000.01 2016-06-11 00:05:00
'''
df = pd.read_fwf(io.StringIO(records), widths=[4,11,11,9,23], parse_dates=[4])

# Strip off commas and $ from columns
df['Price'] = df['Price'].astype(np.str).replace(to_replace='[$,]', value='', regex=True)

# -----------
# below items are taken from https://gist.github.com/bsweger/e5817488d161f37dcbd2

#List unique values in a DataFrame column
pd.unique(df.column_name.ravel())

#Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

#Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(value_list)]

#Grab DataFrame rows where column doesn't have certain values
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]

#Delete column from DataFrame
del df['column']

#Select from DataFrame using criteria from multiple columns
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

#Rename several DataFrame columns
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
})

#lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)

#even more fancy DataFrame column re-naming
#lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

#Loop through rows in a DataFrame
#(if you must)
for index, row in df.iterrows():
    print index, row['some column']

#Next few examples show how to work with text data in Pandas.
#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

#Slice values in a DataFrame column (aka Series)
df.column.str[0:2]

#Lower-case everything in a DataFrame column
df.column_name = df.column_name.str.lower()

#Get length of data in a DataFrame column
df.column_name.str.len()

#Sort dataframe by multiple columns
df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

#get top n for each group of columns in a sorted dataframe
#(make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

#Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]

#select from DataFrame using multiple keys of a hierarchical index
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

#Change all NaNs to None (useful before
#loading to a db)
df = df.where((pd.notnull(df)), None)

#Get quick count of rows in a DataFrame
len(df.index)

#Pivot data (with flexibility about what what
# Copied from https://gist.github.com/bsweger/e5817488d161f37dcbd2

#becomes a column and what stays a row).
#Syntax works on Pandas >= .14
pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns
  columns=['col4']) #data values in this column become their own column

#change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)

# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

#Set DataFrame column values based on other column values
df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value

#Clean up missing values in multiple DataFrame columns
df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
})

#Concatenate two DataFrame columns into a new, single column
#(useful when dealing with composite keys, for example)
df['newcol'] = df['col1'].map(str) + df['col2'].map(str)

#Doing calculations with DataFrame columns that have missing values
#In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

# Split delimited values in a DataFrame column into two new columns
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

# Collapse hierarchical column indexes
df.columns = df.columns.get_level_values(0)

#Convert Django queryset to DataFrame
qs = DjangoModelName.objects.all()
q = qs.values()
df = pd.DataFrame.from_records(q)

#Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
	# Using Jupyter 4.2 and Python 3.5.1

	# Firstly, Jupyter notebook stuff
	#
	# Make the notebook the width of the brower window
	from IPython.core.display import HTML
	HTML("<style>.container { width:100% !important; }</style>")

	#-----
	# Now pandas stuffs

	# Get rid of unwanted characters in column names--for example, replace a space with underscore, and get rid of forward slash
	cols = df.columns
	cols = cols.map(lambda x: x.lower().replace(' ', '_').replace('/','') \
	if isinstance(x, str) else x.lower())
	df.columns = cols

	# Convert series into a dataframe
	# GOTCHA: don't forget the brackets around the series, they're easily overlooked
	import pandas as pd
	series = pd.Series(["A","B"], index=['Column1','Column2'])
	df = pd.DataFrame([series]) # makes a dataframe with columns named "Column1", "Column2"
	df1 = pd.DataFrame([series]) # probably not what you want, columns are now "A" and "B"

	# Create a dataframe from a single row at location 1234-- again, carefully note the brackets
	df3 = pd.DataFrame([some_data_frame.iloc[1234]])

	# Compare two rows of a dataframe
	def report_diff(x):
	return x[0] if x[0] == x[1] else '{} \| {}'.format(*x)

	def get_row_differences(df, index1, index2, report_diff_fn):
	df1 = pd.DataFrame([df.ix[index1]]).reset_index(drop=True)
	df2 = pd.DataFrame([df.ix[index2]]).reset_index(drop=True)
	diff_panel = pd.Panel(dict(df1=df1,df2=df2))

	return diff_panel.apply(report_diff_fn, axis=0)

	print(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff))

	# Do that same thing, but display it pretty in a Jupyter notebook
	pd.options.display.max_colwidth = 500
	def report_diff(x):
	if x[0]==x[1]:
	return x[0]
	elif pd.isnull(x[0]) and pd.isnull(x[1]):
	return u'<table style="background-color:#00ff00;font-weight:bold;">'+\
	'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', 'nan')
	elif pd.isnull(x[0]) and ~pd.isnull(x[1]):
	return u'<table style="background-color:#ffff00;font-weight:bold;">'+\
	'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % ('nan', x[1])
	elif ~pd.isnull(x[0]) and pd.isnull(x[1]):
	return u'<table style="background-color:#0000ff;font-weight:bold;">'+\
	'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0],'nan')
	else:
	return u'<table style="background-color:#ff0000;font-weight:bold;">'+\
	'<tr><td>%s</td></tr><tr><td>%s</td></tr></table>' % (x[0], x[1])

	HTML(get_row_differences(<dataframe_name>,<index1>,<index2>,report_diff).to_html(escape=False))

	# Count number of null/NaN values in a column
	df['column name'].isnull().sum()
	df.column_name.isnull().sum()

	# Read fixed-width text from a string
	records = '''\
	id Fruit Color Price Purchase Date
	1 Apple Red $0.99 2016-05-21 12:45:00
	2 Banana Yellow $0.69 2016-05-18 19:55:34
	3 Mango Orange $1.29 2016-05-24 08:12:33
	4 Mangosteen Purple $1,000.01 2016-06-11 00:05:00
	'''
	df = pd.read_fwf(io.StringIO(records), widths=[4,11,11,9,23], parse_dates=[4])

	# Strip off commas and $ from columns
	df['Price'] = df['Price'].astype(np.str).replace(to_replace='[$,]', value='', regex=True)

	# -----------
	# below items are taken from https://gist.github.com/bsweger/e5817488d161f37dcbd2

	#List unique values in a DataFrame column
	pd.unique(df.column_name.ravel())

	#Convert Series datatype to numeric, getting rid of any non-numeric values
	df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

	#Grab DataFrame rows where column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(value_list)]

	#Grab DataFrame rows where column doesn't have certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[~df.column.isin(value_list)]

	#Delete column from DataFrame
	del df['column']

	#Select from DataFrame using criteria from multiple columns
	newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

	#Rename several DataFrame columns
	df = df.rename(columns = {
	'col1 old name':'col1 new name',
	'col2 old name':'col2 new name',
	'col3 old name':'col3 new name',
	})

	#lower-case all DataFrame column names
	df.columns = map(str.lower, df.columns)

	#even more fancy DataFrame column re-naming
	#lower-case all DataFrame column names (for example)
	df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

	#Loop through rows in a DataFrame
	#(if you must)
	for index, row in df.iterrows():
	print index, row['some column']

	#Next few examples show how to work with text data in Pandas.
	#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

	#Slice values in a DataFrame column (aka Series)
	df.column.str[0:2]

	#Lower-case everything in a DataFrame column
	df.column_name = df.column_name.str.lower()

	#Get length of data in a DataFrame column
	df.column_name.str.len()

	#Sort dataframe by multiple columns
	df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

	#get top n for each group of columns in a sorted dataframe
	#(make sure dataframe is sorted first)
	top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

	#Grab DataFrame rows where specific column is null/notnull
	newdf = df[df['column'].isnull()]

	#select from DataFrame using multiple keys of a hierarchical index
	df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

	#Change all NaNs to None (useful before
	#loading to a db)
	df = df.where((pd.notnull(df)), None)

	#Get quick count of rows in a DataFrame
	len(df.index)

	#Pivot data (with flexibility about what what
	# Copied from https://gist.github.com/bsweger/e5817488d161f37dcbd2

	#becomes a column and what stays a row).
	#Syntax works on Pandas >= .14
	pd.pivot_table(
	df,values='cell_value',
	index=['col1', 'col2', 'col3'], #these stay as columns
	columns=['col4']) #data values in this column become their own column

	#change data type of DataFrame column
	df.column_name = df.column_name.astype(np.int64)

	# Get rid of non-numeric values throughout a DataFrame:
	for col in refunds.columns.values:
	refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

	#Set DataFrame column values based on other column values
	df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value

	#Clean up missing values in multiple DataFrame columns
	df = df.fillna({
	'col1': 'missing',
	'col2': '99.999',
	'col3': '999',
	'col4': 'missing',
	'col5': 'missing',
	'col6': '99'
	})

	#Concatenate two DataFrame columns into a new, single column
	#(useful when dealing with composite keys, for example)
	df['newcol'] = df['col1'].map(str) + df['col2'].map(str)

	#Doing calculations with DataFrame columns that have missing values
	#In example below, swap in 0 for df['col1'] cells that contain null
	df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

	# Split delimited values in a DataFrame column into two new columns
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	# Collapse hierarchical column indexes
	df.columns = df.columns.get_level_values(0)

	#Convert Django queryset to DataFrame
	qs = DjangoModelName.objects.all()
	q = qs.values()
	df = pd.DataFrame.from_records(q)

	#Create a DataFrame from a Python dictionary
	df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])