stevenvo/useful_pandas_snippets.py

## useful_pandas_snippets.py
###### FORCE DATA FORMAT IN PANDAS ######
pd.options.display.float_format = '{:,.2f}'.format


# NOTES:
# 1. DO NOT update row in df.iterrows() -> https://stackoverflow.com/questions/25478528/updating-value-in-iterrow-for-pandas
#    Use df.loc[idx, '<col>'] = <value>


###### REGEX on DataFrame ######
# Search a column of strings for a pattern
# Which rows of df['raw'] contain 'xxxx-xx-xx'?
df['raw'].str.contains('....-..-..', regex=True)

# Extract the column of single digits
# In the column 'raw', extract single digit in the strings
df['female'] = df['raw'].str.extract('(\d)', expand=True)


# Extract the column of dates
# In the column 'raw', extract xxxx-xx-xx in the strings
df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)


# Extract the column of thousands
# In the column 'raw', extract ####.## in the strings
df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True)


# Extract the column of words
# In the column 'raw', extract the word in the strings
df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True)


### DATA CONVERSION ###

# Get list from pandas dataframe column
df['abc'].tolist()

#Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)


### BASIC DATA LOOPING, GET LENGTH ###
#Loop through rows in a DataFrame
#(if you must)
for index, row in df.iterrows():
    print index, row['some column']

#Get length of data in a DataFrame column
df.column_name.str.len()

#Get quick count of rows in a DataFrame
len(df.index)


### DATA FILTERING, SEARCH ###

#List unique values in a DataFrame column
pd.unique(df.column_name.ravel())

#Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(value_list)]

#Grab DataFrame rows where column doesn't have certain values
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]

#Select from DataFrame using criteria from multiple columns
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]


#get top n for each group of columns in a sorted dataframe
#(make sure dataframe is sorted first)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)


# Group by time (Hour/Minute) & another potential  column
times = pd.to_datetime(df.datetime)
df_grp = df.groupby([times.dt.hour, '<other column name>'])
df_grp = df.groupby([times.dt.hour, '<other column name>']).<column_to_sum>.sum() #if need to SUM


# FLATTEN column headers after GROUPBY
# https://stackoverflow.com/questions/33004573/after-groupby-how-to-flatten-column-headers
df.reset_index()


#Grab DataFrame rows where specific column is null/notnull
newdf = df[df['column'].isnull()]

#select from DataFrame using multiple keys of a hierarchical index
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))


#### MODIFY, SORT, SLICE, RENAME DATA FRAME ####

#Delete column from DataFrame
del df['column']

#Rename several DataFrame columns
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
})

#lower-case all DataFrame column names
df.columns = map(str.lower, df.columns)

#even more fancy DataFrame column re-naming
#lower-case all DataFrame column names (for example)
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

#Next few examples show how to work with text data in Pandas.
#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

#Slice values in a DataFrame column (aka Series)
df.column.str[0:2]

#Lower-case everything in a DataFrame column
df.column_name = df.column_name.str.lower()

#Sort dataframe by multiple columns
df = df.sort_values(by=['col1', 'col2', 'col3'], ascending=[True, False, True])
df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) # Seem stop working


#Change all NaNs to None (useful before
#loading to a db)
df = df.where((pd.notnull(df)), None)


#Pivot data (with flexibility about what what
#becomes a column and what stays a row).
#Syntax works on Pandas >= .14
pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns
  columns=['col4']) #data values in this column become their own column

#change data type of DataFrame column
df.column_name = df.column_name.astype(np.int64)

# Get rid of non-numeric values throughout a DataFrame:
for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

#Set DataFrame column values based on other column values
df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value

#Clean up missing values in multiple DataFrame columns
df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
})

#Concatenate two DataFrame columns into a new, single column
#(useful when dealing with composite keys, for example)
df['newcol'] = df['col1'].map(str) + df['col2'].map(str)

#Doing calculations with DataFrame columns that have missing values
#In example below, swap in 0 for df['col1'] cells that contain null
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

# Split delimited values in a DataFrame column into two new columns
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

# Collapse hierarchical column indexes
df.columns = df.columns.get_level_values(0)

#Convert Django queryset to DataFrame
qs = DjangoModelName.objects.all()
q = qs.values()
df = pd.DataFrame.from_records(q)

#Create a DataFrame from a Python dictionary
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

# append or insert a row into pandas data frame
# using loc[]
df.loc[lens(df.index)] = [new value]

# append a column using loc too
# 'C' is the new column
df.loc[:,'C'] = [new column values]

# DataFrame - Drop Columns
claims = claims.drop(['Measure Start Date', 'Measure End Date'], axis=1)

# DataFrame - Replace content in columns
origin1 = '%'
claims[['Percent Spending Hospital', 'Percent Spending State']] = claims[['Percent Spending Hospital', 'Percent Spending State']].replace(origin1,'',regex=True).astype('float')/100

# DataFrame - filter datetime index
# By month and year
close_px[(close_px.index.year == 2010) & (close_px.index.month == 10)]
# By date range
close_px[('2011-06-01'<= close_px.index) & ('2011-06-16' >= close_px.index)]
# By weekday
close_px[('2011-06-01'<= close_px.index) &
         ('2011-06-16' >= close_px.index) &
        (close_px.index.weekday == 4)] #friday

# DataFrame, resample index date time
close_px.resample('BM') # Business Month End

# DataFrame shift data UP or DOWN, or in some cases, one day earlier/later, one month earlier/later
close_px.shift(1, "BM") # 1 business month earlier, no NAN values, i.e. remove 1st row
close_px.shift(1) # 1 unit earlier, all values are push down 1 row, got NAN values in first row

# DataFrame drop duplicates on certain columns
df.drop_duplicates(['states', 'years'])

# DataFrame pivot - 3 ways:
df.pivot(index='states', columns='years', values='data1')
df.pivot('states','years','data1')
pd.pivot_table(df, index='states', columns='years', values='data1')
pd.pivot_table(df, index=['states'], columns=['years'], values='data1')
# with function aggregation
pd.pivot_table(df, index=['states'], columns=['years'], values='data1', aggfunc=np.mean)

# DataFrame cut columns into different equal pieces
pd.qcut(df['data1'], 2)

# To select the ith row, use iloc:
df_test.iloc[0]

# To select the ith value in the Btime column you could use:
df_test['Btime'].iloc[0]

# Add column - Ref: https://coderwall.com/p/dt4yja/adding-columns-to-a-pandas-dataframe
# Dataframes in some ways act very similar to Python dictionaries in that you easily add new columns.
df = pandas.DataFrame.from_csv('my_data.csv')  # fake data
df['diff_A_B'] = df['A'] - df['B']


# Convert existing column from float to percentage
# Display column in percentage
df['pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in df['pct']], index = df.index)


# Get a list of row where the value of a certain column is not an empty list
# Ref: https://stackoverflow.com/questions/42143116/how-to-get-rows-containg-empty-list-in-a-data-frame-columns-in-python
test.B.apply(lambda c: c==[])
Out[71]:
0     True
1    False
Name: B, dtype: bool

test[test.B.apply(lambda c: c==[])]
Out[72]:
   A   B
0  a  []


# found the difference between 2 data frames: df1, df2
# base on column 'id'
df1[~df1.id.isin(df2.id)]


# Update/Create column based on IF condition of other column
df['elderly'] = np.where(df['age']>=50, 'yes', 'no')


# Check if value in one dataframe column does exist in the other dataframe column
Df1.name.isin(Df2.IDs).astype(int)
	###### FORCE DATA FORMAT IN PANDAS ######
	pd.options.display.float_format = '{:,.2f}'.format


	# NOTES:
	# 1. DO NOT update row in df.iterrows() -> https://stackoverflow.com/questions/25478528/updating-value-in-iterrow-for-pandas
	# Use df.loc[idx, '<col>'] = <value>


	###### REGEX on DataFrame ######
	# Search a column of strings for a pattern
	# Which rows of df['raw'] contain 'xxxx-xx-xx'?
	df['raw'].str.contains('....-..-..', regex=True)

	# Extract the column of single digits
	# In the column 'raw', extract single digit in the strings
	df['female'] = df['raw'].str.extract('(\d)', expand=True)


	# Extract the column of dates
	# In the column 'raw', extract xxxx-xx-xx in the strings
	df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)


	# Extract the column of thousands
	# In the column 'raw', extract ####.## in the strings
	df['score'] = df['raw'].str.extract('(\d\d\d\d\.\d)', expand=True)


	# Extract the column of words
	# In the column 'raw', extract the word in the strings
	df['state'] = df['raw'].str.extract('([A-Z]\w{0,})', expand=True)




	### DATA CONVERSION ###

	# Get list from pandas dataframe column
	df['abc'].tolist()

	#Convert Series datatype to numeric, getting rid of any non-numeric values
	df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)


	### BASIC DATA LOOPING, GET LENGTH ###
	#Loop through rows in a DataFrame
	#(if you must)
	for index, row in df.iterrows():
	print index, row['some column']

	#Get length of data in a DataFrame column
	df.column_name.str.len()

	#Get quick count of rows in a DataFrame
	len(df.index)



	### DATA FILTERING, SEARCH ###

	#List unique values in a DataFrame column
	pd.unique(df.column_name.ravel())

	#Grab DataFrame rows where column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(value_list)]

	#Grab DataFrame rows where column doesn't have certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[~df.column.isin(value_list)]

	#Select from DataFrame using criteria from multiple columns
	newdf = df[(df['column_one']>2004) & (df['column_two']==9)]


	#get top n for each group of columns in a sorted dataframe
	#(make sure dataframe is sorted first)
	top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)


	# Group by time (Hour/Minute) & another potential column
	times = pd.to_datetime(df.datetime)
	df_grp = df.groupby([times.dt.hour, '<other column name>'])
	df_grp = df.groupby([times.dt.hour, '<other column name>']).<column_to_sum>.sum() #if need to SUM



	# FLATTEN column headers after GROUPBY
	# https://stackoverflow.com/questions/33004573/after-groupby-how-to-flatten-column-headers
	df.reset_index()



	#Grab DataFrame rows where specific column is null/notnull
	newdf = df[df['column'].isnull()]

	#select from DataFrame using multiple keys of a hierarchical index
	df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))


	#### MODIFY, SORT, SLICE, RENAME DATA FRAME ####

	#Delete column from DataFrame
	del df['column']

	#Rename several DataFrame columns
	df = df.rename(columns = {
	'col1 old name':'col1 new name',
	'col2 old name':'col2 new name',
	'col3 old name':'col3 new name',
	})

	#lower-case all DataFrame column names
	df.columns = map(str.lower, df.columns)

	#even more fancy DataFrame column re-naming
	#lower-case all DataFrame column names (for example)
	df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

	#Next few examples show how to work with text data in Pandas.
	#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

	#Slice values in a DataFrame column (aka Series)
	df.column.str[0:2]

	#Lower-case everything in a DataFrame column
	df.column_name = df.column_name.str.lower()

	#Sort dataframe by multiple columns
	df = df.sort_values(by=['col1', 'col2', 'col3'], ascending=[True, False, True])
	df = df.sort(['col1','col2','col3'],ascending=[1,1,0]) # Seem stop working


	#Change all NaNs to None (useful before
	#loading to a db)
	df = df.where((pd.notnull(df)), None)


	#Pivot data (with flexibility about what what
	#becomes a column and what stays a row).
	#Syntax works on Pandas >= .14
	pd.pivot_table(
	df,values='cell_value',
	index=['col1', 'col2', 'col3'], #these stay as columns
	columns=['col4']) #data values in this column become their own column

	#change data type of DataFrame column
	df.column_name = df.column_name.astype(np.int64)

	# Get rid of non-numeric values throughout a DataFrame:
	for col in refunds.columns.values:
	refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

	#Set DataFrame column values based on other column values
	df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value

	#Clean up missing values in multiple DataFrame columns
	df = df.fillna({
	'col1': 'missing',
	'col2': '99.999',
	'col3': '999',
	'col4': 'missing',
	'col5': 'missing',
	'col6': '99'
	})

	#Concatenate two DataFrame columns into a new, single column
	#(useful when dealing with composite keys, for example)
	df['newcol'] = df['col1'].map(str) + df['col2'].map(str)

	#Doing calculations with DataFrame columns that have missing values
	#In example below, swap in 0 for df['col1'] cells that contain null
	df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

	# Split delimited values in a DataFrame column into two new columns
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	# Collapse hierarchical column indexes
	df.columns = df.columns.get_level_values(0)

	#Convert Django queryset to DataFrame
	qs = DjangoModelName.objects.all()
	q = qs.values()
	df = pd.DataFrame.from_records(q)

	#Create a DataFrame from a Python dictionary
	df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])

	# append or insert a row into pandas data frame
	# using loc[]
	df.loc[lens(df.index)] = [new value]

	# append a column using loc too
	# 'C' is the new column
	df.loc[:,'C'] = [new column values]

	# DataFrame - Drop Columns
	claims = claims.drop(['Measure Start Date', 'Measure End Date'], axis=1)

	# DataFrame - Replace content in columns
	origin1 = '%'
	claims[['Percent Spending Hospital', 'Percent Spending State']] = claims[['Percent Spending Hospital', 'Percent Spending State']].replace(origin1,'',regex=True).astype('float')/100

	# DataFrame - filter datetime index
	# By month and year
	close_px[(close_px.index.year == 2010) & (close_px.index.month == 10)]
	# By date range
	close_px[('2011-06-01'<= close_px.index) & ('2011-06-16' >= close_px.index)]
	# By weekday
	close_px[('2011-06-01'<= close_px.index) &
	('2011-06-16' >= close_px.index) &
	(close_px.index.weekday == 4)] #friday

	# DataFrame, resample index date time
	close_px.resample('BM') # Business Month End

	# DataFrame shift data UP or DOWN, or in some cases, one day earlier/later, one month earlier/later
	close_px.shift(1, "BM") # 1 business month earlier, no NAN values, i.e. remove 1st row
	close_px.shift(1) # 1 unit earlier, all values are push down 1 row, got NAN values in first row

	# DataFrame drop duplicates on certain columns
	df.drop_duplicates(['states', 'years'])

	# DataFrame pivot - 3 ways:
	df.pivot(index='states', columns='years', values='data1')
	df.pivot('states','years','data1')
	pd.pivot_table(df, index='states', columns='years', values='data1')
	pd.pivot_table(df, index=['states'], columns=['years'], values='data1')
	# with function aggregation
	pd.pivot_table(df, index=['states'], columns=['years'], values='data1', aggfunc=np.mean)

	# DataFrame cut columns into different equal pieces
	pd.qcut(df['data1'], 2)

	# To select the ith row, use iloc:
	df_test.iloc[0]

	# To select the ith value in the Btime column you could use:
	df_test['Btime'].iloc[0]

	# Add column - Ref: https://coderwall.com/p/dt4yja/adding-columns-to-a-pandas-dataframe
	# Dataframes in some ways act very similar to Python dictionaries in that you easily add new columns.
	df = pandas.DataFrame.from_csv('my_data.csv') # fake data
	df['diff_A_B'] = df['A'] - df['B']


	# Convert existing column from float to percentage
	# Display column in percentage
	df['pct'] = pd.Series(["{0:.2f}%".format(val * 100) for val in df['pct']], index = df.index)


	# Get a list of row where the value of a certain column is not an empty list
	# Ref: https://stackoverflow.com/questions/42143116/how-to-get-rows-containg-empty-list-in-a-data-frame-columns-in-python
	test.B.apply(lambda c: c==[])
	Out[71]:
	0 True
	1 False
	Name: B, dtype: bool

	test[test.B.apply(lambda c: c==[])]
	Out[72]:
	A B
	0 a []


	# found the difference between 2 data frames: df1, df2
	# base on column 'id'
	df1[~df1.id.isin(df2.id)]


	# Update/Create column based on IF condition of other column
	df['elderly'] = np.where(df['age']>=50, 'yes', 'no')


	# Check if value in one dataframe column does exist in the other dataframe column
	Df1.name.isin(Df2.IDs).astype(int)