Skip to content

Instantly share code, notes, and snippets.

@MartinBodocky
Last active August 29, 2015 14:00
Show Gist options
  • Save MartinBodocky/11388081 to your computer and use it in GitHub Desktop.
Save MartinBodocky/11388081 to your computer and use it in GitHub Desktop.
Learning about Python library Pandas
__author__ = 'martinbodocky'
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
obj = Series([4, -7, 5, 3])
obj.values
obj.index
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2[obj2 > 0]
obj2 ** 2
'b' in obj2
'e' in obj2
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
pd.isnull(obj4)
pd.notnull(obj4)
obj4.isnull()
obj5 = obj3 + obj4
obj5.name = 'population'
obj5.index.name = 'state'
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
#data frames
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
DataFrame(data, columns=['year', 'state', 'pop'])
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four', 'five'])
frame2.ix['three']
frame2.year
frame2['state']
frame2['debt'] = 16.5
frame2['debt'] = np.arange(5.)
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2['eastern'] = frame2.state == 'Ohio'
del frame2['eastern']
# create data frame from dict of dict format
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3.T
DataFrame(pop, index=[2001, 2002, 2003])
pdata = {'Ohio': frame3['Ohio'][:-1],
'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3.values
#Index Objects
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index[1:]
#index objects are immutable
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index=index)
obj2.index is index
#reindexing
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj3 = obj.reindex(['a', 'b', 'r', 'k', 'e'], fill_value=0)
# reindexing methods
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
#reindexing with filling
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
columns=states)
frame.ix[['a', 'b', 'c', 'd'], states]
#dropping entries from an axis
obj = Series(np.arange(5.), index = ['a','b','c','d','e'])
new_obj = obj.drop('c')
data = DataFrame(np.arange(16).reshape((4,4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data.drop(['Colorado', 'Ohio'])
data.drop('two', axis=1)
#index, selection and filtering
obj = Series(np.arange(4.), index=['a','b','c','d'])
obj['b']
obj[1]
obj[['a','c']]
obj[obj>2]
data = DataFrame(np.arange(16).reshape((4,4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data['two']
data[['three', 'one']]
data[data['three'] > 5]
data < 5
data.ix['Colorado', ['two', 'three']]
data[data<5]=0
data.ix[['Colorado', 'Utah'], [3,0,1]]
data.ix[:'Utah']
data.ix['Utah':]
#Arithemtic and data alignment
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
#we add together only those which are the same
s1+s2
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2
#Arithemtic methods with fill values
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1 + df2
#when value missing, we put 0 and add dataframes together
df1.add(df2, fill_value=0)
#operatitions between data frame and series
arr = np.arange(12.).reshape((3,4))
arr - arr[0]
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas','Oregon'])
series = frame.ix[0]
# remove values for Utah
frame - series
#create a new series and add to data frame
series2 = Series(range(3), index=list('bef'))
frame + series2
series3 = frame['d']
frame.sub(series3, axis=0)
#function application and mapping
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(frame)
f = lambda x : x.max() - x.min()
frame.apply(f)
frame.apply(f, axis=1)
def f(x):
return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
#Sorting and ranking
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index()
frame.sort_index(axis=1)
obj = Series([4, 7, -3, 2])
obj.order()
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.order()
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_index(by='b')
#ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
#axis indexes with duplicate values
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique
obj['a']
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
#Summarizing and computing descriptive statistics
# all statistics methods consider to exclude missing values
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df.sum()
df.sum(axis=1)
df.mean()
df.mean(axis=1)
df.mean(axis=1, skipna=False)
df.describe()
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
#correlation and covariance - yahoo web service is not working
# import pandas.io.data as web
#
# all_data = {}
# for ticket in ['AAPL', 'IBM', 'MSFT','GOOG']:
# all_data[ticket] = web.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
#
# price = DataFrame({tic : data['Adj Close'] for tic, data in all_data.iteritems()})
# volume = DataFrame({tic : data['Volume'] for tic, data in all_data.iteritems()})
#unique values, value count and membership
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
obj.value_counts()
pd.value_counts(obj.values, sort=False)
mask = obj.isin(['b', 'c'])
obj[mask]
#Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data.isnull()
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]
data = DataFrame([[1., 6.5, 3.],
[1., NA, NA],
[NA, NA, NA],
[NA, 6.5, 3.]])
#drop each row which conrains NA
data.dropna()
#drop just row with all NA
data.dropna(how='all')
#dropping columns
data[4] = NA
data.dropna(axis=1, how='all')
#Filling in Missing Data
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1]= NA; df.ix[:2,2]
df.fillna(0)
df.fillna({1:0.5, 3:-1})
#always returns a reference to the filled object
_ = df.fillna(0, inplace=True)
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
#Hierarchical Indexing
data = Series(np.random.randn(10),
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data.index
data['b']
data['b':'c']
data.ix[['b','d']]
data.unstack()
data.unstack().stack()
frame = DataFrame(np.arange(12).reshape((4,3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame['Ohio']
#Reordering and sorting levels
frame.swaplevel('key1', 'key2')
#summary by level
frame.sum(level = 'key2')
frame.sum(level = 'color', axis=1)
#Using a dataframe's columns
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame2 = frame.set_index(['c','d'])
frame.set_index(['c','d'], drop=False)
frame2.reset_index()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment