Skip to content

Instantly share code, notes, and snippets.

@yoki
Last active January 26, 2023 07:21
Show Gist options
  • Save yoki/32eacf92ce58c8d15e12226f11088cc7 to your computer and use it in GitHub Desktop.
Save yoki/32eacf92ce58c8d15e12226f11088cc7 to your computer and use it in GitHub Desktop.
Pandas tutorial
# 10_generation
# 11_filter
# 12_setting_value
# 13_io
# 15_groupby
# 16_merge
# 17_pivot_reshape
# 18_misc
# 19_type
# 21_category
# 25_time_series
# creation
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df = pd.read_csv('file.csv')
df.to_csv('name.csv', encoding='utf-8')
# df info
df.columns.values # DF info
nrow = df.shape[0] # DF info
# filter
df.loc[:, ['col_1','col_2']] # select rows by string index
df.iloc[[2,3]] # select rows by numerical index. double bracket gives number, instead of series
df.iloc[:, [1,2]]
# update
df.loc[df.AAA >= 5,'BBB'] = -1 # update AAA: condition, BBB: columns to be changed
# iteration
for idx, row in df.iterrows():
print([row.value1, row["value2"]])
df.value3 = df.apply(lambda x: x.value1 + x.value2, axis = 1)
grp = iris.groupby('Species').apply(eval('np.sum'))
# manage dataframe
df.reset_index()
df.sort_values(by=[('Group1', 'C')], ascending=False) # multi index sort
df1.dropna(how='any')
df['col'] = df['col'].fillna(-1).astype(int)
# creation of series
s = pd.Series([1,3,5,np.nan,6,8])
# dataframe from series
df = pd.Dataframe(s, columns=['supercolname'])
# creation with index, numpy data, and column name
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
# create from data without index
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]
BabyDataSet = list(zip(names,births))
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
# creation from dictionary
In [10]: df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' }
# output
# A B C D E F
# 0 1 2013-01-02 1 3 test foo
# 1 1 2013-01-02 1 3 train foo
# 2 1 2013-01-02 1 3 test foo
# 3 1 2013-01-02 1 3 train foo
# loc, iloc, ix
# loc select by names of columns or rows
df.loc[['row_2','row_3']] # select columns
df.loc[:, ['col_1','col_2']] # select rows
# iloc select by numerical index
df.iloc[[2,3]]
df.iloc[:, [1,2]]
# ix select by numerical index for row and column name
df.ix[[3,7], "row_b"]
# simple conditions
df3 = df2[df2['Time0']==2012]
df3 = df3[df3['Value1'] > 20000]
df3 = df3[df3['Country0'] == df3['Country1']]
df = df[pd.notnull(df['latitude'])] # not null
df[df['category'].str.contains('national')] # string condition
# multiple conditions
df[df['id'].isin(['109', '673'])]
df[(df['category'] == 'national') & (df['is_removed'] == '1')]
# negative conditions
df = df[~df['id'].isin(['1', '2', '3'])]
# by index
df['20130102':'20130104']
df[0:3]
# selected columns
df[['A','B']]
# combine index and column selection
df.loc['20130102':'20130104',['A','B']]
df.loc['20130102','A','B'] # return scalar
# select row close to some value
df.ix[(df.CCC-aValue).abs().argsort()]
# Column names
df.columns.values
# column rename
df.rename(columns={"uname":"username"}, inplace=True)
# select columns
df.loc[['row_2','row_3']]
# drop columns
df.drop(columns=['B', 'C'])
# single value
df.at[dates[0],'A'] = 0
# column
df.loc[:,'D'] = np.array([5] * len(df))
df['logic'] = np.where(df['AAA'] > 5,'high','low');
# column rename
df.rename(columns={"uname":"username"}, inplace=True)
# update information
df.loc[df.AAA >= 5,'BBB'] = -1 # AAA: condition, BBB: columns to be changed
df.ix[[3,7],'BBB'] = -1 # use ix, not iloc
# group by update 1. update only variable which satisfies group conditions
moras = moras.reset_index()
tmp = moras.groupby(['wid'])["mid"].idxmax()
moras.ix[tmp,"pitch_change"] = np.nan
# group by update. Assign unique ide within group
df['domainId'] = df.groupby('orgid')['orgid'].rank(method='first')
#----------
# deleteing
#----------
# column
del df[name] # column
# row
df.drop(df.index[[1,3]], inplace=True)
df = df[(df.a == 2)]
df = df[~np.isnan(df['corr'])] # delete NaN
#------------
# Import
#------------
#CSV
df = pd.read_csv('file.csv')
df = pd.read_csv('file.csv', header=0,index_col=0, quotechar='"',sep=':', na_values = ['na', '-', '.', ''])
# series
df = pd.concat([s1, s2], axis=1)
# Python dictionary by column
df = DataFrame({'col0' : [1.0, 2.0, 3.0, 4.0],'col1' : [100, 200, 300, 400]})
# python dictionary by row
df = DataFrame.from_dict({ 'row0' : {'col0':0, 'col1':'A'},'row1' : {'col0':1, 'col1':'B'}}, orient='index')
df = DataFrame.from_dict({ 'row0' : [1, 1+1j, 'A'],'row1' : [2, 2+2j, 'B']}, orient='index')
#HDF5
pd.read_hdf('foo.h5','df')
#-------
# save
#------
df.to_csv('name.csv', encoding='utf-8')
df.to_hdf('foo.h5','df')
# http://sinhrks.hatenablog.com/entry/2014/10/13/005327
grp = iris.groupby('Species')
grp['len'].sum() #single column
grp.sum() # all column
grp[['width', 'len']].sum() # multipl columns
# custom functions. Custom function's input is series.
grp[ 'Length'].apply(np.sum)
grp[ 'Length'].apply(lambda x: ",".join(x))
# multiple custom functions
grp.agg({'Length': [np.sum, np.mean], 'Width': [np.sum, np.mean]})
# add new column based on groupby.
iris["avgren"] = grp["Length"].transform("sum")
# Complex cumulation
# Using quarterly frequency y-y growth rate, create level data.
df = pd.DataFrame(['country_id','quarter','year', 'yoygrowth'])
df = df.sort_values(["country_id", "quarter", "year"])
def cumulative_func(df):
results = []
for group in df.groupby(["country_id", "quarter"]).indices.values():
level = 1
result = []
for val in df.iloc[group].yoygrowth.values:
level *= 1 + 0.01 * val
result.append(level)
results.append(pd.Series(result, index=group))
# return pd.concat(results)
return pd.concat(results).reindex(df.index)
df = df.reset_index(drop=True)
df["level"] = cumulative_func(df)
#http://pandas.pydata.org/pandas-docs/stable/merging.html
#--------------
#Concat
#---------------
#horizontal concat
frames = [df1, df2, df3]
result = pd.concat(frames)
# vertical concat, if index is not meaningfu, better ignore
result = df1.append(df2, ignore_index=True)
result = df1.append([df2, df3], ignore_index=True)
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True)
# Add row
result = df1.append(one_row, ignore_index=True)
# Join
result = pd.merge(left, right, how='left', on=['key1', 'key2'])
# how: =left/right/outer/inner
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html
long_df = pd.DataFrame({'vname': ['one', 'one', 'one', 'two', 'two','two','one', 'one', 'one', 'two', 'two','two'],
'country': ['A', 'B', 'C', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C'],
'year': ['x1', 'x1', 'x1', 'x1', 'x1', 'x1','x2', 'x2', 'x2', 'x2', 'x2', 'x2'],
'val': [1, 2, 3, 4, 5, 6,7, 8, 9, 10, 11, 12]})
# reshape from long to wide
wide_df = long_df.pivot(index=['country','year'], columns='vname', values="val").reset_index()
# reshape from wide to long (neither "vname" or "val" doesn't exist in wide_df. final DF contains [country, year, vname, val] coluumns)
new_long_df = wide_df.melt(id_vars=['country','year'], var_name='vname', value_name="val")
# Iteration
for idx, row in df.iterrows():
print([row.value1, row["value2"]])
df.value3 = df.apply(lambda x: x.value1 + x.value2, axis = 1)
# reset index
df.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.html
df.sort_values(by=[('Group1', 'C')], ascending=False) # multi index sort
#na
df1.dropna(how='any')
df1.fillna(value=5)
pd.isnull(df1)
df1.[df1.x.notna()]
#histgram
s.value_counts()
#change column type
df['name'] = df['name'].astype('str')
df['col'] = df['col'].fillna(-1).astype(int)
df['col'] = pd.to_numeric(df['col'])
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category")
df["grade"].cat.categories = ["very good", "good", "very bad"]
# http://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries
# easy sample
# resample
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts.resample('5Min').sum()
# time zone
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC')
ts_utc.tz_convert('US/Eastern')
# convert monthly data to quarterly data
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ps = ts.to_period()
ps.to_timestamp()
## full examples
##
# generate timestamp
pd.Timestamp(datetime(2012, 5, 1))
pd.Timestamp('2012-05-01')
pd.Timestamp(2012, 5, 1)
# generate Period (time span)
pd.Period('2011-01')
pd.Period('2012-05', freq='D')
# generate series using timestamp/period
dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')]
ts = pd.Series(np.random.randn(3), dates)
# convert date-like object to timestamp
pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None]))
pd.to_datetime(['2005/11/23', '2010.12.31'])
pd.to_datetime(['04-01-2012 10:00'], dayfirst=True)
pd.to_datetime('2010/11/12')
pd.Timestamp('2010/11/12')
# convert pandas columns to timestamp index
df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5],'hour': [2, 3]})
# to_timestamp error handling
pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')
pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') # convert errors to NAT
# get range of timestamps
index = pd.date_range('2000-1-1', periods=1000, freq='M')
index = pd.bdate_range('2012-1-1', periods=250) # business day
rng = pd.date_range(datetime(2011, 1, 1), datetime(2012, 1, 1))
pd.Timestamp.min
pd.Timestamp.max
rng = pd.date_range(start, end, freq='BM')
# access range
ts['1/31/2011']
ts[datetime(2011, 12, 25):]
ts['10/31/2011':'12/31/2011']
ts['2011']
ts['2011-6']
dft['2013-1':'2013-2-28 00:00:00']
dft['2013-1-15':'2013-1-15 12:30:00']
# date shift
d + pd.tseries.offsets.DateOffset(months=4, days=5)
from pandas.tseries.offsets import *
d + DateOffset(months=4, days=5)
d - 5 * BDay()
d + BMonthEnd()
offset = BMonthEnd()
offset.rollforward(d)
offset.rollback(d)
d + Week()
d + Week(weekday=4)
d + YearEnd()
d + YearEnd(month=6)
rng = pd.date_range('2012-01-01', '2012-01-03')
s = pd.Series(rng)
rng + DateOffset(months=2)
s + DateOffset(months=2)
s - Day(2)
td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31'))
# Hoilday
from pandas.tseries.holiday import USFederalHolidayCalendar
bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar())
dt = datetime(2014, 1, 17)
dt + bday_us
# business hour
bh = BusinessHour()
pd.Timestamp('2014-08-01 10:00') + bh
pd.Timestamp('2014-08-01 10:00') + BusinessHour(2)
# custom business hour
bh = BusinessHour(start='11:00', end=time(20, 0))
pd.Timestamp('2014-08-01 09:00') + bh
# annoted offsets
pd.Timestamp('2014-01-02') + MonthBegin(n=4)
pd.Timestamp('2014-01-31') + MonthEnd(n=1)
pd.Timestamp('2014-01-02') + MonthEnd(n=0)
# shift/lag
ts = ts[:5]
ts.shift(1)
ts.shift(5, freq=offsets.BDay())
ts.shift(5, freq='BM')
# frequency conversion
dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay())
ts = pd.Series(randn(3), index=dr)
ts.asfreq(BDay())
ts.asfreq(BDay(), method='pad')
# resampling (e.g. 1min -> 5min)
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
df.resample('M', on='date')
ts.resample('5Min').sum()
ts.resample('5Min').mean()
ts.resample('5Min').ohlc() # open high low close
ts.resample('5Min', closed='right').mean()
ts.resample('5Min', closed='left').mean()
# offset alias
# B business day frequency
# D calendar day frequency
# W weekly frequency
# M month end frequency
# BM business month end frequency
# MS month start frequency
# BMS business month start frequency
# Q quarter end frequency
# A year end frequency
# H hourly frequency
# T, min minutely frequency
# S secondly frequency
# upsampling (e.g. 5min to min)
ts[:2].resample('250L').ffill()
# sparse resampling
# omitted
################
## Time span
#################
# period
p = pd.Period('2012', freq='A-DEC')
p + 1 #Period('2013', 'A-DEC')
p = pd.Period('2014-07-01 09:00', freq='H')
p + Hour(2)
p + timedelta(minutes=120)
# period range
prng = pd.period_range('1/1/2011', '1/1/2012', freq='M')
pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')
pd.PeriodIndex(start='2014-01', freq='3M', periods=4)
ps = pd.Series(np.random.randn(len(prng)), prng)
idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H')
#PeriodIndex Partial String Indexing¶
ps['2011-01']
ps[datetime(2011, 12, 25):]
ps['10/31/2011':'12/31/2011']
ps['2011']
dfp['2013-01-01 10H']
dfp['2013-01-01 10H':'2013-01-01 11H']
p = pd.Period('2011', freq='A-DEC')
# convert period to other freq
p.asfreq('M', how='start')
# time zone
rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D',tz='Europe/London')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC')
ts_utc.tz_convert('US/Eastern')
rng_eastern = rng_utc.tz_convert('US/Eastern')
didx.tz_localize(None) # remove timezone
# convert timezone from US/Eastern to Asia/Tokyo
df['timestamp'] = pd.to_datetime(df.DATE) #convert to datetime
df.index = df.timestamp # convert to datetimeindex
ind =df.index
ind = ind.tz_localize('US/Eastern') # convert to tz-aware US time
ind = pd.to_datetime(ind.values) # convert to UTC without timezone
ind = ind + pd.tseries.offsets.DateOffset(hours=9) # convert to Tokyo time
df.index = ind
df.timestamp = ind
# interpolation
ts.interpolate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment