ranfort77/python_for_data_analysis_chapter12.py

## python_for_data_analysis_chapter12.py
#
# referencies
#  [1] Python for Data Analysis - Wes McKinney

import numpy as np
import pandas as pd


#----------------------------------------------------------------------
# Categorical Data (Ref.[1] p.363)
#----------------------------------------------------------------------

#---- dimension tables
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
dim.take(values)


#---- categorical type
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size=N),
                   'weight': np.random.uniform(0, 4, size=N)},
                  columns=['basket_id', 'fruit', 'count', 'weight'])

# series.values를 categorical type으로 만든다.
df['fruit'] = df['fruit'].astype('category')
type(df.fruit.values)  # categorical type

# categorical type은 categories와 codes를 가진다.
df.fruit.values.categories
df.fruit.values.codes

# categorical type을 바로 만들 수 있다.
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

# from_codes constructor
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]
my_cats_2 = pd.Categorical.from_codes(codes, categories)

# categories 순서 유지
ordered_cat = pd.Categorical.from_codes(codes, categories, ordered=True)

# 순서가 없었던 categories를 순서가 있도록 만들기
my_cats_2.as_ordered()


#---- computations with categoricals (Ref.[1] p.367)

# pd.qcut은 Categorical 객체를 리턴
np.random.seed(12345)
draws = np.random.randn(1000)
bins = pd.qcut(draws, 4)
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

# aggregation 예제
bins = pd.Series(bins, name='quartile')
results = (pd.Series(draws).groupby(bins)
           .agg(['count', 'min', 'max']).reset_index())
# quartile 열은 categorical series로 남아있다.
results['quartile']

# better performance with categoricals (Ref.[1] p.369)
N = 1000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
categories = labels.astype('category')
# 메모리 사용량 비교
labels.memory_usage()
categories.memory_usage()


#---- Categorical Methods (Ref.[1] p.370)
# 표 참고: Categorical methods for Series in pandas (Ref.[1] p.372)

# categoricalAccessor: cat
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s.values.codes
cat_s.values.categories
cat_s.cat.codes
cat_s.cat.categories

# cat.set_categories: 카테고리를 확장하고 싶을 때
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
# 이렇게 하면 여러 통계 계산에서 변화가 보인다.
cat_s.value_counts()
cat_s2.value_counts()

# cat.remove_unused_categories: 사용하지 않는 카테고리 삭제
cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
cat_s3.cat.remove_unused_categories()

# dummy variable (one-hot encoding)
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s)


#----------------------------------------------------------------------
# Advanced GroupBy Use (Ref.[1] p.373)
#----------------------------------------------------------------------

df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
g = df.groupby('key').value
g.mean()

#---- groupby.transform 메서드
# groupby.transform 특징:
#  1. 입력 func이 aggfunc이면, aggfunc에 의해 group의 scalar value가
#     만들어 지고 각 group의 크기로 broadcast된다.
#  2. 1의 상황이아니면 입력 func은 group 크기와 같은 크기의 object를
#     리턴해야 한다.
#  3. 입력 func이 input 객체를 바꾸면 안된다.

# group broadcast 예제
g.transform(lambda x: x.mean())
g.transform('mean')  # built-in agg function

# 아래 groupby.transform과 groupby.apply는 동일한 결과를 준다.
def normalize(x): return (x - x.mean()) / x.std()
g.transform(normalize)
g.apply(normalize)

# groupby.transform의 장점은 unwrapped group operation의 사용이다.
# 위 코드처럼 callable을 apply하는 것 보다, built-in agg func인
# 'mean'이나 'sum'같은 것을 사용하는게 더 빠른데, groupby.apply는
# built-in agg func을 사용할 수 없다. groupby.transform으로 해야한다.
(df['value'] - g.transform('mean')) / g.transform('std')


#---- grouped time resampling
# resample 메서드는 time series의 groupby 연산이다.
N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
df = pd.DataFrame({'time': times, 'value': np.arange(N)})
df.set_index('time').resample('5min').count()

# 아래 dataframe과 같이 group에 time series에 포함되어 있을 때는
# Grouper 객체로 group을 지정해 줘야 한다.
df2 = pd.DataFrame({'time': times.repeat(3),
                    'key': np.tile(['a', 'b', 'c'], N),
                    'value': np.arange(N * 3.)})
time_key = pd.Grouper(freq='5min')
resampled = (df2.set_index('time').groupby(['key', time_key]).sum())


#----------------------------------------------------------------------
# Techniques for Method Chaining (Ref.[1] p.378)
#----------------------------------------------------------------------

# 아래와 같은 계산 과정이 있다고 가정
rst = np.random.RandomState(123)
df = pd.DataFrame({'key': list('ababaabbab'),
                   'col1': rst.randn(10),
                   'col2': rst.randn(10)})
df2 = df.copy()
df2 = df2[df2['col2'] < 0]
df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()
result = df2.groupby('key').col1_demeaned.std()

#---- assign 메서드
# method chaing으로 사용해서 temporary variable을 안 만들 수 있다.
# 아래는 위 결과와 동일하다.
result = (df[df['col2'] < 0]
          .assign(col1_demeaned = df['col1'] - df['col1'].mean())
          .groupby('key').col1_demeaned.std())

# dataframe의 callable indexing으로 method chaining을 할 수 있다.
result = (df[lambda x: x.col2 < 0]
          .assign(col1_demeaned = df['col1'] - df['col1'].mean())
          .groupby('key').col1_demeaned.std())


#---- pipe 메서드
# 아래와 같이 어떤 dataframe에 연속해서 임의의 함수를 적용해야 한다고 가정
add_n = lambda obj, n: obj + n
sub_n = lambda obj, n: obj - n
df = pd.DataFrame({'c1': np.arange(5),
                   'c2': np.arange(5, 10)})
df2 = add_n(df, 3)
df3 = sub_n(df2, 2)

# pipe를 사용하면 더 보기 좋다.
df.pipe(add_n, n=3).pipe(sub_n, n=2)
	#
	# referencies
	# [1] Python for Data Analysis - Wes McKinney

	import numpy as np
	import pandas as pd


	#----------------------------------------------------------------------
	# Categorical Data (Ref.[1] p.363)
	#----------------------------------------------------------------------

	#---- dimension tables
	values = pd.Series([0, 1, 0, 0] * 2)
	dim = pd.Series(['apple', 'orange'])
	dim.take(values)


	#---- categorical type
	fruits = ['apple', 'orange', 'apple', 'apple'] * 2
	N = len(fruits)
	df = pd.DataFrame({'fruit': fruits,
	'basket_id': np.arange(N),
	'count': np.random.randint(3, 15, size=N),
	'weight': np.random.uniform(0, 4, size=N)},
	columns=['basket_id', 'fruit', 'count', 'weight'])

	# series.values를 categorical type으로 만든다.
	df['fruit'] = df['fruit'].astype('category')
	type(df.fruit.values) # categorical type

	# categorical type은 categories와 codes를 가진다.
	df.fruit.values.categories
	df.fruit.values.codes

	# categorical type을 바로 만들 수 있다.
	my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

	# from_codes constructor
	categories = ['foo', 'bar', 'baz']
	codes = [0, 1, 2, 0, 0, 1]
	my_cats_2 = pd.Categorical.from_codes(codes, categories)

	# categories 순서 유지
	ordered_cat = pd.Categorical.from_codes(codes, categories, ordered=True)

	# 순서가 없었던 categories를 순서가 있도록 만들기
	my_cats_2.as_ordered()


	#---- computations with categoricals (Ref.[1] p.367)

	# pd.qcut은 Categorical 객체를 리턴
	np.random.seed(12345)
	draws = np.random.randn(1000)
	bins = pd.qcut(draws, 4)
	bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

	# aggregation 예제
	bins = pd.Series(bins, name='quartile')
	results = (pd.Series(draws).groupby(bins)
	.agg(['count', 'min', 'max']).reset_index())
	# quartile 열은 categorical series로 남아있다.
	results['quartile']

	# better performance with categoricals (Ref.[1] p.369)
	N = 1000000
	draws = pd.Series(np.random.randn(N))
	labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
	categories = labels.astype('category')
	# 메모리 사용량 비교
	labels.memory_usage()
	categories.memory_usage()


	#---- Categorical Methods (Ref.[1] p.370)
	# 표 참고: Categorical methods for Series in pandas (Ref.[1] p.372)

	# categoricalAccessor: cat
	s = pd.Series(['a', 'b', 'c', 'd'] * 2)
	cat_s = s.astype('category')
	cat_s.values.codes
	cat_s.values.categories
	cat_s.cat.codes
	cat_s.cat.categories

	# cat.set_categories: 카테고리를 확장하고 싶을 때
	actual_categories = ['a', 'b', 'c', 'd', 'e']
	cat_s2 = cat_s.cat.set_categories(actual_categories)
	# 이렇게 하면 여러 통계 계산에서 변화가 보인다.
	cat_s.value_counts()
	cat_s2.value_counts()

	# cat.remove_unused_categories: 사용하지 않는 카테고리 삭제
	cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
	cat_s3.cat.remove_unused_categories()

	# dummy variable (one-hot encoding)
	cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
	pd.get_dummies(cat_s)


	#----------------------------------------------------------------------
	# Advanced GroupBy Use (Ref.[1] p.373)
	#----------------------------------------------------------------------

	df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
	'value': np.arange(12.)})
	g = df.groupby('key').value
	g.mean()

	#---- groupby.transform 메서드
	# groupby.transform 특징:
	# 1. 입력 func이 aggfunc이면, aggfunc에 의해 group의 scalar value가
	# 만들어 지고 각 group의 크기로 broadcast된다.
	# 2. 1의 상황이아니면 입력 func은 group 크기와 같은 크기의 object를
	# 리턴해야 한다.
	# 3. 입력 func이 input 객체를 바꾸면 안된다.

	# group broadcast 예제
	g.transform(lambda x: x.mean())
	g.transform('mean') # built-in agg function

	# 아래 groupby.transform과 groupby.apply는 동일한 결과를 준다.
	def normalize(x): return (x - x.mean()) / x.std()
	g.transform(normalize)
	g.apply(normalize)

	# groupby.transform의 장점은 unwrapped group operation의 사용이다.
	# 위 코드처럼 callable을 apply하는 것 보다, built-in agg func인
	# 'mean'이나 'sum'같은 것을 사용하는게 더 빠른데, groupby.apply는
	# built-in agg func을 사용할 수 없다. groupby.transform으로 해야한다.
	(df['value'] - g.transform('mean')) / g.transform('std')


	#---- grouped time resampling
	# resample 메서드는 time series의 groupby 연산이다.
	N = 15
	times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
	df = pd.DataFrame({'time': times, 'value': np.arange(N)})
	df.set_index('time').resample('5min').count()

	# 아래 dataframe과 같이 group에 time series에 포함되어 있을 때는
	# Grouper 객체로 group을 지정해 줘야 한다.
	df2 = pd.DataFrame({'time': times.repeat(3),
	'key': np.tile(['a', 'b', 'c'], N),
	'value': np.arange(N * 3.)})
	time_key = pd.Grouper(freq='5min')
	resampled = (df2.set_index('time').groupby(['key', time_key]).sum())


	#----------------------------------------------------------------------
	# Techniques for Method Chaining (Ref.[1] p.378)
	#----------------------------------------------------------------------

	# 아래와 같은 계산 과정이 있다고 가정
	rst = np.random.RandomState(123)
	df = pd.DataFrame({'key': list('ababaabbab'),
	'col1': rst.randn(10),
	'col2': rst.randn(10)})
	df2 = df.copy()
	df2 = df2[df2['col2'] < 0]
	df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()
	result = df2.groupby('key').col1_demeaned.std()

	#---- assign 메서드
	# method chaing으로 사용해서 temporary variable을 안 만들 수 있다.
	# 아래는 위 결과와 동일하다.
	result = (df[df['col2'] < 0]
	.assign(col1_demeaned = df['col1'] - df['col1'].mean())
	.groupby('key').col1_demeaned.std())

	# dataframe의 callable indexing으로 method chaining을 할 수 있다.
	result = (df[lambda x: x.col2 < 0]
	.assign(col1_demeaned = df['col1'] - df['col1'].mean())
	.groupby('key').col1_demeaned.std())


	#---- pipe 메서드
	# 아래와 같이 어떤 dataframe에 연속해서 임의의 함수를 적용해야 한다고 가정
	add_n = lambda obj, n: obj + n
	sub_n = lambda obj, n: obj - n
	df = pd.DataFrame({'c1': np.arange(5),
	'c2': np.arange(5, 10)})
	df2 = add_n(df, 3)
	df3 = sub_n(df2, 2)

	# pipe를 사용하면 더 보기 좋다.
	df.pipe(add_n, n=3).pipe(sub_n, n=2)