Skip to content

Instantly share code, notes, and snippets.

@Maecenas
Last active May 3, 2018 02:25
Show Gist options
  • Save Maecenas/4df452bdf626a3ae4c43e616a6fb87d3 to your computer and use it in GitHub Desktop.
Save Maecenas/4df452bdf626a3ae4c43e616a6fb87d3 to your computer and use it in GitHub Desktop.
Data Processing Using Python
# 时间索引
>>> import pandas as pd
>>> index = pd.date_range('20160503', periods=5)
>>> index
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
'2016-05-07'],
dtype='datetime64[ns]', freq='D')
# 数据显示
>>> import numpy as np
>>> dates = pd.DataFrame(np.random.randn(5,3),index=index,columns=list('ABC'))
>>> dates
A B C
2016-05-03 0.320522 -0.854196 -0.477794
2016-05-04 -0.736698 0.220931 0.878340
2016-05-05 1.002439 -0.422472 -1.027093
2016-05-06 1.792256 0.322563 0.285416
2016-05-07 -2.291344 0.633183 1.120919
>>> dates.index # 获取索引
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
'2016-05-07'],
dtype='datetime64[ns]', freq='D')
>>> dates.columns # 列名
Index([u'A', u'B', u'C'], dtype='object')
>>> dates.values # 返回数组,从这里可以看出来DataFrame与Series都是以ndarray为基础的
array([[ 0.32052247, -0.85419619, -0.47779368],
[-0.73669815, 0.22093113, 0.87834037],
[ 1.00243859, -0.42247204, -1.02709299],
[ 1.79225566, 0.32256317, 0.28541609],
[-2.2913443 , 0.63318336, 1.12091912]])
>>> dates.describe # 描述
<bound method DataFrame.describe of A B C
2016-05-03 0.320522 -0.854196 -0.477794
2016-05-04 -0.736698 0.220931 0.878340
2016-05-05 1.002439 -0.422472 -1.027093
2016-05-06 1.792256 0.322563 0.285416
2016-05-07 -2.291344 0.633183 1.120919>
# 数据筛选
>>> dates.head(1) # 切片,前一行
A B C
2016-05-03 0.320522 -0.854196 -0.477794
>>> dates.tail(1) # 最后一行
A B C
2016-05-07 -2.291344 0.633183 1.120919
>>> dates['2016-05-03':'2016-05-05'] # 通过索引切片
A B C
2016-05-03 0.320522 -0.854196 -0.477794
2016-05-04 -0.736698 0.220931 0.878340
2016-05-05 1.002439 -0.422472 -1.027093
>>> dates.loc['2016-05-03':'2016-05-05', ['A', 'B']] # 区域选择,定义索引,字段名
A B
2016-05-03 0.320522 -0.854196
2016-05-04 -0.736698 0.220931
2016-05-05 1.002439 -0.422472
>>> dates = pd.DataFrame(np.random.randn(5,3))
>>> dates
0 1 2
0 -0.814773 0.828948 1.279713
1 1.010236 -1.063719 -1.010884
2 -0.206223 -1.756636 -0.777366
3 -0.253049 0.955689 -1.735237
4 -1.269495 -0.321743 0.929069
>>> dates.at[0,0] # 定位到单元格
-0.81477290050615703
>>> dates.iloc[1:4,[1,]] # 通过数字切片loc
B
2016-05-04 -1.002059
2016-05-05 -0.298642
2016-05-06 -1.042664
>>> dates.iat[1,1] # 数字索引定位到单元格
-1.0020589880335706
>>> dates[dates.index > '2016-05-05'] # 条件筛选
A B C
2016-05-06 -0.334060 -1.042664 0.999093
2016-05-07 -2.051357 0.713025 -0.792198
>>> dates[(dates.index > '2016-05-05') & (dates.A < 0)] # 组合筛选
A B C
2016-05-06 -0.334060 -1.042664 0.999093
2016-05-07 -2.051357 0.713025 -0.792198
>>> dates[dates.A > 0].B # 返回Series
2016-05-06 -0.139405
2016-05-07 -1.437701
Freq: D, Name: B, dtype: float64
>>> dates[dates.A>0].B.mean() # 计算平均值
-0.78855298332911494
>>> len(dates) # 行数
5
>>> import numpy as np
>>> np.sign(np.diff(dates.A))
array([ 1., -1., 1., -1.])
>>> status = np.sign(np.diff(dates.A))
>>> status[np.where(status == 1.)].size # 算变更类型未1的数量
2
>>> dates.sort(columns='A') # 排序
__main__:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
A B C
2016-05-03 -0.695635 0.077077 0.694779
2016-05-05 -0.641996 1.420200 0.086330
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-07 0.667998 -1.437701 -0.196523
2016-05-06 1.176916 -0.139405 -0.592849
>>> dates.groupby('A').count() # 分组,分组的字段作为索引
B C
A
-0.695635 1 1
-0.641996 1 1
-0.594039 1 1
0.667998 1 1
1.176916 1 1
>>> dates.groupby('A').sum() # 分组计算和
B C
A
-0.695635 0.077077 0.694779
-0.641996 1.420200 0.086330
-0.594039 -0.842575 -0.046909
0.667998 -1.437701 -0.196523
1.176916 -0.139405 -0.592849
sum
# 数据合并
>>> bdates = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
>>> bdates
A B C
0 -0.043208 0.332842 -0.390034
1 -0.318745 0.013284 0.147836
>>> dates
A B C
2016-05-03 -0.695635 0.077077 0.694779
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-05 -0.641996 1.420200 0.086330
2016-05-06 1.176916 -0.139405 -0.592849
2016-05-07 0.667998 -1.437701 -0.196523
>>> dates.append(bdates) # 追加
A B C
2016-05-03 00:00:00 -0.695635 0.077077 0.694779
2016-05-04 00:00:00 -0.594039 -0.842575 -0.046909
2016-05-05 00:00:00 -0.641996 1.420200 0.086330
2016-05-06 00:00:00 1.176916 -0.139405 -0.592849
2016-05-07 00:00:00 0.667998 -1.437701 -0.196523
0 -0.043208 0.332842 -0.390034
1 -0.318745 0.013284 0.147836
>>> pieces = [dates[:2], dates[len(dates)-1:]
>>> pd.concat(pieces) # 合并片段
A B C
2016-05-03 -0.695635 0.077077 0.694779
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-07 0.667998 -1.437701 -0.196523
>>> pd.concat(pieces, ignore_index=True) # 忽略不想同的字段,用NaN填充
A B C
0 -0.695635 0.077077 0.694779
1 -0.594039 -0.842575 -0.046909
2 0.667998 -1.437701 -0.196523
>>> pd.merge(dates, bdates, on='A') # join合并,类比sql语句
Empty DataFrame
Columns: [A, B_x, C_x, B_y, C_y]
Index: []
# ndarray 多维数组
>>> import numpy as np
>>> np.ones((3,4)) # 自动生成3*4的二维数组,全为1
array([[ 1., 1., 1., 1.],
[ 1., 1., 1., 1.],
[ 1., 1., 1., 1.]])
>>> np.array([[1,2], [3,4]])
array([[1, 2],
[3, 4]])
>>> np.zeros((1,2)) # 生成全为0
array([[ 0., 0.]])
>>> np.arange(1,10,1) # 类似于range生成列表
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
# ndarray 运算
>>> aarr = np.array([[2,2], [3,3]])
>>> barr = np.array([[3,4], [5,6]])
>>> aarr * barr # 直接对同位置的进行运算,+ - * / % +=等
array([[ 6, 8],
[15, 18]])
>>> barr > 5 # bool运算
array([[False, False],
[False, True]], dtype=bool)
>>> barr.shape # 获取数组行列数结构
(2, 2)
>>> barr.reshape(1,4) # 重排数据结构
array([[3, 4, 5, 6]])
>>> barr.sum() # 算和
18
>>> barr.sum(axis=0) # 算列的和
array([ 8, 10])
>>> barr
array([[3, 4],
[5, 6]])
>>> barr.sum(axis=1) # 算行的和 axis 0 纵轴 1 横轴
array([ 7, 11])
>>> c = np.array([[3,4], [5,6]])
>>> np.where(c > 5, aarr, barr) # 筛选组合,判断c的同位置元素如果大于5则选aarr同位置的,否则选barr的
array([[3, 4],
[5, 3]])
>>> def func(i, j):
... return (i+1) * (j+1)
...
>>> np.fromfunction(func, (9,9)) # 通过1个函数来创建数组,第二个参数为数组的行与列数,传给func的参数行列的索引
array([[ 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[ 2., 4., 6., 8., 10., 12., 14., 16., 18.],
[ 3., 6., 9., 12., 15., 18., 21., 24., 27.],
[ 4., 8., 12., 16., 20., 24., 28., 32., 36.],
[ 5., 10., 15., 20., 25., 30., 35., 40., 45.],
[ 6., 12., 18., 24., 30., 36., 42., 48., 54.],
[ 7., 14., 21., 28., 35., 42., 49., 56., 63.],
[ 8., 16., 24., 32., 40., 48., 56., 64., 72.],
[ 9., 18., 27., 36., 45., 54., 63., 72., 81.]])
>>> np.add(aarr, barr) # ufunc的数组+
array([[5, 6],
[8, 9]])
>>> np.add.accumulate([1,2,3]) # reduce的方式对数组求和,并返回求和后的新的数组
array([1, 3, 6])
# Series 列 有序字段
>>> import pandas as pd
>>> pd.Series([1,2.0,'a']) # 自动增加索引,类似与字典的key-value结构
0 1
1 2
2 a
dtype: object
>>> aser = pd.Series([1,2,3], index=['a', 'b', 'c']) # 自定义索引名称
>>> aser
a 1
b 2
c 3
dtype: int64
>>> aser.index
Index([u'a', u'b', u'c'], dtype='object') # 获取Series的索引
>>> aser.values
array([1, 2, 3]) # 获取Series的值,返回数组对象
>>> aser['b'] # 通过索引取值
2
>>> aser * 2 # 运算,可类比ndarray的运算
a 2
b 4
c 6
dtype: int64
>>> import numpy as np
>>> np.exp(aser) # 计算自然对数的n次方
a 2.718282
b 7.389056
c 20.085537
dtype: float64
>>> data = {'a':1, 'b':2}
>>> index = ['a', 'b', 'c'] # 数据对齐,如果没有对应索引的数据,自动填充NaN
>>> bser = pd.Series(data, index=index)
>>> bser
a 1.0
b 2.0
c NaN
dtype: float64
>>> pd.isnull(bser) # 判断是否有NaN元素,返回一个新的结果Series
a False
b False
c True
dtype: bool
>>> cser = pd.Series({'a':2,'b':5,'d':10})
>>> bser + cser # 运算时如果有不相称的索引也会对齐
a 3.0
b 7.0
c NaN
d NaN
dtype: float64
>>> aser.name # 名称属性,分别对应Series的名称与索引名称
>>> aser.index.name
>>> aser.name = 'names'
>>> aser.index.name = 'col'
>>> aser
col
a 1
b 2
c 3
Name: names, dtype: int64
# DataFrame 数据帧 表
>>> data = {'col1': [1,2,3], 'col2': [3,4,5]}
>>> adf = pd.DataFrame(data)
>>> adf
col1 col2
0 1 3
1 2 4
2 3 5
>>> adf['col1'] # 通过列名获取单列数据,返回Series对象
0 1
1 2
2 3
Name: col1, dtype: int64
>>> adf.col1 # . 也可以用
0 1
1 2
2 3
Name: col1, dtype: int64
>>> adf.ix[0] # 通过索引获取一行数据,返回Series
col1 1
col2 3
Name: 0, dtype: int64
>>> adf.col1 = 'admin' # 重写一列的值
>>> adf
col1 col2
0 admin 3
1 admin 4
2 admin 5
>>> adf['col1'] = [10,9,8]
>>> adf
col1 col2
0 10 3
1 9 4
2 8 5
>>> del adf['col1'] # 删除一列
>>> adf
col2
0 3
1 4
2 5
>>> adf.index.name = 'index' # 索引名
>>> adf
col2
index
0 3
1 4
2 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment