DadgadCafe/np.py

## np.py
import numpy as np

# linspace：创建线段

arr = np.array([[1, 2, 3]
                [4, 5, 6]], dtype=int32) # list to matrix

arr.ndim # 2 dimensions
arr.shape # 2 * 3
arr.size # number of elements

np.zeros((3, 4), dtype=int) # 3*4

np.ones((3, 4), dtype=int16) # 3*4

np.empty((3, 4), dtype=int) # 3*4, close to 0

a = np.arange(10, 20, 2)
a[0] #10

a = np.arange(10, 50, 2) # 10 - 20, step 2
      .reshape((4, 5)) # reshape to 3*4
a[0][0] #10
a[0, 0] #same
a[0, 1:3] # [12, 14]

np.linspace(1, 10, 20) # 1 - 10, 20 pieces
  .reshape((5, 4))

# matrix operation
a1 = np.arange(5) # array([0, 1, 2, 3, 4])
a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14])

a2 - a1 # array([10, 10, 10, 10, 10])
a2 + a1 # array([10, 12, 14, 16, 18])
a2 * a1 # array([ 0, 11, 24, 39, 56])
a1 / a2 # array([ 0.        ,  0.09090909,  0.16666667,  0.23076923,  0.28571429])
a1 ** 2 # array([ 0,  1,  4,  9, 16])
a1 < 0 # array([False, False, False, False, False], dtype=bool)
np.sin(a1)

a = np.array([[1,1],[0,1]])
b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]]
np.dot(a, b) # array([[2, 4], [2, 3]])
a.dot(b) # same
np.dot([1,2,3],[4,5,6]) # 1*4 + 2*5 + 3*6 = 32

a = np.random.random((2,4))
np.sum(a)
np.max(a)
np.min(a)
np.sum(a, axis=0) # sum by column
np.sum(a, axis=1) # sum by row


A = np.arange(2,14).reshape((3,4))
# array([[ 2, 3, 4, 5]
#        [ 6, 7, 8, 9]
#        [10,11,12,13]])
np.argmax(A) # index of 2: 0
np.argmin(A) # indeox of 13: 11
np.mean(A) # A.mean() # 7.5
np.median(A) # A.median()
np.median(A, axis=0)
np.average(A) # 7.5
np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2,  5,  9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ...
              #(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
              # array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))

A = np.arange(14, 2, -1)
      .reshape((3,4))
# array([[14, 13, 12, 11],
#       [10,  9,  8,  7],
#       [ 6,  5,  4,  3]])
np.sort(A)
# array([[11,12,13,14]
#        [ 7, 8, 9,10]
#        [ 3, 4, 5, 6]])
np.transpose(A)
A.T # same
# array([[14,10, 6]
#        [13, 9, 5]
#        [12, 8, 4]
#        [11, 7, 3]])
(A.T).dot(A)
# array([[332, 302, 272, 242],
#        [302, 275, 248, 221],
#        [272, 248, 224, 200],
#        [242, 221, 200, 179]])
np.clip(A, 5, 9) # >9 => 9; <5 => 5
# array([[ 9, 9, 9, 9]
#        [ 9, 9, 8, 7]
#        [ 6, 5, 5, 5]])

a = np.arange(4).reshape((2, 2))
a[0, 0] # 0
for row in a:
    print(row)

for column in a.T:
    print(column)

a.flatten() # array([0, 1, 2, 3])
for item in a.flat: # iterate items
    print(item)


a = np.array([1, 1, 1])
b = np.array([2, 2, 2])
np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]])
np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2])
a[:, np.newaxis] # array([[1], [1], [1]])
# using concatenate
np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal


a = np.arange(12).reshape((3, 4))
np.hsplit(a, 2)
np.split(a, 2, axis=1) # horizontal
# [ array([[0, 1], [4, 5], [8, 9]]),
#   array([[2, 3], [6, 7], [10, 11]])]
np.vsplit(a, 3)
np.split(a, 3, axis=0) # vertically
# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])]
np.array_split(A, 3, axis=1) # uneven split
# [ array([[0, 1],[4, 5],[8, 9]]),
#   array([[2], [6], [10]]),
#   array([[3], [7], [11]])]

a = np.arange(4)
b = a
a is b # True
c = a.copy()
c is a # False


#pandas
import pandas as pd
s = pd.Series([1, 3, np.nan, 5])
# 0    1.0
# 1    3.0
# 2    NaN
# 3    5.0
# dtype: float64

dates = pd.date_range('20170101', periods=6)
# DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'],
#               dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
#                    a         b         c         d
# 2017-01-01 -0.669733  0.091818  0.581845 -0.290370
# 2017-01-02  0.203958 -0.840011 -1.234419  1.567374
# 2017-01-03  0.761231 -0.712473  0.954426  2.002349
# 2017-01-04  0.477278  0.860596  0.867349  0.438903
# 2017-01-05 -1.431947  0.684325 -0.762821  0.815071
# 2017-01-06 -0.095380 -0.515609  0.184032 -0.482174

pd.DataFrame(np.arange(12).reshape((3, 4)))
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.arange(4),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  0   test  foo
# 1  1.0 2013-01-02  1.0  1  train  foo
# 2  1.0 2013-01-02  1.0  2   test  foo
# 3  1.0 2013-01-02  1.0  3  train  foo
df2.type
df2.index
df2.columns
df2.rows
df2.values
df2.describe()
df2.T
df2.sort_index(axis=1, ascending=False) # sort by column name
df2.sort_values(by='B') # sort by B column value

# data selection
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),
                  index=dates,
                  columns=['A','B','C','D'])

'''
             A   B   C   D
2013-01-01   0   1   2   3
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''

df['A']
df.A
'''
2013-01-01     0
2013-01-02     4
2013-01-03     8
2013-01-04    12
2013-01-05    16
2013-01-06    20
Freq: D, Name: A, dtype: int64
'''

df[0:3]
'''
            A  B   C   D
2013-01-01  0  1   2   3
2013-01-02  4  5   6   7
2013-01-03  8  9  10  11
'''

df['20130102':'20130104']
'''
A   B   C   D
2013-01-02   4   5   6   7
2013-01-03   8   9  10  11
2013-01-04  12  13  14  15
'''

df.loc['20130102']
'''
A    4
B    5
C    6
D    7
Name: 2013-01-02 00:00:00, dtype: int64
'''

df.loc[:,['A','B']]
'''
             A   B
2013-01-01   0   1
2013-01-02   4   5
2013-01-03   8   9
2013-01-04  12  13
2013-01-05  16  17
2013-01-06  20  21
'''

df.loc['20130102',['A','B']]
'''
A    4
B    5
Name: 2013-01-02 00:00:00, dtype: int64
'''

df.iloc[3,1] # 13
df.iloc[3:5, 1:3]
'''
             B   C
2013-01-04  13  14
2013-01-05  17  18
'''
df.iloc[[1,3,5],1:3]
'''
             B   C
2013-01-02   5   6
2013-01-04  13  14
2013-01-06  21  22
'''

df.ix[:3,['A','C']]
'''
            A   C
2013-01-01  0   2
2013-01-02  4   6
2013-01-03  8  10
'''

# Boolean indexing:
df[df.A>8]
'''
             A   B   C   D
2013-01-04  12  13  14  15
2013-01-05  16  17  18  19
2013-01-06  20  21  22  23
'''


dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)),
                  index=dates,
                  columns=['A', 'B', 'C', 'D'])

df.iloc[2, 2] = 111
df.loc['20130101', 'B'] = 222
'''
             A    B    C   D
2013-01-01   0  222    2   3
2013-01-02   4    5    6   7
2013-01-03   8    9  111  11
2013-01-04  12   13   14  15
2013-01-05  16   17   18  19
2013-01-06  20   21   22  23
'''
df.B[df.A>4] = 0
'''
                A     B     C   D
2013-01-01   0  2222     2   3
2013-01-02   4     5     6   7
2013-01-03   8     0  1111  11
2013-01-04  12     0    14  15
2013-01-05  16     0    18  19
2013-01-06  20     0    22  23
'''
df['F'] = np.nan
'''
             A    B    C   D   F
2013-01-01   0  222    2   3 NaN
2013-01-02   4    5    6   7 NaN
2013-01-03   8    0  111  11 NaN
2013-01-04  12    0   14  15 NaN
2013-01-05  16    0   18  19 NaN
2013-01-06  20    0   22  23 NaN
'''
df['E'] = pd.Series([1,2,3,4,5,6],
                    index=pd.date_range('20130101',
                                        periods=6))
'''
             A     B     C   D   F  E
2013-01-01   0  2222     2   3 NaN  1
2013-01-02   4     5     6   7 NaN  2
2013-01-03   8     0  1111  11 NaN  3
2013-01-04  12     0    14  15 NaN  4
2013-01-05  16     0    18  19 NaN  5
2013-01-06  20     0    22  23 NaN  6
'''

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)),
                  index=dates,
                  columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
'''
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
df.dropna(
    axis=0,     # 0: 对行进行操作; 1: 对列进行操作
    how='any'   # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
)
'''
             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
df.fillna(value=0)
'''
             A     B     C   D
2013-01-01   0   0.0   2.0   3
2013-01-02   4   5.0   0.0   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
'''
df.isnull()
'''
                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False
'''
np.any(df.isnull()) # if exists nan


# read
data = pd.read_csv('students.csv')
# to pickle
data.to_pickle('student.pickle')

# concat
# axis, default 0
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
pd.concat([df1, df2, df3], axis=0)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
'''
pd.concat([df1, df2, df3], axis=0, ignore_index=True)
'''
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
'''

df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
pd.concat([df1, df2], axis=0, join='outer')
'''
    a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
'''

pd.concat([df1, df2], axis=0, join='inner')
'''
    b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
'''

pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
'''
    b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
5  1.0  1.0  1.0
'''

# horizontal by index
df1 = pd.DataFrame(np.ones((3, 4)),
                   columns=['A', 'B', 'C', 'D'],
                   index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)),
                   columns=['A', 'B', 'C', 'D']
                   index=[2, 3, 4])
pd.concat([df1, df2], axis=1, join_axes=[df1.index])

#     a    b    c    d    b    c    d    e
# 1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
# 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0


df1 = pd.DataFrame(np.ones((3, 4)) * 0,
                   columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1,
                   columns=['A', 'B', 'C', 'D'])
df3 = pd.DataFrame(np.ones((3, 4)) * 1,
                   columns=['A', 'B', 'C', 'D'])

df1.append(df2, ignore_index=True)
'''
    a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
'''

s1 = pd.Series([1,2,3,4],
               index=['a','b','c','d'])

df1.append(s1, ignore_index=True)
'''
#     a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  2.0  3.0  4.0
'''


left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
'''
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K2
3  A3  B3  K3
'''
'''
    C   D key
0  C0  D0  K1
1  C1  D1  K2
2  C2  D2  K3
3  C3  D3  K4
'''
pd.merge(left, right, on='key')
'''
    A   B key   C   D
0  A1  B1  K1  C0  D0
1  A2  B2  K2  C1  D1
2  A3  B3  K3  C2  D2
'''

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})

pd.merge(left, right, on=['key1', 'key2'], how='inner')
pd.merge(left, right, on=['key1', 'key2'], how='outer')
pd.merge(left, right, on=['key1', 'key2'], how='left')
pd.merge(left, right, on=['key1', 'key2'], how='right')


df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
pd.merge(df1, df2, on='col1', how='outer', indicator=True)
'''
#   col1 col_left  col_right      _merge
# 0   0.0        a        NaN   left_only
# 1   1.0        b        2.0        both
# 2   2.0      NaN        2.0  right_only
# 3   2.0      NaN        2.0  right_only
'''
pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
'''
  col1 col_left  col_right indicator_column
0   0.0        a        NaN        left_only
1   1.0        b        2.0             both
2   2.0      NaN        2.0       right_only
3   2.0      NaN        2.0       right_only
'''

# merge by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
pd.merge(left, right, left_index=True, right_index=True, how='outer')
#      A    B    C    D
# K0   A0   B0   C0   D0
# K1   A1   B1  NaN  NaN
# K2   A2   B2   C2   D2
# K3  NaN  NaN   C3   D3

pd.merge(left, right, left_index=True, right_index=True, how='inner')
#     A   B   C   D
# K0  A0  B0  C0  D0
# K2  A2  B2  C2  D2

boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
#使用suffixes解决overlapping的问题
pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
'''
   age_boy   k  age_girl
0        1  K0         4
1        1  K0         5
'''

# draw
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data.cumsum()
data.plot()
plt.show()
	import numpy as np

	# linspace：创建线段

	arr = np.array([[1, 2, 3]
	[4, 5, 6]], dtype=int32) # list to matrix

	arr.ndim # 2 dimensions
	arr.shape # 2 * 3
	arr.size # number of elements

	np.zeros((3, 4), dtype=int) # 3*4

	np.ones((3, 4), dtype=int16) # 3*4

	np.empty((3, 4), dtype=int) # 3*4, close to 0

	a = np.arange(10, 20, 2)
	a[0] #10

	a = np.arange(10, 50, 2) # 10 - 20, step 2
	.reshape((4, 5)) # reshape to 3*4
	a[0][0] #10
	a[0, 0] #same
	a[0, 1:3] # [12, 14]

	np.linspace(1, 10, 20) # 1 - 10, 20 pieces
	.reshape((5, 4))

	# matrix operation
	a1 = np.arange(5) # array([0, 1, 2, 3, 4])
	a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14])

	a2 - a1 # array([10, 10, 10, 10, 10])
	a2 + a1 # array([10, 12, 14, 16, 18])
	a2 * a1 # array([ 0, 11, 24, 39, 56])
	a1 / a2 # array([ 0. , 0.09090909, 0.16666667, 0.23076923, 0.28571429])
	a1 ** 2 # array([ 0, 1, 4, 9, 16])
	a1 < 0 # array([False, False, False, False, False], dtype=bool)
	np.sin(a1)

	a = np.array([[1,1],[0,1]])
	b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]]
	np.dot(a, b) # array([[2, 4], [2, 3]])
	a.dot(b) # same
	np.dot([1,2,3],[4,5,6]) # 14 + 25 + 3*6 = 32

	a = np.random.random((2,4))
	np.sum(a)
	np.max(a)
	np.min(a)
	np.sum(a, axis=0) # sum by column
	np.sum(a, axis=1) # sum by row


	A = np.arange(2,14).reshape((3,4))
	# array([[ 2, 3, 4, 5]
	# [ 6, 7, 8, 9]
	# [10,11,12,13]])
	np.argmax(A) # index of 2: 0
	np.argmin(A) # indeox of 13: 11
	np.mean(A) # A.mean() # 7.5
	np.median(A) # A.median()
	np.median(A, axis=0)
	np.average(A) # 7.5
	np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
	np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
	np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ...
	#(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
	# array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))

	A = np.arange(14, 2, -1)
	.reshape((3,4))
	# array([[14, 13, 12, 11],
	# [10, 9, 8, 7],
	# [ 6, 5, 4, 3]])
	np.sort(A)
	# array([[11,12,13,14]
	# [ 7, 8, 9,10]
	# [ 3, 4, 5, 6]])
	np.transpose(A)
	A.T # same
	# array([[14,10, 6]
	# [13, 9, 5]
	# [12, 8, 4]
	# [11, 7, 3]])
	(A.T).dot(A)
	# array([[332, 302, 272, 242],
	# [302, 275, 248, 221],
	# [272, 248, 224, 200],
	# [242, 221, 200, 179]])
	np.clip(A, 5, 9) # >9 => 9; <5 => 5
	# array([[ 9, 9, 9, 9]
	# [ 9, 9, 8, 7]
	# [ 6, 5, 5, 5]])

	a = np.arange(4).reshape((2, 2))
	a[0, 0] # 0
	for row in a:
	print(row)

	for column in a.T:
	print(column)

	a.flatten() # array([0, 1, 2, 3])
	for item in a.flat: # iterate items
	print(item)


	a = np.array([1, 1, 1])
	b = np.array([2, 2, 2])
	np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]])
	np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2])
	a[:, np.newaxis] # array([[1], [1], [1]])
	# using concatenate
	np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal


	a = np.arange(12).reshape((3, 4))
	np.hsplit(a, 2)
	np.split(a, 2, axis=1) # horizontal
	# [ array([[0, 1], [4, 5], [8, 9]]),
	# array([[2, 3], [6, 7], [10, 11]])]
	np.vsplit(a, 3)
	np.split(a, 3, axis=0) # vertically
	# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])]
	np.array_split(A, 3, axis=1) # uneven split
	# [ array([[0, 1],[4, 5],[8, 9]]),
	# array([[2], [6], [10]]),
	# array([[3], [7], [11]])]

	a = np.arange(4)
	b = a
	a is b # True
	c = a.copy()
	c is a # False



	#pandas
	import pandas as pd
	s = pd.Series([1, 3, np.nan, 5])
	# 0 1.0
	# 1 3.0
	# 2 NaN
	# 3 5.0
	# dtype: float64

	dates = pd.date_range('20170101', periods=6)
	# DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'],
	# dtype='datetime64[ns]', freq='D')

	df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
	# a b c d
	# 2017-01-01 -0.669733 0.091818 0.581845 -0.290370
	# 2017-01-02 0.203958 -0.840011 -1.234419 1.567374
	# 2017-01-03 0.761231 -0.712473 0.954426 2.002349
	# 2017-01-04 0.477278 0.860596 0.867349 0.438903
	# 2017-01-05 -1.431947 0.684325 -0.762821 0.815071
	# 2017-01-06 -0.095380 -0.515609 0.184032 -0.482174

	pd.DataFrame(np.arange(12).reshape((3, 4)))
	# 0 1 2 3
	# 0 0 1 2 3
	# 1 4 5 6 7
	# 2 8 9 10 11

	df2 = pd.DataFrame({'A': 1.,
	'B': pd.Timestamp('20130102'),
	'C': pd.Series(1, index=list(range(4)), dtype='float32'),
	'D': np.arange(4),
	'E': pd.Categorical(['test', 'train', 'test', 'train']),
	'F': 'foo'})
	# A B C D E F
	# 0 1.0 2013-01-02 1.0 0 test foo
	# 1 1.0 2013-01-02 1.0 1 train foo
	# 2 1.0 2013-01-02 1.0 2 test foo
	# 3 1.0 2013-01-02 1.0 3 train foo
	df2.type
	df2.index
	df2.columns
	df2.rows
	df2.values
	df2.describe()
	df2.T
	df2.sort_index(axis=1, ascending=False) # sort by column name
	df2.sort_values(by='B') # sort by B column value

	# data selection
	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6,4)),
	index=dates,
	columns=['A','B','C','D'])

	'''
	A B C D
	2013-01-01 0 1 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''

	df['A']
	df.A
	'''
	2013-01-01 0
	2013-01-02 4
	2013-01-03 8
	2013-01-04 12
	2013-01-05 16
	2013-01-06 20
	Freq: D, Name: A, dtype: int64
	'''

	df[0:3]
	'''
	A B C D
	2013-01-01 0 1 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	'''

	df['20130102':'20130104']
	'''
	A B C D
	2013-01-02 4 5 6 7
	2013-01-03 8 9 10 11
	2013-01-04 12 13 14 15
	'''

	df.loc['20130102']
	'''
	A 4
	B 5
	C 6
	D 7
	Name: 2013-01-02 00:00:00, dtype: int64
	'''

	df.loc[:,['A','B']]
	'''
	A B
	2013-01-01 0 1
	2013-01-02 4 5
	2013-01-03 8 9
	2013-01-04 12 13
	2013-01-05 16 17
	2013-01-06 20 21
	'''

	df.loc['20130102',['A','B']]
	'''
	A 4
	B 5
	Name: 2013-01-02 00:00:00, dtype: int64
	'''

	df.iloc[3,1] # 13
	df.iloc[3:5, 1:3]
	'''
	B C
	2013-01-04 13 14
	2013-01-05 17 18
	'''
	df.iloc[[1,3,5],1:3]
	'''
	B C
	2013-01-02 5 6
	2013-01-04 13 14
	2013-01-06 21 22
	'''

	df.ix[:3,['A','C']]
	'''
	A C
	2013-01-01 0 2
	2013-01-02 4 6
	2013-01-03 8 10
	'''

	# Boolean indexing:
	df[df.A>8]
	'''
	A B C D
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''


	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6, 4)),
	index=dates,
	columns=['A', 'B', 'C', 'D'])

	df.iloc[2, 2] = 111
	df.loc['20130101', 'B'] = 222
	'''
	A B C D
	2013-01-01 0 222 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 9 111 11
	2013-01-04 12 13 14 15
	2013-01-05 16 17 18 19
	2013-01-06 20 21 22 23
	'''
	df.B[df.A>4] = 0
	'''
	A B C D
	2013-01-01 0 2222 2 3
	2013-01-02 4 5 6 7
	2013-01-03 8 0 1111 11
	2013-01-04 12 0 14 15
	2013-01-05 16 0 18 19
	2013-01-06 20 0 22 23
	'''
	df['F'] = np.nan
	'''
	A B C D F
	2013-01-01 0 222 2 3 NaN
	2013-01-02 4 5 6 7 NaN
	2013-01-03 8 0 111 11 NaN
	2013-01-04 12 0 14 15 NaN
	2013-01-05 16 0 18 19 NaN
	2013-01-06 20 0 22 23 NaN
	'''
	df['E'] = pd.Series([1,2,3,4,5,6],
	index=pd.date_range('20130101',
	periods=6))
	'''
	A B C D F E
	2013-01-01 0 2222 2 3 NaN 1
	2013-01-02 4 5 6 7 NaN 2
	2013-01-03 8 0 1111 11 NaN 3
	2013-01-04 12 0 14 15 NaN 4
	2013-01-05 16 0 18 19 NaN 5
	2013-01-06 20 0 22 23 NaN 6
	'''

	dates = pd.date_range('20130101', periods=6)
	df = pd.DataFrame(np.arange(24).reshape((6, 4)),
	index=dates,
	columns=['A', 'B', 'C', 'D'])
	df.iloc[0, 1] = np.nan
	df.iloc[1, 2] = np.nan
	'''
	A B C D
	2013-01-01 0 NaN 2.0 3
	2013-01-02 4 5.0 NaN 7
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.dropna(
	axis=0, # 0: 对行进行操作; 1: 对列进行操作
	how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
	)
	'''
	A B C D
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.fillna(value=0)
	'''
	A B C D
	2013-01-01 0 0.0 2.0 3
	2013-01-02 4 5.0 0.0 7
	2013-01-03 8 9.0 10.0 11
	2013-01-04 12 13.0 14.0 15
	2013-01-05 16 17.0 18.0 19
	2013-01-06 20 21.0 22.0 23
	'''
	df.isnull()
	'''
	A B C D
	2013-01-01 False True False False
	2013-01-02 False False True False
	2013-01-03 False False False False
	2013-01-04 False False False False
	2013-01-05 False False False False
	2013-01-06 False False False False
	'''
	np.any(df.isnull()) # if exists nan


	# read
	data = pd.read_csv('students.csv')
	# to pickle
	data.to_pickle('student.pickle')

	# concat
	# axis, default 0
	df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
	df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
	df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
	pd.concat([df1, df2, df3], axis=0)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	0 1.0 1.0 1.0 1.0
	1 1.0 1.0 1.0 1.0
	2 1.0 1.0 1.0 1.0
	0 2.0 2.0 2.0 2.0
	1 2.0 2.0 2.0 2.0
	2 2.0 2.0 2.0 2.0
	'''
	pd.concat([df1, df2, df3], axis=0, ignore_index=True)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	5 1.0 1.0 1.0 1.0
	6 2.0 2.0 2.0 2.0
	7 2.0 2.0 2.0 2.0
	8 2.0 2.0 2.0 2.0
	'''

	df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
	df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
	pd.concat([df1, df2], axis=0, join='outer')
	'''
	a b c d e
	1 0.0 0.0 0.0 0.0 NaN
	2 0.0 0.0 0.0 0.0 NaN
	3 0.0 0.0 0.0 0.0 NaN
	2 NaN 1.0 1.0 1.0 1.0
	3 NaN 1.0 1.0 1.0 1.0
	4 NaN 1.0 1.0 1.0 1.0
	'''

	pd.concat([df1, df2], axis=0, join='inner')
	'''
	b c d
	1 0.0 0.0 0.0
	2 0.0 0.0 0.0
	3 0.0 0.0 0.0
	2 1.0 1.0 1.0
	3 1.0 1.0 1.0
	4 1.0 1.0 1.0
	'''

	pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
	'''
	b c d
	0 0.0 0.0 0.0
	1 0.0 0.0 0.0
	2 0.0 0.0 0.0
	3 1.0 1.0 1.0
	4 1.0 1.0 1.0
	5 1.0 1.0 1.0
	'''

	# horizontal by index
	df1 = pd.DataFrame(np.ones((3, 4)),
	columns=['A', 'B', 'C', 'D'],
	index=[1, 2, 3])
	df2 = pd.DataFrame(np.ones((3, 4)),
	columns=['A', 'B', 'C', 'D']
	index=[2, 3, 4])
	pd.concat([df1, df2], axis=1, join_axes=[df1.index])

	# a b c d b c d e
	# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
	# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
	# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0


	df1 = pd.DataFrame(np.ones((3, 4)) * 0,
	columns=['A', 'B', 'C', 'D'])
	df2 = pd.DataFrame(np.ones((3, 4)) * 1,
	columns=['A', 'B', 'C', 'D'])
	df3 = pd.DataFrame(np.ones((3, 4)) * 1,
	columns=['A', 'B', 'C', 'D'])

	df1.append(df2, ignore_index=True)
	'''
	a b c d
	0 0.0 0.0 0.0 0.0
	1 0.0 0.0 0.0 0.0
	2 0.0 0.0 0.0 0.0
	3 1.0 1.0 1.0 1.0
	4 1.0 1.0 1.0 1.0
	5 1.0 1.0 1.0 1.0
	'''

	s1 = pd.Series([1,2,3,4],
	index=['a','b','c','d'])

	df1.append(s1, ignore_index=True)
	'''
	# a b c d
	# 0 0.0 0.0 0.0 0.0
	# 1 0.0 0.0 0.0 0.0
	# 2 0.0 0.0 0.0 0.0
	# 3 1.0 2.0 3.0 4.0
	'''


	left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})
	'''
	A B key
	0 A0 B0 K0
	1 A1 B1 K1
	2 A2 B2 K2
	3 A3 B3 K3
	'''
	'''
	C D key
	0 C0 D0 K1
	1 C1 D1 K2
	2 C2 D2 K3
	3 C3 D3 K4
	'''
	pd.merge(left, right, on='key')
	'''
	A B key C D
	0 A1 B1 K1 C0 D0
	1 A2 B2 K2 C1 D1
	2 A3 B3 K3 C2 D2
	'''

	left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
	'key2': ['K0', 'K1', 'K0', 'K1'],
	'A': ['A0', 'A1', 'A2', 'A3'],
	'B': ['B0', 'B1', 'B2', 'B3']})
	right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
	'key2': ['K0', 'K0', 'K0', 'K0'],
	'C': ['C0', 'C1', 'C2', 'C3'],
	'D': ['D0', 'D1', 'D2', 'D3']})

	pd.merge(left, right, on=['key1', 'key2'], how='inner')
	pd.merge(left, right, on=['key1', 'key2'], how='outer')
	pd.merge(left, right, on=['key1', 'key2'], how='left')
	pd.merge(left, right, on=['key1', 'key2'], how='right')


	df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
	df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
	pd.merge(df1, df2, on='col1', how='outer', indicator=True)
	'''
	# col1 col_left col_right _merge
	# 0 0.0 a NaN left_only
	# 1 1.0 b 2.0 both
	# 2 2.0 NaN 2.0 right_only
	# 3 2.0 NaN 2.0 right_only
	'''
	pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
	'''
	col1 col_left col_right indicator_column
	0 0.0 a NaN left_only
	1 1.0 b 2.0 both
	2 2.0 NaN 2.0 right_only
	3 2.0 NaN 2.0 right_only
	'''

	# merge by index
	left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
	'B': ['B0', 'B1', 'B2']},
	index=['K0', 'K1', 'K2'])
	right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
	'D': ['D0', 'D2', 'D3']},
	index=['K0', 'K2', 'K3'])
	pd.merge(left, right, left_index=True, right_index=True, how='outer')
	# A B C D
	# K0 A0 B0 C0 D0
	# K1 A1 B1 NaN NaN
	# K2 A2 B2 C2 D2
	# K3 NaN NaN C3 D3

	pd.merge(left, right, left_index=True, right_index=True, how='inner')
	# A B C D
	# K0 A0 B0 C0 D0
	# K2 A2 B2 C2 D2

	boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
	girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
	#使用suffixes解决overlapping的问题
	pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
	'''
	age_boy k age_girl
	0 1 K0 4
	1 1 K0 5
	'''

	# draw
	import matplotlib.pyplot as plt
	data = pd.Series(np.random.randn(1000),index=np.arange(1000))
	data.cumsum()
	data.plot()
	plt.show()