Skip to content

Instantly share code, notes, and snippets.

@DadgadCafe
Created June 30, 2017 03:44
Show Gist options
  • Save DadgadCafe/27165ed2b3e18787dac8dd0c4daf960b to your computer and use it in GitHub Desktop.
Save DadgadCafe/27165ed2b3e18787dac8dd0c4daf960b to your computer and use it in GitHub Desktop.
notes of numpy and pandas.
import numpy as np
# linspace:创建线段
arr = np.array([[1, 2, 3]
[4, 5, 6]], dtype=int32) # list to matrix
arr.ndim # 2 dimensions
arr.shape # 2 * 3
arr.size # number of elements
np.zeros((3, 4), dtype=int) # 3*4
np.ones((3, 4), dtype=int16) # 3*4
np.empty((3, 4), dtype=int) # 3*4, close to 0
a = np.arange(10, 20, 2)
a[0] #10
a = np.arange(10, 50, 2) # 10 - 20, step 2
.reshape((4, 5)) # reshape to 3*4
a[0][0] #10
a[0, 0] #same
a[0, 1:3] # [12, 14]
np.linspace(1, 10, 20) # 1 - 10, 20 pieces
.reshape((5, 4))
# matrix operation
a1 = np.arange(5) # array([0, 1, 2, 3, 4])
a2 = np.arange(10, 15) # array([10, 11, 12, 13, 14])
a2 - a1 # array([10, 10, 10, 10, 10])
a2 + a1 # array([10, 12, 14, 16, 18])
a2 * a1 # array([ 0, 11, 24, 39, 56])
a1 / a2 # array([ 0. , 0.09090909, 0.16666667, 0.23076923, 0.28571429])
a1 ** 2 # array([ 0, 1, 4, 9, 16])
a1 < 0 # array([False, False, False, False, False], dtype=bool)
np.sin(a1)
a = np.array([[1,1],[0,1]])
b = np.arange(4).reshape((2,2)) # [[0, 1], [2, 3]]
np.dot(a, b) # array([[2, 4], [2, 3]])
a.dot(b) # same
np.dot([1,2,3],[4,5,6]) # 1*4 + 2*5 + 3*6 = 32
a = np.random.random((2,4))
np.sum(a)
np.max(a)
np.min(a)
np.sum(a, axis=0) # sum by column
np.sum(a, axis=1) # sum by row
A = np.arange(2,14).reshape((3,4))
# array([[ 2, 3, 4, 5]
# [ 6, 7, 8, 9]
# [10,11,12,13]])
np.argmax(A) # index of 2: 0
np.argmin(A) # indeox of 13: 11
np.mean(A) # A.mean() # 7.5
np.median(A) # A.median()
np.median(A, axis=0)
np.average(A) # 7.5
np.cumsum(A) # [2, 2+3, 2+3+4, ...] => array([ 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90])
np.diff(A) # [[3-2, 4-3, 5-4], ...] => array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
np.nonzero(A) # position of nonzero: (0, 0), (0, 1) ...
#(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]),
# array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]))
A = np.arange(14, 2, -1)
.reshape((3,4))
# array([[14, 13, 12, 11],
# [10, 9, 8, 7],
# [ 6, 5, 4, 3]])
np.sort(A)
# array([[11,12,13,14]
# [ 7, 8, 9,10]
# [ 3, 4, 5, 6]])
np.transpose(A)
A.T # same
# array([[14,10, 6]
# [13, 9, 5]
# [12, 8, 4]
# [11, 7, 3]])
(A.T).dot(A)
# array([[332, 302, 272, 242],
# [302, 275, 248, 221],
# [272, 248, 224, 200],
# [242, 221, 200, 179]])
np.clip(A, 5, 9) # >9 => 9; <5 => 5
# array([[ 9, 9, 9, 9]
# [ 9, 9, 8, 7]
# [ 6, 5, 5, 5]])
a = np.arange(4).reshape((2, 2))
a[0, 0] # 0
for row in a:
print(row)
for column in a.T:
print(column)
a.flatten() # array([0, 1, 2, 3])
for item in a.flat: # iterate items
print(item)
a = np.array([1, 1, 1])
b = np.array([2, 2, 2])
np.vstack((a, b)) # array([[1, 1, 1], [2, 2, 2]])
np.hstack((a, b)) # array([1, 1, 1, 2, 2, 2])
a[:, np.newaxis] # array([[1], [1], [1]])
# using concatenate
np.concatenate((a, b, b), axis=0) # 0:vertical 1:horizontal
a = np.arange(12).reshape((3, 4))
np.hsplit(a, 2)
np.split(a, 2, axis=1) # horizontal
# [ array([[0, 1], [4, 5], [8, 9]]),
# array([[2, 3], [6, 7], [10, 11]])]
np.vsplit(a, 3)
np.split(a, 3, axis=0) # vertically
# [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[8, 9, 10, 11]])]
np.array_split(A, 3, axis=1) # uneven split
# [ array([[0, 1],[4, 5],[8, 9]]),
# array([[2], [6], [10]]),
# array([[3], [7], [11]])]
a = np.arange(4)
b = a
a is b # True
c = a.copy()
c is a # False
#pandas
import pandas as pd
s = pd.Series([1, 3, np.nan, 5])
# 0 1.0
# 1 3.0
# 2 NaN
# 3 5.0
# dtype: float64
dates = pd.date_range('20170101', periods=6)
# DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
# a b c d
# 2017-01-01 -0.669733 0.091818 0.581845 -0.290370
# 2017-01-02 0.203958 -0.840011 -1.234419 1.567374
# 2017-01-03 0.761231 -0.712473 0.954426 2.002349
# 2017-01-04 0.477278 0.860596 0.867349 0.438903
# 2017-01-05 -1.431947 0.684325 -0.762821 0.815071
# 2017-01-06 -0.095380 -0.515609 0.184032 -0.482174
pd.DataFrame(np.arange(12).reshape((3, 4)))
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.arange(4),
'E': pd.Categorical(['test', 'train', 'test', 'train']),
'F': 'foo'})
# A B C D E F
# 0 1.0 2013-01-02 1.0 0 test foo
# 1 1.0 2013-01-02 1.0 1 train foo
# 2 1.0 2013-01-02 1.0 2 test foo
# 3 1.0 2013-01-02 1.0 3 train foo
df2.type
df2.index
df2.columns
df2.rows
df2.values
df2.describe()
df2.T
df2.sort_index(axis=1, ascending=False) # sort by column name
df2.sort_values(by='B') # sort by B column value
# data selection
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),
index=dates,
columns=['A','B','C','D'])
'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''
df['A']
df.A
'''
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int64
'''
df[0:3]
'''
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
'''
df['20130102':'20130104']
'''
A B C D
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
'''
df.loc['20130102']
'''
A 4
B 5
C 6
D 7
Name: 2013-01-02 00:00:00, dtype: int64
'''
df.loc[:,['A','B']]
'''
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21
'''
df.loc['20130102',['A','B']]
'''
A 4
B 5
Name: 2013-01-02 00:00:00, dtype: int64
'''
df.iloc[3,1] # 13
df.iloc[3:5, 1:3]
'''
B C
2013-01-04 13 14
2013-01-05 17 18
'''
df.iloc[[1,3,5],1:3]
'''
B C
2013-01-02 5 6
2013-01-04 13 14
2013-01-06 21 22
'''
df.ix[:3,['A','C']]
'''
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
'''
# Boolean indexing:
df[df.A>8]
'''
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)),
index=dates,
columns=['A', 'B', 'C', 'D'])
df.iloc[2, 2] = 111
df.loc['20130101', 'B'] = 222
'''
A B C D
2013-01-01 0 222 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
'''
df.B[df.A>4] = 0
'''
A B C D
2013-01-01 0 2222 2 3
2013-01-02 4 5 6 7
2013-01-03 8 0 1111 11
2013-01-04 12 0 14 15
2013-01-05 16 0 18 19
2013-01-06 20 0 22 23
'''
df['F'] = np.nan
'''
A B C D F
2013-01-01 0 222 2 3 NaN
2013-01-02 4 5 6 7 NaN
2013-01-03 8 0 111 11 NaN
2013-01-04 12 0 14 15 NaN
2013-01-05 16 0 18 19 NaN
2013-01-06 20 0 22 23 NaN
'''
df['E'] = pd.Series([1,2,3,4,5,6],
index=pd.date_range('20130101',
periods=6))
'''
A B C D F E
2013-01-01 0 2222 2 3 NaN 1
2013-01-02 4 5 6 7 NaN 2
2013-01-03 8 0 1111 11 NaN 3
2013-01-04 12 0 14 15 NaN 4
2013-01-05 16 0 18 19 NaN 5
2013-01-06 20 0 22 23 NaN 6
'''
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)),
index=dates,
columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
'''
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
df.dropna(
axis=0, # 0: 对行进行操作; 1: 对列进行操作
how='any' # 'any': 只要存在 NaN 就 drop 掉; 'all': 必须全部是 NaN 才 drop
)
'''
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
df.fillna(value=0)
'''
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
'''
df.isnull()
'''
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
'''
np.any(df.isnull()) # if exists nan
# read
data = pd.read_csv('students.csv')
# to pickle
data.to_pickle('student.pickle')
# concat
# axis, default 0
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
pd.concat([df1, df2, df3], axis=0)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
'''
pd.concat([df1, df2, df3], axis=0, ignore_index=True)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
'''
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
pd.concat([df1, df2], axis=0, join='outer')
'''
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
'''
pd.concat([df1, df2], axis=0, join='inner')
'''
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
'''
pd.concat([df1, df2], axis=0, join='inner', ignore_index=True)
'''
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
'''
# horizontal by index
df1 = pd.DataFrame(np.ones((3, 4)),
columns=['A', 'B', 'C', 'D'],
index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)),
columns=['A', 'B', 'C', 'D']
index=[2, 3, 4])
pd.concat([df1, df2], axis=1, join_axes=[df1.index])
# a b c d b c d e
# 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
# 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
# 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
df1 = pd.DataFrame(np.ones((3, 4)) * 0,
columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1,
columns=['A', 'B', 'C', 'D'])
df3 = pd.DataFrame(np.ones((3, 4)) * 1,
columns=['A', 'B', 'C', 'D'])
df1.append(df2, ignore_index=True)
'''
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
'''
s1 = pd.Series([1,2,3,4],
index=['a','b','c','d'])
df1.append(s1, ignore_index=True)
'''
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 2.0 3.0 4.0
'''
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
'''
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
'''
'''
C D key
0 C0 D0 K1
1 C1 D1 K2
2 C2 D2 K3
3 C3 D3 K4
'''
pd.merge(left, right, on='key')
'''
A B key C D
0 A1 B1 K1 C0 D0
1 A2 B2 K2 C1 D1
2 A3 B3 K3 C2 D2
'''
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left, right, on=['key1', 'key2'], how='inner')
pd.merge(left, right, on=['key1', 'key2'], how='outer')
pd.merge(left, right, on=['key1', 'key2'], how='left')
pd.merge(left, right, on=['key1', 'key2'], how='right')
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
pd.merge(df1, df2, on='col1', how='outer', indicator=True)
'''
# col1 col_left col_right _merge
# 0 0.0 a NaN left_only
# 1 1.0 b 2.0 both
# 2 2.0 NaN 2.0 right_only
# 3 2.0 NaN 2.0 right_only
'''
pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
'''
col1 col_left col_right indicator_column
0 0.0 a NaN left_only
1 1.0 b 2.0 both
2 2.0 NaN 2.0 right_only
3 2.0 NaN 2.0 right_only
'''
# merge by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
pd.merge(left, right, left_index=True, right_index=True, how='outer')
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
pd.merge(left, right, left_index=True, right_index=True, how='inner')
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
#使用suffixes解决overlapping的问题
pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
'''
age_boy k age_girl
0 1 K0 4
1 1 K0 5
'''
# draw
import matplotlib.pyplot as plt
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data.cumsum()
data.plot()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment