Skip to content

Instantly share code, notes, and snippets.

@ansvver
Last active August 28, 2017 08:28
Show Gist options
  • Save ansvver/3d5fa4aacba4a6d1833e014dc4799bed to your computer and use it in GitHub Desktop.
Save ansvver/3d5fa4aacba4a6d1833e014dc4799bed to your computer and use it in GitHub Desktop.
Pandas common usage.
# pandas输出格式
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
# 设置数据格式,避免比较大的数据以科学计数法显示
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# 从'\t'分隔文件中构造DataFrame
df = pd.read_csv('input.data', sep='\t', names=['nid', 'url', 'timestamp', 'title', 'attention', 'show', 'click', 'down'])
# 打平、操作索引与属性列
df.reset_index(drop=True)
df.set_index(['state', 'year'])
df.reindex_axis(sorted(df.columns), axis=1)
df.reindex(map(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"), date_series))
# 列出Dataframe中某列去重后的值
pd.unique(df.column_name.ravel())
# 删除重复
df.drop_duplicates()
# 删除NaN
df.dropna(subset=['three', 'four', 'five'], how='all')
# 两种方法把某列转化为数字,并把不能转的列置为NaN
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)
df_clean.loc[:, 'show'] = pd.to_numeric(df_clean['show'], errors='coerce')
# 把某列只在白名单的行拿出
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]
# 把某列不在白名单的行拿出
valuelist = ['value1', 'value2', 'value3']
df = df[~df.column.isin(value_list)]
# 删除某列
del df['column']
# 删除NaN的记录
df_clean.dropna(axis=0, how='any', thresh=None, subset=['show', 'click', 'down', 'sc_ratio'], inplace=True)
# 复杂条件过滤,使用:|&~
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
# 重命名列名
df = df.rename(columns = {
'col1 old name':'col1 new name',
'col2 old name':'col2 new name',
'col3 old name':'col3 new name',
})
# map, apply, applymap
df.columns = map(str.lower, df.columns)
df['str_col'].dropna().map(lambda x : 'map_' + x)
df.ix[:,['int_col','float_col']].apply(np.sqrt)
def some_fn(x):
if type(x) is str:
return 'applymap_' + x
elif x:
return 100 * x
else:
return
df.applymap(some_fn)
# 列名重命名的另一种方式
df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
df.rename(columns={'int_col' : 'some_other_name'})
# 遍历DataFrame中的每一行,如果真的必要的话
for index, row in df.iterrows():
print index, row['some column']
# 这种方式比上面那种快很多
for row in df.itertuples():
print(row)
# Pandas内置字符串操作
# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html
df.column.str[0:2]
# 某列的值变小写
df.column_name = df.column_name.str.lower()
# 某列所有字符值的长度
df.column_name.str.len()
# 以index排序
df.sort_index(inplace=True)
# 多级排序
df = df.sort(by=['col1','col2','col3'], ascending=[1,1,0], inplace=True)
# 取每组数据的前5个数据(可以结合sort先对数据排序)
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)
# 取某些列数据
df[['float_col','int_col']]
# 取某列值为null的所有行
newdf = df[df['column'].isnull()]
# 用多级索引中的多个键取记录
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
# 将所有NaN改为None值,对加载db数据处理很有用
df = df.where((pd.notnull(df)), None)
# 快速获取行数
len(df.index)
# 数据透视表,参数依次为展现的数据、行、列
df.pivot_table(['data1'], index='key1',columns='key2')
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
titanic.pivot_table('survived', index='sex', columns='class')
# 修改列的数据类型
df.column_name = df.column_name.astype(np.int64)
# 正则处理列的所有值
for col in refunds.columns.values:
refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True)
# 对过滤出的列赋值
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
# 对不同列的缺失值做不同处理
df = df.fillna({
'col1': 'missing',
'col2': '99.999',
'col3': '999',
'col4': 'missing',
'col5': 'missing',
'col6': '99'
})
# 使用均值替换缺失值
mean = df['float_col'].mean()
df['float_col'].fillna(mean)
# 跨列计算、赋值
df['newcol'] = df['col1'].map(str) + df['col2'].map(str)
# 对空缺值进行计算
# 下面是把为null值的df['col1']转为0,再进行计算
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
# 对某列数据进行数据切分,下面把一列数据分割为两列
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
# 合并 (打平)多层索引
df.columns = df.columns.get_level_values(0)
# 通过Python字典构造DataFrame
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
# 通过DataFrame构造字典
df.T.to_dict().values()
# 对特定的列查看重复情况,keep可为first、last、False
dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]
# Pipline: group, sum, sort
series_df.groupby(['DATA_TYPE', 'DATA_DATE', 'ADDR_ID'], as_index=False)['QUANTITY'].sum().sort_values(by='QUANTITY', ascending=False).head(2)
# Groupby后面可接aggregate, filter, transform, apply
df.groupby('key').aggregate(['min', np.median, max])
df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'})
def filter_func(x):
return x['data2'].std() > 4
display('df', "df.groupby('key').std()", "df.groupby('key').filter(filter_func)")
df.groupby('key').transform(lambda x: x - x.mean())
def norm_by_data2(x):
# x is a DataFrame of group values
x['data1'] /= x['data2'].sum()
return x
df.groupby('key').apply(norm_by_data2)
# 为DataFrame增加一行
series_df_rslt.append({"DETAIL_ID": _id, "DATA_TYPE": _type}, ignore_index=True)
# 两个相同的DF拼接, concat不会开辟新的内存空间来复制数据
series_df = series_df.append(series_df_2, ignore_index=True)
pd.concat([df1, df2])
pd.concat([df3, df4], axis='col')
pd.concat([x, y], ignore_index=True)
# Join操作
pd.merge(df1, df2, on='key')
# 矩阵操作
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3]})
df["C"] = df["A"]+df["B"]
df["D"] = df["A"]*3
df["E"] = np.sqrt(df["A"])
df["F"] = df.Z.str.upper()
# 数据统计
df.describe()
df.cov()
df.corr()
# 柱狀图
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
plot_df['y'] = plot_df['y'].map(lambda x : x + 1)
plot_df.plot()
plot_df.hist()
# eval(), query() 追求更高性能
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
for i in range(4))
np.allclose(df1 + df2 + df3 + df4,
pd.eval('df1 + df2 + df3 + df4'))
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment