Last active
August 28, 2017 08:28
-
-
Save ansvver/3d5fa4aacba4a6d1833e014dc4799bed to your computer and use it in GitHub Desktop.
Pandas common usage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pandas输出格式 | |
import pandas as pd | |
pd.set_option('display.height', 1000) | |
pd.set_option('display.max_rows', 500) | |
pd.set_option('display.max_columns', 500) | |
pd.set_option('display.width', 1000) | |
pd.set_option('display.expand_frame_repr', False) | |
# 设置数据格式,避免比较大的数据以科学计数法显示 | |
pd.set_option('display.float_format', lambda x: '%.3f' % x) | |
# 从'\t'分隔文件中构造DataFrame | |
df = pd.read_csv('input.data', sep='\t', names=['nid', 'url', 'timestamp', 'title', 'attention', 'show', 'click', 'down']) | |
# 打平、操作索引与属性列 | |
df.reset_index(drop=True) | |
df.set_index(['state', 'year']) | |
df.reindex_axis(sorted(df.columns), axis=1) | |
df.reindex(map(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"), date_series)) | |
# 列出Dataframe中某列去重后的值 | |
pd.unique(df.column_name.ravel()) | |
# 删除重复 | |
df.drop_duplicates() | |
# 删除NaN | |
df.dropna(subset=['three', 'four', 'five'], how='all') | |
# 两种方法把某列转化为数字,并把不能转的列置为NaN | |
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
df_clean.loc[:, 'show'] = pd.to_numeric(df_clean['show'], errors='coerce') | |
# 把某列只在白名单的行拿出 | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[df.column.isin(valuelist)] | |
# 把某列不在白名单的行拿出 | |
valuelist = ['value1', 'value2', 'value3'] | |
df = df[~df.column.isin(value_list)] | |
# 删除某列 | |
del df['column'] | |
# 删除NaN的记录 | |
df_clean.dropna(axis=0, how='any', thresh=None, subset=['show', 'click', 'down', 'sc_ratio'], inplace=True) | |
# 复杂条件过滤,使用:|&~ | |
newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
# 重命名列名 | |
df = df.rename(columns = { | |
'col1 old name':'col1 new name', | |
'col2 old name':'col2 new name', | |
'col3 old name':'col3 new name', | |
}) | |
# map, apply, applymap | |
df.columns = map(str.lower, df.columns) | |
df['str_col'].dropna().map(lambda x : 'map_' + x) | |
df.ix[:,['int_col','float_col']].apply(np.sqrt) | |
def some_fn(x): | |
if type(x) is str: | |
return 'applymap_' + x | |
elif x: | |
return 100 * x | |
else: | |
return | |
df.applymap(some_fn) | |
# 列名重命名的另一种方式 | |
df.rename(columns=lambda x: x.split('.')[-1], inplace=True) | |
df.rename(columns={'int_col' : 'some_other_name'}) | |
# 遍历DataFrame中的每一行,如果真的必要的话 | |
for index, row in df.iterrows(): | |
print index, row['some column'] | |
# 这种方式比上面那种快很多 | |
for row in df.itertuples(): | |
print(row) | |
# Pandas内置字符串操作 | |
# Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html | |
df.column.str[0:2] | |
# 某列的值变小写 | |
df.column_name = df.column_name.str.lower() | |
# 某列所有字符值的长度 | |
df.column_name.str.len() | |
# 以index排序 | |
df.sort_index(inplace=True) | |
# 多级排序 | |
df = df.sort(by=['col1','col2','col3'], ascending=[1,1,0], inplace=True) | |
# 取每组数据的前5个数据(可以结合sort先对数据排序) | |
top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5) | |
# 取某些列数据 | |
df[['float_col','int_col']] | |
# 取某列值为null的所有行 | |
newdf = df[df['column'].isnull()] | |
# 用多级索引中的多个键取记录 | |
df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2')) | |
# 将所有NaN改为None值,对加载db数据处理很有用 | |
df = df.where((pd.notnull(df)), None) | |
# 快速获取行数 | |
len(df.index) | |
# 数据透视表,参数依次为展现的数据、行、列 | |
df.pivot_table(['data1'], index='key1',columns='key2') | |
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack() | |
titanic.pivot_table('survived', index='sex', columns='class') | |
# 修改列的数据类型 | |
df.column_name = df.column_name.astype(np.int64) | |
# 正则处理列的所有值 | |
for col in refunds.columns.values: | |
refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True) | |
# 对过滤出的列赋值 | |
df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value | |
# 对不同列的缺失值做不同处理 | |
df = df.fillna({ | |
'col1': 'missing', | |
'col2': '99.999', | |
'col3': '999', | |
'col4': 'missing', | |
'col5': 'missing', | |
'col6': '99' | |
}) | |
# 使用均值替换缺失值 | |
mean = df['float_col'].mean() | |
df['float_col'].fillna(mean) | |
# 跨列计算、赋值 | |
df['newcol'] = df['col1'].map(str) + df['col2'].map(str) | |
# 对空缺值进行计算 | |
# 下面是把为null值的df['col1']转为0,再进行计算 | |
df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
# 对某列数据进行数据切分,下面把一列数据分割为两列 | |
df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
# 合并 (打平)多层索引 | |
df.columns = df.columns.get_level_values(0) | |
# 通过Python字典构造DataFrame | |
df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2']) | |
# 通过DataFrame构造字典 | |
df.T.to_dict().values() | |
# 对特定的列查看重复情况,keep可为first、last、False | |
dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)] | |
# Pipline: group, sum, sort | |
series_df.groupby(['DATA_TYPE', 'DATA_DATE', 'ADDR_ID'], as_index=False)['QUANTITY'].sum().sort_values(by='QUANTITY', ascending=False).head(2) | |
# Groupby后面可接aggregate, filter, transform, apply | |
df.groupby('key').aggregate(['min', np.median, max]) | |
df.groupby('key').aggregate({'data1': 'min', | |
'data2': 'max'}) | |
def filter_func(x): | |
return x['data2'].std() > 4 | |
display('df', "df.groupby('key').std()", "df.groupby('key').filter(filter_func)") | |
df.groupby('key').transform(lambda x: x - x.mean()) | |
def norm_by_data2(x): | |
# x is a DataFrame of group values | |
x['data1'] /= x['data2'].sum() | |
return x | |
df.groupby('key').apply(norm_by_data2) | |
# 为DataFrame增加一行 | |
series_df_rslt.append({"DETAIL_ID": _id, "DATA_TYPE": _type}, ignore_index=True) | |
# 两个相同的DF拼接, concat不会开辟新的内存空间来复制数据 | |
series_df = series_df.append(series_df_2, ignore_index=True) | |
pd.concat([df1, df2]) | |
pd.concat([df3, df4], axis='col') | |
pd.concat([x, y], ignore_index=True) | |
# Join操作 | |
pd.merge(df1, df2, on='key') | |
# 矩阵操作 | |
df = pd.DataFrame(data={"A":[1,2], "B":[1.2,1.3]}) | |
df["C"] = df["A"]+df["B"] | |
df["D"] = df["A"]*3 | |
df["E"] = np.sqrt(df["A"]) | |
df["F"] = df.Z.str.upper() | |
# 数据统计 | |
df.describe() | |
df.cov() | |
df.corr() | |
# 柱狀图 | |
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y']) | |
plot_df['y'] = plot_df['y'].map(lambda x : x + 1) | |
plot_df.plot() | |
plot_df.hist() | |
# eval(), query() 追求更高性能 | |
nrows, ncols = 100000, 100 | |
rng = np.random.RandomState(42) | |
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) | |
for i in range(4)) | |
np.allclose(df1 + df2 + df3 + df4, | |
pd.eval('df1 + df2 + df3 + df4')) | |
Cmean = df['C'].mean() | |
result1 = df[(df.A < Cmean) & (df.B < Cmean)] | |
result2 = df.query('A < @Cmean and B < @Cmean') | |
np.allclose(result1, result2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment