Created
August 22, 2020 23:17
-
-
Save messefor/ef5a243234b7d6aa08f4af494816d4f4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
x = [1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6] | |
fig, ax = plt.subplots() | |
ax.hist(x,bins=np.arange(8) - 0.5, ec='white') | |
ax.set(xlabel='values', ylabel='Frequency', title='Distribution of x') | |
fig.savefig('hist.png') | |
# | |
# ------------------------------------------------------------ | |
# statistics | |
# ------------------------------------------------------------ | |
import statistics | |
# 最頻値のうち、出現した最初の値を返す@version3.8+ | |
statistics.mode(x) | |
# 複数の最頻値を返したい場合はmodemulti()を使う | |
statistics.multimode(x) | |
# 複数の最頻値のうち最小の値のみを返す | |
min(statistics.multimode(x)) | |
# ------------------------------------------------------------ | |
# Scipy | |
# ------------------------------------------------------------ | |
import scipy | |
mode, count = scipy.stats.mode(x) | |
mode[0] | |
# ------------------------------------------------------------ | |
# numpy | |
# ------------------------------------------------------------ | |
import numpy as np | |
uniqs, counts = np.unique(x, return_counts=True) | |
uniqs[counts == np.amax(counts)] | |
uniqs, counts = np.unique(x, return_counts=True) | |
uniqs[counts == np.amax(counts)].min() | |
# ------------------------------------------------------------ | |
# Pandas | |
# ------------------------------------------------------------ | |
# pandasで最頻値を求める | |
import pandas as pd | |
pd.Series(x, name='values').mode() | |
# 0 2 | |
# 1 3 | |
# 2 4 | |
# dtype: int64 | |
# 複数の最頻値のうち最小の値のみを返す | |
pd.Series(x, name='values').mode().min() # 2 | |
# ------------------------------------------------------------ | |
# Counter | |
# ------------------------------------------------------------ | |
from collections import Counter | |
Counter(x).most_common()[0][0] # 2 | |
# ------------------------------------------------------------- | |
# 処理速度の比較 | |
# ------------------------------------------------------------- | |
# 比較する関数の設定 | |
# 最頻値のうち最初の値を返す(statistics_modeのみ最小値を返す) | |
def mode_numpy(x): | |
uniqs, counts = np.unique(x, return_counts=True) | |
return uniqs[counts == np.amax(counts)][0] | |
methods = { | |
'scipy': lambda x: scipy.stats.mode(x).mode[0], | |
'numpy': mode_numpy, | |
'statistics_multimode': lambda x: statistics.multimode(x)[0], | |
# 'statistics_mode': lambda x: statistics.mode(x), | |
'collections': lambda x: Counter(x).most_common()[0][0], | |
'pandas': lambda x: pd.Series(x, name='values').mode().iloc[0] | |
} | |
# 処理速度を計測(遅い) | |
np.random.seed(1234) | |
niter = 50 | |
do_sort = False | |
# n_xs = [10, 1e2, 1e3, 1e4, 1e5, 1e6] | |
n_xs = np.arange(10, 10000, 1000) | |
results_list = [] | |
for n_x in n_xs: | |
# 対象の長さの1/3の種類数の乱数intを生成 | |
x = np.random.randint(n_x // 3, size=int(n_x)) | |
if do_sort: | |
x = np.sort(x) | |
for method, func in methods.items(): | |
ts = [] | |
for i in range(niter): | |
start = time.time() | |
mode = func(x) | |
t = time.time() - start | |
ts.append(t) | |
mu = np.mean(ts) | |
sd = np.std(ts) | |
result = dict(n_x=n_x, method=method, mu=mu, sd=sd) | |
results_list.append(result) | |
df_result = pd.DataFrame(results_list) | |
fig, ax = plt.subplots() | |
for k, d in df_result.groupby('method'): | |
ax.plot(d['n_x'], d['mu'], label='{} mean'.format(k)) | |
ax.set(title='process time', xlabel='Length of x', | |
ylabel='Elapsed time[sec]') | |
ax.legend() | |
fig.savefig('result1.png') | |
cmap = plt.get_cmap('tab10') | |
gs = df_result.groupby('method') | |
nrows = gs.ngroups | |
fig, axes = plt.subplots(figsize=(8, 3 * nrows), nrows=nrows, ncols=1, | |
sharey=True) | |
for i, (ax, (k, d)) in enumerate(zip(axes, gs)): | |
ax.errorbar(d['n_x'], d['mu'], yerr=d['sd'] / np.sqrt(niter), | |
color=cmap(i), label='{} $\pm$1se'.format(k)) | |
ax.set(title='process time', xlabel='Length of x', | |
ylabel='Elapsed time[sec]') | |
ax.legend() | |
fig.tight_layout() | |
fig.savefig('result2.png') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment