Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# cadadr/Readme.markdown

Last active Apr 27, 2020
Comparison of while and for i in np.arange(...) in loops of various length

for loops with np.arange seem to be quicker than manual while loops for certain powers of 10. The inverse is true for some other numbers and interestingly, 10 million loops. I got this pattern fairly consistently with

• Python 3.6.9
• Linux Mint 19.3 (Linux g-X551CA 5.3.0-46-generic #38~18.04.1-Ubuntu SMP Tue Mar 31 04:17:56 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux)
• Intel(R) Core(TM) i3-3217U CPU @ 1.80GHz
• Numpy 1.18.2

The benchmark function is a dice roller that rolls a positive number of dice each with a positive number of sides 2+ times a batch, and calculates means for each batch. It does this by allocating nBatch * nRolls random integers between 1 and nSides * nDice (inclusive), so the run time of the function is probably consistent between runs.

 import sys import numpy as np import matplotlib.pyplot as plt # Means of TIMES batches of LIM rolls of NDICE dice of DIM sides. def nmeans(times, lim, ndice, dim): a = np.empty(times) r = np.random.randint(1, dim * ndice + 1, (lim * times,)) # mean of each slice of lim elements (TIMES slices) i = 0 while i < times: b = i * lim a[i] = np.mean(r[b : b+lim-1]) i += 1 return (times, lim, ndice, dim, a) def nmeans_for(times, lim, ndice, dim): a = np.empty(times) r = np.random.randint(1, dim * ndice + 1, (lim * times,)) # mean of each slice of lim elements (TIMES slices) for i in range(times): b = i * lim a[i] = np.mean(r[b : b+lim-1]) return (times, lim, ndice, dim, a) def plotmeans(data): _, _, ndice, dim, dat = data plt.hist(dat) plt.xlabel('mean') plt.xticks(range(1, ndice * dim + 1)) plt.ylabel('freq') plt.show() def bench(): from timeit import timeit nfor = [] nwhile = [] runs = [10, 100, 1000, 10_000, 100_000, 500_000, 1_000_000, 5_000_000] def f(f, n): a = "_for" if f else "" return [ 'nmeans%s(%d, 10, 2, 6)' % (a, n), 'from dicerolls import nmeans%s' % (a,) ] for i in runs: sys.stdout.write("%d batches, nmeans_for: " % (i,)) sys.stdout.flush() t0 = timeit(*f(True, i), number=1) print('%f sec' % (t0,)) sys.stdout.write("%d batches, nmeans: " % (i,)) sys.stdout.flush() t1 = timeit(*f(False, i), number=1) print('%f sec' % (t1,)) nfor.append(t0) nwhile.append(t1) # the plot r = len(runs) i = np.arange(r) w = 0.35 # log comparison plt.bar(i, nfor, w, color='r', label='nmeans_for') plt.bar(i+w, nwhile, w, color='g', label='nmeans_while') plt.title('Run times') plt.xlabel('# of batches') plt.xticks(i+w/2, [str(i) for i in runs]) plt.ylabel('time (log)') plt.yscale('log') plt.legend() plt.tight_layout() plt.show() # time delta dt = [w - f for w, f in zip(nwhile, nfor)] plt.bar(i, dt, w) plt.title('$t(while)-t(for)$ (log)') plt.xlabel('# of batches') plt.xticks(i, [str(i) for i in runs]) plt.ylabel('$\log(\delta t)$') plt.yscale('log') plt.axhline(0, color='black') plt.tight_layout() plt.show()
 10 batches, nmeans_for: 0.000949 sec 10 batches, nmeans: 0.000706 sec 100 batches, nmeans_for: 0.004747 sec 100 batches, nmeans: 0.004600 sec 1000 batches, nmeans_for: 0.036245 sec 1000 batches, nmeans: 0.037384 sec 10000 batches, nmeans_for: 0.339179 sec 10000 batches, nmeans: 0.383801 sec 100000 batches, nmeans_for: 3.419067 sec 100000 batches, nmeans: 3.774153 sec 500000 batches, nmeans_for: 17.111042 sec 500000 batches, nmeans: 15.910373 sec 1000000 batches, nmeans_for: 30.236948 sec 1000000 batches, nmeans: 33.663940 sec 5000000 batches, nmeans_for: 177.865651 sec 5000000 batches, nmeans: 157.516901 sec

### cadadr commented Apr 26, 2020

 Times in seconds.

### cadadr commented Apr 26, 2020

 With 10 million times added: 10 batches, nmeans_for: 0.000537 sec 10 batches, nmeans: 0.000371 sec 100 batches, nmeans_for: 0.002961 sec 100 batches, nmeans: 0.003028 sec 1000 batches, nmeans_for: 0.029531 sec 1000 batches, nmeans: 0.030077 sec 10000 batches, nmeans_for: 0.292118 sec 10000 batches, nmeans: 0.298963 sec 100000 batches, nmeans_for: 2.960439 sec 100000 batches, nmeans: 3.230583 sec 500000 batches, nmeans_for: 18.042404 sec 500000 batches, nmeans: 18.034151 sec 1000000 batches, nmeans_for: 36.607472 sec 1000000 batches, nmeans: 37.622316 sec 5000000 batches, nmeans_for: 180.157668 sec 5000000 batches, nmeans: 179.575660 sec 10000000 batches, nmeans_for: 348.877258 sec 10000000 batches, nmeans: 312.674294 sec 

### cadadr commented Apr 27, 2020

 A major improvement is to use numpy.array.reshape and numpy.array.mean instead of manual slicing. import sys import numpy as np import matplotlib.pyplot as plt # Means of TIMES batches of LIM rolls of NDICE dice of DIM sides. def nmeans(times, lim, ndice, dim): a = np.empty(times) r = np.random.randint(1, dim * ndice + 1, (lim * times,)) # mean of each slice of lim elements (TIMES slices) i = 0 while i < times: b = i * lim a[i] = np.mean(r[b : b+lim-1]) i += 1 return (times, lim, ndice, dim, a) def nmeans_for(times, lim, ndice, dim): a = np.empty(times) r = np.random.randint(1, dim * ndice + 1, (lim * times,)) # mean of each slice of lim elements (TIMES slices) for i in range(times): b = i * lim a[i] = np.mean(r[b : b+lim-1]) return (times, lim, ndice, dim, a) def nmeans_reshape(times, lim, ndice, dim): r = np.random.randint(1, dim * ndice + 1, (lim * times,)) a = r.reshape((-1, ndice)).mean(axis=1) return (times, lim, ndice, dim, a) def test(): args = (100, 10, 2, 6) def t(x): len(x(*args)[4]) assert t(nmeans) == t(nmeans_for) == t(nmeans_reshape) def plotmeans(data): _, _, ndice, dim, dat = data plt.hist(dat) plt.xlabel('mean') plt.xticks(range(1, ndice * dim + 1)) plt.ylabel('freq') plt.show() def bench(): from timeit import timeit nfor = [] nwhile = [] nreshape = [] runs = [10, 100, 1000, 10_000, 100_000, 500_000, 1_000_000, 5_000_000][:-1] def f(a, n): return [ 'nmeans%s(%d, 10, 2, 6)' % (a, n), 'from dicerolls import nmeans%s' % (a,) ] for i in runs: sys.stdout.write("%d batches, nmeans_for: " % (i,)) sys.stdout.flush() t0 = timeit(*f("_for", i), number=1) print('%f sec' % (t0,)) sys.stdout.write("%d batches, nmeans: " % (i,)) sys.stdout.flush() t1 = timeit(*f("", i), number=1) print('%f sec' % (t1,)) sys.stdout.write("%d batches, nmeans_reshape: " % (i,)) sys.stdout.flush() t2 = timeit(*f("_reshape", i), number=1) print('%f sec' % (t2,)) nfor.append(t0) nwhile.append(t1) nreshape.append(t2) # the plot r = len(runs) i = np.arange(r) w = 0.20 # log comparison plt.bar(i, nfor, w, color='r', label='nmeans_for') plt.bar(i+w, nwhile, w, color='g', label='nmeans_while') plt.bar(i+2*w, nreshape, w, color='yellow', label='nmeans_reshape') plt.title('Run times') plt.xlabel('# of batches') plt.xticks(i+w/3, [str(i) for i in runs]) plt.ylabel('time (log)') plt.yscale('log') plt.legend() plt.tight_layout() plt.show() This improves the speed by orders of magnitude: 10 batches, nmeans_for: 0.000555 sec 10 batches, nmeans: 0.000376 sec 10 batches, nmeans_reshape: 0.000149 sec 100 batches, nmeans_for: 0.003613 sec 100 batches, nmeans: 0.003130 sec 100 batches, nmeans_reshape: 0.000175 sec 1000 batches, nmeans_for: 0.029829 sec 1000 batches, nmeans: 0.028750 sec 1000 batches, nmeans_reshape: 0.000544 sec 10000 batches, nmeans_for: 0.289810 sec 10000 batches, nmeans: 0.286741 sec 10000 batches, nmeans_reshape: 0.003652 sec 100000 batches, nmeans_for: 2.832762 sec 100000 batches, nmeans: 2.863872 sec 100000 batches, nmeans_reshape: 0.034788 sec 500000 batches, nmeans_for: 14.211868 sec 500000 batches, nmeans: 14.302791 sec 500000 batches, nmeans_reshape: 0.192003 sec 1000000 batches, nmeans_for: 28.512097 sec 1000000 batches, nmeans: 28.862442 sec 1000000 batches, nmeans_reshape: 0.412655 sec  which is 10: 3.724832 x 100: 21.252941 x 1000: 55.238889 x 10000: 79.400000 x 100000: 81.448016 x 500000: 74.020146 x 1000000: 69.095110 x  Thanks to @holger for the suggestion of using reshape and means.
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.