Last active
March 27, 2022 21:33
-
-
Save mkschleg/300f0fb742235a20a0084dfa85912cee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Julia Version 1.7.2 | |
Commit bf53498635 (2022-02-06 15:21 UTC) | |
Platform Info: | |
OS: Linux (x86_64-pc-linux-gnu) | |
CPU: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz | |
WORD_SIZE: 64 | |
LIBM: libopenlibm | |
LLVM: libLLVM-12.0.1 (ORCJIT, skylake) | |
Environment: | |
JULIA_VERSION = 1.7 | |
JULIA_LOAD_PATH = /home/matt/Documents/upstream/Flux.jl/test/:/home/matt/Documents/upstream/Flux.jl/experiment/: | |
nothing | |
RNN Vec CPU n=2, ts=1 | |
forward | |
247.924 ns (4 allocations: 288 bytes) | |
backward | |
8.621 μs (82 allocations: 3.67 KiB) | |
forw and back | |
17.016 μs (177 allocations: 15.06 KiB) | |
RNN Vec CUDA n=2, ts=1 | |
forward | |
91.333 μs (90 allocations: 6.31 KiB) | |
backward | |
178.409 μs (272 allocations: 12.58 KiB) | |
forw and back | |
299.957 μs (509 allocations: 31.27 KiB) | |
RNN Vec CPU n=2, ts=4 | |
forward | |
628.889 ns (13 allocations: 1.00 KiB) | |
backward | |
22.644 μs (241 allocations: 11.80 KiB) | |
forw and back | |
37.727 μs (432 allocations: 39.89 KiB) | |
RNN Vec CUDA n=2, ts=4 | |
forward | |
172.301 μs (357 allocations: 25.08 KiB) | |
backward | |
445.365 μs (1208 allocations: 63.81 KiB) | |
forw and back | |
680.446 μs (1913 allocations: 119.92 KiB) | |
RNN Vec CPU n=2, ts=16 | |
forward | |
2.104 μs (49 allocations: 3.91 KiB) | |
backward | |
71.660 μs (877 allocations: 44.05 KiB) | |
forw and back | |
107.745 μs (1452 allocations: 138.80 KiB) | |
RNN Vec CUDA n=2, ts=16 | |
forward | |
461.348 μs (1425 allocations: 100.17 KiB) | |
backward | |
1.338 ms (4952 allocations: 268.44 KiB) | |
forw and back | |
1.997 ms (7529 allocations: 473.95 KiB) | |
RNN Vec CPU n=2, ts=64 | |
forward | |
8.031 μs (193 allocations: 15.55 KiB) | |
backward | |
268.009 μs (3422 allocations: 173.02 KiB) | |
forw and back | |
384.960 μs (5534 allocations: 534.33 KiB) | |
RNN Vec CUDA n=2, ts=64 | |
forward | |
1.607 ms (5697 allocations: 400.56 KiB) | |
backward | |
4.833 ms (19929 allocations: 1.06 MiB) | |
forw and back | |
7.178 ms (29995 allocations: 1.85 MiB) | |
RNN Vec CPU n=20, ts=1 | |
forward | |
2.042 μs (4 allocations: 3.73 KiB) | |
backward | |
11.603 μs (82 allocations: 10.66 KiB) | |
forw and back | |
22.048 μs (177 allocations: 28.88 KiB) | |
RNN Vec CUDA n=20, ts=1 | |
forward | |
93.403 μs (90 allocations: 6.31 KiB) | |
backward | |
178.111 μs (272 allocations: 12.58 KiB) | |
forw and back | |
304.851 μs (509 allocations: 31.27 KiB) | |
RNN Vec CPU n=20, ts=4 | |
forward | |
8.987 μs (13 allocations: 19.64 KiB) | |
backward | |
38.078 μs (241 allocations: 59.75 KiB) | |
forw and back | |
62.202 μs (432 allocations: 125.05 KiB) | |
RNN Vec CUDA n=20, ts=4 | |
forward | |
177.066 μs (357 allocations: 25.08 KiB) | |
backward | |
446.968 μs (1208 allocations: 63.81 KiB) | |
forw and back | |
685.343 μs (1913 allocations: 119.92 KiB) | |
RNN Vec CPU n=20, ts=16 | |
forward | |
37.185 μs (49 allocations: 83.30 KiB) | |
backward | |
137.436 μs (877 allocations: 255.88 KiB) | |
forw and back | |
210.404 μs (1452 allocations: 509.33 KiB) | |
RNN Vec CUDA n=20, ts=16 | |
forward | |
475.832 μs (1425 allocations: 100.17 KiB) | |
backward | |
1.346 ms (4952 allocations: 268.44 KiB) | |
forw and back | |
2.025 ms (7529 allocations: 473.95 KiB) | |
RNN Vec CPU n=20, ts=64 | |
forward | |
149.940 μs (193 allocations: 337.94 KiB) | |
backward | |
542.958 μs (3422 allocations: 1.02 MiB) | |
forw and back | |
816.599 μs (5534 allocations: 2.00 MiB) | |
RNN Vec CUDA n=20, ts=64 | |
forward | |
1.649 ms (5697 allocations: 400.56 KiB) | |
backward | |
4.858 ms (19929 allocations: 1.06 MiB) | |
forw and back | |
7.280 ms (29995 allocations: 1.85 MiB) | |
RNN Vec CPU n=200, ts=1 | |
forward | |
389.813 μs (6 allocations: 313.53 KiB) | |
backward | |
313.736 μs (86 allocations: 630.98 KiB) | |
forw and back | |
529.839 μs (185 allocations: 1.24 MiB) | |
RNN Vec CUDA n=200, ts=1 | |
forward | |
104.113 μs (91 allocations: 6.33 KiB) | |
backward | |
190.055 μs (273 allocations: 12.59 KiB) | |
forw and back | |
332.940 μs (554 allocations: 33.55 KiB) | |
RNN Vec CPU n=200, ts=4 | |
forward | |
1.600 ms (24 allocations: 1.68 MiB) | |
backward | |
1.533 ms (269 allocations: 4.29 MiB) | |
forw and back | |
2.464 ms (482 allocations: 7.67 MiB) | |
RNN Vec CUDA n=200, ts=4 | |
forward | |
205.567 μs (361 allocations: 25.14 KiB) | |
backward | |
493.036 μs (1221 allocations: 64.02 KiB) | |
forw and back | |
755.744 μs (1979 allocations: 122.53 KiB) | |
RNN Vec CPU n=200, ts=16 | |
forward | |
6.451 ms (96 allocations: 7.17 MiB) | |
backward | |
6.535 ms (1001 allocations: 18.99 MiB) | |
forw and back | |
10.322 ms (1670 allocations: 33.42 MiB) | |
RNN Vec CUDA n=200, ts=16 | |
forward | |
556.426 μs (1441 allocations: 100.42 KiB) | |
backward | |
1.517 ms (5013 allocations: 269.39 KiB) | |
forw and back | |
2.292 ms (7679 allocations: 477.88 KiB) | |
RNN Vec CPU n=200, ts=64 | |
forward | |
26.120 ms (384 allocations: 29.15 MiB) | |
backward | |
26.483 ms (3930 allocations: 77.77 MiB) | |
forw and back | |
44.487 ms (6424 allocations: 136.40 MiB) | |
RNN Vec CUDA n=200, ts=64 | |
forward | |
1.916 ms (5761 allocations: 401.56 KiB) | |
backward | |
5.437 ms (20182 allocations: 1.07 MiB) | |
forw and back | |
8.405 ms (30481 allocations: 1.85 MiB) | |
RNN Vec CPU n=1000, ts=1 | |
forward | |
7.647 ms (6 allocations: 7.63 MiB) | |
backward | |
11.449 ms (86 allocations: 15.27 MiB) | |
forw and back | |
17.773 ms (185 allocations: 30.55 MiB) | |
RNN Vec CUDA n=1000, ts=1 | |
forward | |
586.533 μs (165 allocations: 9.67 KiB) | |
backward | |
1.009 ms (368 allocations: 16.27 KiB) | |
forw and back | |
1.705 ms (723 allocations: 40.56 KiB) | |
RNN Vec CPU n=1000, ts=4 | |
forward | |
44.705 ms (24 allocations: 41.97 MiB) | |
backward | |
76.916 ms (269 allocations: 106.86 MiB) | |
forw and back | |
119.754 ms (482 allocations: 190.81 MiB) | |
RNN Vec CUDA n=1000, ts=4 | |
forward | |
2.359 ms (525 allocations: 29.89 KiB) | |
backward | |
4.385 ms (1469 allocations: 70.08 KiB) | |
forw and back | |
6.745 ms (2391 allocations: 133.34 KiB) | |
RNN Vec CPU n=1000, ts=16 | |
forward | |
262.950 ms (96 allocations: 179.30 MiB) | |
backward | |
370.010 ms (1001 allocations: 473.19 MiB) | |
forw and back | |
573.668 ms (1670 allocations: 831.87 MiB) | |
RNN Vec CUDA n=1000, ts=16 | |
forward | |
7.912 ms (1965 allocations: 110.80 KiB) | |
backward | |
18.333 ms (5873 allocations: 285.02 KiB) | |
forw and back | |
27.314 ms (9063 allocations: 503.88 KiB) | |
RNN Vec CPU n=1000, ts=64 | |
forward | |
1.064 s (384 allocations: 728.62 MiB) | |
backward | |
1.533 s (3930 allocations: 1.89 GiB) | |
forw and back | |
2.292 s (6424 allocations: 3.32 GiB) | |
RNN Vec CUDA n=1000, ts=64 | |
forward | |
30.802 ms (7725 allocations: 434.44 KiB) | |
backward | |
74.351 ms (23490 allocations: 1.12 MiB) | |
forw and back | |
112.124 ms (35753 allocations: 1.94 MiB) | |
RNN Block CPU n=2, ts=1 | |
forward | |
1.155 μs (14 allocations: 640 bytes) | |
backward | |
9.752 μs (97 allocations: 4.19 KiB) | |
forw and back | |
26.600 μs (226 allocations: 19.52 KiB) | |
RNN Block CUDA n=2, ts=1 | |
forward | |
116.712 μs (134 allocations: 7.88 KiB) | |
backward | |
210.616 μs (334 allocations: 14.95 KiB) | |
forw and back | |
373.062 μs (639 allocations: 38.55 KiB) | |
RNN Block CPU n=2, ts=4 | |
forward | |
1.577 μs (23 allocations: 1.39 KiB) | |
backward | |
24.358 μs (271 allocations: 13.16 KiB) | |
forw and back | |
49.624 μs (490 allocations: 45.84 KiB) | |
RNN Block CUDA n=2, ts=4 | |
forward | |
207.293 μs (491 allocations: 30.02 KiB) | |
backward | |
490.103 μs (1408 allocations: 71.98 KiB) | |
forw and back | |
757.963 μs (2184 allocations: 130.11 KiB) | |
RNN Block CPU n=2, ts=16 | |
forward | |
3.159 μs (59 allocations: 4.50 KiB) | |
backward | |
76.241 μs (967 allocations: 48.70 KiB) | |
forw and back | |
122.146 μs (1546 allocations: 150.66 KiB) | |
RNN Block CUDA n=2, ts=16 | |
forward | |
526.463 μs (1919 allocations: 118.61 KiB) | |
backward | |
1.429 ms (5704 allocations: 299.89 KiB) | |
forw and back | |
2.120 ms (8364 allocations: 495.92 KiB) | |
RNN Block CPU n=2, ts=64 | |
forward | |
9.480 μs (203 allocations: 16.88 KiB) | |
backward | |
282.026 μs (3752 allocations: 190.81 KiB) | |
forw and back | |
411.365 μs (5772 allocations: 569.44 KiB) | |
RNN Block CUDA n=2, ts=64 | |
forward | |
1.799 ms (7631 allocations: 473.00 KiB) | |
backward | |
5.111 ms (22889 allocations: 1.18 MiB) | |
forw and back | |
7.401 ms (33086 allocations: 1.91 MiB) | |
RNN Block CPU n=20, ts=1 | |
forward | |
3.265 μs (14 allocations: 5.77 KiB) | |
backward | |
12.391 μs (97 allocations: 12.86 KiB) | |
forw and back | |
31.334 μs (226 allocations: 36.70 KiB) | |
RNN Block CUDA n=20, ts=1 | |
forward | |
120.470 μs (134 allocations: 7.88 KiB) | |
backward | |
209.458 μs (334 allocations: 14.95 KiB) | |
forw and back | |
372.813 μs (639 allocations: 38.55 KiB) | |
RNN Block CPU n=20, ts=4 | |
forward | |
11.209 μs (23 allocations: 26.28 KiB) | |
backward | |
40.215 μs (271 allocations: 67.36 KiB) | |
forw and back | |
73.945 μs (490 allocations: 138.44 KiB) | |
RNN Block CUDA n=20, ts=4 | |
forward | |
211.320 μs (491 allocations: 30.02 KiB) | |
backward | |
493.841 μs (1408 allocations: 71.98 KiB) | |
forw and back | |
780.405 μs (2227 allocations: 132.58 KiB) | |
RNN Block CPU n=20, ts=16 | |
forward | |
42.479 μs (60 allocations: 108.61 KiB) | |
backward | |
143.415 μs (968 allocations: 285.27 KiB) | |
forw and back | |
227.558 μs (1548 allocations: 545.33 KiB) | |
RNN Block CUDA n=20, ts=16 | |
forward | |
547.352 μs (1919 allocations: 118.61 KiB) | |
backward | |
1.448 ms (5704 allocations: 299.89 KiB) | |
forw and back | |
2.157 ms (8407 allocations: 498.39 KiB) | |
RNN Block CPU n=20, ts=64 | |
forward | |
166.558 μs (204 allocations: 438.25 KiB) | |
backward | |
560.376 μs (3753 allocations: 1.13 MiB) | |
forw and back | |
843.331 μs (5774 allocations: 2.12 MiB) | |
RNN Block CUDA n=20, ts=64 | |
forward | |
1.858 ms (7631 allocations: 473.00 KiB) | |
backward | |
5.187 ms (22889 allocations: 1.18 MiB) | |
forw and back | |
7.550 ms (33129 allocations: 1.92 MiB) | |
RNN Block CPU n=200, ts=1 | |
forward | |
407.682 μs (17 allocations: 470.09 KiB) | |
backward | |
347.632 μs (102 allocations: 787.73 KiB) | |
forw and back | |
601.803 μs (236 allocations: 1.55 MiB) | |
RNN Block CUDA n=200, ts=1 | |
forward | |
125.213 μs (137 allocations: 7.92 KiB) | |
backward | |
220.922 μs (335 allocations: 14.97 KiB) | |
forw and back | |
399.846 μs (687 allocations: 41.09 KiB) | |
RNN Block CPU n=200, ts=4 | |
forward | |
1.669 ms (35 allocations: 2.29 MiB) | |
backward | |
1.658 ms (300 allocations: 4.90 MiB) | |
forw and back | |
2.669 ms (539 allocations: 8.44 MiB) | |
RNN Block CUDA n=200, ts=4 | |
forward | |
232.016 μs (503 allocations: 30.20 KiB) | |
backward | |
530.763 μs (1421 allocations: 72.19 KiB) | |
forw and back | |
842.546 μs (2256 allocations: 133.03 KiB) | |
RNN Block CPU n=200, ts=16 | |
forward | |
3.832 ms (107 allocations: 9.62 MiB) | |
backward | |
5.478 ms (1092 allocations: 21.43 MiB) | |
forw and back | |
8.418 ms (1751 allocations: 36.03 MiB) | |
RNN Block CUDA n=200, ts=16 | |
forward | |
611.755 μs (1967 allocations: 119.36 KiB) | |
backward | |
1.621 ms (5765 allocations: 300.84 KiB) | |
forw and back | |
2.406 ms (8532 allocations: 500.34 KiB) | |
RNN Block CPU n=200, ts=64 | |
forward | |
15.785 ms (395 allocations: 38.92 MiB) | |
backward | |
22.297 ms (4261 allocations: 87.55 MiB) | |
forw and back | |
36.527 ms (6601 allocations: 146.35 MiB) | |
RNN Block CUDA n=200, ts=64 | |
forward | |
2.144 ms (7823 allocations: 476.00 KiB) | |
backward | |
6.171 ms (23142 allocations: 1.19 MiB) | |
forw and back | |
9.122 ms (33638 allocations: 1.92 MiB) | |
RNN Block CPU n=1000, ts=1 | |
forward | |
7.971 ms (19 allocations: 11.45 MiB) | |
backward | |
11.804 ms (102 allocations: 19.09 MiB) | |
forw and back | |
18.380 ms (238 allocations: 38.18 MiB) | |
RNN Block CUDA n=1000, ts=1 | |
forward | |
614.054 μs (213 allocations: 11.30 KiB) | |
backward | |
1.129 ms (430 allocations: 18.64 KiB) | |
forw and back | |
1.735 ms (858 allocations: 48.14 KiB) | |
RNN Block CPU n=1000, ts=4 | |
forward | |
45.956 ms (37 allocations: 57.23 MiB) | |
backward | |
78.570 ms (300 allocations: 122.12 MiB) | |
forw and back | |
121.519 ms (541 allocations: 209.89 MiB) | |
RNN Block CUDA n=1000, ts=4 | |
forward | |
2.206 ms (669 allocations: 34.98 KiB) | |
backward | |
4.613 ms (1669 allocations: 78.25 KiB) | |
forw and back | |
6.913 ms (2670 allocations: 143.88 KiB) | |
RNN Block CPU n=1000, ts=16 | |
forward | |
271.100 ms (109 allocations: 240.33 MiB) | |
backward | |
383.031 ms (1092 allocations: 534.23 MiB) | |
forw and back | |
584.032 ms (1753 allocations: 896.73 MiB) | |
RNN Block CUDA n=1000, ts=16 | |
forward | |
8.093 ms (2493 allocations: 129.77 KiB) | |
backward | |
19.103 ms (6625 allocations: 316.47 KiB) | |
forw and back | |
28.240 ms (9918 allocations: 526.38 KiB) | |
RNN Block CPU n=1000, ts=64 | |
forward | |
811.597 ms (397 allocations: 972.76 MiB) | |
backward | |
1.420 s (4261 allocations: 2.13 GiB) | |
forw and back | |
2.484 s (6603 allocations: 3.56 GiB) | |
RNN Block CUDA n=1000, ts=64 | |
forward | |
32.477 ms (9789 allocations: 508.91 KiB) | |
backward | |
78.004 ms (26450 allocations: 1.24 MiB) | |
forw and back | |
116.131 ms (38912 allocations: 2.01 MiB) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment