Last active
August 29, 2016 23:41
-
-
Save moskewcz/bc3be55afd2d15e90100a1957ca805be to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# command lines run: | |
# boda-native case (generates out-boda-native.py; error check results are in test_compute.txt) | |
boda test_compute --model-name=googlenet_conv --wins-per-image=1 --imgs='(pil_fn=%(boda_test_dir)/pascal/head_1/%%s.txt)' --run-cnet='(in_dims=(img=1,y=1024,x=2048))' --cf2='(mode=rtc,per_call_fn=out-boda-native.py,op_tune=(use_culibs=0,k1conv=1,tconv=1),enable_write_xpose=1)' --max-err=1 | |
# cudnn-v5 (via boda) case (generates out-cudnn-v5.py; only convs are cudnn; error check results are in test_compute.txt) | |
boda test_compute --model-name=googlenet_conv --wins-per-image=1 --imgs='(pil_fn=%(boda_test_dir)/pascal/head_1/%%s.txt)' --run-cnet='(in_dims=(img=1,y=1024,x=2048))' --cf2='(mode=rtc,per_call_fn=out-cudnn-v5.py,op_tune=(use_culibs=1))' --max-err=1 | |
# to generate profiles | |
python ../../pysrc/flops.py --per-layer=1 --ai-mnk=1 --per-layer-in-info=1 --profile=1 --net-fn out-boda-native.py | |
python ../../pysrc/flops.py --per-layer=1 --ai-mnk=1 --per-layer-in-info=1 --profile=1 --net-fn out-cudnn-v5.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
conv1 input 1*3*1024*2048=25.2MB filts 64*3*7*7=37.6KB out 1*64*512*1024=134MB | |
conv1 FWD 9.87GF 159MB FWD_AI 61.9F/B MxNxK=524288x64x147 --- 4.58ms 2.15TF/s | |
reduction2 input 1*64*256*512=33.6MB filts 64*64*1*1=16.4KB out 1*64*256*512=33.6MB | |
reduction2 FWD 1.07GF 67.1MB FWD_AI 16.0F/B MxNxK=131072x64x64 --- 2.52ms 426GF/s | |
conv2 input 1*64*256*512=33.6MB filts 192*64*3*3=442KB out 1*192*256*512=101MB | |
conv2 FWD 29.0GF 135MB FWD_AI 215F/B MxNxK=131072x192x576 --- 13.0ms 2.22TF/s | |
icp1_reduction1 input 1*192*128*256=25.2MB filts 96*192*1*1=73.7KB out 1*96*128*256=12.6MB | |
icp1_reduction1 FWD 1.21GF 37.8MB FWD_AI 31.9F/B MxNxK=32768x96x192 --- 605us 2.00TF/s | |
icp1_out1 input 1*96*128*256=12.6MB filts 128*96*3*3=442KB out 1*128*128*256=16.8MB | |
icp1_out1 FWD 7.25GF 29.8MB FWD_AI 243F/B MxNxK=32768x128x864 --- 2.61ms 2.78TF/s | |
icp1_reduction2 input 1*192*128*256=25.2MB filts 16*192*1*1=12.3KB out 1*16*128*256=2.10MB | |
icp1_reduction2 FWD 201MF 27.3MB FWD_AI 7.38F/B MxNxK=32768x16x192 --- 330us 610GF/s | |
icp1_out2 input 1*16*128*256=2.10MB filts 32*16*5*5=51.2KB out 1*32*128*256=4.19MB | |
icp1_out2 FWD 839MF 6.34MB FWD_AI 132F/B MxNxK=32768x32x400 --- 344us 2.44TF/s | |
icp1_out3 input 1*192*128*256=25.2MB filts 32*192*1*1=24.6KB out 1*32*128*256=4.19MB | |
icp1_out3 FWD 403MF 29.4MB FWD_AI 13.7F/B MxNxK=32768x32x192 --- 768us 524GF/s | |
icp1_out0 input 1*192*128*256=25.2MB filts 64*192*1*1=49.2KB out 1*64*128*256=8.39MB | |
icp1_out0 FWD 805MF 33.6MB FWD_AI 24.0F/B MxNxK=32768x64x192 --- 339us 2.38TF/s | |
icp2_reduction1 input 1*256*128*256=33.6MB filts 128*256*1*1=131KB out 1*128*128*256=16.8MB | |
icp2_reduction1 FWD 2.15GF 50.5MB FWD_AI 42.6F/B MxNxK=32768x128x256 --- 864us 2.48TF/s | |
icp2_out1 input 1*128*128*256=16.8MB filts 192*128*3*3=885KB out 1*192*128*256=25.2MB | |
icp2_out1 FWD 14.5GF 42.8MB FWD_AI 338F/B MxNxK=32768x192x1152 --- 6.12ms 2.37TF/s | |
icp2_reduction2 input 1*256*128*256=33.6MB filts 32*256*1*1=32.8KB out 1*32*128*256=4.19MB | |
icp2_reduction2 FWD 537MF 37.8MB FWD_AI 14.2F/B MxNxK=32768x32x256 --- 937us 573GF/s | |
icp2_out2 input 1*32*128*256=4.19MB filts 96*32*5*5=307KB out 1*96*128*256=12.6MB | |
icp2_out2 FWD 5.03GF 17.1MB FWD_AI 295F/B MxNxK=32768x96x800 --- 1.76ms 2.86TF/s | |
icp2_out3 input 1*256*128*256=33.6MB filts 64*256*1*1=65.5KB out 1*64*128*256=8.39MB | |
icp2_out3 FWD 1.07GF 42.0MB FWD_AI 25.6F/B MxNxK=32768x64x256 --- 400us 2.68TF/s | |
icp2_out0 input 1*256*128*256=33.6MB filts 128*256*1*1=131KB out 1*128*128*256=16.8MB | |
icp2_out0 FWD 2.15GF 50.5MB FWD_AI 42.6F/B MxNxK=32768x128x256 --- 873us 2.46TF/s | |
icp3_reduction1 input 1*480*64*128=15.7MB filts 96*480*1*1=184KB out 1*96*64*128=3.15MB | |
icp3_reduction1 FWD 755MF 19.1MB FWD_AI 39.6F/B MxNxK=8192x96x480 --- 351us 2.15TF/s | |
icp3_out1 input 1*96*64*128=3.15MB filts 208*96*3*3=719KB out 1*208*64*128=6.82MB | |
icp3_out1 FWD 2.94GF 10.7MB FWD_AI 276F/B MxNxK=8192x208x864 --- 1.30ms 2.27TF/s | |
icp3_reduction2 input 1*480*64*128=15.7MB filts 16*480*1*1=30.7KB out 1*16*64*128=524KB | |
icp3_reduction2 FWD 126MF 16.3MB FWD_AI 7.73F/B MxNxK=8192x16x480 --- 657us 191GF/s | |
icp3_out2 input 1*16*64*128=524KB filts 48*16*5*5=76.8KB out 1*48*64*128=1.57MB | |
icp3_out2 FWD 315MF 2.17MB FWD_AI 145F/B MxNxK=8192x48x400 --- 164us 1.92TF/s | |
icp3_out3 input 1*480*64*128=15.7MB filts 64*480*1*1=123KB out 1*64*64*128=2.10MB | |
icp3_out3 FWD 503MF 17.9MB FWD_AI 28.0F/B MxNxK=8192x64x480 --- 198us 2.54TF/s | |
icp3_out0 input 1*480*64*128=15.7MB filts 192*480*1*1=369KB out 1*192*64*128=6.29MB | |
icp3_out0 FWD 1.51GF 22.4MB FWD_AI 67.4F/B MxNxK=8192x192x480 --- 634us 2.38TF/s | |
cls1_reduction input 1*512*21*42=1.81MB filts 128*512*1*1=262KB out 1*128*21*42=452KB | |
cls1_reduction FWD 116MF 2.52MB FWD_AI 45.9F/B MxNxK=882x128x512 --- 134us 860GF/s | |
cls1_fc1-conv input 1*128*21*42=452KB filts 1024*128*4*4=8.39MB out 1*1024*18*39=2.88MB | |
cls1_fc1-conv FWD 2.94GF 11.7MB FWD_AI 251F/B MxNxK=702x1024x2048 --- 1.46ms 2.02TF/s | |
cls1_fc2-conv input 1*1024*18*39=2.88MB filts 1000*1024*1*1=4.10MB out 1*1000*18*39=2.81MB | |
cls1_fc2-conv FWD 1.44GF 9.78MB FWD_AI 147F/B MxNxK=702x1000x1024 --- 447us 3.21TF/s | |
icp4_reduction1 input 1*512*64*128=16.8MB filts 112*512*1*1=229KB out 1*112*64*128=3.67MB | |
icp4_reduction1 FWD 940MF 20.7MB FWD_AI 45.4F/B MxNxK=8192x112x512 --- 379us 2.48TF/s | |
icp4_out1 input 1*112*64*128=3.67MB filts 224*112*3*3=903KB out 1*224*64*128=7.34MB | |
icp4_out1 FWD 3.70GF 11.9MB FWD_AI 311F/B MxNxK=8192x224x1008 --- 1.51ms 2.45TF/s | |
icp4_reduction2 input 1*512*64*128=16.8MB filts 24*512*1*1=49.2KB out 1*24*64*128=786KB | |
icp4_reduction2 FWD 201MF 17.6MB FWD_AI 11.4F/B MxNxK=8192x24x512 --- 894us 225GF/s | |
icp4_out2 input 1*24*64*128=786KB filts 64*24*5*5=154KB out 1*64*64*128=2.10MB | |
icp4_out2 FWD 629MF 3.04MB FWD_AI 207F/B MxNxK=8192x64x600 --- 242us 2.60TF/s | |
icp4_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp4_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 207us 2.59TF/s | |
icp4_out0 input 1*512*64*128=16.8MB filts 160*512*1*1=328KB out 1*160*64*128=5.24MB | |
icp4_out0 FWD 1.34GF 22.3MB FWD_AI 60.1F/B MxNxK=8192x160x512 --- 652us 2.06TF/s | |
icp5_reduction1 input 1*512*64*128=16.8MB filts 128*512*1*1=262KB out 1*128*64*128=4.19MB | |
icp5_reduction1 FWD 1.07GF 21.2MB FWD_AI 50.6F/B MxNxK=8192x128x512 --- 415us 2.59TF/s | |
icp5_out1 input 1*128*64*128=4.19MB filts 256*128*3*3=1.18MB out 1*256*64*128=8.39MB | |
icp5_out1 FWD 4.83GF 13.8MB FWD_AI 351F/B MxNxK=8192x256x1152 --- 1.79ms 2.70TF/s | |
icp5_reduction2 input 1*512*64*128=16.8MB filts 24*512*1*1=49.2KB out 1*24*64*128=786KB | |
icp5_reduction2 FWD 201MF 17.6MB FWD_AI 11.4F/B MxNxK=8192x24x512 --- 891us 226GF/s | |
icp5_out2 input 1*24*64*128=786KB filts 64*24*5*5=154KB out 1*64*64*128=2.10MB | |
icp5_out2 FWD 629MF 3.04MB FWD_AI 207F/B MxNxK=8192x64x600 --- 242us 2.60TF/s | |
icp5_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp5_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 210us 2.56TF/s | |
icp5_out0 input 1*512*64*128=16.8MB filts 128*512*1*1=262KB out 1*128*64*128=4.19MB | |
icp5_out0 FWD 1.07GF 21.2MB FWD_AI 50.6F/B MxNxK=8192x128x512 --- 416us 2.58TF/s | |
icp6_reduction1 input 1*512*64*128=16.8MB filts 144*512*1*1=295KB out 1*144*64*128=4.72MB | |
icp6_reduction1 FWD 1.21GF 21.8MB FWD_AI 55.4F/B MxNxK=8192x144x512 --- 664us 1.82TF/s | |
icp6_out1 input 1*144*64*128=4.72MB filts 288*144*3*3=1.49MB out 1*288*64*128=9.44MB | |
icp6_out1 FWD 6.12GF 15.6MB FWD_AI 391F/B MxNxK=8192x288x1296 --- 2.66ms 2.30TF/s | |
icp6_reduction2 input 1*512*64*128=16.8MB filts 32*512*1*1=65.5KB out 1*32*64*128=1.05MB | |
icp6_reduction2 FWD 268MF 17.9MB FWD_AI 15.0F/B MxNxK=8192x32x512 --- 693us 387GF/s | |
icp6_out2 input 1*32*64*128=1.05MB filts 64*32*5*5=205KB out 1*64*64*128=2.10MB | |
icp6_out2 FWD 839MF 3.35MB FWD_AI 250F/B MxNxK=8192x64x800 --- 301us 2.78TF/s | |
icp6_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp6_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 207us 2.60TF/s | |
icp6_out0 input 1*512*64*128=16.8MB filts 112*512*1*1=229KB out 1*112*64*128=3.67MB | |
icp6_out0 FWD 940MF 20.7MB FWD_AI 45.4F/B MxNxK=8192x112x512 --- 377us 2.49TF/s | |
cls2_reduction input 1*528*21*42=1.86MB filts 128*528*1*1=270KB out 1*128*21*42=452KB | |
cls2_reduction FWD 119MF 2.59MB FWD_AI 46.1F/B MxNxK=882x128x528 --- 137us 870GF/s | |
cls2_fc1-conv input 1*128*21*42=452KB filts 1024*128*4*4=8.39MB out 1*1024*18*39=2.88MB | |
cls2_fc1-conv FWD 2.94GF 11.7MB FWD_AI 251F/B MxNxK=702x1024x2048 --- 1.47ms 2.00TF/s | |
cls2_fc2-conv input 1*1024*18*39=2.88MB filts 1000*1024*1*1=4.10MB out 1*1000*18*39=2.81MB | |
cls2_fc2-conv FWD 1.44GF 9.78MB FWD_AI 147F/B MxNxK=702x1000x1024 --- 448us 3.21TF/s | |
icp7_reduction1 input 1*528*64*128=17.3MB filts 160*528*1*1=338KB out 1*160*64*128=5.24MB | |
icp7_reduction1 FWD 1.38GF 22.9MB FWD_AI 60.5F/B MxNxK=8192x160x528 --- 678us 2.04TF/s | |
icp7_out1 input 1*160*64*128=5.24MB filts 320*160*3*3=1.84MB out 1*320*64*128=10.5MB | |
icp7_out1 FWD 7.55GF 17.6MB FWD_AI 430F/B MxNxK=8192x320x1440 --- 2.95ms 2.56TF/s | |
icp7_reduction2 input 1*528*64*128=17.3MB filts 32*528*1*1=67.6KB out 1*32*64*128=1.05MB | |
icp7_reduction2 FWD 277MF 18.4MB FWD_AI 15.0F/B MxNxK=8192x32x528 --- 708us 391GF/s | |
icp7_out2 input 1*32*64*128=1.05MB filts 128*32*5*5=410KB out 1*128*64*128=4.19MB | |
icp7_out2 FWD 1.68GF 5.65MB FWD_AI 297F/B MxNxK=8192x128x800 --- 698us 2.40TF/s | |
icp7_out3 input 1*528*64*128=17.3MB filts 128*528*1*1=270KB out 1*128*64*128=4.19MB | |
icp7_out3 FWD 1.11GF 21.8MB FWD_AI 50.9F/B MxNxK=8192x128x528 --- 400us 2.77TF/s | |
icp7_out0 input 1*528*64*128=17.3MB filts 256*528*1*1=541KB out 1*256*64*128=8.39MB | |
icp7_out0 FWD 2.21GF 26.2MB FWD_AI 84.4F/B MxNxK=8192x256x528 --- 719us 3.08TF/s | |
icp8_reduction1 input 1*832*32*64=6.82MB filts 160*832*1*1=532KB out 1*160*32*64=1.31MB | |
icp8_reduction1 FWD 545MF 8.66MB FWD_AI 63.0F/B MxNxK=2048x160x832 --- 321us 1.70TF/s | |
icp8_out1 input 1*160*32*64=1.31MB filts 320*160*3*3=1.84MB out 1*320*32*64=2.62MB | |
icp8_out1 FWD 1.89GF 5.78MB FWD_AI 327F/B MxNxK=2048x320x1440 --- 755us 2.50TF/s | |
icp8_reduction2 input 1*832*32*64=6.82MB filts 32*832*1*1=106KB out 1*32*32*64=262KB | |
icp8_reduction2 FWD 109MF 7.18MB FWD_AI 15.2F/B MxNxK=2048x32x832 --- 1.06ms 103GF/s | |
icp8_out2 input 1*32*32*64=262KB filts 128*32*5*5=410KB out 1*128*32*64=1.05MB | |
icp8_out2 FWD 419MF 1.72MB FWD_AI 244F/B MxNxK=2048x128x800 --- 250us 1.67TF/s | |
icp8_out3 input 1*832*32*64=6.82MB filts 128*832*1*1=426KB out 1*128*32*64=1.05MB | |
icp8_out3 FWD 436MF 8.29MB FWD_AI 52.6F/B MxNxK=2048x128x832 --- 247us 1.77TF/s | |
icp8_out0 input 1*832*32*64=6.82MB filts 256*832*1*1=852KB out 1*256*32*64=2.10MB | |
icp8_out0 FWD 872MF 9.77MB FWD_AI 89.3F/B MxNxK=2048x256x832 --- 316us 2.76TF/s | |
icp9_reduction1 input 1*832*32*64=6.82MB filts 192*832*1*1=639KB out 1*192*32*64=1.57MB | |
icp9_reduction1 FWD 654MF 9.03MB FWD_AI 72.5F/B MxNxK=2048x192x832 --- 313us 2.09TF/s | |
icp9_out1 input 1*192*32*64=1.57MB filts 384*192*3*3=2.65MB out 1*384*32*64=3.15MB | |
icp9_out1 FWD 2.72GF 7.37MB FWD_AI 369F/B MxNxK=2048x384x1728 --- 880us 3.09TF/s | |
icp9_reduction2 input 1*832*32*64=6.82MB filts 48*832*1*1=160KB out 1*48*32*64=393KB | |
icp9_reduction2 FWD 164MF 7.37MB FWD_AI 22.2F/B MxNxK=2048x48x832 --- 1.32ms 123GF/s | |
icp9_out2 input 1*48*32*64=393KB filts 128*48*5*5=614KB out 1*128*32*64=1.05MB | |
icp9_out2 FWD 629MF 2.06MB FWD_AI 306F/B MxNxK=2048x128x1200 --- 358us 1.76TF/s | |
icp9_out3 input 1*832*32*64=6.82MB filts 128*832*1*1=426KB out 1*128*32*64=1.05MB | |
icp9_out3 FWD 436MF 8.29MB FWD_AI 52.6F/B MxNxK=2048x128x832 --- 246us 1.77TF/s | |
icp9_out0 input 1*832*32*64=6.82MB filts 384*832*1*1=1.28MB out 1*384*32*64=3.15MB | |
icp9_out0 FWD 1.31GF 11.2MB FWD_AI 116F/B MxNxK=2048x384x832 --- 408us 3.21TF/s | |
cls3_fc-conv input 1*1024*26*58=6.18MB filts 1000*1024*1*1=4.10MB out 1*1000*26*58=6.03MB | |
cls3_fc-conv FWD 3.09GF 16.3MB FWD_AI 189F/B MxNxK=1508x1000x1024 --- 882us 3.50TF/s | |
total _inxp time: 11.1ms | |
-- INPUT: RUNTIME=0.0888052s -- | |
-- INPUT: POWER=200W -- | |
-- "data" node dims: 1*3*1024*2048=25.2MB | |
- SUM-OVER-FWD-LAYERS-TOTALS: fwd_input_bytes=796MB fwd_filt_bytes=53.5MB fwd_output_bytes=558MB | |
--- FWD TOTALS --- | |
144GF 1.62TF/s | |
1.41GB 15.8GB/s AI=103F/B | |
17.8J 8.12GF/s/W | |
PROFILE: | |
time time% cum_time cum_time% func_name | |
13.0ms 14.7% 13.0ms 14.7% conv2 | |
6.12ms 6.9% 19.2ms 21.6% icp2_out1 | |
4.58ms 5.2% 23.7ms 26.7% conv1 | |
2.95ms 3.3% 26.7ms 30.1% icp7_out1 | |
2.66ms 3.0% 29.4ms 33.1% icp6_out1 | |
2.61ms 2.9% 32.0ms 36.0% icp1_out1 | |
2.52ms 2.8% 34.5ms 38.8% reduction2 | |
1.79ms 2.0% 36.3ms 40.8% icp5_out1 | |
1.76ms 2.0% 38.0ms 42.8% icp2_out2 | |
1.51ms 1.7% 39.5ms 44.5% icp4_out1 | |
1.48ms 1.7% 41.0ms 46.2% norm2 | |
1.47ms 1.7% 42.5ms 47.8% cls2_fc1-conv | |
1.46ms 1.6% 43.9ms 49.5% cls1_fc1-conv | |
1.32ms 1.5% 45.3ms 51.0% icp9_reduction2 | |
1.30ms 1.5% 46.6ms 52.4% icp3_out1 | |
1.06ms 1.2% 47.6ms 53.6% icp8_reduction2 | |
988us 1.1% 48.6ms 54.7% icp2_in__inxp | |
946us 1.1% 49.6ms 55.8% pool2__inxp | |
937us 1.1% 50.5ms 56.9% icp2_reduction2 | |
894us 1.0% 51.4ms 57.9% icp4_reduction2 | |
891us 1.0% 52.3ms 58.9% icp5_reduction2 | |
882us 1.0% 53.2ms 59.9% cls3_fc-conv | |
880us 1.0% 54.0ms 60.9% icp9_out1 | |
873us 1.0% 54.9ms 61.8% icp2_out0 | |
864us 1.0% 55.8ms 62.8% icp2_reduction1 | |
768us 0.9% 56.6ms 63.7% icp1_out3 | |
755us 0.8% 57.3ms 64.5% icp8_out1 | |
719us 0.8% 58.0ms 65.3% icp7_out0 | |
708us 0.8% 58.7ms 66.1% icp7_reduction2 | |
698us 0.8% 59.4ms 66.9% icp7_out2 | |
693us 0.8% 60.1ms 67.7% icp6_reduction2 | |
678us 0.8% 60.8ms 68.5% icp7_reduction1 | |
664us 0.7% 61.5ms 69.2% icp6_reduction1 | |
664us 0.7% 62.1ms 70.0% reduction2__inxp | |
657us 0.7% 62.8ms 70.7% icp3_reduction2 | |
652us 0.7% 63.4ms 71.4% icp4_out0 | |
634us 0.7% 64.1ms 72.2% icp3_out0 | |
608us 0.7% 64.7ms 72.8% pool1 | |
605us 0.7% 65.3ms 73.5% icp1_reduction1 | |
528us 0.6% 65.8ms 74.1% icp1_pool__inxp | |
523us 0.6% 66.3ms 74.7% icp3_out__inxp | |
522us 0.6% 66.9ms 75.3% icp5_out__inxp | |
519us 0.6% 67.4ms 75.9% norm1 | |
498us 0.6% 67.9ms 76.4% icp2_out | |
470us 0.5% 68.3ms 77.0% norm1__inxp | |
466us 0.5% 68.8ms 77.5% icp3_in__inxp | |
459us 0.5% 69.3ms 78.0% pool2 | |
448us 0.5% 69.7ms 78.5% cls2_fc2-conv | |
447us 0.5% 70.2ms 79.0% cls1_fc2-conv | |
417us 0.5% 70.6ms 79.5% data__inxp | |
416us 0.5% 71.0ms 80.0% icp5_out0 | |
415us 0.5% 71.4ms 80.4% icp5_reduction1 | |
408us 0.5% 71.8ms 80.9% icp9_out0 | |
406us 0.5% 72.2ms 81.3% icp2_reduction1__inxp | |
400us 0.5% 72.6ms 81.8% icp2_out3 | |
400us 0.5% 73.0ms 82.2% icp7_out3 | |
379us 0.4% 73.4ms 82.7% icp4_reduction1 | |
377us 0.4% 73.8ms 83.1% icp6_out0 | |
373us 0.4% 74.2ms 83.5% icp4_out__inxp | |
370us 0.4% 74.5ms 83.9% icp2_pool | |
369us 0.4% 74.9ms 84.3% icp6_out__inxp | |
358us 0.4% 75.3ms 84.7% icp9_out2 | |
351us 0.4% 75.6ms 85.1% icp3_reduction1 | |
344us 0.4% 76.0ms 85.5% icp1_out2 | |
339us 0.4% 76.3ms 85.9% icp1_out0 | |
330us 0.4% 76.6ms 86.3% icp1_reduction2 | |
321us 0.4% 76.9ms 86.6% icp8_reduction1 | |
316us 0.4% 77.3ms 87.0% icp8_out0 | |
313us 0.4% 77.6ms 87.3% icp9_reduction1 | |
301us 0.3% 77.9ms 87.7% icp6_out2 | |
288us 0.3% 78.2ms 88.0% icp3_in | |
281us 0.3% 78.4ms 88.3% icp1_pool | |
278us 0.3% 78.7ms 88.6% icp1_reduction1__inxp | |
276us 0.3% 79.0ms 89.0% icp2_in | |
267us 0.3% 79.3ms 89.3% icp2_pool__inxp | |
252us 0.3% 79.5ms 89.5% cls2_fc1-conv_filts__inxp | |
250us 0.3% 79.8ms 89.8% icp8_out2 | |
247us 0.3% 80.0ms 90.1% icp8_out3 | |
246us 0.3% 80.3ms 90.4% icp9_out3 | |
245us 0.3% 80.5ms 90.7% cls1_fc1-conv_filts__inxp | |
242us 0.3% 80.7ms 90.9% cls3_pool | |
242us 0.3% 81.0ms 91.2% icp4_out2 | |
242us 0.3% 81.2ms 91.5% icp5_out2 | |
228us 0.3% 81.5ms 91.7% cls2_pool | |
226us 0.3% 81.7ms 92.0% icp7_out | |
216us 0.2% 81.9ms 92.2% cls1_pool | |
210us 0.2% 82.1ms 92.5% icp5_out3 | |
207us 0.2% 82.3ms 92.7% icp4_out3 | |
207us 0.2% 82.5ms 92.9% icp6_out3 | |
198us 0.2% 82.7ms 93.1% icp3_out3 | |
192us 0.2% 82.9ms 93.4% icp7_pool | |
188us 0.2% 83.1ms 93.6% icp4_pool | |
187us 0.2% 83.3ms 93.8% icp6_pool | |
186us 0.2% 83.5ms 94.0% icp5_pool | |
173us 0.2% 83.6ms 94.2% icp3_pool | |
164us 0.2% 83.8ms 94.4% icp3_out2 | |
147us 0.2% 84.0ms 94.5% icp6_out | |
147us 0.2% 84.1ms 94.7% icp7_pool__inxp | |
145us 0.2% 84.2ms 94.9% icp4_out | |
144us 0.2% 84.4ms 95.0% icp3_out | |
143us 0.2% 84.5ms 95.2% icp5_out | |
140us 0.2% 84.7ms 95.3% icp5_pool__inxp | |
137us 0.2% 84.8ms 95.5% icp4_pool__inxp | |
137us 0.2% 84.9ms 95.7% cls2_reduction | |
136us 0.2% 85.1ms 95.8% icp8_out__inxp | |
136us 0.2% 85.2ms 96.0% icp6_pool__inxp | |
134us 0.2% 85.4ms 96.1% cls1_reduction | |
129us 0.1% 85.5ms 96.3% icp8_in | |
129us 0.1% 85.6ms 96.4% icp3_pool__inxp | |
123us 0.1% 85.7ms 96.5% cls1_fc2-conv_filts__inxp | |
123us 0.1% 85.9ms 96.7% cls2_fc2-conv_filts__inxp | |
122us 0.1% 86.0ms 96.8% cls3_fc-conv_filts__inxp | |
115us 0.1% 86.1ms 97.0% icp8_in__inxp | |
98.7us 0.1% 86.2ms 97.1% icp7_reduction1__inxp | |
91.1us 0.1% 86.3ms 97.2% icp2_reduction2__inxp | |
87.3us 0.1% 86.4ms 97.3% icp6_reduction1__inxp | |
83.0us 0.1% 86.5ms 97.4% icp9_out | |
82.9us 0.1% 86.5ms 97.4% icp9_out1_filts__inxp | |
79.9us 0.1% 86.6ms 97.5% icp9_pool | |
79.0us 0.1% 86.7ms 97.6% icp8_pool | |
73.2us 0.1% 86.8ms 97.7% icp5_reduction1__inxp | |
65.8us 0.1% 86.8ms 97.8% icp8_out | |
65.4us 0.1% 86.9ms 97.9% icp4_reduction1__inxp | |
59.7us 0.1% 87.0ms 97.9% icp8_out1_filts__inxp | |
59.7us 0.1% 87.0ms 98.0% icp7_out1_filts__inxp | |
57.3us 0.1% 87.1ms 98.1% icp8_pool__inxp | |
57.1us 0.1% 87.1ms 98.1% icp3_reduction1__inxp | |
56.4us 0.1% 87.2ms 98.2% icp9_pool__inxp | |
56.3us 0.1% 87.3ms 98.2% cls3_pool__inxp | |
47.9us 0.1% 87.3ms 98.3% icp6_out1_filts__inxp | |
43.5us 0.0% 87.3ms 98.4% icp1_reduction2__inxp | |
43.3us 0.0% 87.4ms 98.4% icp9_out0_filts__inxp | |
40.6us 0.0% 87.4ms 98.4% icp5_out1_filts__inxp | |
32.2us 0.0% 87.5ms 98.5% icp2_out1_filts__inxp | |
32.1us 0.0% 87.5ms 98.5% icp4_out1_filts__inxp | |
30.6us 0.0% 87.5ms 98.6% icp8_out0_filts__inxp | |
29.7us 0.0% 87.6ms 98.6% icp9_reduction1__inxp | |
29.0us 0.0% 87.6ms 98.6% cls2_fc1__inxp | |
28.5us 0.0% 87.6ms 98.7% cls1_fc1__inxp | |
27.8us 0.0% 87.6ms 98.7% icp3_out1_filts__inxp | |
25.5us 0.0% 87.7ms 98.7% icp8_reduction1__inxp | |
24.6us 0.0% 87.7ms 98.7% icp9_reduction1_filts__inxp | |
24.1us 0.0% 87.7ms 98.8% icp7_reduction2__inxp | |
23.9us 0.0% 87.7ms 98.8% icp9_out2_filts__inxp | |
23.0us 0.0% 87.8ms 98.8% icp8_reduction1_filts__inxp | |
22.7us 0.0% 87.8ms 98.8% cls2_pool__inxp | |
21.6us 0.0% 87.8ms 98.9% icp7_out0_filts__inxp | |
21.5us 0.0% 87.8ms 98.9% conv1_filts__inxp | |
20.6us 0.0% 87.8ms 98.9% icp4_reduction2__inxp | |
20.5us 0.0% 87.9ms 98.9% icp6_reduction2__inxp | |
20.3us 0.0% 87.9ms 99.0% icp5_reduction2__inxp | |
19.5us 0.0% 87.9ms 99.0% icp1_out1_filts__inxp | |
19.3us 0.0% 87.9ms 99.0% conv2_filts__inxp | |
19.3us 0.0% 87.9ms 99.0% cls1_pool__inxp | |
18.8us 0.0% 88.0ms 99.0% icp9_out3_filts__inxp | |
18.8us 0.0% 88.0ms 99.1% icp8_out2_filts__inxp | |
18.8us 0.0% 88.0ms 99.1% icp7_out2_filts__inxp | |
18.3us 0.0% 88.0ms 99.1% icp8_out3_filts__inxp | |
17.5us 0.0% 88.0ms 99.1% cls2_reduction_filts__inxp | |
17.3us 0.0% 88.1ms 99.2% cls1_reduction__inxp | |
16.9us 0.0% 88.1ms 99.2% cls2_reduction__inxp | |
16.6us 0.0% 88.1ms 99.2% icp3_out0_filts__inxp | |
15.8us 0.0% 88.1ms 99.2% icp7_reduction1_filts__inxp | |
15.8us 0.0% 88.1ms 99.2% icp3_reduction2__inxp | |
15.0us 0.0% 88.1ms 99.2% icp4_out0_filts__inxp | |
14.7us 0.0% 88.1ms 99.3% icp6_reduction1_filts__inxp | |
14.6us 0.0% 88.2ms 99.3% icp9_reduction2__inxp | |
14.4us 0.0% 88.2ms 99.3% icp2_out2_filts__inxp | |
13.8us 0.0% 88.2ms 99.3% icp7_out3_filts__inxp | |
13.7us 0.0% 88.2ms 99.3% cls1_reduction_filts__inxp | |
13.7us 0.0% 88.2ms 99.3% icp5_out0_filts__inxp | |
13.6us 0.0% 88.2ms 99.4% icp5_reduction1_filts__inxp | |
13.1us 0.0% 88.2ms 99.4% icp6_out0_filts__inxp | |
13.0us 0.0% 88.3ms 99.4% icp4_reduction1_filts__inxp | |
12.4us 0.0% 88.3ms 99.4% icp6_out2_filts__inxp | |
10.9us 0.0% 88.3ms 99.4% icp5_out2_filts__inxp | |
10.8us 0.0% 88.3ms 99.4% icp3_reduction1_filts__inxp | |
10.7us 0.0% 88.3ms 99.4% icp4_out2_filts__inxp | |
10.3us 0.0% 88.3ms 99.4% icp9_reduction2_filts__inxp | |
10.1us 0.0% 88.3ms 99.5% icp8_reduction2__inxp | |
10.1us 0.0% 88.3ms 99.5% icp2_reduction1_filts__inxp | |
10.1us 0.0% 88.3ms 99.5% icp2_out0_filts__inxp | |
10.1us 0.0% 88.4ms 99.5% icp5_out3_filts__inxp | |
9.98us 0.0% 88.4ms 99.5% icp4_out3_filts__inxp | |
9.66us 0.0% 88.4ms 99.5% icp8_reduction2_filts__inxp | |
9.44us 0.0% 88.4ms 99.5% icp6_out3_filts__inxp | |
9.44us 0.0% 88.4ms 99.5% icp3_out3_filts__inxp | |
8.70us 0.0% 88.4ms 99.5% icp3_out2_filts__inxp | |
8.64us 0.0% 88.4ms 99.6% icp1_reduction1_filts__inxp | |
8.22us 0.0% 88.4ms 99.6% icp7_reduction2_filts__inxp | |
7.97us 0.0% 88.4ms 99.6% icp6_reduction2_filts__inxp | |
7.90us 0.0% 88.4ms 99.6% icp2_out3_filts__inxp | |
7.90us 0.0% 88.4ms 99.6% icp1_out2_filts__inxp | |
7.36us 0.0% 88.4ms 99.6% reduction2_filts__inxp | |
7.30us 0.0% 88.5ms 99.6% icp2_reduction2_filts__inxp | |
7.23us 0.0% 88.5ms 99.6% icp5_reduction2_filts__inxp | |
7.23us 0.0% 88.5ms 99.6% icp4_reduction2_filts__inxp | |
7.23us 0.0% 88.5ms 99.6% icp3_reduction2_filts__inxp | |
7.17us 0.0% 88.5ms 99.6% icp1_out0_filts__inxp | |
6.59us 0.0% 88.5ms 99.6% icp1_out3_filts__inxp | |
6.50us 0.0% 88.5ms 99.7% icp1_reduction2_filts__inxp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
conv1 input 1*3*1024*2048=25.2MB filts 64*3*7*7=37.6KB out 1*64*512*1024=134MB | |
conv1 FWD 9.87GF 159MB FWD_AI 61.9F/B MxNxK=524288x64x147 --- 4.74ms 2.08TF/s | |
reduction2 input 1*64*256*512=33.6MB filts 64*64*1*1=16.4KB out 1*64*256*512=33.6MB | |
reduction2 FWD 1.07GF 67.1MB FWD_AI 16.0F/B MxNxK=131072x64x64 --- 907us 1.18TF/s | |
conv2 input 1*64*256*512=33.6MB filts 192*64*3*3=442KB out 1*192*256*512=101MB | |
conv2 FWD 29.0GF 135MB FWD_AI 215F/B MxNxK=131072x192x576 --- 4.81ms 6.03TF/s | |
icp1_reduction1 input 1*192*128*256=25.2MB filts 96*192*1*1=73.7KB out 1*96*128*256=12.6MB | |
icp1_reduction1 FWD 1.21GF 37.8MB FWD_AI 31.9F/B MxNxK=32768x96x192 --- 610us 1.98TF/s | |
icp1_out1 input 1*96*128*256=12.6MB filts 128*96*3*3=442KB out 1*128*128*256=16.8MB | |
icp1_out1 FWD 7.25GF 29.8MB FWD_AI 243F/B MxNxK=32768x128x864 --- 1.04ms 6.97TF/s | |
icp1_reduction2 input 1*192*128*256=25.2MB filts 16*192*1*1=12.3KB out 1*16*128*256=2.10MB | |
icp1_reduction2 FWD 201MF 27.3MB FWD_AI 7.38F/B MxNxK=32768x16x192 --- 178us 1.13TF/s | |
icp1_out2 input 1*16*128*256=2.10MB filts 32*16*5*5=51.2KB out 1*32*128*256=4.19MB | |
icp1_out2 FWD 839MF 6.34MB FWD_AI 132F/B MxNxK=32768x32x400 --- 301us 2.79TF/s | |
icp1_out3 input 1*192*128*256=25.2MB filts 32*192*1*1=24.6KB out 1*32*128*256=4.19MB | |
icp1_out3 FWD 403MF 29.4MB FWD_AI 13.7F/B MxNxK=32768x32x192 --- 228us 1.77TF/s | |
icp1_out0 input 1*192*128*256=25.2MB filts 64*192*1*1=49.2KB out 1*64*128*256=8.39MB | |
icp1_out0 FWD 805MF 33.6MB FWD_AI 24.0F/B MxNxK=32768x64x192 --- 353us 2.28TF/s | |
icp2_reduction1 input 1*256*128*256=33.6MB filts 128*256*1*1=131KB out 1*128*128*256=16.8MB | |
icp2_reduction1 FWD 2.15GF 50.5MB FWD_AI 42.6F/B MxNxK=32768x128x256 --- 731us 2.94TF/s | |
icp2_out1 input 1*128*128*256=16.8MB filts 192*128*3*3=885KB out 1*192*128*256=25.2MB | |
icp2_out1 FWD 14.5GF 42.8MB FWD_AI 338F/B MxNxK=32768x192x1152 --- 1.85ms 7.85TF/s | |
icp2_reduction2 input 1*256*128*256=33.6MB filts 32*256*1*1=32.8KB out 1*32*128*256=4.19MB | |
icp2_reduction2 FWD 537MF 37.8MB FWD_AI 14.2F/B MxNxK=32768x32x256 --- 269us 2.00TF/s | |
icp2_out2 input 1*32*128*256=4.19MB filts 96*32*5*5=307KB out 1*96*128*256=12.6MB | |
icp2_out2 FWD 5.03GF 17.1MB FWD_AI 295F/B MxNxK=32768x96x800 --- 1.37ms 3.68TF/s | |
icp2_out3 input 1*256*128*256=33.6MB filts 64*256*1*1=65.5KB out 1*64*128*256=8.39MB | |
icp2_out3 FWD 1.07GF 42.0MB FWD_AI 25.6F/B MxNxK=32768x64x256 --- 406us 2.64TF/s | |
icp2_out0 input 1*256*128*256=33.6MB filts 128*256*1*1=131KB out 1*128*128*256=16.8MB | |
icp2_out0 FWD 2.15GF 50.5MB FWD_AI 42.6F/B MxNxK=32768x128x256 --- 729us 2.95TF/s | |
icp3_reduction1 input 1*480*64*128=15.7MB filts 96*480*1*1=184KB out 1*96*64*128=3.15MB | |
icp3_reduction1 FWD 755MF 19.1MB FWD_AI 39.6F/B MxNxK=8192x96x480 --- 267us 2.83TF/s | |
icp3_out1 input 1*96*64*128=3.15MB filts 208*96*3*3=719KB out 1*208*64*128=6.82MB | |
icp3_out1 FWD 2.94GF 10.7MB FWD_AI 276F/B MxNxK=8192x208x864 --- 475us 6.20TF/s | |
icp3_reduction2 input 1*480*64*128=15.7MB filts 16*480*1*1=30.7KB out 1*16*64*128=524KB | |
icp3_reduction2 FWD 126MF 16.3MB FWD_AI 7.73F/B MxNxK=8192x16x480 --- 127us 989GF/s | |
icp3_out2 input 1*16*64*128=524KB filts 48*16*5*5=76.8KB out 1*48*64*128=1.57MB | |
icp3_out2 FWD 315MF 2.17MB FWD_AI 145F/B MxNxK=8192x48x400 --- 136us 2.31TF/s | |
icp3_out3 input 1*480*64*128=15.7MB filts 64*480*1*1=123KB out 1*64*64*128=2.10MB | |
icp3_out3 FWD 503MF 17.9MB FWD_AI 28.0F/B MxNxK=8192x64x480 --- 166us 3.04TF/s | |
icp3_out0 input 1*480*64*128=15.7MB filts 192*480*1*1=369KB out 1*192*64*128=6.29MB | |
icp3_out0 FWD 1.51GF 22.4MB FWD_AI 67.4F/B MxNxK=8192x192x480 --- 442us 3.41TF/s | |
cls1_reduction input 1*512*21*42=1.81MB filts 128*512*1*1=262KB out 1*128*21*42=452KB | |
cls1_reduction FWD 116MF 2.52MB FWD_AI 45.9F/B MxNxK=882x128x512 --- 112us 1.04TF/s | |
cls1_fc1-conv input 1*128*21*42=452KB filts 1024*128*4*4=8.39MB out 1*1024*18*39=2.88MB | |
cls1_fc1-conv FWD 2.94GF 11.7MB FWD_AI 251F/B MxNxK=702x1024x2048 --- 588us 5.01TF/s | |
cls1_fc2-conv input 1*1024*18*39=2.88MB filts 1000*1024*1*1=4.10MB out 1*1000*18*39=2.81MB | |
cls1_fc2-conv FWD 1.44GF 9.78MB FWD_AI 147F/B MxNxK=702x1000x1024 --- 311us 4.62TF/s | |
icp4_reduction1 input 1*512*64*128=16.8MB filts 112*512*1*1=229KB out 1*112*64*128=3.67MB | |
icp4_reduction1 FWD 940MF 20.7MB FWD_AI 45.4F/B MxNxK=8192x112x512 --- 303us 3.10TF/s | |
icp4_out1 input 1*112*64*128=3.67MB filts 224*112*3*3=903KB out 1*224*64*128=7.34MB | |
icp4_out1 FWD 3.70GF 11.9MB FWD_AI 311F/B MxNxK=8192x224x1008 --- 535us 6.91TF/s | |
icp4_reduction2 input 1*512*64*128=16.8MB filts 24*512*1*1=49.2KB out 1*24*64*128=786KB | |
icp4_reduction2 FWD 201MF 17.6MB FWD_AI 11.4F/B MxNxK=8192x24x512 --- 136us 1.48TF/s | |
icp4_out2 input 1*24*64*128=786KB filts 64*24*5*5=154KB out 1*64*64*128=2.10MB | |
icp4_out2 FWD 629MF 3.04MB FWD_AI 207F/B MxNxK=8192x64x600 --- 181us 3.48TF/s | |
icp4_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp4_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 169us 3.17TF/s | |
icp4_out0 input 1*512*64*128=16.8MB filts 160*512*1*1=328KB out 1*160*64*128=5.24MB | |
icp4_out0 FWD 1.34GF 22.3MB FWD_AI 60.1F/B MxNxK=8192x160x512 --- 430us 3.12TF/s | |
icp5_reduction1 input 1*512*64*128=16.8MB filts 128*512*1*1=262KB out 1*128*64*128=4.19MB | |
icp5_reduction1 FWD 1.07GF 21.2MB FWD_AI 50.6F/B MxNxK=8192x128x512 --- 372us 2.89TF/s | |
icp5_out1 input 1*128*64*128=4.19MB filts 256*128*3*3=1.18MB out 1*256*64*128=8.39MB | |
icp5_out1 FWD 4.83GF 13.8MB FWD_AI 351F/B MxNxK=8192x256x1152 --- 660us 7.32TF/s | |
icp5_reduction2 input 1*512*64*128=16.8MB filts 24*512*1*1=49.2KB out 1*24*64*128=786KB | |
icp5_reduction2 FWD 201MF 17.6MB FWD_AI 11.4F/B MxNxK=8192x24x512 --- 135us 1.49TF/s | |
icp5_out2 input 1*24*64*128=786KB filts 64*24*5*5=154KB out 1*64*64*128=2.10MB | |
icp5_out2 FWD 629MF 3.04MB FWD_AI 207F/B MxNxK=8192x64x600 --- 183us 3.44TF/s | |
icp5_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp5_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 171us 3.15TF/s | |
icp5_out0 input 1*512*64*128=16.8MB filts 128*512*1*1=262KB out 1*128*64*128=4.19MB | |
icp5_out0 FWD 1.07GF 21.2MB FWD_AI 50.6F/B MxNxK=8192x128x512 --- 370us 2.90TF/s | |
icp6_reduction1 input 1*512*64*128=16.8MB filts 144*512*1*1=295KB out 1*144*64*128=4.72MB | |
icp6_reduction1 FWD 1.21GF 21.8MB FWD_AI 55.4F/B MxNxK=8192x144x512 --- 419us 2.88TF/s | |
icp6_out1 input 1*144*64*128=4.72MB filts 288*144*3*3=1.49MB out 1*288*64*128=9.44MB | |
icp6_out1 FWD 6.12GF 15.6MB FWD_AI 391F/B MxNxK=8192x288x1296 --- 805us 7.60TF/s | |
icp6_reduction2 input 1*512*64*128=16.8MB filts 32*512*1*1=65.5KB out 1*32*64*128=1.05MB | |
icp6_reduction2 FWD 268MF 17.9MB FWD_AI 15.0F/B MxNxK=8192x32x512 --- 136us 1.97TF/s | |
icp6_out2 input 1*32*64*128=1.05MB filts 64*32*5*5=205KB out 1*64*64*128=2.10MB | |
icp6_out2 FWD 839MF 3.35MB FWD_AI 250F/B MxNxK=8192x64x800 --- 224us 3.75TF/s | |
icp6_out3 input 1*512*64*128=16.8MB filts 64*512*1*1=131KB out 1*64*64*128=2.10MB | |
icp6_out3 FWD 537MF 19.0MB FWD_AI 28.2F/B MxNxK=8192x64x512 --- 174us 3.09TF/s | |
icp6_out0 input 1*512*64*128=16.8MB filts 112*512*1*1=229KB out 1*112*64*128=3.67MB | |
icp6_out0 FWD 940MF 20.7MB FWD_AI 45.4F/B MxNxK=8192x112x512 --- 352us 2.67TF/s | |
cls2_reduction input 1*528*21*42=1.86MB filts 128*528*1*1=270KB out 1*128*21*42=452KB | |
cls2_reduction FWD 119MF 2.59MB FWD_AI 46.1F/B MxNxK=882x128x528 --- 114us 1.04TF/s | |
cls2_fc1-conv input 1*128*21*42=452KB filts 1024*128*4*4=8.39MB out 1*1024*18*39=2.88MB | |
cls2_fc1-conv FWD 2.94GF 11.7MB FWD_AI 251F/B MxNxK=702x1024x2048 --- 584us 5.05TF/s | |
cls2_fc2-conv input 1*1024*18*39=2.88MB filts 1000*1024*1*1=4.10MB out 1*1000*18*39=2.81MB | |
cls2_fc2-conv FWD 1.44GF 9.78MB FWD_AI 147F/B MxNxK=702x1000x1024 --- 312us 4.61TF/s | |
icp7_reduction1 input 1*528*64*128=17.3MB filts 160*528*1*1=338KB out 1*160*64*128=5.24MB | |
icp7_reduction1 FWD 1.38GF 22.9MB FWD_AI 60.5F/B MxNxK=8192x160x528 --- 443us 3.12TF/s | |
icp7_out1 input 1*160*64*128=5.24MB filts 320*160*3*3=1.84MB out 1*320*64*128=10.5MB | |
icp7_out1 FWD 7.55GF 17.6MB FWD_AI 430F/B MxNxK=8192x320x1440 --- 1.06ms 7.14TF/s | |
icp7_reduction2 input 1*528*64*128=17.3MB filts 32*528*1*1=67.6KB out 1*32*64*128=1.05MB | |
icp7_reduction2 FWD 277MF 18.4MB FWD_AI 15.0F/B MxNxK=8192x32x528 --- 144us 1.93TF/s | |
icp7_out2 input 1*32*64*128=1.05MB filts 128*32*5*5=410KB out 1*128*64*128=4.19MB | |
icp7_out2 FWD 1.68GF 5.65MB FWD_AI 297F/B MxNxK=8192x128x800 --- 427us 3.93TF/s | |
icp7_out3 input 1*528*64*128=17.3MB filts 128*528*1*1=270KB out 1*128*64*128=4.19MB | |
icp7_out3 FWD 1.11GF 21.8MB FWD_AI 50.9F/B MxNxK=8192x128x528 --- 322us 3.44TF/s | |
icp7_out0 input 1*528*64*128=17.3MB filts 256*528*1*1=541KB out 1*256*64*128=8.39MB | |
icp7_out0 FWD 2.21GF 26.2MB FWD_AI 84.4F/B MxNxK=8192x256x528 --- 608us 3.64TF/s | |
icp8_reduction1 input 1*832*32*64=6.82MB filts 160*832*1*1=532KB out 1*160*32*64=1.31MB | |
icp8_reduction1 FWD 545MF 8.66MB FWD_AI 63.0F/B MxNxK=2048x160x832 --- 206us 2.65TF/s | |
icp8_out1 input 1*160*32*64=1.31MB filts 320*160*3*3=1.84MB out 1*320*32*64=2.62MB | |
icp8_out1 FWD 1.89GF 5.78MB FWD_AI 327F/B MxNxK=2048x320x1440 --- 260us 7.27TF/s | |
icp8_reduction2 input 1*832*32*64=6.82MB filts 32*832*1*1=106KB out 1*32*32*64=262KB | |
icp8_reduction2 FWD 109MF 7.18MB FWD_AI 15.2F/B MxNxK=2048x32x832 --- 165us 660GF/s | |
icp8_out2 input 1*32*32*64=262KB filts 128*32*5*5=410KB out 1*128*32*64=1.05MB | |
icp8_out2 FWD 419MF 1.72MB FWD_AI 244F/B MxNxK=2048x128x800 --- 170us 2.47TF/s | |
icp8_out3 input 1*832*32*64=6.82MB filts 128*832*1*1=426KB out 1*128*32*64=1.05MB | |
icp8_out3 FWD 436MF 8.29MB FWD_AI 52.6F/B MxNxK=2048x128x832 --- 170us 2.56TF/s | |
icp8_out0 input 1*832*32*64=6.82MB filts 256*832*1*1=852KB out 1*256*32*64=2.10MB | |
icp8_out0 FWD 872MF 9.77MB FWD_AI 89.3F/B MxNxK=2048x256x832 --- 267us 3.27TF/s | |
icp9_reduction1 input 1*832*32*64=6.82MB filts 192*832*1*1=639KB out 1*192*32*64=1.57MB | |
icp9_reduction1 FWD 654MF 9.03MB FWD_AI 72.5F/B MxNxK=2048x192x832 --- 235us 2.78TF/s | |
icp9_out1 input 1*192*32*64=1.57MB filts 384*192*3*3=2.65MB out 1*384*32*64=3.15MB | |
icp9_out1 FWD 2.72GF 7.37MB FWD_AI 369F/B MxNxK=2048x384x1728 --- 472us 5.76TF/s | |
icp9_reduction2 input 1*832*32*64=6.82MB filts 48*832*1*1=160KB out 1*48*32*64=393KB | |
icp9_reduction2 FWD 164MF 7.37MB FWD_AI 22.2F/B MxNxK=2048x48x832 --- 180us 910GF/s | |
icp9_out2 input 1*48*32*64=393KB filts 128*48*5*5=614KB out 1*128*32*64=1.05MB | |
icp9_out2 FWD 629MF 2.06MB FWD_AI 306F/B MxNxK=2048x128x1200 --- 215us 2.92TF/s | |
icp9_out3 input 1*832*32*64=6.82MB filts 128*832*1*1=426KB out 1*128*32*64=1.05MB | |
icp9_out3 FWD 436MF 8.29MB FWD_AI 52.6F/B MxNxK=2048x128x832 --- 173us 2.52TF/s | |
icp9_out0 input 1*832*32*64=6.82MB filts 384*832*1*1=1.28MB out 1*384*32*64=3.15MB | |
icp9_out0 FWD 1.31GF 11.2MB FWD_AI 116F/B MxNxK=2048x384x832 --- 285us 4.60TF/s | |
cls3_fc-conv input 1*1024*26*58=6.18MB filts 1000*1024*1*1=4.10MB out 1*1000*26*58=6.03MB | |
cls3_fc-conv FWD 3.09GF 16.3MB FWD_AI 189F/B MxNxK=1508x1000x1024 --- 643us 4.81TF/s | |
total _inxp time: 0s | |
-- INPUT: RUNTIME=0.0417357s -- | |
-- INPUT: POWER=200W -- | |
-- "data" node dims: 1*3*1024*2048=25.2MB | |
- SUM-OVER-FWD-LAYERS-TOTALS: fwd_input_bytes=796MB fwd_filt_bytes=53.5MB fwd_output_bytes=558MB | |
--- FWD TOTALS --- | |
144GF 3.46TF/s | |
1.41GB 33.7GB/s AI=103F/B | |
8.35J 17.3GF/s/W | |
PROFILE: | |
time time% cum_time cum_time% func_name | |
4.81ms 11.5% 4.81ms 11.5% conv2 | |
4.74ms 11.4% 9.55ms 22.9% conv1 | |
1.85ms 4.4% 11.4ms 27.3% icp2_out1 | |
1.37ms 3.3% 12.8ms 30.6% norm2 | |
1.37ms 3.3% 14.1ms 33.9% icp2_out2 | |
1.06ms 2.5% 15.2ms 36.4% icp7_out1 | |
1.04ms 2.5% 16.2ms 38.9% icp1_out1 | |
907us 2.2% 17.1ms 41.1% reduction2 | |
805us 1.9% 17.9ms 43.0% icp6_out1 | |
731us 1.8% 18.7ms 44.8% icp2_reduction1 | |
729us 1.7% 19.4ms 46.5% icp2_out0 | |
660us 1.6% 20.1ms 48.1% icp5_out1 | |
643us 1.5% 20.7ms 49.6% cls3_fc-conv | |
610us 1.5% 21.3ms 51.1% icp1_reduction1 | |
608us 1.5% 21.9ms 52.5% pool1 | |
608us 1.5% 22.5ms 54.0% icp7_out0 | |
588us 1.4% 23.1ms 55.4% cls1_fc1-conv | |
584us 1.4% 23.7ms 56.8% cls2_fc1-conv | |
535us 1.3% 24.2ms 58.1% icp4_out1 | |
497us 1.2% 24.7ms 59.3% icp2_out | |
475us 1.1% 25.2ms 60.4% icp3_out1 | |
472us 1.1% 25.7ms 61.5% icp9_out1 | |
465us 1.1% 26.2ms 62.7% norm1 | |
459us 1.1% 26.6ms 63.8% pool2 | |
443us 1.1% 27.1ms 64.8% icp7_reduction1 | |
442us 1.1% 27.5ms 65.9% icp3_out0 | |
430us 1.0% 27.9ms 66.9% icp4_out0 | |
427us 1.0% 28.4ms 67.9% icp7_out2 | |
419us 1.0% 28.8ms 68.9% icp6_reduction1 | |
406us 1.0% 29.2ms 69.9% icp2_out3 | |
372us 0.9% 29.6ms 70.8% icp5_reduction1 | |
370us 0.9% 29.9ms 71.7% icp5_out0 | |
353us 0.8% 30.3ms 72.5% icp1_out0 | |
352us 0.8% 30.6ms 73.4% icp6_out0 | |
341us 0.8% 31.0ms 74.2% icp2_pool | |
322us 0.8% 31.3ms 75.0% icp7_out3 | |
312us 0.7% 31.6ms 75.7% cls2_fc2-conv | |
311us 0.7% 31.9ms 76.5% cls1_fc2-conv | |
303us 0.7% 32.2ms 77.2% icp4_reduction1 | |
301us 0.7% 32.5ms 77.9% icp1_out2 | |
288us 0.7% 32.8ms 78.6% icp3_in | |
285us 0.7% 33.1ms 79.3% icp9_out0 | |
273us 0.7% 33.4ms 79.9% icp2_in | |
269us 0.6% 33.6ms 80.6% icp2_reduction2 | |
268us 0.6% 33.9ms 81.2% icp1_pool | |
267us 0.6% 34.2ms 81.9% icp8_out0 | |
267us 0.6% 34.4ms 82.5% icp3_reduction1 | |
260us 0.6% 34.7ms 83.1% icp8_out1 | |
235us 0.6% 34.9ms 83.7% icp9_reduction1 | |
228us 0.5% 35.2ms 84.2% icp1_out3 | |
225us 0.5% 35.4ms 84.8% icp7_out | |
224us 0.5% 35.6ms 85.3% icp6_out2 | |
215us 0.5% 35.8ms 85.8% icp9_out2 | |
214us 0.5% 36.0ms 86.3% cls3_pool | |
206us 0.5% 36.2ms 86.8% icp8_reduction1 | |
194us 0.5% 36.4ms 87.3% cls2_pool | |
190us 0.5% 36.6ms 87.7% cls1_pool | |
183us 0.4% 36.8ms 88.2% icp5_out2 | |
181us 0.4% 37.0ms 88.6% icp7_pool | |
181us 0.4% 37.2ms 89.1% icp4_out2 | |
180us 0.4% 37.3ms 89.5% icp9_reduction2 | |
178us 0.4% 37.5ms 89.9% icp1_reduction2 | |
177us 0.4% 37.7ms 90.3% icp6_pool | |
174us 0.4% 37.9ms 90.8% icp6_out3 | |
173us 0.4% 38.0ms 91.2% icp9_out3 | |
172us 0.4% 38.2ms 91.6% icp5_pool | |
171us 0.4% 38.4ms 92.0% icp4_pool | |
171us 0.4% 38.6ms 92.4% icp5_out3 | |
170us 0.4% 38.7ms 92.8% icp8_out3 | |
170us 0.4% 38.9ms 93.2% icp8_out2 | |
169us 0.4% 39.1ms 93.6% icp4_out3 | |
166us 0.4% 39.2ms 94.0% icp3_out3 | |
165us 0.4% 39.4ms 94.4% icp8_reduction2 | |
162us 0.4% 39.6ms 94.8% icp3_pool | |
145us 0.3% 39.7ms 95.1% icp6_out | |
144us 0.3% 39.9ms 95.5% icp7_reduction2 | |
142us 0.3% 40.0ms 95.8% icp3_out | |
141us 0.3% 40.1ms 96.2% icp4_out | |
141us 0.3% 40.3ms 96.5% icp5_out | |
136us 0.3% 40.4ms 96.8% icp6_reduction2 | |
136us 0.3% 40.6ms 97.2% icp4_reduction2 | |
136us 0.3% 40.7ms 97.5% icp3_out2 | |
135us 0.3% 40.8ms 97.8% icp5_reduction2 | |
129us 0.3% 41.0ms 98.1% icp8_in | |
127us 0.3% 41.1ms 98.4% icp3_reduction2 | |
114us 0.3% 41.2ms 98.7% cls2_reduction | |
112us 0.3% 41.3ms 99.0% cls1_reduction | |
77.8us 0.2% 41.4ms 99.2% icp9_pool | |
76.1us 0.2% 41.5ms 99.3% icp9_out | |
72.7us 0.2% 41.5ms 99.5% icp8_pool | |
62.1us 0.1% 41.6ms 99.7% icp8_out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment