Skip to content

Instantly share code, notes, and snippets.

@maxhutch
Created February 3, 2014 14:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maxhutch/8784895 to your computer and use it in GitHub Desktop.
Save maxhutch/8784895 to your computer and use it in GitHub Desktop.
Y = D X benchmark, where D is diagonal

All time columns other than the first are relative to the first. I suspect the slow 'fresh read' time is due to address translation, as it only occurs after an allocate. I'm a bit perplexed that the scaling doesn't improve with N, as the copy time doesn't include D and its reads could be re-used...

Results

ifort

log2(M) | log2(N) | copy (s) | fresh copy | loop | forall | concurrent | gbmv | scal
----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----
16 | 1 | 0.00014 | 1.55921 | 1.16612 | 1.12500 | 1.13158 | 29.30428 | 4.53947
17 | 1 | 0.00029 | 1.56861 | 1.12325 | 1.12654 | 1.12325 | 3.87428 | 4.34593
18 | 1 | 0.00057 | 1.83123 | 1.13392 | 1.12343 | 1.12888 | 3.91604 | 4.46558
19 | 1 | 0.00112 | 1.89005 | 1.16227 | 1.17929 | 1.16674 | 4.02488 | 4.50829
20 | 1 | 0.00234 | 2.05463 | 1.12139 | 1.17908 | 1.12568 | 4.01559 | 4.36530
21 | 1 | 0.00436 | 2.58005 | 1.19295 | 1.19546 | 1.19093 | 4.36486 | 4.67126
22 | 1 | 0.00880 | 2.76917 | 1.17662 | 1.18910 | 1.18331 | 4.44385 | 4.59607
23 | 1 | 0.01751 | 2.98388 | 1.22310 | 1.21145 | 1.20037 | 4.47294 | 4.59261
24 | 1 | 0.03471 | 3.02142 | 1.20837 | 1.20964 | 1.20964 | 4.52828 | 4.62772
25 | 1 | 0.06928 | 2.96777 | 1.21227 | 1.21186 | 1.21266 | 4.47780 | 4.63916
26 | 1 | 0.13834 | 2.92824 | 1.21350 | 1.21274 | 1.21328 | 4.46550 | 4.65362
27 | 1 | 0.27649 | 2.89176 | 1.21491 | 1.21448 | 1.21440 | 4.45917 | 4.74534
28 | 1 | 0.55291 | 2.86256 | 1.21293 | 1.21254 | 1.21275 | 4.44964 | 4.79506
29 | 1 | 1.10337 | 2.86108 | 1.21571 | 1.21571 | 1.21587 | 4.46008 | 5.16229
15 | 2 | 0.00014 | 0.99338 | 1.10430 | 1.08940 | 1.10430 | 3.79305 | 3.47185
16 | 2 | 0.00029 | 0.99583 | 1.10083 | 1.09083 | 1.10417 | 3.80250 | 3.47417
17 | 2 | 0.00057 | 0.97692 | 1.11792 | 1.09694 | 1.10197 | 3.82963 | 3.47629
18 | 2 | 0.00112 | 1.00620 | 1.11301 | 1.10660 | 1.12540 | 3.94168 | 3.55458
19 | 2 | 0.00222 | 1.80402 | 1.15461 | 1.15451 | 1.16901 | 3.99925 | 3.57602
20 | 2 | 0.00439 | 2.20141 | 1.17751 | 1.18435 | 1.18294 | 3.90606 | 3.61021
21 | 2 | 0.00868 | 2.61201 | 1.18869 | 1.19309 | 1.19078 | 3.96192 | 3.63675
22 | 2 | 0.01734 | 2.83028 | 1.20614 | 1.18826 | 1.18907 | 4.05951 | 3.63720
23 | 2 | 0.03471 | 3.01913 | 1.21061 | 1.21075 | 1.21110 | 4.07892 | 3.63007
24 | 2 | 0.06934 | 2.98707 | 1.21195 | 1.21169 | 1.21260 | 4.05289 | 3.63668
25 | 2 | 0.13841 | 2.93301 | 1.21384 | 1.21398 | 1.21430 | 4.05470 | 3.65711
26 | 2 | 0.27653 | 2.89224 | 1.21555 | 1.21525 | 1.21503 | 4.05413 | 3.71380
27 | 2 | 0.55304 | 2.86636 | 1.21320 | 1.21320 | 1.21318 | 4.04503 | 3.77496
28 | 2 | 1.10395 | 2.86057 | 1.21503 | 1.21528 | 1.21539 | 4.05321 | 4.20415
14 | 3 | 0.00015 | 0.99347 | 1.05383 | 1.04078 | 1.05383 | 3.64600 | 3.55139
15 | 3 | 0.00029 | 1.00667 | 1.08417 | 1.08000 | 1.07667 | 3.70167 | 3.67333
16 | 3 | 0.00057 | 1.00167 | 1.06483 | 1.06817 | 1.07026 | 3.70807 | 4.78712
17 | 3 | 0.00112 | 0.99216 | 1.08902 | 1.07673 | 1.07567 | 3.75858 | 4.87707
18 | 3 | 0.00225 | 1.65583 | 1.08120 | 1.07061 | 1.06924 | 3.80711 | 4.93278
19 | 3 | 0.00443 | 1.95106 | 1.10063 | 1.10423 | 1.10515 | 3.60259 | 5.00210
20 | 3 | 0.00883 | 2.23069 | 1.16566 | 1.16534 | 1.16553 | 3.67929 | 5.02831
21 | 3 | 0.01741 | 2.62856 | 1.18398 | 1.18403 | 1.18766 | 3.75832 | 5.08517
22 | 3 | 0.03465 | 2.83800 | 1.18743 | 1.18680 | 1.18668 | 3.85345 | 5.59416
23 | 3 | 0.06937 | 2.99295 | 1.21041 | 1.21101 | 1.20992 | 3.83901 | 5.59352
24 | 3 | 0.13844 | 2.93239 | 1.21282 | 1.21212 | 1.21291 | 3.85071 | 5.60987
25 | 3 | 0.27666 | 2.89200 | 1.21457 | 1.21539 | 1.21444 | 3.84719 | 5.63814
26 | 3 | 0.55299 | 2.86851 | 1.21440 | 1.21466 | 1.21459 | 3.84332 | 5.71820
27 | 3 | 1.10450 | 2.85982 | 1.21540 | 1.21553 | 1.21548 | 3.84916 | 6.71723
13 | 4 | 0.00014 | 1.02685 | 1.02013 | 1.02013 | 1.03523 | 3.70805 | 5.60906
14 | 4 | 0.00029 | 0.99588 | 1.03710 | 1.03380 | 1.04369 | 3.64468 | 5.50124
15 | 4 | 0.00057 | 0.99289 | 1.06148 | 1.05437 | 1.05772 | 3.66458 | 6.35550
16 | 4 | 0.00112 | 1.00983 | 1.08438 | 1.07178 | 1.07541 | 3.76138 | 6.75881
17 | 4 | 0.00224 | 1.58088 | 1.06126 | 1.06477 | 1.06434 | 3.72285 | 7.07008
18 | 4 | 0.00444 | 1.94305 | 1.05518 | 1.05786 | 1.05716 | 3.47233 | 9.61913
19 | 4 | 0.00881 | 1.98287 | 1.09035 | 1.08580 | 1.09216 | 3.52471 | 9.69163
20 | 4 | 0.01755 | 2.24734 | 1.16900 | 1.16941 | 1.16902 | 3.60570 | 9.72971
21 | 4 | 0.03475 | 2.64783 | 1.18334 | 1.18296 | 1.18296 | 3.67540 | 11.12814
22 | 4 | 0.06932 | 2.81668 | 1.18573 | 1.18705 | 1.18586 | 3.73229 | 11.55661
23 | 4 | 0.13858 | 2.93015 | 1.21090 | 1.21317 | 1.21126 | 3.74208 | 11.50555
24 | 4 | 0.27673 | 2.89324 | 1.21439 | 1.21483 | 1.21416 | 3.74150 | 11.55177
25 | 4 | 0.55292 | 2.86956 | 1.21521 | 1.21490 | 1.21478 | 3.74286 | 11.87245
26 | 4 | 1.10449 | 2.85966 | 1.21365 | 1.21350 | 1.21365 | 3.74655 | 12.92992
12 | 5 | 0.00014 | 0.99338 | 1.00000 | 0.97848 | 0.98675 | 3.70033 | 5.91722
13 | 5 | 0.00028 | 0.99319 | 1.02215 | 1.03237 | 1.03663 | 3.76235 | 6.25213
14 | 5 | 0.00058 | 0.99834 | 1.00828 | 1.02773 | 1.03849 | 3.62500 | 6.48096
15 | 5 | 0.00113 | 1.00000 | 1.05331 | 1.05416 | 1.04887 | 3.67971 | 6.67802
16 | 5 | 0.00223 | 1.54781 | 1.05464 | 1.05635 | 1.06531 | 3.70907 | 7.15091
17 | 5 | 0.00443 | 1.95507 | 1.05300 | 1.05192 | 1.05413 | 3.41883 | 9.71811
18 | 5 | 0.00882 | 1.98381 | 1.05565 | 1.05373 | 1.05633 | 3.44008 | 9.86078
19 | 5 | 0.01764 | 1.99217 | 1.09469 | 1.08601 | 1.08340 | 3.47970 | 9.86134
20 | 5 | 0.03507 | 2.26098 | 1.16925 | 1.17117 | 1.16948 | 3.56841 | 11.62838
21 | 5 | 0.06956 | 2.62690 | 1.18224 | 1.18298 | 1.18273 | 3.62141 | 12.29127
22 | 5 | 0.13857 | 2.74787 | 1.18613 | 1.18616 | 1.18577 | 3.68431 | 12.73983
23 | 5 | 0.27665 | 2.89490 | 1.21993 | 1.21457 | 1.21518 | 3.69193 | 13.05943
24 | 5 | 0.55290 | 2.87067 | 1.21465 | 1.21457 | 1.21480 | 3.69166 | 13.17788
25 | 5 | 1.10566 | 2.85820 | 1.21481 | 1.21469 | 1.21496 | 3.69029 | 13.89242
11 | 6 | 0.00016 | 0.94954 | 0.91131 | 0.92355 | 0.90520 | 3.39297 | 5.79817
12 | 6 | 0.00029 | 0.97930 | 0.98262 | 0.97185 | 0.97185 | 3.63162 | 6.27401
13 | 6 | 0.00056 | 1.00000 | 1.01841 | 1.04324 | 1.02354 | 3.74187 | 6.83176
14 | 6 | 0.00114 | 0.99916 | 1.02451 | 1.02640 | 1.02912 | 3.65368 | 7.00817
15 | 6 | 0.00223 | 1.52080 | 1.05643 | 1.06219 | 1.06486 | 3.70077 | 7.76851
16 | 6 | 0.00443 | 1.95354 | 1.04904 | 1.05216 | 1.05039 | 3.40202 | 10.51152
17 | 6 | 0.00884 | 1.98211 | 1.05094 | 1.05102 | 1.05013 | 3.40401 | 10.65649
18 | 6 | 0.01764 | 1.99553 | 1.05056 | 1.05146 | 1.05129 | 3.41883 | 14.64367
19 | 6 | 0.03522 | 2.00705 | 1.07669 | 1.07700 | 1.07637 | 3.46513 | 14.35339
20 | 6 | 0.07014 | 2.24036 | 1.16830 | 1.16853 | 1.16919 | 3.54304 | 15.40090
21 | 6 | 0.13902 | 2.55777 | 1.18390 | 1.18405 | 1.18367 | 3.60225 | 16.03189
22 | 6 | 0.27690 | 2.71309 | 1.18691 | 1.18687 | 1.18748 | 3.65789 | 16.03118
23 | 6 | 0.55309 | 2.86994 | 1.21340 | 1.21336 | 1.21508 | 3.66403 | 16.39104

gfortran

log2(M) | log2(N) | copy (s) | fresh copy | loop | forall | concurrent | gbmv | scal
----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----
16 | 1 | 0.00014 | 1.79620 | 1.30397 | 1.54404 | 1.56477 | 18.29016 | 4.70812
17 | 1 | 0.00027 | 1.86510 | 1.39774 | 1.58399 | 1.63185 | 4.27415 | 4.59965
18 | 1 | 0.00054 | 1.94523 | 1.31360 | 1.58922 | 1.61175 | 4.25530 | 4.64443
19 | 1 | 0.00107 | 2.07375 | 1.32242 | 1.59158 | 1.66444 | 4.33734 | 4.70365
20 | 1 | 0.00210 | 2.21044 | 1.33492 | 1.61287 | 1.65511 | 4.56054 | 4.74867
21 | 1 | 0.00419 | 2.28086 | 1.36005 | 1.63665 | 1.67482 | 4.64814 | 4.77796
22 | 1 | 0.00837 | 2.31160 | 1.33871 | 1.64697 | 1.67883 | 4.85726 | 4.79017
23 | 1 | 0.01665 | 2.34700 | 1.36810 | 1.63785 | 1.65857 | 4.83426 | 4.80861
24 | 1 | 0.03333 | 2.19562 | 1.36679 | 1.63054 | 1.65820 | 4.82140 | 4.81522
25 | 1 | 0.06696 | 2.16186 | 1.36175 | 1.63502 | 1.66120 | 4.77282 | 4.80597
26 | 1 | 0.13434 | 2.14151 | 1.35870 | 1.65677 | 1.68292 | 4.75200 | 4.80339
27 | 1 | 0.26842 | 2.09992 | 1.35558 | 1.65199 | 1.67770 | 4.72239 | 4.93927
28 | 1 | 0.53566 | 2.11745 | 1.36394 | 2.01082 | 2.04317 | 4.76933 | 5.25310
29 | 1 | 1.07581 | 2.14793 | 1.35794 | 1.62986 | 1.65424 | 4.74175 | 4.81708
15 | 2 | 0.00014 | 1.00169 | 1.29103 | 1.91540 | 1.95262 | 3.87479 | 3.52623
16 | 2 | 0.00028 | 0.99574 | 1.27830 | 2.77362 | 2.81617 | 3.89106 | 3.52681
17 | 2 | 0.00055 | 1.00691 | 1.29663 | 2.82340 | 2.83420 | 3.95337 | 3.58592
18 | 2 | 0.00110 | 1.01107 | 1.29511 | 2.85081 | 2.86732 | 4.01390 | 3.60869
19 | 2 | 0.00218 | 2.08320 | 1.30586 | 2.87192 | 2.88801 | 4.06754 | 3.64335
20 | 2 | 0.00417 | 2.26407 | 1.34968 | 1.99663 | 1.99040 | 4.18904 | 3.76713
21 | 2 | 0.00833 | 2.29577 | 1.34785 | 2.00713 | 1.99428 | 4.18494 | 3.77125
22 | 2 | 0.01660 | 2.32425 | 1.35189 | 2.02007 | 2.00484 | 4.32570 | 3.78520
23 | 2 | 0.03323 | 2.33571 | 1.36490 | 2.01983 | 2.00517 | 4.33037 | 3.77212
24 | 2 | 0.06644 | 2.28641 | 1.36558 | 2.01803 | 2.00372 | 4.30084 | 3.77428
25 | 2 | 0.13319 | 2.20608 | 1.36393 | 2.02634 | 2.03086 | 4.28525 | 3.78703
26 | 2 | 0.26733 | 2.14609 | 1.36117 | 2.03863 | 2.02450 | 4.26412 | 3.82846
27 | 2 | 0.53700 | 2.10290 | 1.35515 | 2.07861 | 2.06433 | 4.24222 | 3.88281
28 | 2 | 1.07104 | 2.09837 | 1.36172 | 2.24297 | 2.23049 | 4.24484 | 4.23100
14 | 3 | 0.00014 | 0.99831 | 1.26689 | 4.23818 | 4.12500 | 3.79054 | 3.67061
15 | 3 | 0.00028 | 1.00598 | 1.29974 | 5.04355 | 5.00427 | 3.80786 | 3.72502
16 | 3 | 0.00055 | 1.01444 | 1.31890 | 5.86920 | 5.85871 | 3.88233 | 4.97944
17 | 3 | 0.00109 | 1.00000 | 1.30677 | 5.89127 | 6.19913 | 3.86572 | 5.00480
18 | 3 | 0.00218 | 1.86016 | 1.29380 | 7.55520 | 7.54601 | 3.93599 | 5.07550
19 | 3 | 0.00418 | 2.28169 | 1.33717 | 9.21018 | 9.06375 | 3.85351 | 5.24947
20 | 3 | 0.00831 | 2.31954 | 1.34899 | 9.25603 | 9.13645 | 3.92892 | 5.28644
21 | 3 | 0.01663 | 2.32210 | 1.34861 | 9.46930 | 9.42620 | 3.97258 | 5.30627
22 | 3 | 0.03323 | 2.33589 | 1.34835 | 9.60530 | 9.56034 | 4.06180 | 5.79119
23 | 3 | 0.06653 | 2.29136 | 1.36631 | 9.59456 | 9.56386 | 4.03934 | 5.77615
24 | 3 | 0.13314 | 2.20679 | 1.36479 | 9.60519 | 9.56512 | 4.04004 | 5.78333
25 | 3 | 0.26720 | 2.14893 | 1.36163 | 9.95964 | 9.92338 | 4.01777 | 5.79669
26 | 3 | 0.53659 | 2.10791 | 1.35717 | 11.39551 | 11.34358 | 3.99745 | 5.84365
27 | 3 | 1.07135 | 2.09721 | 1.36110 | 14.01157 | 13.99235 | 4.03420 | 7.43307
13 | 4 | 0.00014 | 0.99831 | 1.27534 | 4.26520 | 4.27365 | 3.76182 | 5.71115
14 | 4 | 0.00028 | 1.00343 | 1.29160 | 10.87822 | 10.87050 | 3.78388 | 5.69726
15 | 4 | 0.00055 | 1.00173 | 1.29259 | 11.28695 | 11.27005 | 3.80711 | 6.56307
16 | 4 | 0.00111 | 0.99095 | 1.27937 | 11.26471 | 11.26557 | 3.79284 | 6.77560
17 | 4 | 0.00218 | 1.75413 | 1.29150 | 15.72809 | 15.71977 | 3.82208 | 7.21053
18 | 4 | 0.00417 | 2.27890 | 1.33814 | 16.45969 | 16.44866 | 3.71063 | 10.18622
19 | 4 | 0.00833 | 2.30279 | 1.34024 | 17.85806 | 17.83953 | 3.74403 | 10.19919
20 | 4 | 0.01663 | 2.33030 | 1.34769 | 20.25812 | 20.27572 | 3.82059 | 10.21730
21 | 4 | 0.03324 | 2.33525 | 1.34930 | 20.10040 | 20.12290 | 3.86144 | 11.58577
22 | 4 | 0.06647 | 2.30523 | 1.34961 | 20.17429 | 20.20462 | 3.91057 | 11.92665
23 | 4 | 0.13313 | 2.20631 | 1.36597 | 20.38021 | 20.40630 | 3.91110 | 11.91708
24 | 4 | 0.26723 | 2.14988 | 1.36164 | 21.08453 | 21.10098 | 3.89204 | 11.91661
25 | 4 | 0.53642 | 2.10883 | 1.35647 | 22.78196 | 22.79634 | 3.87230 | 12.21752
26 | 4 | 1.07051 | 2.09825 | 1.36185 | 28.29655 | 28.29190 | 3.87737 | 13.27728
12 | 5 | 0.00014 | 1.00000 | 1.27750 | 6.16751 | 6.09645 | 3.74112 | 6.13875
13 | 5 | 0.00028 | 0.99228 | 1.28731 | 11.61921 | 11.59005 | 3.77358 | 6.32762
14 | 5 | 0.00055 | 0.99957 | 1.29136 | 11.88363 | 11.91620 | 3.79722 | 6.80417
15 | 5 | 0.00109 | 1.00175 | 1.31452 | 12.23250 | 12.24431 | 3.81999 | 6.88780
16 | 5 | 0.00218 | 1.71277 | 1.29467 | 16.31962 | 16.32203 | 3.79910 | 7.24576
17 | 5 | 0.00430 | 2.21588 | 1.29415 | 16.55839 | 16.56647 | 3.52725 | 9.96111
18 | 5 | 0.00833 | 2.31148 | 1.33786 | 24.30867 | 24.31969 | 3.64720 | 10.37504
19 | 5 | 0.01664 | 2.32376 | 1.34125 | 26.34304 | 26.33781 | 3.69677 | 10.45294
20 | 5 | 0.03322 | 2.33793 | 1.34925 | 26.14268 | 26.15607 | 3.78037 | 12.21410
21 | 5 | 0.06650 | 2.30667 | 1.34927 | 26.88993 | 26.89977 | 3.79873 | 13.02378
22 | 5 | 0.13307 | 2.20874 | 1.35026 | 26.74056 | 26.74916 | 3.84593 | 13.14075
23 | 5 | 0.26721 | 2.15319 | 1.36304 | 26.61801 | 26.62138 | 3.82844 | 13.05417
24 | 5 | 0.53640 | 2.10916 | 1.35639 | 27.79922 | 27.80702 | 3.81295 | 13.26568
25 | 5 | 1.07339 | 2.09258 | 1.35873 | 34.58359 | 34.56587 | 3.80758 | 14.08993
11 | 6 | 0.00014 | 0.98500 | 1.29333 | 6.44500 | 6.43000 | 3.69167 | 6.35500
12 | 6 | 0.00028 | 1.00343 | 1.29160 | 11.80274 | 11.80532 | 3.77358 | 6.49314
13 | 6 | 0.00055 | 1.00130 | 1.30061 | 11.89533 | 11.90960 | 3.77898 | 6.90311
14 | 6 | 0.00111 | 0.98557 | 1.27375 | 11.97588 | 11.98579 | 3.74176 | 7.16993
15 | 6 | 0.00216 | 1.67534 | 1.30393 | 16.50386 | 16.51489 | 3.80856 | 7.94916
16 | 6 | 0.00417 | 2.28241 | 1.33064 | 17.26702 | 17.28115 | 3.61185 | 11.09938
17 | 6 | 0.00832 | 2.30636 | 1.33854 | 22.68029 | 22.70516 | 3.61306 | 11.21489
18 | 6 | 0.01662 | 2.33203 | 1.33967 | 31.22985 | 31.23586 | 3.62954 | 15.44506
19 | 6 | 0.03321 | 2.34120 | 1.34207 | 31.51367 | 31.51445 | 3.67779 | 16.02625
20 | 6 | 0.06647 | 2.31100 | 1.34979 | 31.79578 | 31.79418 | 3.74650 | 16.46622
21 | 6 | 0.13324 | 2.20472 | 1.34947 | 32.15855 | 32.15592 | 3.76506 | 16.54684
22 | 6 | 0.26718 | 2.15223 | 1.34867 | 32.28857 | 32.28642 | 3.79439 | 16.55134
23 | 6 | 0.53633 | 2.11010 | 1.35657 | 32.76753 | 32.76806 | 3.78151 | 16.55881

#define CACHE_SIZE 20000000
program diag_test
implicit none
integer :: i,j
write(*,'(2(A10,A3),7(A12,A3))') "log2(M)", " | ", "log2(N)", " | ", &
"copy (s)", " | ", "fresh copy", " | ", &
"loop", " | ", "forall", " | ", "concurrent", " | ", "gbmv", " | ", "scal", ""
write(*,'(2(A10,A3),7(A12,A3))') "-----", " | ", "-----", " | ", &
"-----", " | ", "-----", " | ", &
"-----", " | ", "-----", " | ", &
"-----", " | ", "-----", " | ", &
"-----", ""
do j = 1, 32
do i = 1,32
call test(2**i, 2**j)
enddo
enddo
contains
subroutine test(M,N)
implicit none
include 'mpif.h'
integer, intent(in) :: N,M
real, allocatable :: D(:)
complex, allocatable :: Dz(:)
complex, allocatable :: X(:,:)
complex, allocatable :: Y(:,:)
real :: ref
integer :: i,j
double precision :: copy_t, copy2_t, loop_t, forall_t, concurrent_t, gbmv_t, scal_t
complex, parameter :: zero = cmplx(0.d0), one = cmplx(1.d0)
real, parameter :: eps = 1.d-6
! 1 MB < size(Y) < 32 GB
if (M * N < 2.**17 .or. M*N > 2.**32) return
allocate(D(M), Dz(M), X(M,N), Y(M,N))
call random_number(D)
X = cmplx(1.d0)
call flush_cache()
copy_t = - MPI_WTIME()
Y = X
copy_t = copy_t + MPI_WTIME()
ref = abs(sum(Y))
call flush_cache()
copy2_t = - MPI_WTIME()
Y = X
copy2_t = copy2_t + MPI_WTIME()
ref = abs(sum(Y))
call flush_cache()
loop_t = - MPI_WTIME()
do j = 1,N
do i = 1,M
Y(i,j) = D(i) * X(i,j)
enddo
enddo
loop_t = loop_t + MPI_WTIME()
ref = abs(sum(Y))
call flush_cache()
forall_t = - MPI_WTIME()
forall(j = 1:N, i = 1:M)
Y(i,j) = D(i) * X(i,j)
end forall
forall_t = forall_t + MPI_WTIME()
if (abs(abs(sum(Y)) - ref) > eps) write(*,*) "forall failed"
call flush_cache()
concurrent_t = - MPI_WTIME()
do concurrent(j = 1:N, i = 1:M)
Y(i,j) = D(i) * X(i,j)
enddo
concurrent_t = concurrent_t + MPI_WTIME()
if (abs(abs(sum(Y)) - ref) > eps) write(*,*) "concurrent failed"
call flush_cache()
gbmv_t = - MPI_WTIME()
Dz = cmplx(D)
do i = 1,N
call cgbmv('N', M, M, 0, 0, one, Dz, 1, X(1,i), 1, zero, Y(1,i), 1)
enddo
gbmv_t = gbmv_t + MPI_WTIME()
if (abs(abs(sum(Y)) - ref) > eps) write(*,*) "gbmv failed"
call flush_cache()
scal_t = - MPI_WTIME()
Y = X
do i = 1,M
call csscal(N,D(i),Y(i,1),M)
enddo
scal_t = scal_t + MPI_WTIME()
if (abs(abs(sum(Y)) - ref) > eps) write(*,*) "scal failed", abs(sum(Y)), ref
write(*,'(2(I10,A3),7(F12.5,A3))') nint(log(real(M))/log(2.d0)), " | ", &
nint(log(real(N))/log(2.d0)), " | ", &
copy2_t, " | ", &
copy_t/copy2_t, " | ", &
loop_t/copy2_t, " | ", &
forall_t/copy2_t, " | ", &
concurrent_t/copy2_t, " | ", &
gbmv_t/copy2_t, " | ", &
scal_t/copy2_t, ""
deallocate(D,Dz,X,Y)
end subroutine test
subroutine flush_cache()
implicit none
real, allocatable :: buffer(:)
real red
allocate(buffer(CACHE_SIZE))
call random_number(buffer)
red = abs(sum(buffer))
deallocate(buffer)
end subroutine flush_cache
end program diag_test
FC = ifort -I/opt/openmpi/include -I/opt/openmpi/lib -L/opt/openmpi/lib -lmpi_f90 -lmpi_f77 -lmpi -ldl -lm -Wl,--export-dynamic -lrt -lnsl -lutil
#FC = gfortran -I/opt/openmpi/include -I/opt/openmpi/lib -L/opt/openmpi/lib -lmpi_f90 -lmpi_f77 -lmpi -ldl -lm -Wl,--export-dynamic -lrt -lnsl -lutil
FFLAGS = -O2
all:
$(FC) $(FFLAGS) -o diag_test.ex diag_test.F90 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment