Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@hsandid
Last active January 20, 2021 11:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hsandid/faaa6ef9408f92060e203fa91958a322 to your computer and use it in GitHub Desktop.
Save hsandid/faaa6ef9408f92060e203fa91958a322 to your computer and use it in GitHub Desktop.
#include <stdio.h>
extern void sgemm_nn(size_t n, size_t m, size_t k, const int*a, size_t lda, const int*b, size_t ldb, int*c, size_t ldc);
int main()
{
int sum=0;
int matrix1[4][4]= {
{3,2,4,4},
{5,2,7,6},
{3,2,5,8},
{2,1,5,2}
};
int matrix2[4][4]= {
{6,7,2,6},
{2,1,5,2},
{3,4,6,3},
{5,2,7,6}
};
int resultMat[4][4]= {
{0,0,0,0},
{0,0,0,0},
{0,0,0,0},
{0,0,0,0}
};
sgemm_nn(4, 4, 4, &matrix1[0][0], 4, &matrix2[0][0], 4, &resultMat[0][0], 4);
return 0;
}
t-v: file format elf64-littleriscv
Disassembly of section .text:
00000000000100e8 <register_fini>:
100e8: 00000793 li a5,0
100ec: c789 beqz a5,100f6 <register_fini+0xe>
100ee: 6541 lui a0,0x10
100f0: 69850513 addi a0,a0,1688 # 10698 <__libc_fini_array>
100f4: abe1 j 106cc <atexit>
100f6: 8082 ret
00000000000100f8 <_start>:
100f8: 00002197 auipc gp,0x2
100fc: f1818193 addi gp,gp,-232 # 12010 <__global_pointer$>
10100: f6018513 addi a0,gp,-160 # 11f70 <completed.1>
10104: f9818613 addi a2,gp,-104 # 11fa8 <__BSS_END__>
10108: 8e09 sub a2,a2,a0
1010a: 4581 li a1,0
1010c: 420000ef jal ra,1052c <memset>
10110: 00000517 auipc a0,0x0
10114: 5bc50513 addi a0,a0,1468 # 106cc <atexit>
10118: c519 beqz a0,10126 <_start+0x2e>
1011a: 00000517 auipc a0,0x0
1011e: 57e50513 addi a0,a0,1406 # 10698 <__libc_fini_array>
10122: 5aa000ef jal ra,106cc <atexit>
10126: 39c000ef jal ra,104c2 <__libc_init_array>
1012a: 4502 lw a0,0(sp)
1012c: 002c addi a1,sp,8
1012e: 4601 li a2,0
10130: 054000ef jal ra,10184 <main>
10134: ae85 j 104a4 <exit>
0000000000010136 <__do_global_dtors_aux>:
10136: f601c703 lbu a4,-160(gp) # 11f70 <completed.1>
1013a: e715 bnez a4,10166 <__do_global_dtors_aux+0x30>
1013c: 1141 addi sp,sp,-16
1013e: e022 sd s0,0(sp)
10140: 843e mv s0,a5
10142: e406 sd ra,8(sp)
10144: 00000793 li a5,0
10148: cb81 beqz a5,10158 <__do_global_dtors_aux+0x22>
1014a: 6545 lui a0,0x11
1014c: 7f450513 addi a0,a0,2036 # 117f4 <__FRAME_END__>
10150: 00000097 auipc ra,0x0
10154: 000000e7 jalr zero # 0 <register_fini-0x100e8>
10158: 4785 li a5,1
1015a: 60a2 ld ra,8(sp)
1015c: f6f18023 sb a5,-160(gp) # 11f70 <completed.1>
10160: 6402 ld s0,0(sp)
10162: 0141 addi sp,sp,16
10164: 8082 ret
10166: 8082 ret
0000000000010168 <frame_dummy>:
10168: 00000793 li a5,0
1016c: cb91 beqz a5,10180 <frame_dummy+0x18>
1016e: 6545 lui a0,0x11
10170: f6818593 addi a1,gp,-152 # 11f78 <object.0>
10174: 7f450513 addi a0,a0,2036 # 117f4 <__FRAME_END__>
10178: 00000317 auipc t1,0x0
1017c: 00000067 jr zero # 0 <register_fini-0x100e8>
10180: 8082 ret
...
0000000000010184 <main>:
10184: 7151 addi sp,sp,-240
10186: f586 sd ra,232(sp)
10188: f1a2 sd s0,224(sp)
1018a: 1980 addi s0,sp,240
1018c: 4501 li a0,0
1018e: f2a43023 sd a0,-224(s0)
10192: fea42623 sw a0,-20(s0)
10196: fea42423 sw a0,-24(s0)
1019a: 00010637 lui a2,0x10
1019e: 77460593 addi a1,a2,1908 # 10774 <__errno+0x6>
101a2: 0385e703 lwu a4,56(a1)
101a6: 03c5e683 lwu a3,60(a1)
101aa: 1682 slli a3,a3,0x20
101ac: 8ed9 or a3,a3,a4
101ae: fed43023 sd a3,-32(s0)
101b2: 0305e703 lwu a4,48(a1)
101b6: 0345e683 lwu a3,52(a1)
101ba: 1682 slli a3,a3,0x20
101bc: 8ed9 or a3,a3,a4
101be: fcd43c23 sd a3,-40(s0)
101c2: 0285e703 lwu a4,40(a1)
101c6: 02c5e683 lwu a3,44(a1)
101ca: 1682 slli a3,a3,0x20
101cc: 8ed9 or a3,a3,a4
101ce: fcd43823 sd a3,-48(s0)
101d2: 0205e703 lwu a4,32(a1)
101d6: 0245e683 lwu a3,36(a1)
101da: 1682 slli a3,a3,0x20
101dc: 8ed9 or a3,a3,a4
101de: fcd43423 sd a3,-56(s0)
101e2: 0185e703 lwu a4,24(a1)
101e6: 01c5e683 lwu a3,28(a1)
101ea: 1682 slli a3,a3,0x20
101ec: 8ed9 or a3,a3,a4
101ee: fcd43023 sd a3,-64(s0)
101f2: 0105e703 lwu a4,16(a1)
101f6: 0145e683 lwu a3,20(a1)
101fa: 1682 slli a3,a3,0x20
101fc: 8ed9 or a3,a3,a4
101fe: fad43c23 sd a3,-72(s0)
10202: 0085e703 lwu a4,8(a1)
10206: 00c5e683 lwu a3,12(a1)
1020a: 1682 slli a3,a3,0x20
1020c: 8ed9 or a3,a3,a4
1020e: fad43823 sd a3,-80(s0)
10212: 77466603 lwu a2,1908(a2)
10216: 0045e583 lwu a1,4(a1)
1021a: 1582 slli a1,a1,0x20
1021c: 8dd1 or a1,a1,a2
1021e: fab43423 sd a1,-88(s0)
10222: 00010637 lui a2,0x10
10226: 7b460593 addi a1,a2,1972 # 107b4 <__errno+0x46>
1022a: 0385e703 lwu a4,56(a1)
1022e: 03c5e683 lwu a3,60(a1)
10232: 1682 slli a3,a3,0x20
10234: 8ed9 or a3,a3,a4
10236: fad43023 sd a3,-96(s0)
1023a: 0305e703 lwu a4,48(a1)
1023e: 0345e683 lwu a3,52(a1)
10242: 1682 slli a3,a3,0x20
10244: 8ed9 or a3,a3,a4
10246: f8d43c23 sd a3,-104(s0)
1024a: 0285e703 lwu a4,40(a1)
1024e: 02c5e683 lwu a3,44(a1)
10252: 1682 slli a3,a3,0x20
10254: 8ed9 or a3,a3,a4
10256: f8d43823 sd a3,-112(s0)
1025a: 0205e703 lwu a4,32(a1)
1025e: 0245e683 lwu a3,36(a1)
10262: 1682 slli a3,a3,0x20
10264: 8ed9 or a3,a3,a4
10266: f8d43423 sd a3,-120(s0)
1026a: 0185e703 lwu a4,24(a1)
1026e: 01c5e683 lwu a3,28(a1)
10272: 1682 slli a3,a3,0x20
10274: 8ed9 or a3,a3,a4
10276: f8d43023 sd a3,-128(s0)
1027a: 0105e703 lwu a4,16(a1)
1027e: 0145e683 lwu a3,20(a1)
10282: 1682 slli a3,a3,0x20
10284: 8ed9 or a3,a3,a4
10286: f6d43c23 sd a3,-136(s0)
1028a: 0085e703 lwu a4,8(a1)
1028e: 00c5e683 lwu a3,12(a1)
10292: 1682 slli a3,a3,0x20
10294: 8ed9 or a3,a3,a4
10296: f6d43823 sd a3,-144(s0)
1029a: 7b466603 lwu a2,1972(a2)
1029e: 0045e583 lwu a1,4(a1)
102a2: 1582 slli a1,a1,0x20
102a4: 8dd1 or a1,a1,a2
102a6: f6b43423 sd a1,-152(s0)
102aa: f6a43023 sd a0,-160(s0)
102ae: f4a43c23 sd a0,-168(s0)
102b2: f4a43823 sd a0,-176(s0)
102b6: f4a43423 sd a0,-184(s0)
102ba: f4a43023 sd a0,-192(s0)
102be: f2a43c23 sd a0,-200(s0)
102c2: f2a43823 sd a0,-208(s0)
102c6: f2a43423 sd a0,-216(s0)
102ca: 850a mv a0,sp
102cc: 4811 li a6,4
102ce: 01053023 sd a6,0(a0)
102d2: fa840693 addi a3,s0,-88
102d6: f6840793 addi a5,s0,-152
102da: f2840893 addi a7,s0,-216
102de: 8542 mv a0,a6
102e0: 85c2 mv a1,a6
102e2: 8642 mv a2,a6
102e4: 8742 mv a4,a6
102e6: 00000097 auipc ra,0x0
102ea: 016080e7 jalr 22(ra) # 102fc <sgemm_nn>
102ee: f2043503 ld a0,-224(s0)
102f2: 740e ld s0,224(sp)
102f4: 70ae ld ra,232(sp)
102f6: 616d addi sp,sp,240
102f8: 8082 ret
...
00000000000102fc <sgemm_nn>:
102fc: 62a2 ld t0,8(sp)
102fe: f9810113 addi sp,sp,-104
10302: f0a2 sd s0,96(sp)
10304: eca6 sd s1,88(sp)
10306: e8ca sd s2,80(sp)
10308: e4ce sd s3,72(sp)
1030a: e0d2 sd s4,64(sp)
1030c: fc56 sd s5,56(sp)
1030e: f85a sd s6,48(sp)
10310: f45e sd s7,40(sp)
10312: f062 sd s8,32(sp)
10314: e866 sd s9,16(sp)
10316: e46a sd s10,8(sp)
10318: e06e sd s11,0(sp)
1031a: 16050663 beqz a0,10486 <exit>
1031e: 16058463 beqz a1,10486 <exit>
10322: 16060263 beqz a2,10486 <exit>
10326: 070a slli a4,a4,0x2
10328: 080a slli a6,a6,0x2
1032a: 028a slli t0,t0,0x2
1032c: 0085af93 slti t6,a1,8
10330: 100f9163 bnez t6,10432 <end_c_row_loop>
0000000000010334 <c_row_loop>:
10334: 83aa mv t2,a0
10336: 8e3e mv t3,a5
10338: 8ec6 mv t4,a7
000000000001033a <c_col_loop>:
1033a: 0c83f4d7 vsetvli s1,t2,e16,m1,ta,ma,d1
1033e: 8f36 mv t5,a3
10340: 8472 mv s0,t3
10342: 020ee007 vle32.v v0,(t4)
10346: 005e8933 add s2,t4,t0
1034a: 02096087 vle32.v v1,(s2)
1034e: 9916 add s2,s2,t0
10350: 02096107 vle32.v v2,(s2)
10354: 9916 add s2,s2,t0
10356: 02096187 vle32.v v3,(s2)
1035a: 9916 add s2,s2,t0
1035c: 02096207 vle32.v v4,(s2)
10360: 9916 add s2,s2,t0
10362: 02096287 vle32.v v5,(s2)
10366: 9916 add s2,s2,t0
10368: 02096307 vle32.v v6,(s2)
1036c: 9916 add s2,s2,t0
1036e: 02096387 vle32.v v7,(s2)
10372: 8332 mv t1,a2
10374: 02046807 vle32.v v16,(s0)
0000000000010378 <k_loop>:
10378: 137d addi t1,t1,-1
1037a: 9442 add s0,s0,a6
1037c: 0f11 addi t5,t5,4
1037e: 000f2a03 lw s4,0(t5)
10382: 00ef09b3 add s3,t5,a4
10386: 0009aa83 lw s5,0(s3)
1038a: 99ba add s3,s3,a4
1038c: 0009ab03 lw s6,0(s3)
10390: 99ba add s3,s3,a4
10392: 0009ab83 lw s7,0(s3)
10396: 99ba add s3,s3,a4
10398: 0009ac03 lw s8,0(s3)
1039c: 99ba add s3,s3,a4
1039e: 0009ac83 lw s9,0(s3)
103a2: 99ba add s3,s3,a4
103a4: 0009ad03 lw s10,0(s3)
103a8: 99ba add s3,s3,a4
103aa: 0009ad83 lw s11,0(s3)
103ae: 99ba add s3,s3,a4
103b0: b70a6057 vmacc.vx v0,s4,v16
103b4: b70ae0d7 vmacc.vx v1,s5,v16
103b8: b70b6157 vmacc.vx v2,s6,v16
103bc: b70be1d7 vmacc.vx v3,s7,v16
103c0: b70c6257 vmacc.vx v4,s8,v16
103c4: b70ce2d7 vmacc.vx v5,s9,v16
103c8: b70d6357 vmacc.vx v6,s10,v16
103cc: b70de3d7 vmacc.vx v7,s11,v16
103d0: 00030663 beqz t1,103dc <k_loop+0x64>
103d4: 02046807 vle32.v v16,(s0)
103d8: fa1ff06f j 10378 <k_loop>
103dc: 020ee027 vse32.v v0,(t4)
103e0: 005e8933 add s2,t4,t0
103e4: 020960a7 vse32.v v1,(s2)
103e8: 9916 add s2,s2,t0
103ea: 02096127 vse32.v v2,(s2)
103ee: 9916 add s2,s2,t0
103f0: 020961a7 vse32.v v3,(s2)
103f4: 9916 add s2,s2,t0
103f6: 02096227 vse32.v v4,(s2)
103fa: 9916 add s2,s2,t0
103fc: 020962a7 vse32.v v5,(s2)
10400: 9916 add s2,s2,t0
10402: 02096327 vse32.v v6,(s2)
10406: 9916 add s2,s2,t0
10408: 020963a7 vse32.v v7,(s2)
1040c: 00249f93 slli t6,s1,0x2
10410: 9efe add t4,t4,t6
10412: 9e7e add t3,t3,t6
10414: 409383b3 sub t2,t2,s1
10418: f20391e3 bnez t2,1033a <c_col_loop>
1041c: 15e1 addi a1,a1,-8
1041e: 00371f93 slli t6,a4,0x3
10422: 96fe add a3,a3,t6
10424: 00329f93 slli t6,t0,0x3
10428: 98fe add a7,a7,t6
1042a: 0085af93 slti t6,a1,8
1042e: f00f83e3 beqz t6,10334 <c_row_loop>
0000000000010432 <end_c_row_loop>:
10432: 83aa mv t2,a0
10434: 8e3e mv t3,a5
10436: 8ec6 mv t4,a7
0000000000010438 <end_c_col_loop>:
10438: 0c83f4d7 vsetvli s1,t2,e16,m1,ta,ma,d1
1043c: 8f36 mv t5,a3
1043e: 8472 mv s0,t3
10440: 020ee007 vle32.v v0,(t4)
10444: 8332 mv t1,a2
10446: 02046807 vle32.v v16,(s0)
000000000001044a <end_k_loop>:
1044a: 137d addi t1,t1,-1
1044c: 9442 add s0,s0,a6
1044e: 0f11 addi t5,t5,4
10450: 000f2a03 lw s4,0(t5)
10454: 00ef09b3 add s3,t5,a4
10458: b70a6057 vmacc.vx v0,s4,v16
1045c: 00030663 beqz t1,10468 <end_k_loop+0x1e>
10460: 02046807 vle32.v v16,(s0)
10464: fe7ff06f j 1044a <end_k_loop>
10468: 020ee027 vse32.v v0,(t4)
1046c: 00249f93 slli t6,s1,0x2
10470: 9efe add t4,t4,t6
10472: 9e7e add t3,t3,t6
10474: 409383b3 sub t2,t2,s1
10478: fc0390e3 bnez t2,10438 <end_c_col_loop>
1047c: 15fd addi a1,a1,-1
1047e: 96ba add a3,a3,a4
10480: 98ba add a7,a7,a4
10482: fa0598e3 bnez a1,10432 <end_c_row_loop>
0000000000010486 <exit>:
10486: 7406 ld s0,96(sp)
10488: 64e6 ld s1,88(sp)
1048a: 6946 ld s2,80(sp)
1048c: 69a6 ld s3,72(sp)
1048e: 6a06 ld s4,64(sp)
10490: 7ae2 ld s5,56(sp)
10492: 7b42 ld s6,48(sp)
10494: 7ba2 ld s7,40(sp)
10496: 7c02 ld s8,32(sp)
10498: 6cc2 ld s9,16(sp)
1049a: 6d22 ld s10,8(sp)
1049c: 6d82 ld s11,0(sp)
1049e: 06810113 addi sp,sp,104
104a2: 8082 ret
00000000000104a4 <exit>:
104a4: 1141 addi sp,sp,-16
104a6: 4581 li a1,0
104a8: e022 sd s0,0(sp)
104aa: e406 sd ra,8(sp)
104ac: 842a mv s0,a0
104ae: 128000ef jal ra,105d6 <__call_exitprocs>
104b2: f481b503 ld a0,-184(gp) # 11f58 <_global_impure_ptr>
104b6: 6d3c ld a5,88(a0)
104b8: c391 beqz a5,104bc <exit+0x18>
104ba: 9782 jalr a5
104bc: 8522 mv a0,s0
104be: 28e000ef jal ra,1074c <_exit>
00000000000104c2 <__libc_init_array>:
104c2: 1101 addi sp,sp,-32
104c4: e822 sd s0,16(sp)
104c6: e04a sd s2,0(sp)
104c8: 6445 lui s0,0x11
104ca: 6945 lui s2,0x11
104cc: 7f840793 addi a5,s0,2040 # 117f8 <__init_array_start>
104d0: 7f890913 addi s2,s2,2040 # 117f8 <__init_array_start>
104d4: 40f90933 sub s2,s2,a5
104d8: ec06 sd ra,24(sp)
104da: e426 sd s1,8(sp)
104dc: 40395913 srai s2,s2,0x3
104e0: 00090b63 beqz s2,104f6 <__libc_init_array+0x34>
104e4: 7f840413 addi s0,s0,2040
104e8: 4481 li s1,0
104ea: 601c ld a5,0(s0)
104ec: 0485 addi s1,s1,1
104ee: 0421 addi s0,s0,8
104f0: 9782 jalr a5
104f2: fe991ce3 bne s2,s1,104ea <__libc_init_array+0x28>
104f6: 6445 lui s0,0x11
104f8: 6949 lui s2,0x12
104fa: 7f840793 addi a5,s0,2040 # 117f8 <__init_array_start>
104fe: 80890913 addi s2,s2,-2040 # 11808 <__do_global_dtors_aux_fini_array_entry>
10502: 40f90933 sub s2,s2,a5
10506: 40395913 srai s2,s2,0x3
1050a: 00090b63 beqz s2,10520 <__libc_init_array+0x5e>
1050e: 7f840413 addi s0,s0,2040
10512: 4481 li s1,0
10514: 601c ld a5,0(s0)
10516: 0485 addi s1,s1,1
10518: 0421 addi s0,s0,8
1051a: 9782 jalr a5
1051c: fe991ce3 bne s2,s1,10514 <__libc_init_array+0x52>
10520: 60e2 ld ra,24(sp)
10522: 6442 ld s0,16(sp)
10524: 64a2 ld s1,8(sp)
10526: 6902 ld s2,0(sp)
10528: 6105 addi sp,sp,32
1052a: 8082 ret
000000000001052c <memset>:
1052c: 433d li t1,15
1052e: 872a mv a4,a0
10530: 02c37163 bgeu t1,a2,10552 <memset+0x26>
10534: 00f77793 andi a5,a4,15
10538: e3c1 bnez a5,105b8 <memset+0x8c>
1053a: e1bd bnez a1,105a0 <memset+0x74>
1053c: ff067693 andi a3,a2,-16
10540: 8a3d andi a2,a2,15
10542: 96ba add a3,a3,a4
10544: e30c sd a1,0(a4)
10546: e70c sd a1,8(a4)
10548: 0741 addi a4,a4,16
1054a: fed76de3 bltu a4,a3,10544 <memset+0x18>
1054e: e211 bnez a2,10552 <memset+0x26>
10550: 8082 ret
10552: 40c306b3 sub a3,t1,a2
10556: 068a slli a3,a3,0x2
10558: 00000297 auipc t0,0x0
1055c: 9696 add a3,a3,t0
1055e: 00a68067 jr 10(a3)
10562: 00b70723 sb a1,14(a4)
10566: 00b706a3 sb a1,13(a4)
1056a: 00b70623 sb a1,12(a4)
1056e: 00b705a3 sb a1,11(a4)
10572: 00b70523 sb a1,10(a4)
10576: 00b704a3 sb a1,9(a4)
1057a: 00b70423 sb a1,8(a4)
1057e: 00b703a3 sb a1,7(a4)
10582: 00b70323 sb a1,6(a4)
10586: 00b702a3 sb a1,5(a4)
1058a: 00b70223 sb a1,4(a4)
1058e: 00b701a3 sb a1,3(a4)
10592: 00b70123 sb a1,2(a4)
10596: 00b700a3 sb a1,1(a4)
1059a: 00b70023 sb a1,0(a4)
1059e: 8082 ret
105a0: 0ff5f593 andi a1,a1,255
105a4: 00859693 slli a3,a1,0x8
105a8: 8dd5 or a1,a1,a3
105aa: 01059693 slli a3,a1,0x10
105ae: 8dd5 or a1,a1,a3
105b0: 02059693 slli a3,a1,0x20
105b4: 8dd5 or a1,a1,a3
105b6: b759 j 1053c <memset+0x10>
105b8: 00279693 slli a3,a5,0x2
105bc: 00000297 auipc t0,0x0
105c0: 9696 add a3,a3,t0
105c2: 8286 mv t0,ra
105c4: fa2680e7 jalr -94(a3)
105c8: 8096 mv ra,t0
105ca: 17c1 addi a5,a5,-16
105cc: 8f1d sub a4,a4,a5
105ce: 963e add a2,a2,a5
105d0: f8c371e3 bgeu t1,a2,10552 <memset+0x26>
105d4: b79d j 1053a <memset+0xe>
00000000000105d6 <__call_exitprocs>:
105d6: 715d addi sp,sp,-80
105d8: f052 sd s4,32(sp)
105da: f481ba03 ld s4,-184(gp) # 11f58 <_global_impure_ptr>
105de: f84a sd s2,48(sp)
105e0: e486 sd ra,72(sp)
105e2: 1f8a3903 ld s2,504(s4)
105e6: e0a2 sd s0,64(sp)
105e8: fc26 sd s1,56(sp)
105ea: f44e sd s3,40(sp)
105ec: ec56 sd s5,24(sp)
105ee: e85a sd s6,16(sp)
105f0: e45e sd s7,8(sp)
105f2: e062 sd s8,0(sp)
105f4: 02090863 beqz s2,10624 <__call_exitprocs+0x4e>
105f8: 8b2a mv s6,a0
105fa: 8bae mv s7,a1
105fc: 4a85 li s5,1
105fe: 59fd li s3,-1
10600: 00892483 lw s1,8(s2)
10604: fff4841b addiw s0,s1,-1
10608: 00044e63 bltz s0,10624 <__call_exitprocs+0x4e>
1060c: 048e slli s1,s1,0x3
1060e: 94ca add s1,s1,s2
10610: 020b8663 beqz s7,1063c <__call_exitprocs+0x66>
10614: 2084b783 ld a5,520(s1)
10618: 03778263 beq a5,s7,1063c <__call_exitprocs+0x66>
1061c: 347d addiw s0,s0,-1
1061e: 14e1 addi s1,s1,-8
10620: ff3418e3 bne s0,s3,10610 <__call_exitprocs+0x3a>
10624: 60a6 ld ra,72(sp)
10626: 6406 ld s0,64(sp)
10628: 74e2 ld s1,56(sp)
1062a: 7942 ld s2,48(sp)
1062c: 79a2 ld s3,40(sp)
1062e: 7a02 ld s4,32(sp)
10630: 6ae2 ld s5,24(sp)
10632: 6b42 ld s6,16(sp)
10634: 6ba2 ld s7,8(sp)
10636: 6c02 ld s8,0(sp)
10638: 6161 addi sp,sp,80
1063a: 8082 ret
1063c: 00892783 lw a5,8(s2)
10640: 6498 ld a4,8(s1)
10642: 37fd addiw a5,a5,-1
10644: 04878463 beq a5,s0,1068c <__call_exitprocs+0xb6>
10648: 0004b423 sd zero,8(s1)
1064c: db61 beqz a4,1061c <__call_exitprocs+0x46>
1064e: 31092783 lw a5,784(s2)
10652: 008a96bb sllw a3,s5,s0
10656: 00892c03 lw s8,8(s2)
1065a: 8ff5 and a5,a5,a3
1065c: 2781 sext.w a5,a5
1065e: ef89 bnez a5,10678 <__call_exitprocs+0xa2>
10660: 9702 jalr a4
10662: 00892703 lw a4,8(s2)
10666: 1f8a3783 ld a5,504(s4)
1066a: 01871463 bne a4,s8,10672 <__call_exitprocs+0x9c>
1066e: fb2787e3 beq a5,s2,1061c <__call_exitprocs+0x46>
10672: dbcd beqz a5,10624 <__call_exitprocs+0x4e>
10674: 893e mv s2,a5
10676: b769 j 10600 <__call_exitprocs+0x2a>
10678: 31492783 lw a5,788(s2)
1067c: 1084b583 ld a1,264(s1)
10680: 8ff5 and a5,a5,a3
10682: 2781 sext.w a5,a5
10684: e799 bnez a5,10692 <__call_exitprocs+0xbc>
10686: 855a mv a0,s6
10688: 9702 jalr a4
1068a: bfe1 j 10662 <__call_exitprocs+0x8c>
1068c: 00892423 sw s0,8(s2)
10690: bf75 j 1064c <__call_exitprocs+0x76>
10692: 852e mv a0,a1
10694: 9702 jalr a4
10696: b7f1 j 10662 <__call_exitprocs+0x8c>
0000000000010698 <__libc_fini_array>:
10698: 1101 addi sp,sp,-32
1069a: e822 sd s0,16(sp)
1069c: 67c9 lui a5,0x12
1069e: 6449 lui s0,0x12
106a0: 80840413 addi s0,s0,-2040 # 11808 <__do_global_dtors_aux_fini_array_entry>
106a4: 81078793 addi a5,a5,-2032 # 11810 <impure_data>
106a8: 8f81 sub a5,a5,s0
106aa: e426 sd s1,8(sp)
106ac: ec06 sd ra,24(sp)
106ae: 4037d493 srai s1,a5,0x3
106b2: c881 beqz s1,106c2 <__libc_fini_array+0x2a>
106b4: 17e1 addi a5,a5,-8
106b6: 943e add s0,s0,a5
106b8: 601c ld a5,0(s0)
106ba: 14fd addi s1,s1,-1
106bc: 1461 addi s0,s0,-8
106be: 9782 jalr a5
106c0: fce5 bnez s1,106b8 <__libc_fini_array+0x20>
106c2: 60e2 ld ra,24(sp)
106c4: 6442 ld s0,16(sp)
106c6: 64a2 ld s1,8(sp)
106c8: 6105 addi sp,sp,32
106ca: 8082 ret
00000000000106cc <atexit>:
106cc: 85aa mv a1,a0
106ce: 4681 li a3,0
106d0: 4601 li a2,0
106d2: 4501 li a0,0
106d4: a009 j 106d6 <__register_exitproc>
00000000000106d6 <__register_exitproc>:
106d6: f481b703 ld a4,-184(gp) # 11f58 <_global_impure_ptr>
106da: 1f873783 ld a5,504(a4)
106de: c3b1 beqz a5,10722 <__register_exitproc+0x4c>
106e0: 4798 lw a4,8(a5)
106e2: 487d li a6,31
106e4: 06e84263 blt a6,a4,10748 <__register_exitproc+0x72>
106e8: c505 beqz a0,10710 <__register_exitproc+0x3a>
106ea: 00371813 slli a6,a4,0x3
106ee: 983e add a6,a6,a5
106f0: 10c83823 sd a2,272(a6)
106f4: 3107a883 lw a7,784(a5)
106f8: 4605 li a2,1
106fa: 00e6163b sllw a2,a2,a4
106fe: 00c8e8b3 or a7,a7,a2
10702: 3117a823 sw a7,784(a5)
10706: 20d83823 sd a3,528(a6)
1070a: 4689 li a3,2
1070c: 02d50063 beq a0,a3,1072c <__register_exitproc+0x56>
10710: 00270693 addi a3,a4,2
10714: 068e slli a3,a3,0x3
10716: 2705 addiw a4,a4,1
10718: c798 sw a4,8(a5)
1071a: 97b6 add a5,a5,a3
1071c: e38c sd a1,0(a5)
1071e: 4501 li a0,0
10720: 8082 ret
10722: 20070793 addi a5,a4,512
10726: 1ef73c23 sd a5,504(a4)
1072a: bf5d j 106e0 <__register_exitproc+0xa>
1072c: 3147a683 lw a3,788(a5)
10730: 4501 li a0,0
10732: 8e55 or a2,a2,a3
10734: 00270693 addi a3,a4,2
10738: 068e slli a3,a3,0x3
1073a: 2705 addiw a4,a4,1
1073c: 30c7aa23 sw a2,788(a5)
10740: c798 sw a4,8(a5)
10742: 97b6 add a5,a5,a3
10744: e38c sd a1,0(a5)
10746: 8082 ret
10748: 557d li a0,-1
1074a: 8082 ret
000000000001074c <_exit>:
1074c: 05d00893 li a7,93
10750: 00000073 ecall
10754: 00054363 bltz a0,1075a <_exit+0xe>
10758: a001 j 10758 <_exit+0xc>
1075a: 1141 addi sp,sp,-16
1075c: e022 sd s0,0(sp)
1075e: 842a mv s0,a0
10760: e406 sd ra,8(sp)
10762: 4080043b negw s0,s0
10766: 008000ef jal ra,1076e <__errno>
1076a: c100 sw s0,0(a0)
1076c: a001 j 1076c <_exit+0x20>
000000000001076e <__errno>:
1076e: f581b503 ld a0,-168(gp) # 11f68 <_impure_ptr>
10772: 8082 ret
.text
.balign 4
.global sgemm_nn
# RV64IDV system
#
# void
# sgemm_nn(size_t n,
# size_t m,
# size_t k,
# const int*a, // m * k matrix
# size_t lda,
# const int*b, // k * n matrix
# size_t ldb,
# int*c, // m * n matrix
# size_t ldc)
#
# c += a*b (alpha=1, no transpose on input matrices)
# matrices stored in C row-major order
#define n a0
#define m a1
#define k a2
#define ap a3
#define astride a4
#define bp a5
#define bstride a6
#define cp a7
#define cstride t0
#define kt t1
#define nt t2
#define bnp t3
#define cnp t4
#define akp t5
#define bkp s0
#define nvl s1
#define ccp s2
#define amp s3
# This version holds a 8*VLMAX block of C matrix in vector registers
# in inner loop, but otherwise does not cache or TLB tiling.
sgemm_nn:
ld cstride, 8(sp) # Get arg from stack frame
addi sp, sp, -240
sd s0, 96(sp)
sd s1, 88(sp)
sd s2, 80(sp)
sd s3, 72(sp)
sd s4, 64(sp)
sd s5, 56(sp)
sd s6, 48(sp)
sd s7, 40(sp)
sd s8, 32(sp)
sd s9, 16(sp)
sd s10, 8(sp)
sd s11, 0(sp)
# Check for zero size matrices
beqz n, exit
beqz m, exit
beqz k, exit
# Convert elements strides to byte strides. (32 bit elements so element 0 starts at byte 0, element 1 at byte 4, element 2 at byte 8...)
#ld cstride, 0(sp) # Get arg from stack frame
slli astride, astride, 2
slli bstride, bstride, 2
slli cstride, cstride, 2
slti t6, m, 8
bnez t6, end_rows
c_row_loop: # Loop across rows of C blocks
mv nt, n # Initialize n counter for next row of C blocks
mv bnp, bp # Initialize B n-loop pointer to start
mv cnp, cp # Initialize C n-loop pointer
c_col_loop: # Loop across one row of C blocks
vsetvli nvl, nt, e32, m1, ta, ma # 32-bit vectors, LMUL=1
mv akp, ap # reset pointer into A to beginning
mv bkp, bnp # step to next column in B matrix
# Initalize current C submatrix block from memory.
vle32.v v0, (cnp); add ccp, cnp, cstride;
vle32.v v1, (ccp); add ccp, ccp, cstride;
vle32.v v2, (ccp); add ccp, ccp, cstride;
vle32.v v3, (ccp); add ccp, ccp, cstride;
vle32.v v4, (ccp); add ccp, ccp, cstride;
vle32.v v5, (ccp); add ccp, ccp, cstride;
vle32.v v6, (ccp); add ccp, ccp, cstride;
vle32.v v7, (ccp);
mv kt, k # Initialize inner loop counter
# Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
# Software pipeline loads (NOT IMPLEMENTED FOR NOW)
# lw s4, (akp); add amp, akp, astride;
# lw s5, (amp); add amp, amp, astride;
# lw s6, (amp); add amp, amp, astride;
# lw s7, (amp); add amp, amp, astride;
# lw s8, (amp); add amp, amp, astride;
# Get vector from B matrix
vle32.v v16, (bkp)
# Loop on inner dimension for current C block (unscheduled)
k_loop:
addi kt, kt, -1 # Decrement k counter
add bkp, bkp, bstride # address of next vector in B matrix
addi akp, akp, 4 # Move to next column of A
lw s4, (akp); add amp, akp, astride;
lw s5, (amp); add amp, amp, astride;
lw s6, (amp); add amp, amp, astride;
lw s7, (amp); add amp, amp, astride;
lw s8, (amp); add amp, amp, astride;
lw s9, (amp); add amp, amp, astride;
lw s10, (amp); add amp, amp, astride;
lw s11, (amp); add amp, amp, astride;
vmacc.vx v0, s4, v16
vmacc.vx v1, s5, v16
vmacc.vx v2, s6, v16
vmacc.vx v3, s7, v16
vmacc.vx v4, s8, v16
vmacc.vx v5, s9, v16
vmacc.vx v6, s10, v16
vmacc.vx v7, s11, v16
beqz kt, 1f # Exit out of loop if k=0 (go forward to label '1')
vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
j k_loop
1:
# Save C matrix block back to memory
vse32.v v0, (cnp); add ccp, cnp, cstride;
vse32.v v1, (ccp); add ccp, ccp, cstride;
vse32.v v2, (ccp); add ccp, ccp, cstride;
vse32.v v3, (ccp); add ccp, ccp, cstride;
vse32.v v4, (ccp); add ccp, ccp, cstride;
vse32.v v5, (ccp); add ccp, ccp, cstride;
vse32.v v6, (ccp); add ccp, ccp, cstride;
vse32.v v7, (ccp);
# Following tail instructions should be scheduled earlier in free slots during C block save.
# Leaving here for clarity.
# Bump pointers for loop across blocks in one row
slli t6, nvl, 2
add cnp, cnp, t6 # Move C block pointer over
add bnp, bnp, t6 # Move B block pointer over
sub nt, nt, nvl # Decrement element count in n dimension
bnez nt, c_col_loop # Any more to do?
# Move to next set of rows
addi m, m, -8 # Did 8 rows above
slli t6, astride, 3 # Multiply astride by 8
add ap, ap, t6 # Move A matrix pointer down 8 rows
slli t6, cstride, 3 # Multiply cstride by 8
add cp, cp, t6 # Move C matrix pointer down 8 rows
slti t6, m, 8
beqz t6, c_row_loop
# Handle end of matrix with fewer than 8 rows.
# Can use smaller versions of above decreasing in powers-of-2 depending on code-size concerns.
end_rows: # Need to do rows 1 by 1
end_c_row_loop: # Loop across rows of C blocks
mv nt, n # Initialize n counter for next row of C blocks
mv bnp, bp # Initialize B n-loop pointer to start
mv cnp, cp # Initialize C n-loop pointer
end_c_col_loop: # Loop across one row of C blocks
vsetvli nvl, nt, e32, m1, ta, ma # 32-bit vectors, LMUL=1
mv akp, ap # reset pointer into A to beginning
mv bkp, bnp # step to next column in B matrix
# Initalize current C submatrix block from memory.
vle32.v v0, (cnp);
mv kt, k # Initialize inner loop counter
# Get vector from B matrix
vle32.v v16, (bkp)
# Loop on inner dimension for current C block (unscheduled)
end_k_loop:
addi kt, kt, -1 # Decrement k counter
add bkp, bkp, bstride # address of next vector in B matrix
addi akp, akp, 4 # Move to next column of A
lw s4, (akp); add amp, akp, astride;
vmacc.vx v0, s4, v16
beqz kt, 1f # Exit out of loop if k=0 (go forward to label '1')
vle32.v v16, (bkp) # Get next vector from B matrix, overlap loads with jump stalls
j end_k_loop
1:
# Save C matrix block back to memory
vse32.v v0, (cnp);
# Bump pointers for loop across blocks in one row
slli t6, nvl, 2
add cnp, cnp, t6 # Move C block pointer over
add bnp, bnp, t6 # Move B block pointer over
sub nt, nt, nvl # Decrement element count in n dimension
bnez nt, end_c_col_loop # Any more to do?
# Move to next row
addi m, m, -1
add ap, ap, astride # Move A matrix pointer down 1 row
add cp, cp, astride # Move C matrix pointer down 1 row
bnez m, end_c_row_loop
exit:
ld s0, 96(sp)
ld s1, 88(sp)
ld s2, 80(sp)
ld s3, 72(sp)
ld s4, 64(sp)
ld s5, 56(sp)
ld s6, 48(sp)
ld s7, 40(sp)
ld s8, 32(sp)
ld s9, 16(sp)
ld s10, 8(sp)
ld s11, 0(sp)
addi sp, sp, 104
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment