Skip to content

Instantly share code, notes, and snippets.

@ssvb
Created February 16, 2020 12:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssvb/6f57b3654fc224a895d96e730196b47b to your computer and use it in GitHub Desktop.
Save ssvb/6f57b3654fc224a895d96e730196b47b to your computer and use it in GitHub Desktop.
libjpeg-turbo floating point MIPS functions
$ cat test.c
typedef short JCOEF;
typedef unsigned char JSAMPLE;
typedef unsigned int JDIMENSION;
typedef JCOEF *JCOEFPTR; /* useful in a couple of places */
#define GETJSAMPLE(value) ((int)(value))
#define CENTERJSAMPLE 128
#define FAST_FLOAT float
#define DCTSIZE 8
#define DCTSIZE2 64
typedef JSAMPLE *JSAMPROW; /* ptr to one image row of pixel samples. */
typedef JSAMPROW *JSAMPARRAY; /* ptr to some rows (a 2-D sample array) */
void
convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
register FAST_FLOAT *workspaceptr;
register JSAMPROW elemptr;
register int elemr;
workspaceptr = workspace;
for (elemr = 0; elemr < DCTSIZE; elemr++) {
elemptr = sample_data[elemr] + start_col;
#if DCTSIZE == 8 /* unroll the inner loop */
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
*workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
#else
{
register int elemc;
for (elemc = DCTSIZE; elemc > 0; elemc--)
*workspaceptr++ = (FAST_FLOAT)
(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
}
#endif
}
}
void
quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
register FAST_FLOAT temp;
register int i;
register JCOEFPTR output_ptr = coef_block;
for (i = 0; i < DCTSIZE2; i++) {
/* Apply the quantization and scaling factor */
temp = workspace[i] * divisors[i];
/* Round to nearest integer.
* Since C does not specify the direction of rounding for negative
* quotients, we have to force the dividend positive for portability.
* The maximum coefficient size is +-16K (for 12-bit data), so this
* code should work for either 16-bit or 32-bit ints.
*/
output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
}
}
$ mipsel-unknown-linux-gnu-gcc -O2 -c -march=mips32r2 -funroll-loops test.c && mipsel-unknown-linux-gnu-objdump -d test.o
test.o: file format elf32-tradlittlemips
Disassembly of section .text:
00000000 <convsamp_float>:
0: 8c830000 lw v1,0(a0)
4: 8c8a0004 lw t2,4(a0)
8: 00654021 addu t0,v1,a1
c: 91020000 lbu v0,0(t0)
10: 01456821 addu t5,t2,a1
14: 2447ff80 addiu a3,v0,-128
18: 44870000 mtc1 a3,$f0
1c: 46800060 cvt.s.w $f1,$f0
20: e4c10000 swc1 $f1,0(a2)
24: 91090001 lbu t1,1(t0)
28: 252bff80 addiu t3,t1,-128
2c: 448b1000 mtc1 t3,$f2
30: 468010e0 cvt.s.w $f3,$f2
34: e4c30004 swc1 $f3,4(a2)
38: 910c0002 lbu t4,2(t0)
3c: 258eff80 addiu t6,t4,-128
40: 448e2000 mtc1 t6,$f4
44: 46802160 cvt.s.w $f5,$f4
48: e4c50008 swc1 $f5,8(a2)
4c: 910f0003 lbu t7,3(t0)
50: 25f8ff80 addiu t8,t7,-128
54: 44983000 mtc1 t8,$f6
58: 468031e0 cvt.s.w $f7,$f6
5c: e4c7000c swc1 $f7,12(a2)
60: 91190004 lbu t9,4(t0)
64: 2723ff80 addiu v1,t9,-128
68: 44834000 mtc1 v1,$f8
6c: 46804260 cvt.s.w $f9,$f8
70: e4c90010 swc1 $f9,16(a2)
74: 91020005 lbu v0,5(t0)
78: 2447ff80 addiu a3,v0,-128
7c: 44875000 mtc1 a3,$f10
80: 468052e0 cvt.s.w $f11,$f10
84: e4cb0014 swc1 $f11,20(a2)
88: 91090006 lbu t1,6(t0)
8c: 252aff80 addiu t2,t1,-128
90: 448a6000 mtc1 t2,$f12
94: 46806360 cvt.s.w $f13,$f12
98: e4cd0018 swc1 $f13,24(a2)
9c: 91080007 lbu t0,7(t0)
a0: 250bff80 addiu t3,t0,-128
a4: 448b7000 mtc1 t3,$f14
a8: 468073e0 cvt.s.w $f15,$f14
ac: e4cf001c swc1 $f15,28(a2)
b0: 91ac0000 lbu t4,0(t5)
b4: 258eff80 addiu t6,t4,-128
b8: 448e8000 mtc1 t6,$f16
bc: 46808460 cvt.s.w $f17,$f16
c0: e4d10020 swc1 $f17,32(a2)
c4: 91af0001 lbu t7,1(t5)
c8: 25f8ff80 addiu t8,t7,-128
cc: 44989000 mtc1 t8,$f18
d0: 468094e0 cvt.s.w $f19,$f18
d4: e4d30024 swc1 $f19,36(a2)
d8: 91b90002 lbu t9,2(t5)
dc: 2723ff80 addiu v1,t9,-128
e0: 44830000 mtc1 v1,$f0
e4: 46800060 cvt.s.w $f1,$f0
e8: e4c10028 swc1 $f1,40(a2)
ec: 91a20003 lbu v0,3(t5)
f0: 2447ff80 addiu a3,v0,-128
f4: 44871000 mtc1 a3,$f2
f8: 468010e0 cvt.s.w $f3,$f2
fc: e4c3002c swc1 $f3,44(a2)
100: 91a90004 lbu t1,4(t5)
104: 252aff80 addiu t2,t1,-128
108: 448a2000 mtc1 t2,$f4
10c: 46802160 cvt.s.w $f5,$f4
110: e4c50030 swc1 $f5,48(a2)
114: 91a80005 lbu t0,5(t5)
118: 250bff80 addiu t3,t0,-128
11c: 448b3000 mtc1 t3,$f6
120: 468031e0 cvt.s.w $f7,$f6
124: e4c70034 swc1 $f7,52(a2)
128: 91ac0006 lbu t4,6(t5)
12c: 258eff80 addiu t6,t4,-128
130: 448e4000 mtc1 t6,$f8
134: 46804260 cvt.s.w $f9,$f8
138: e4c90038 swc1 $f9,56(a2)
13c: 91ad0007 lbu t5,7(t5)
140: 8c8f0008 lw t7,8(a0)
144: 25b8ff80 addiu t8,t5,-128
148: 44985000 mtc1 t8,$f10
14c: 01e5c821 addu t9,t7,a1
150: 468052e0 cvt.s.w $f11,$f10
154: e4cb003c swc1 $f11,60(a2)
158: 93220000 lbu v0,0(t9)
15c: 8c83000c lw v1,12(a0)
160: 2447ff80 addiu a3,v0,-128
164: 44876000 mtc1 a3,$f12
168: 00655021 addu t2,v1,a1
16c: 46806360 cvt.s.w $f13,$f12
170: e4cd0040 swc1 $f13,64(a2)
174: 93290001 lbu t1,1(t9)
178: 2528ff80 addiu t0,t1,-128
17c: 44887000 mtc1 t0,$f14
180: 468073e0 cvt.s.w $f15,$f14
184: e4cf0044 swc1 $f15,68(a2)
188: 932b0002 lbu t3,2(t9)
18c: 256cff80 addiu t4,t3,-128
190: 448c8000 mtc1 t4,$f16
194: 46808460 cvt.s.w $f17,$f16
198: e4d10048 swc1 $f17,72(a2)
19c: 932e0003 lbu t6,3(t9)
1a0: 25cdff80 addiu t5,t6,-128
1a4: 448d9000 mtc1 t5,$f18
1a8: 468094e0 cvt.s.w $f19,$f18
1ac: e4d3004c swc1 $f19,76(a2)
1b0: 932f0004 lbu t7,4(t9)
1b4: 25f8ff80 addiu t8,t7,-128
1b8: 44980000 mtc1 t8,$f0
1bc: 46800060 cvt.s.w $f1,$f0
1c0: e4c10050 swc1 $f1,80(a2)
1c4: 93220005 lbu v0,5(t9)
1c8: 2443ff80 addiu v1,v0,-128
1cc: 44831000 mtc1 v1,$f2
1d0: 468010e0 cvt.s.w $f3,$f2
1d4: e4c30054 swc1 $f3,84(a2)
1d8: 93270006 lbu a3,6(t9)
1dc: 24e9ff80 addiu t1,a3,-128
1e0: 44892000 mtc1 t1,$f4
1e4: 46802160 cvt.s.w $f5,$f4
1e8: e4c50058 swc1 $f5,88(a2)
1ec: 93390007 lbu t9,7(t9)
1f0: 2728ff80 addiu t0,t9,-128
1f4: 44883000 mtc1 t0,$f6
1f8: 468031e0 cvt.s.w $f7,$f6
1fc: e4c7005c swc1 $f7,92(a2)
200: 914b0000 lbu t3,0(t2)
204: 256cff80 addiu t4,t3,-128
208: 448c4000 mtc1 t4,$f8
20c: 46804260 cvt.s.w $f9,$f8
210: e4c90060 swc1 $f9,96(a2)
214: 914e0001 lbu t6,1(t2)
218: 25cdff80 addiu t5,t6,-128
21c: 448d5000 mtc1 t5,$f10
220: 468052e0 cvt.s.w $f11,$f10
224: e4cb0064 swc1 $f11,100(a2)
228: 914f0002 lbu t7,2(t2)
22c: 25f8ff80 addiu t8,t7,-128
230: 44986000 mtc1 t8,$f12
234: 46806360 cvt.s.w $f13,$f12
238: e4cd0068 swc1 $f13,104(a2)
23c: 91420003 lbu v0,3(t2)
240: 2443ff80 addiu v1,v0,-128
244: 44837000 mtc1 v1,$f14
248: 468073e0 cvt.s.w $f15,$f14
24c: e4cf006c swc1 $f15,108(a2)
250: 91470004 lbu a3,4(t2)
254: 24e9ff80 addiu t1,a3,-128
258: 44898000 mtc1 t1,$f16
25c: 46808460 cvt.s.w $f17,$f16
260: e4d10070 swc1 $f17,112(a2)
264: 91590005 lbu t9,5(t2)
268: 2728ff80 addiu t0,t9,-128
26c: 44889000 mtc1 t0,$f18
270: 468094e0 cvt.s.w $f19,$f18
274: e4d30074 swc1 $f19,116(a2)
278: 914b0006 lbu t3,6(t2)
27c: 8c8c0010 lw t4,16(a0)
280: 256eff80 addiu t6,t3,-128
284: 448e0000 mtc1 t6,$f0
288: 01856821 addu t5,t4,a1
28c: 46800060 cvt.s.w $f1,$f0
290: e4c10078 swc1 $f1,120(a2)
294: 914a0007 lbu t2,7(t2)
298: 8c820014 lw v0,20(a0)
29c: 254fff80 addiu t7,t2,-128
2a0: 448f1000 mtc1 t7,$f2
2a4: 00454821 addu t1,v0,a1
2a8: 468010e0 cvt.s.w $f3,$f2
2ac: e4c3007c swc1 $f3,124(a2)
2b0: 91b80000 lbu t8,0(t5)
2b4: 2703ff80 addiu v1,t8,-128
2b8: 44832000 mtc1 v1,$f4
2bc: 46802160 cvt.s.w $f5,$f4
2c0: e4c50080 swc1 $f5,128(a2)
2c4: 91a70001 lbu a3,1(t5)
2c8: 24f9ff80 addiu t9,a3,-128
2cc: 44993000 mtc1 t9,$f6
2d0: 468031e0 cvt.s.w $f7,$f6
2d4: e4c70084 swc1 $f7,132(a2)
2d8: 91a80002 lbu t0,2(t5)
2dc: 250bff80 addiu t3,t0,-128
2e0: 448b4000 mtc1 t3,$f8
2e4: 46804260 cvt.s.w $f9,$f8
2e8: e4c90088 swc1 $f9,136(a2)
2ec: 91ac0003 lbu t4,3(t5)
2f0: 258eff80 addiu t6,t4,-128
2f4: 448e5000 mtc1 t6,$f10
2f8: 468052e0 cvt.s.w $f11,$f10
2fc: e4cb008c swc1 $f11,140(a2)
300: 91aa0004 lbu t2,4(t5)
304: 254fff80 addiu t7,t2,-128
308: 448f6000 mtc1 t7,$f12
30c: 46806360 cvt.s.w $f13,$f12
310: e4cd0090 swc1 $f13,144(a2)
314: 91b80005 lbu t8,5(t5)
318: 2702ff80 addiu v0,t8,-128
31c: 44827000 mtc1 v0,$f14
320: 468073e0 cvt.s.w $f15,$f14
324: e4cf0094 swc1 $f15,148(a2)
328: 91a30006 lbu v1,6(t5)
32c: 2467ff80 addiu a3,v1,-128
330: 44878000 mtc1 a3,$f16
334: 46808460 cvt.s.w $f17,$f16
338: e4d10098 swc1 $f17,152(a2)
33c: 91ad0007 lbu t5,7(t5)
340: 25b9ff80 addiu t9,t5,-128
344: 44999000 mtc1 t9,$f18
348: 468094e0 cvt.s.w $f19,$f18
34c: e4d3009c swc1 $f19,156(a2)
350: 91280000 lbu t0,0(t1)
354: 250bff80 addiu t3,t0,-128
358: 448b0000 mtc1 t3,$f0
35c: 46800060 cvt.s.w $f1,$f0
360: e4c100a0 swc1 $f1,160(a2)
364: 912c0001 lbu t4,1(t1)
368: 258eff80 addiu t6,t4,-128
36c: 448e1000 mtc1 t6,$f2
370: 468010e0 cvt.s.w $f3,$f2
374: e4c300a4 swc1 $f3,164(a2)
378: 912a0002 lbu t2,2(t1)
37c: 254fff80 addiu t7,t2,-128
380: 448f2000 mtc1 t7,$f4
384: 46802160 cvt.s.w $f5,$f4
388: e4c500a8 swc1 $f5,168(a2)
38c: 91380003 lbu t8,3(t1)
390: 2702ff80 addiu v0,t8,-128
394: 44823000 mtc1 v0,$f6
398: 468031e0 cvt.s.w $f7,$f6
39c: e4c700ac swc1 $f7,172(a2)
3a0: 91230004 lbu v1,4(t1)
3a4: 2467ff80 addiu a3,v1,-128
3a8: 44874000 mtc1 a3,$f8
3ac: 46804260 cvt.s.w $f9,$f8
3b0: e4c900b0 swc1 $f9,176(a2)
3b4: 912d0005 lbu t5,5(t1)
3b8: 8c990018 lw t9,24(a0)
3bc: 25a8ff80 addiu t0,t5,-128
3c0: 44885000 mtc1 t0,$f10
3c4: 8c84001c lw a0,28(a0)
3c8: 468052e0 cvt.s.w $f11,$f10
3cc: e4cb00b4 swc1 $f11,180(a2)
3d0: 912b0006 lbu t3,6(t1)
3d4: 03256021 addu t4,t9,a1
3d8: 256eff80 addiu t6,t3,-128
3dc: 448e6000 mtc1 t6,$f12
3e0: 00852821 addu a1,a0,a1
3e4: 46806360 cvt.s.w $f13,$f12
3e8: e4cd00b8 swc1 $f13,184(a2)
3ec: 91290007 lbu t1,7(t1)
3f0: 252aff80 addiu t2,t1,-128
3f4: 448a7000 mtc1 t2,$f14
3f8: 468073e0 cvt.s.w $f15,$f14
3fc: e4cf00bc swc1 $f15,188(a2)
400: 918f0000 lbu t7,0(t4)
404: 25f8ff80 addiu t8,t7,-128
408: 44988000 mtc1 t8,$f16
40c: 46808460 cvt.s.w $f17,$f16
410: e4d100c0 swc1 $f17,192(a2)
414: 91820001 lbu v0,1(t4)
418: 2443ff80 addiu v1,v0,-128
41c: 44839000 mtc1 v1,$f18
420: 468094e0 cvt.s.w $f19,$f18
424: e4d300c4 swc1 $f19,196(a2)
428: 91870002 lbu a3,2(t4)
42c: 24edff80 addiu t5,a3,-128
430: 448d0000 mtc1 t5,$f0
434: 46800060 cvt.s.w $f1,$f0
438: e4c100c8 swc1 $f1,200(a2)
43c: 91990003 lbu t9,3(t4)
440: 2728ff80 addiu t0,t9,-128
444: 44881000 mtc1 t0,$f2
448: 468010e0 cvt.s.w $f3,$f2
44c: e4c300cc swc1 $f3,204(a2)
450: 918b0004 lbu t3,4(t4)
454: 256eff80 addiu t6,t3,-128
458: 448e2000 mtc1 t6,$f4
45c: 46802160 cvt.s.w $f5,$f4
460: e4c500d0 swc1 $f5,208(a2)
464: 91890005 lbu t1,5(t4)
468: 2524ff80 addiu a0,t1,-128
46c: 44843000 mtc1 a0,$f6
470: 468031e0 cvt.s.w $f7,$f6
474: e4c700d4 swc1 $f7,212(a2)
478: 918a0006 lbu t2,6(t4)
47c: 254fff80 addiu t7,t2,-128
480: 448f4000 mtc1 t7,$f8
484: 46804260 cvt.s.w $f9,$f8
488: e4c900d8 swc1 $f9,216(a2)
48c: 918c0007 lbu t4,7(t4)
490: 2598ff80 addiu t8,t4,-128
494: 44985000 mtc1 t8,$f10
498: 468052e0 cvt.s.w $f11,$f10
49c: e4cb00dc swc1 $f11,220(a2)
4a0: 90a20000 lbu v0,0(a1)
4a4: 2443ff80 addiu v1,v0,-128
4a8: 44836000 mtc1 v1,$f12
4ac: 46806360 cvt.s.w $f13,$f12
4b0: e4cd00e0 swc1 $f13,224(a2)
4b4: 90a70001 lbu a3,1(a1)
4b8: 24edff80 addiu t5,a3,-128
4bc: 448d7000 mtc1 t5,$f14
4c0: 468073e0 cvt.s.w $f15,$f14
4c4: e4cf00e4 swc1 $f15,228(a2)
4c8: 90b90002 lbu t9,2(a1)
4cc: 2728ff80 addiu t0,t9,-128
4d0: 44888000 mtc1 t0,$f16
4d4: 46808460 cvt.s.w $f17,$f16
4d8: e4d100e8 swc1 $f17,232(a2)
4dc: 90ab0003 lbu t3,3(a1)
4e0: 256eff80 addiu t6,t3,-128
4e4: 448e9000 mtc1 t6,$f18
4e8: 468094e0 cvt.s.w $f19,$f18
4ec: e4d300ec swc1 $f19,236(a2)
4f0: 90a90004 lbu t1,4(a1)
4f4: 2524ff80 addiu a0,t1,-128
4f8: 44840000 mtc1 a0,$f0
4fc: 46800060 cvt.s.w $f1,$f0
500: e4c100f0 swc1 $f1,240(a2)
504: 90aa0005 lbu t2,5(a1)
508: 254fff80 addiu t7,t2,-128
50c: 448f1000 mtc1 t7,$f2
510: 468010e0 cvt.s.w $f3,$f2
514: e4c300f4 swc1 $f3,244(a2)
518: 90ac0006 lbu t4,6(a1)
51c: 2598ff80 addiu t8,t4,-128
520: 44982000 mtc1 t8,$f4
524: 46802160 cvt.s.w $f5,$f4
528: e4c500f8 swc1 $f5,248(a2)
52c: 90a50007 lbu a1,7(a1)
530: 24a2ff80 addiu v0,a1,-128
534: 44823000 mtc1 v0,$f6
538: 468031e0 cvt.s.w $f7,$f6
53c: 03e00008 jr ra
540: e4c700fc swc1 $f7,252(a2)
00000544 <quantize_float>:
544: 3c1c0000 lui gp,0x0
548: 279c0000 addiu gp,gp,0
54c: 0399e021 addu gp,gp,t9
550: 8f820000 lw v0,0(gp)
554: 24cc0100 addiu t4,a2,256
558: c4400000 lwc1 $f0,0(v0)
55c: c4c80000 lwc1 $f8,0(a2)
560: c4b00000 lwc1 $f16,0(a1)
564: c4c70004 lwc1 $f7,4(a2)
568: c4af0004 lwc1 $f15,4(a1)
56c: c4c60008 lwc1 $f6,8(a2)
570: c4ae0008 lwc1 $f14,8(a1)
574: c4c5000c lwc1 $f5,12(a2)
578: c4ad000c lwc1 $f13,12(a1)
57c: c4c40010 lwc1 $f4,16(a2)
580: c4ac0010 lwc1 $f12,16(a1)
584: c4c30014 lwc1 $f3,20(a2)
588: c4ab0014 lwc1 $f11,20(a1)
58c: c4c20018 lwc1 $f2,24(a2)
590: c4aa0018 lwc1 $f10,24(a1)
594: c4c1001c lwc1 $f1,28(a2)
598: c4a9001c lwc1 $f9,28(a1)
59c: 4c104460 madd.s $f17,$f0,$f8,$f16
5a0: 4c0f3ca0 madd.s $f18,$f0,$f7,$f15
5a4: 4c0e34e0 madd.s $f19,$f0,$f6,$f14
5a8: 4c0d2c20 madd.s $f16,$f0,$f5,$f13
5ac: 4c0c23e0 madd.s $f15,$f0,$f4,$f12
5b0: 4c0b1ba0 madd.s $f14,$f0,$f3,$f11
5b4: 4c0a1360 madd.s $f13,$f0,$f2,$f10
5b8: 4c090b20 madd.s $f12,$f0,$f1,$f9
5bc: 46008a0d trunc.w.s $f8,$f17
5c0: 460091cd trunc.w.s $f7,$f18
5c4: 4600998d trunc.w.s $f6,$f19
5c8: 4600814d trunc.w.s $f5,$f16
5cc: 4600790d trunc.w.s $f4,$f15
5d0: 460070cd trunc.w.s $f3,$f14
5d4: 46006acd trunc.w.s $f11,$f13
5d8: 4600608d trunc.w.s $f2,$f12
5dc: 440d4000 mfc1 t5,$f8
5e0: 440b3800 mfc1 t3,$f7
5e4: 440a3000 mfc1 t2,$f6
5e8: 44092800 mfc1 t1,$f5
5ec: 44082000 mfc1 t0,$f4
5f0: 44071800 mfc1 a3,$f3
5f4: 44035800 mfc1 v1,$f11
5f8: 440e1000 mfc1 t6,$f2
5fc: 25afc000 addiu t7,t5,-16384
600: 2578c000 addiu t8,t3,-16384
604: 2559c000 addiu t9,t2,-16384
608: 252dc000 addiu t5,t1,-16384
60c: 250bc000 addiu t3,t0,-16384
610: 24eac000 addiu t2,a3,-16384
614: 2469c000 addiu t1,v1,-16384
618: 25c2c000 addiu v0,t6,-16384
61c: 24c60020 addiu a2,a2,32
620: a48f0000 sh t7,0(a0)
624: a4980002 sh t8,2(a0)
628: a4990004 sh t9,4(a0)
62c: a48d0006 sh t5,6(a0)
630: a48b0008 sh t3,8(a0)
634: a48a000a sh t2,10(a0)
638: a489000c sh t1,12(a0)
63c: a482000e sh v0,14(a0)
640: 24a50020 addiu a1,a1,32
644: 1586ffc5 bne t4,a2,55c <quantize_float+0x18>
648: 24840010 addiu a0,a0,16
64c: 03e00008 jr ra
650: 00000000 nop
@ssvb
Copy link
Author

ssvb commented Feb 16, 2020

For comparison, the output of objdump for jsimd_quantize_float_dspr2 from https://github.com/libjpeg-turbo/libjpeg-turbo/blob/master/simd/mips/jsimd_dspr2.S#L2816

00004f6c <jsimd_quantize_float_dspr2>:
    4f6c:	3c094680 	lui	t1,0x4680
    4f70:	35290100 	ori	t1,t1,0x100
    4f74:	44890000 	mtc1	t1,$f0
    4f78:	2408003f 	li	t0,63
    4f7c:	c4c20000 	lwc1	$f2,0(a2)
    4f80:	c4aa0000 	lwc1	$f10,0(a1)
    4f84:	c4c40004 	lwc1	$f4,4(a2)
    4f88:	c4ac0004 	lwc1	$f12,4(a1)
    4f8c:	c4c60008 	lwc1	$f6,8(a2)
    4f90:	c4ae0008 	lwc1	$f14,8(a1)
    4f94:	c4c8000c 	lwc1	$f8,12(a2)
    4f98:	c4b0000c 	lwc1	$f16,12(a1)
    4f9c:	4c0a10a0 	madd.s	$f2,$f0,$f2,$f10
    4fa0:	4c0c2120 	madd.s	$f4,$f0,$f4,$f12
    4fa4:	4c0e31a0 	madd.s	$f6,$f0,$f6,$f14
    4fa8:	4c104220 	madd.s	$f8,$f0,$f8,$f16
    4fac:	c4aa0010 	lwc1	$f10,16(a1)
    4fb0:	c4ac0014 	lwc1	$f12,20(a1)
    4fb4:	4600108d 	trunc.w.s	$f2,$f2
    4fb8:	4600210d 	trunc.w.s	$f4,$f4
    4fbc:	4600318d 	trunc.w.s	$f6,$f6
    4fc0:	4600420d 	trunc.w.s	$f8,$f8
    4fc4:	c4ae0018 	lwc1	$f14,24(a1)
    4fc8:	c4b0001c 	lwc1	$f16,28(a1)
    4fcc:	44091000 	mfc1	t1,$f2
    4fd0:	440a2000 	mfc1	t2,$f4
    4fd4:	440b3000 	mfc1	t3,$f6
    4fd8:	440c4000 	mfc1	t4,$f8
    4fdc:	c4c20010 	lwc1	$f2,16(a2)
    4fe0:	c4c40014 	lwc1	$f4,20(a2)
    4fe4:	c4c60018 	lwc1	$f6,24(a2)
    4fe8:	c4c8001c 	lwc1	$f8,28(a2)
    4fec:	4c0a10a0 	madd.s	$f2,$f0,$f2,$f10
    4ff0:	4c0c2120 	madd.s	$f4,$f0,$f4,$f12
    4ff4:	4c0e31a0 	madd.s	$f6,$f0,$f6,$f14
    4ff8:	4c104220 	madd.s	$f8,$f0,$f8,$f16
    4ffc:	2529c000 	addiu	t1,t1,-16384
    5000:	254ac000 	addiu	t2,t2,-16384
    5008:	258cc000 	addiu	t4,t4,-16384
    500c:	4600108d 	trunc.w.s	$f2,$f2
    5010:	4600210d 	trunc.w.s	$f4,$f4
    5014:	4600318d 	trunc.w.s	$f6,$f6
    5018:	4600420d 	trunc.w.s	$f8,$f8
    501c:	a4890000 	sh	t1,0(a0)
    5020:	a48a0002 	sh	t2,2(a0)
    5024:	a48b0004 	sh	t3,4(a0)
    5028:	a48c0006 	sh	t4,6(a0)
    502c:	44091000 	mfc1	t1,$f2
    5030:	440a2000 	mfc1	t2,$f4
    5034:	440b3000 	mfc1	t3,$f6
    5038:	440c4000 	mfc1	t4,$f8
    503c:	2508fff8 	addiu	t0,t0,-8
    5040:	24c60020 	addiu	a2,a2,32
    5044:	24a50020 	addiu	a1,a1,32
    5048:	2529c000 	addiu	t1,t1,-16384
    504c:	254ac000 	addiu	t2,t2,-16384
    5050:	256bc000 	addiu	t3,t3,-16384
    5054:	258cc000 	addiu	t4,t4,-16384
    5058:	a4890008 	sh	t1,8(a0)
    505c:	a48a000a 	sh	t2,10(a0)
    5060:	a48b000c 	sh	t3,12(a0)
    5064:	a48c000e 	sh	t4,14(a0)
    5068:	0501ffc4 	bgez	t0,4f7c <jsimd_quantize_float_dspr2+0x10>
    506c:	24840010 	addiu	a0,a0,16
    5070:	03e00008 	jr	ra
    5074:	00000000 	nop

The optimized jsimd_quantize_float_dspr2 assembly function contains 67 instructions. GCC 6.5.0 output for quantize_float contains 68 of mostly the same instructions in a slightly different order.

@ssvb
Copy link
Author

ssvb commented Feb 16, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment