Skip to content

Instantly share code, notes, and snippets.

@Megawats777
Created April 9, 2020 00:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Megawats777/40a3dd389c2fa9e9b9ad5494762dcecd to your computer and use it in GitHub Desktop.
Save Megawats777/40a3dd389c2fa9e9b9ad5494762dcecd to your computer and use it in GitHub Desktop.
AarcheProfileResults-01
Same use case of decode a 1080p video 1000 times.
This was done using the "Perf" tool
Compiler settings
- opt level: -Og
- -g
------------------------------------------------------------------
Run #1:
Expensive Functions:
10.31% h264dec libc-2.27.so [.] __memcpy_generic
8.16% h264dec libc-2.27.so [.] __GI___memset_generic
6.73% h264dec h264dec [.] WelsDec::WelsResidualBlockCavlc
4.81% h264dec h264dec [.] WelsDec::CavlcGetLevelVal
__memcpy_generic - Expensive Lines:
6.16 │ ldp x6, x7, [x1, #16]
13.58 │ stp x8, x9, [x3, #32]
6.34 │ ldp x8, x9, [x1, #32]
15.58 │ stp x10, x11, [x3, #48]
7.39 │ ldp x10, x11, [x1, #48]
16.48 │ stp x12, x13, [x3, #64]!
7.19 │ ldp x12, x13, [x1, #64]!
0.04 │ subs x2, x2, #0x40
15.76 │ ↑ b.hi 110
__GI___memset_generic - Expensive Lines:
11.64 │100: dc zva, x3
52.78 │ add x3, x3, #0x40
│ subs x2, x2, #0x40
10.26 │ ↑ b.hi 100
WelsDec::WelsResidualBlockCavlc - Expensive Lines:
│ for (i = uiTotalCoeff - 1; i >= 0; --i) { //FIXME merge into rundecode?
1.11 │ sub w0, w0, #0x1
0.34 │ ↓ tbnz w0, #31, 5a0
│ iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
1.40 │ add x1, sp, #0xd8
2.11 │ ldr w1, [x1, w0, sxtw #2]
2.11 │ add w1, w1, #0x1
1.09 │ add w3, w3, w1
│ j = kpZigzagTable[ iCoeffNum ];
3.20 │ ldrb w1, [x26, w3, sxtw]
│ if (!pCtx->bUseScalingList) {
0.19 │ add x2, x22, #0x8a, lsl #12
2.83 │ ldrb w2, [x2, #3565]
2.15 │ ↑ cbnz w2, 44514c <WelsDec::WelsResidualBlockCavlc(WelsDec::TagVlcTable*, 4f0
│ pTCoeff[j] = (iLevel[i] * kpDequantCoeff[j & 0x07]);
0.03 │ add x2, sp, #0x98
2.14 │ ldr w2, [x2, w0, sxtw #2]
│ ubfiz x4, x1, #1, #3
1.15 │ ubfiz x1, x1, #1, #8
2.17 │ ldrh w4, [x20, x4]
3.56 │ mul w2, w2, w4
1.03 │ strh w2, [x23, x1]
WelsDec::CavlcGetLevelVal - Expensive Lines:
2.37 │ ldr w7, [x1]
│ GetPrefixBits():
2.46 │ ands w11, w7, #0xffff0000
0.28 │ → b.eq 442e34 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x94> // b.none
│ uiValue >>= 16;
0.81 │ lsr w8, w7, #16
│ iNumBit += 16;
0.29 │ mov w11, #0x10 // #16
│ }
│ if (uiValue & 0xff00) {
0.80 │ tst w8, #0xff00
0.82 │ → b.eq 442f8c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x1ec> // b.none
│ uiValue >>= 8;
0.91 │ lsr w8, w8, #8
│ iNumBit += 8;
0.26 │ add w11, w11, #0x8
│ }
│ if (uiValue & 0xf0) {
0.97 │ tst w8, #0xf0
0.38 │ → b.eq 442f9c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x1fc> // b.none
│ uiValue >>= 4;
0.86 │ lsr w8, w8, #4
│ iNumBit += 4;
0.30 │ add w11, w11, #0x4
│ }
│ iNumBit += g_kuiPrefix8BitsTable[uiValue];
1.16 │ adrp x4, WelsDec::g_ksInterBSubMbTypeInfo+0x60
0.43 │ add x4, x4, #0x28
2.90 │ ldr w4, [x4, w8, uxtw #2]
3.07 │ add w11, w4, w11
│ return (32 - iNumBit);
│ mov w4, #0x20 // #32
1.11 │ sub w4, w4, w11
│ _ZN7WelsDecL16CavlcGetLevelValEPiPNS_16TagReadBitsCacheEhh():
│ if (iPrefixBits > MAX_LEVEL_PREFIX + 1) //iPrefixBits includes leading "0"s and first "1", should +1
1.10 │ cmp w4, #0x10
0.04 │ → b.gt 44304c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x2ac>
│ POP_BUFFER (pBitsCache, iPrefixBits);
1.11 │ lsl w8, w7, w4
0.25 │ str w8, [x1]
1.16 │ ldrb w7, [x1, #4]
2.23 │ sub w7, w7, w4
1.05 │ and w7, w7, #0xff
0.02 │ strb w7, [x1, #4]
│ iUsedBits += iPrefixBits;
0.98 │ add w0, w0, w4
│ iLevelPrefix = iPrefixBits - 1;
0.03 │ sub w4, w4, #0x1
│ iLevelCode = iLevelPrefix << iSuffixLength; //differ
1.11 │ lsl w12, w4, w6
0.25 │ str w8, [x1]
1.16 │ ldrb w7, [x1, #4]
2.23 │ sub w7, w7, w4
1.05 │ and w7, w7, #0xff
0.02 │ strb w7, [x1, #4]
│ iUsedBits += iPrefixBits;
0.98 │ add w0, w0, w4
│ iLevelPrefix = iPrefixBits - 1;
0.03 │ sub w4, w4, #0x1
│ iLevelCode = iLevelPrefix << iSuffixLength; //differ
1.11 │ lsl w12, w4, w6
│ if (iLevelPrefix >= 14) {
0.04 │ cmp w4, #0xd
1.07 │ → b.le 442e3c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x9c>
│ if (14 == iLevelPrefix && 0 == iSuffixLength)
0.09 │ cmp w4, #0xe
0.01 │ cset w11, eq // eq = none
0.03 │ cmp w6, #0x0
0.02 │ csel w11, w11, wzr, eq // eq = none
0.02 │ ↓ cbnz w11, 443014 <WelsDec::CavlcGetLevelVal(int*, 274
│ else if (15 == iLevelPrefix) {
│ cmp w4, #0xf
0.26 │ → b.ne 442e3c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x9c> // b.any
│ if (iSuffixLength == 0)
0.01 │ ↓ cbnz w6, 44301c <WelsDec::CavlcGetLevelVal(int*, 27c
│ iLevelCode += 15;
│ add w12, w12, #0xf
│ iSuffixLengthSize = 12;
0.02 │ mov w11, #0xc // #12
0.02 │ → b 442e48 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0xa8>
│ iSuffixLengthSize = 4;
0.04 │274: mov w11, #0x4 // #4
0.01 │ → b 442e48 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0xa8>
│ iSuffixLengthSize = 12;
│27c: mov w11, #0xc // #12
│ → b 442e48 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0xa8>
│ iLevelCode += ((i == uiTrailingOnes) && (uiTrailingOnes < 3)) << 1;
1.39 │ cmp w3, #0x2
0.15 │ → b.ls 443034 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x294> // b.plast
0.38 │ mov w4, #0x0 // #0
0.08 │ → b 442ec0 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x120>
0.76 │ mov w4, #0x1 // #1
0.15 │ → b 442ec0 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x120>
│ iSuffixLength += ((iLevel[i] > iThreshold) || (iLevel[i] < -iThreshold)) && (iSuffixLength < 6);
1.03 │ mov w4, #0x0 // #0
0.04 │ → b 442f10 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x170>
0.30 │ mov w4, #0x1 // #1
0.08 │ → b 442f10 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x170>
│ return -1;
│ mov w0, #0xffffffff // #-1
│ }
│ return iUsedBits;
│ }
2.08 │ adrp x1, __FRAME_END__+0x16798
│ ldr x1, [x1, #2776]
0.63 │ ldr x2, [sp, #24]
0.66 │ ldr x1, [x1]
0.73 │ eor x1, x2, x1
0.33 │ ↓ cbnz x1, 443070 <WelsDec::CavlcGetLevelVal(int*, 2d0
0.48 │ ldp x29, x30, [sp], #32
0.72 │ ← ret
------------------------------------------------------------------
Run #2:
Expensive Functions:
10.34% h264dec libc-2.27.so [.] __memcpy_generic
8.16% h264dec libc-2.27.so [.] __GI___memset_generic
6.76% h264dec h264dec [.] WelsDec::WelsResidualBlockCavlc
4.78% h264dec h264dec [.] WelsDec::CavlcGetLevelVal
__memcpy_generic - Expensive Lines:
6.34 │ ldp x6, x7, [x1, #16]
13.49 │ stp x8, x9, [x3, #32]
6.37 │ ldp x8, x9, [x1, #32]
15.66 │ stp x10, x11, [x3, #48]
7.38 │ ldp x10, x11, [x1, #48]
16.47 │ stp x12, x13, [x3, #64]!
7.08 │ ldp x12, x13, [x1, #64]!
0.04 │ subs x2, x2, #0x40
15.88 │ ↑ b.hi 110
__GI___memset_generic - Expensive Lines:
11.60 │100: dc zva, x3
52.57 │ add x3, x3, #0x40
│ subs x2, x2, #0x40
10.40 │ ↑ b.hi 100
WelsDec::WelsResidualBlockCavlc - Expensive Lines:
1.10 │ sub w0, w0, #0x1
0.30 │ ↓ tbnz w0, #31, 5a0
│ iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
1.35 │ add x1, sp, #0xd8
2.25 │ ldr w1, [x1, w0, sxtw #2]
2.21 │ add w1, w1, #0x1
1.14 │ add w3, w3, w1
│ j = kpZigzagTable[ iCoeffNum ];
3.33 │ ldrb w1, [x26, w3, sxtw]
│ if (!pCtx->bUseScalingList) {
0.20 │ add x2, x22, #0x8a, lsl #12
2.77 │ ldrb w2, [x2, #3565]
2.21 │ ↑ cbnz w2, 44514c <WelsDec::WelsResidualBlockCavlc(WelsDec::TagVlcTable*, 4f0
│ pTCoeff[j] = (iLevel[i] * kpDequantCoeff[j & 0x07]);
0.03 │ add x2, sp, #0x98
2.09 │ ldr w2, [x2, w0, sxtw #2]
│ ubfiz x4, x1, #1, #3
1.23 │ ubfiz x1, x1, #1, #8
2.16 │ ldrh w4, [x20, x4]
3.50 │ mul w2, w2, w4
1.03 │ strh w2, [x23, x1]
WelsDec::CavlcGetLevelVal - Expensive Lines:
1.81 │ cmp w10, w5
1.02 │ → b.eq 443024 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x284> // b.none
0.10 │ mov w4, #0x0 // #0
1.68 │ add w4, w12, w4, lsl #1
│ iLevel[i] = ((iLevelCode + 2) >> 1);
1.12 │ add w7, w4, #0x2
2.21 │ asr w7, w7, #1
│ iLevel[i] -= (iLevel[i] << 1) & (- (iLevelCode & 0x01));
│ sbfx x4, x4, #0, #1
2.26 │ and w4, w4, w7, lsl #1
1.11 │ sub w4, w7, w4
0.01 │ str w4, [x9, w5, sxtw #2]
│ iSuffixLength += !iSuffixLength;
1.11 │ cmp w6, #0x0
0.02 │ cinc w6, w6, eq // eq = none
│ iThreshold = 3 << (iSuffixLength - 1);
2.25 │ sub w8, w6, #0x1
0.01 │ mov w7, #0x3 // #3
2.18 │ lsl w7, w7, w8
│ iSuffixLength += ((iLevel[i] > iThreshold) || (iLevel[i] < -iThreshold)) && (iSuffixLength < 6);
0.02 │ cmp w4, w7
0.99 │ → b.gt 442f04 <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x164>
0.09 │ neg w7, w7
0.92 │ cmp w4, w7
0.01 │ → b.ge 44303c <WelsDec::CavlcGetLevelVal(int*, WelsDec::TagReadBitsCache*, unsigned char, unsigned char)+0x29c> // b.tcont
0.56 │ cmp w6, #0x5
3.02 │ ldr w4, [x4, w8, uxtw #2]
3.13 │ add w11, w4, w11
2.14 │ adrp x1, __FRAME_END__+0x16798
│ ldr x1, [x1, #2776]
0.53 │ ldr x2, [sp, #24]
0.57 │ ldr x1, [x1]
0.72 │ eor x1, x2, x1
0.34 │ ↓ cbnz x1, 443070 <WelsDec::CavlcGetLevelVal(int*, 2d0
0.48 │ ldp x29, x30, [sp], #32
0.70 │ ← ret
│2d0: → bl __stack_chk_fail@plt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment