bandito/icmdct.c

## icmdct.c
#include "typedefs.h"
#include "imdct.h"

#define SPECTRUM_POS *(i16 *)0x1080 //

static i16 tmp[18];

static i16 u[2][2][17][16]; /* no v[][], it's redundant */
static int u_start[2]={0,0}; /* first element of u[][] */
static int u_div[2]={0,0}; /* which part of u[][] is currently used */

static i16 *spectrum;

#pragma DATA_SECTION(PcmBuffer, "PCM_BUFFER")
u16 PcmBuffer[6*2304];

extern u16 GrannulesInBuffer;
static i16 prev[18];


void MD_IMDCT(i16 win_type,i16 ch,i16 gr, u16 no_of_imdcts)
{
/*------------------------------------------------------------------*/
/*                                                                  */
/*    Function: Calculation of the inverse MDCT                     */
/*    In the case of short blocks the 3 output vectors are already  */
/*    overlapped and added in this modul.                           */
/*                                                                  */
/*    New layer3                                                    */
/*                                                                  */
/*------------------------------------------------------------------*/

	register i16 save;
	i16 pp1, pp2;
	i16 i, p, ss;
	i16 *in;
	i16 out[36];
	i16 n;
   	i16 sb;
	i16 tmp0,tmp1,tmp2,tmp3,tmp4,tmp0_,tmp1_,tmp2_,tmp3_;
	i16 tmp0o,tmp1o,tmp2o,tmp3o,tmp4o,tmp0_o,tmp1_o,tmp2_o,tmp3_o;
	i16 i0;
	i16 i0p12;
	i16 i6_;
	i16 e,o;

	for (sb=0; sb<no_of_imdcts; sb++)
	{

		in = &spectrum[ch*576+sb*18];

		if(win_type == 2)
	    {
			for(p=0;p<36;p+=9)
			{
				out[p]   = out[p+1] = out[p+2] = out[p+3] =
				out[p+4] = out[p+5] = out[p+6] = out[p+7] =
				out[p+8] = 0;
			}

			for(ss=0;ss<18;ss+=6)
			{

				/*
				 *  12 point IMDCT
				 */

				/* Begin 12 point IDCT */

				/* Input aliasing for 12 pt IDCT */
				in[5+ss]+=in[4+ss];
				in[4+ss]+=in[3+ss];
				in[3+ss]+=in[2+ss];
				in[2+ss]+=in[1+ss];
				in[1+ss]+=in[0+ss];

				/* Input aliasing on odd indices (for 6 point IDCT) */
				in[5+ss] += in[3+ss];
				in[3+ss]  += in[1+ss];

				/* 3 point IDCT on even indices */
				//Initially supposing that the Q15 format will never overflow
				pp2 = q15_mul(in[4+ss],0x4000); //*0.5f
				pp1 = q15_mul(in[2+ss],0x6ED9); //*0.866025403f;
				save = in[0+ss] + pp2;
				tmp[1] = in[0+ss] - in[4+ss];
				tmp[0] = save + pp1;
				tmp[2] = save - pp1;

				/* End 3 point IDCT on even indices */

				/* 3 point IDCT on odd indices (for 6 point IDCT) */

				pp2 = q15_mul(in[5+ss],0x4000); //*0.5f
				pp1 = q15_mul(in[3+ss],0x6ED9);
				save = in[1+ss] + pp2;
				tmp[4] = in[1+ss] - in[5+ss];
				tmp[5] = save + pp1;
				tmp[3] = save - pp1;

				/* End 3 point IDCT on odd indices */

				/* Twiddle factors on odd indices (for 6 point IDCT) */

				tmp[3] = q15_q13_mul(tmp[3],0x3DD1);// 1.931851653f;
				tmp[4] = q15_mul(tmp[4],0x5A82); //0.707106781f;
				tmp[5] = q15_mul(tmp[5],0x4241); //0.517638090f;

				/* Output butterflies on 2 3 point IDCT's (for 6 point IDCT) */

				save = tmp[0];
				tmp[0] += tmp[5];
				tmp[5] = save - tmp[5];

				save = tmp[1];
				tmp[1] += tmp[4];
				tmp[4] = save - tmp[4];

				save = tmp[2];
				tmp[2] += tmp[3];
				tmp[3] = save - tmp[3];

				/* End 6 point IDCT */

				/* Twiddle factors on indices (for 12 point IDCT) */

				tmp[0] = q15_mul(tmp[0],0x408D); //0.504314480f;
				tmp[1] = q15_mul(tmp[1],0x4545); //0.541196100f;
				tmp[2] = q15_mul(tmp[2],0x50AB); //0.630236207f;
				tmp[3] = q15_mul(tmp[3],0x6921); //0.821339815f;
				tmp[4] = q15_q13_mul(tmp[4],0x29CF); //1.306562965f;
				tmp[5] = q15_q13_mul(tmp[4],0x7A94); //3.830648788f;

				/* End 12 point IDCT */

				/* Shift to 12 point modified IDCT, multiply by window type 2 */
				tmp[8]  = q15_mul(tmp[0], 0xF9A7);	//-0.793353340f;
				tmp[9]  = q15_mul(tmp[0], 0xB214);	// -0.608761429f;
				tmp[7]  = q15_mul(tmp[1], 0x89BE);	// -0.923879532f;
				tmp[10] = q15_mul(tmp[1], 0xCF04);	// -0.382683432f;
				tmp[6]  = q15_mul(tmp[2], 0x8118); //-0.991444861f;
				tmp[11] = q15_mul(tmp[2], 0xEF4A); // -0.130526192f;

				tmp[0]  = tmp[3];
				tmp[1]  = q15_mul(tmp[4],0x30FB); // 0.382683432f;
				tmp[2]  = q15_mul(tmp[5],0x4DEB); // 0.608761429f;

				tmp[3]  = q15_mul(tmp[5],0x9A73); // -0.793353340f;
				tmp[4]  = q15_mul(tmp[4],0x89BE); // -0.923879532f;
				tmp[5]  = q15_mul(tmp[0],0x8118); // -0.991444861f;

				tmp[0] = q15_mul(tmp[0],0x10B5); // 0.130526192f;


				for (n=6; n<18; n++)
					out[ss + n]  += tmp[n-6];
			}
			//overlapping
			if (sb&1)
			{ //if subband 1 then don't overlap with the previous one ???
				for (i=0;i<18;i+=2) in[i]=out[i] + prev[i];
				for (i=1;i<18;i+=2)  in[i]=-out[i] - prev[i];
			}
			else
				for (i=0;i<18;i++)  in[i]=out[i] + prev[i];

			for (i=18;i<36;i++) prev[i-18]=out[i]; //create new overlap array

	    }
		else
		{
			/*
			 * 36 point IDCT ****************************************************************
			 */

			  /* input aliasing for 36 point IDCT */

			for (n=17; n>0; n--)
				in[n]+=in[n-1];

			/* 18 point IDCT for odd indices */

			/* input aliasing for 18 point IDCT */
			in[17]+=in[15];
			in[15]+=in[13];
			in[13]+=in[11];
			in[11]+=in[9];
			in[9] +=in[7];
			in[7] +=in[5];
			in[5] +=in[3];
			in[3] +=in[1];

			{

			/* Fast 9 Point Inverse Discrete Cosine Transform
			//
			// By  Francois-Raymond Boyer
			//         mailto:boyerf@iro.umontreal.ca
			//         http://www.iro.umontreal.ca/~boyerf
			//
			// The code has been optimized for Intel processors
			//  (takes a lot of time to convert float to and from iternal FPU representation)
			//
			// It is a simple "factorization" of the IDCT matrix.
			*/
			/* 9 point IDCT on even indices */
				{
				/* 5 points on odd indices (not realy an IDCT) */
				 	i0 = in[0]+in[0];
					i0p12 = i0 + in[12];

					tmp0 = i0p12 + q15_q13_mul(in[4],0x3C23) /*1.8793852415718f */  + q15_q13_mul(in[8],0x3106) /*1.532088886238f*/   + q15_mul(in[16],0x2C74); /* 0.34729635533386f*/
					tmp1 = i0    + in[4]                   - in[8] - in[12] - in[12] - in[16];
					tmp2 = i0p12 - q15_mul(in[4], 0x2C74) /*0.34729635533386f*/ - q15_q13_mul(in[8],0x3C23) /* 1.8793852415718f*/  + q15_q13_mul(in[16],0x3106); /* 1.532088886238f*/
					tmp3 = i0p12 - q15_q13_mul(in[4],0x3106) /*1.532088886238f*/ + q15_mul(in[8], 0x2C74) /*0.34729635533386f*/ - q15_q13_mul(in[16],0x3C23); /*1.8793852415718f */
					tmp4 = in[0] - in[4]                   + in[8] - in[12]          + in[16];
				}

				{
					//1.9696155060244=0x3F07
					//1.2855752193731=0x2923
					//0.68404028665134=0x578E

					i6_ = q15_q13_mul(in[6],0x376C); /*1.732050808f*/

					tmp0_ = q15_q13_mul(in[2],0x3F07) /*1.9696155060244f*/  + i6_ + q15_q13_mul(in[10],0x2923) /*1.2855752193731f*/  + q15_mul(in[14],0x578E); /*0.68404028665134f*/
					tmp1_ = q15_q13_mul((in[2]                        - in[10]                   - in[14]),0x376C); /*1.732050808f*/
					tmp2_ = q15_q13_mul(in[2],0x2923) /*1.2855752193731f*/  - i6_ - q15_mul(in[10],0x578E) /*0.68404028665134f*/ + q15_q13_mul(in[14],0x3F07); /*1.9696155060244f*/
					tmp3_ = q15_mul(in[2],0x578E) /*0.68404028665134f*/ - i6_ + q15_q13_mul(in[10],0x3F07) /*1.9696155060244f*/  - q15_q13_mul(in[14],0x2923); /*1.2855752193731f*/
				}

				/* 9 point IDCT on odd indices */
				{
				/* 5 points on odd indices (not realy an IDCT) */
					i0 = in[0+1]+in[0+1];
					i0p12 = i0 + in[12+1];

					tmp0o = i0p12 + q15_q13_mul(in[4+1],0x3C23) /*1.8793852415718f */  + q15_q13_mul(in[8+1],0x3106) /*1.532088886238f*/   + q15_mul(in[16+1],0x2C74); /* 0.34729635533386f*/
					tmp1o = i0      + in[4+1]                   - in[8+1] - in[12+1] - in[12+1] - in[16+1];
					tmp2o = i0p12 - q15_mul(in[4+1], 0x2C74) /*0.34729635533386f*/ - q15_q13_mul(in[8+1],0x3C23) /* 1.8793852415718f*/  + q15_q13_mul(in[16+1],0x3106); /* 1.532088886238f*/
					tmp3o = i0p12 - q15_q13_mul(in[4+1],0x3106) /*1.532088886238f*/ + q15_mul(in[8+1], 0x2C74) /*0.34729635533386f*/ - q15_q13_mul(in[16+1],0x3C23); /*1.8793852415718f */
					tmp4o = q15_mul((in[0+1] - in[4+1]                   + in[8+1] - in[12+1]            + in[16+1]),0x5A82); /*0.707106781f Twiddled */
				}

				{
				/* 4 points on even indices */
					i6_ = q15_q13_mul(in[6+1],0x376C); /*1.732050808f*/

					tmp0_o = q15_q13_mul(in[2+1],0x3F07) /*1.9696155060244f*/  + i6_ + q15_q13_mul(in[10+1],0x2923) /*1.2855752193731f*/  + q15_mul(in[14+1],0x578E); /*0.68404028665134f*/
					tmp1_o = q15_q13_mul((in[2+1]                        - in[10+1]                   - in[14+1]),0x376C); /*1.732050808f*/
					tmp2_o = q15_q13_mul(in[2+1],0x2923) /*1.2855752193731f*/  - i6_ - q15_mul(in[10+1],0x578E) /*0.68404028665134f*/ + q15_q13_mul(in[14+1],0x3F07); /*1.9696155060244f*/
					tmp3_o = q15_mul(in[2+1],0x578E) /*0.68404028665134f*/ - i6_ + q15_q13_mul(in[10+1],0x3F07) /*1.9696155060244f*/  - q15_q13_mul(in[14+1],0x2923); /*1.2855752193731f*/
				}

				/* Twiddle factors on odd indices
				// and
				// Butterflies on 9 point IDCT's
				// and
				// twiddle factors for 36 point IDCT
				*/
				{
					e = tmp0 + tmp0_;
					o = q15_mul((tmp0o + tmp0_o),0x403E); /*0.501909918f*/
					tmp[0] = q15_mul((e + o),0xDFF8); /*(-0.500476342f*.5f)*/
					tmp[17] = q15_q11_mul((e - o),0xD226); /*(-11.46279281f*.5f)*/

					e = tmp1 + tmp1_;
					o = q15_mul((tmp1o + tmp1_o),0x4241); //0.517638090f
					tmp[1] = q15_mul((e + o),0xDFB9); //-0.504314480f*.5f
					tmp[16] = q15_q13_mul((e - o),0xC2B5);//-3.830648788f*.5f)

					e = tmp2 + tmp2_;
					o = q15_mul((tmp2o + tmp2_o),0x469D); //0.551688959f
					tmp[2] = q15_mul((e + o),0xDF39); //(-0.512139757f*.5f)
					tmp[15] = q15_q13_mul((e - o),0xDB09); //(-2.310113158f*.5f)

					e = tmp3 + tmp3_;
					o = q15_mul((tmp3o + tmp3_o),0x4E21); //.610387294f
					tmp[3] = q15_mul((e + o),0xDE72); //-0.524264562f*.5f
					tmp[14] = q15_mul((e - o),0x9595); //(-1.662754762f*.5f)

					tmp[4] = q15_mul((tmp4 + tmp4o),0xBABA); //(-0.541196100f)
					tmp[13] = q15_q13_mul((tmp4 - tmp4o),0xD630); //(-1.306562965f)

					e = tmp3 - tmp3_;
					o = q15_mul((tmp3o - tmp3_o),0x6F94); //0.871723397f
					tmp[5] = q15_mul((e + o),0xDBEC); //(-0.563690973f*.5f)
					tmp[12] = q15_mul((e - o),0xBAB2); //(-1.082840285f*.5f)

					e = tmp2 - tmp2_;
					o = q15_q13_mul((tmp2o - tmp2_o),0x25DB); //1.183100792f
					tmp[6] = q15_mul((e + o), 0xDA0E); //(-0.592844523f*.5f)
					tmp[11] = q15_mul((e - o),0xC471); // (-0.930579498f*.5f)

					e = tmp1 - tmp1_;
					o = q15_q13_mul((tmp1o - tmp1_o),0x3DD1); //1.931851653f
					tmp[7] = q15_mul((e + o),0xD7AA); //(-0.630236207f*.5f)
					tmp[10] = q15_mul((e - o),0xCB6F); //(-0.821339815f*.5f)

					e = tmp0 - tmp0_;
					o = q15_q11_mul((tmp0o - tmp0_o),0x2DE5); //5.736856623f
					tmp[8] = q15_mul((e + o),0xD498); //(-0.678170852f*.5f)
					tmp[9] = q15_mul((e - o),0xD0A2); //(-0.740093616f*.5f)
				}

			}
			/* shift to modified IDCT */

			if (sb&1)
			{ //overlapping

				for (n=0; n<9; n+=2)
					in[n] = q15_mul(-tmp[n+1],win[win_type][n]) + prev[n];

				for (n=1; n<9; n+=2)
					in[n] = -q15_mul(-tmp[n+1],win[win_type][n]) + prev[n];

				for (n=9; n<18; n+=2)
					in[n] =-(q15_mul(tmp[26-n] , win[win_type][n]) + prev[n]);

				for (n=10; n<18; n+=2)
					in[n] =(q15_mul(tmp[26-n] , win[win_type][n]) + prev[n]);

			}
			else
			{

				for (n=0; n<9; n++)
					in[n] = q15_mul(-tmp[9+n]  , win[win_type][n]) + prev[n];


				for (n=9; n<18; n++)
					in[n] = q15_mul(tmp[26-n]  , win[win_type][n]) + prev[n];

			}

			for (n=0; n<9; n++)
				prev[n]= q15_mul(tmp[8-n]  , win[win_type][18+n]);

			for (n=9; n<18; n++)
				prev[n]= q15_mul(tmp[n-9]  , win[win_type][18+n]);

	    }
	}
}

void MD_Polyphase(u16 ch, u16 f)
{
	int start = u_start[ch];
	int div = u_div[ch];
	i16 (*u_p)[16];

	u16 j,n,k, cha;

    const i16 *dewindow = Dewindow[0] + 15 - start;
    i16 *u_ptr = (i16 *) u[ch][div];

	i16 outf1, outf2, outf3, outf4, out;
	i16 d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31;
	i16 d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
	i16 c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;

	cha=(ch)?0:1;
	d0 = spectrum[ch*576+f];    d16=q15_mul((d0  - spectrum[ch*576+558+f]) , b[1]); d0 += spectrum[ch*576+558+f];
	d1 = spectrum[ch*576+18+f]; d17=q15_q14_mul((d1  - spectrum[ch*576+540+f])  , b[3]); d1 += spectrum[ch*576+540+f];
	d3 = spectrum[ch*576+36+f]; d19=q15_q14_mul((d3  - spectrum[ch*576+522+f])  , b[5]); d3 += spectrum[ch*576+522+f];
	d2 = spectrum[ch*576+54+f]; d18=q15_q14_mul((d2  - spectrum[ch*576+504+f])  , b[7]); d2 += spectrum[ch*576+504+f];
	d6 = spectrum[ch*576+72+f]; d22=q15_q14_mul((d6  - spectrum[ch*576+486+f])  , b[9]); d6 += spectrum[ch*576+486+f];
	d7 = spectrum[ch*576+90+f]; d23=q15_q14_mul((d7  - spectrum[ch*576+468+f])  , b[11]); d7 += spectrum[ch*576+468+f];
	d5 = spectrum[ch*576+108+f]; d21=q15_q14_mul((d5  - spectrum[ch*576+450+f]) , b[13]); d5 += spectrum[ch*576+450+f];
	d4 = spectrum[ch*576+126+f]; d20=q15_q14_mul((d4  - spectrum[ch*576+432+f]) , b[15]); d4 += spectrum[ch*576+432+f];
	d12= spectrum[ch*576+144+f]; d28=q15_q14_mul((d12 - spectrum[ch*576+414+f]) , b[17]); d12+= spectrum[ch*576+414+f];
	d13= spectrum[ch*576+162+f]; d29=q15_q14_mul((d13 - spectrum[ch*576+396+f]) , b[19]); d13+= spectrum[ch*576+396+f];
	d15= spectrum[ch*576+180+f]; d31=q15_q14_mul((d15 - spectrum[ch*576+378+f]) , b[21]); d15+= spectrum[ch*576+378+f];
	d14= spectrum[ch*576+198+f]; d30=q15_q14_mul((d14 - spectrum[ch*576+360+f]) , b[23]); d14+= spectrum[ch*576+360+f];
	d10= spectrum[ch*576+216+f]; d26=q15_q14_mul((d10 - spectrum[ch*576+342+f]) , b[25]); d10+= spectrum[ch*576+342+f];
	d11= spectrum[ch*576+234+f]; d27=q15_q14_mul((d11 - spectrum[ch*576+324+f]) , b[27]); d11+= spectrum[ch*576+324+f];
	d9 = spectrum[ch*576+252+f]; d25=q15_q14_mul((d9  - spectrum[ch*576+306+f]) , b[29]); d9 += spectrum[ch*576+306+f];
	d8 = spectrum[ch*576+270+f]; d24=q15_q14_mul((d8  - spectrum[ch*576+288+f]) , b[31]); d8 += spectrum[ch*576+288+f];


/* a test to see what can be done with memory separation
 * first we process indexes 0-15
*/

	c0 = d0 + d8 ; c8 = q15_q14_mul(( d0 - d8 ) ,  b[2]);
	c1 = d1 + d9 ; c9 = q15_q14_mul(( d1 - d9 ) ,  b[6]);
	c2 = d2 + d10; c10= q15_q14_mul(( d2 - d10) , b[14]);
	c3 = d3 + d11; c11= q15_q14_mul(( d3 - d11) , b[10]);
	c4 = d4 + d12; c12= q15_q14_mul(( d4 - d12) , b[30]);
	c5 = d5 + d13; c13= q15_q14_mul(( d5 - d13) , b[26]);
	c6 = d6 + d14; c14= q15_q14_mul(( d6 - d14) , b[18]);
	c7 = d7 + d15; c15= q15_q14_mul(( d7 - d15) , b[22]);

	/* step 3: 4-wide butterflies
	*/
	d0 = c0 + c4 ; d4 = q15_q14_mul(( c0 - c4 ) ,  b[4]);
	d1 = c1 + c5 ; d5 = q15_q14_mul(( c1 - c5 ) , b[12]);
	d2 = c2 + c6 ; d6 = q15_q14_mul(( c2 - c6 ) , b[28]);
	d3 = c3 + c7 ; d7 = q15_q14_mul(( c3 - c7 ) , b[20]);

	d8 = c8 + c12; d12= q15_q14_mul(( c8 - c12) ,  b[4]);
	d9 = c9 + c13; d13= q15_q14_mul(( c9 - c13) , b[12]);
	d10= c10+ c14; d14= q15_q14_mul((c10 - c14) , b[28]);
	d11= c11+ c15; d15= q15_q14_mul((c11 - c15) , b[20]);


/**/	c0 = d0 + d2 ; c2 = q15_q14_mul(( d0 - d2 ) ,  b[8]);
	c1 = d1 + d3 ; c3 = q15_q14_mul(( d1 - d3 ) , b[24]);
/**/	c4 = d4 + d6 ; c6 = q15_q14_mul(( d4 - d6 ) ,  b[8]);
	c5 = d5 + d7 ; c7 = q15_q14_mul(( d5 - d7 ) , b[24]);
/**/	c8 = d8 + d10; c10= q15_q14_mul(( d8 - d10) ,  b[8]);
	c9 = d9 + d11; c11= q15_q14_mul(( d9 - d11) , b[24]);
/**/	c12= d12+ d14; c14= q15_q14_mul((d12 - d14) ,  b[8]);
	c13= d13+ d15; c15= q15_q14_mul((d13 - d15) , b[24]);

	/* step 5: 1-wide butterflies
	*/

	/* this is a little 'hacked up'
	*/
	d0 = q15_q11_mul((-c0 -c1),0x1000); d1 = q15_q14_mul(( c0 - c1 ) , b[16]);
	d2 = c2 + c3; d3 = q15_q14_mul(( c2 - c3 ) , b[16]);
	d3 -= d2;

	d4 = c4 +c5; d5 = q15_q14_mul(( c4 - c5 ),  b[16]);
	d5 += d4;
	d7 = -d5;
	d7 += q15_q14_mul(( c6 - c7 ) , b[16]); d6 = +c6 +c7;

	d8 = c8 + c9 ; d9 = q15_q14_mul(( c8 - c9 ) , b[16]);
	d11= +d8 +d9;
	d11 +=q15_q14_mul((c10 - c11) , b[16]); d10= c10+ c11;

	d12 = c12+ c13; d13 = q15_q14_mul((c12 - c13) , b[16]);
	d13 += -d8-d9+d12;
	d14 = c14+ c15; d15 = q15_q14_mul((c14 - c15) , b[16]);
	d15-=d11;
	d14 += -d8 -d10;

    u_p = (i16 (*)[16]) &u[ch][div][0][start];

/*16*/  u_p[ 0][0] =+d1 ;
        u_p[ 2][0] = +d9 -d14;
/*20*/  u_p[ 4][0] = +d5 -d6;
        u_p[ 6][0] = -d10 +d13;
/*24*/  u_p[ 8][0] =d3;
        u_p[10][0] = -d8 -d9 +d11 -d13;
/*28*/  u_p[12][0] = +d7;
        u_p[14][0] = +d15;

        /* the other 32 are stored for use with the next granule
         */

        u_p = (i16 (*)[16]) &u[ch][!div][0][start];

/*0*/   u_p[16][0] = d0;
        u_p[14][0] = -(+d8 );
/*4*/   u_p[12][0] = -(+d4 );
        u_p[10][0] = -(-d8 +d12 );
/*8*/   u_p[ 8][0] = -(+d2 );
        u_p[ 6][0] = -(+d8 +d10 -d12 );
/*12*/  u_p[ 4][0] = -(-d4 +d6 );
        u_p[ 2][0] = -d14;
        u_p[ 0][0] = -d1;


        c0=d16 + d24; c8= q15_q14_mul((d16 - d24) ,  b[2]);
        c1=d17 + d25; c9= q15_q14_mul((d17 - d25) ,  b[6]);
        c2=d18 + d26; c10= q15_q14_mul((d18 - d26) , b[14]);
        c3=d19 + d27; c11= q15_q14_mul((d19 - d27) , b[10]);
        c4=d20 + d28; c12= q15_q14_mul((d20 - d28) , b[30]);
        c5=d21 + d29; c13= q15_q14_mul((d21 - d29) , b[26]);
        c6=d22 + d30; c14= q15_q14_mul((d22 - d30) , b[18]);
        c7=d23 + d31; c15= q15_q14_mul((d23 - d31) , b[22]);

/* 3
*/
        d16= c0+ c4; d20= q15_q14_mul((c0 - c4) ,  b[4]);
        d17= c1+ c5; d21= q15_q14_mul((c1 - c5) , b[12]);
        d18= c2+ c6; d22= q15_q14_mul((c2 - c6) , b[28]);
        d19= c3+ c7; d23= q15_q14_mul((c3 - c7) , b[20]);

        d24= c8+ c12; d28= q15_q14_mul((c8 - c12) ,  b[4]);
        d25= c9+ c13; d29= q15_q14_mul((c9 - c13) , b[12]);
        d26= c10+ c14; d30= q15_q14_mul((c10 - c14) , b[28]);
        d27= c11+ c15; d31= q15_q14_mul((c11 - c15) , b[20]);

/* 4
*/

/**/    c0= d16+ d18; c2= q15_q14_mul((d16 - d18) ,  b[8]);
        c1= d17+ d19; c3= q15_q14_mul((d17 - d19) , b[24]);
/**/    c4= d20+ d22; c6= q15_q14_mul((d20 - d22) ,  b[8]);
        c5= d21+ d23; c7= q15_q14_mul((d21 - d23) , b[24]);
/**/    c8= d24+ d26; c10= q15_q14_mul((d24 - d26) ,  b[8]);
        c9= d25+ d27; c11= q15_q14_mul((d25 - d27) , b[24]);
/**/    c12= d28+ d30; c14= q15_q14_mul((d28 - d30) ,  b[8]);
        c13= d29+ d31; c15= q15_q14_mul((d29 - d31) , b[24]);

/* 5
*/
        d16= c0+ c1; d17= q15_q14_mul((c0 - c1) * b[16]);
        d18= c2+ c3; d19= q15_q14_mul((c2 - c3) * b[16]);

        d20= c4+ c5; d21= q15_q14_mul((c4 - c5) * b[16]);
        d20+=d16; d21+=d17;
        d22= c6+ c7; d23= q15_q14_mul((c6 - c7) * b[16]);
        d22+=d16; d22+=d18;
        d23+=d16; d23+=d17; d23+=d19;


        d24= c8+ c9; d25= q15_q14_mul((c8 - c9) * b[16]);
        d26= c10+ c11; d27= q15_q14_mul((c10 - c11) * b[16]);
        d26+=d24;
        d27+=d24; d27+=d25;

        d28= c12+ c13; d29= q15_q14_mul((c12 - c13) * b[16]);
        d28-=d20; d29+=d28; d29-=d21;
        d30= c14+ c15; d31= q15_q14_mul((c14 - c15) * b[16]);
        d30-=d22;
        d31-=d23;

    u_p = (i16 (*)[16]) &u[ch][!div][0][start];

	u_p[ 1][0] = -(+d30 );
	u_p[ 3][0] = -(+d22 -d26 );
	u_p[ 5][0] = -(-d18 -d20 +d26 );
	u_p[ 7][0] = -(+d18 -d28 );
	u_p[ 9][0] = -(+d28 );
	u_p[11][0] = -(+d20 -d24 );
	u_p[13][0] = -(-d16 +d24 );
	u_p[15][0] = -(+d16 );

	/* the other 32 are stored for use with the next granule
	 */

	u_p = (i16 (*)[16]) &u[ch][div][0][start];

	u_p[15][0] = +d31;
	u_p[13][0] = +d23 -d27;
	u_p[11][0] = -d19 -d20 -d21 +d27;
	u_p[ 9][0] = +d19 -d29;
	u_p[ 7][0] = -d18 +d29;
	u_p[ 5][0] = +d18 +d20 +d21 -d25 -d26;
	u_p[ 3][0] = -d17 -d22 +d25 +d26;
	u_p[ 1][0] = +d17 -d30;


	  /* This is tuned specifically for architectures with
             autoincrement and -decrement. */

	    u_ptr--;

		outf1=outf2=outf3=outf4=0;

	    for (j = 0; j < 16; ++j)
	   	{
	   		for (n=0; n<4; n++)
		    {
		       outf1 += q15_mul(*++u_ptr , *++dewindow);
		       outf2 += q15_mul(*++u_ptr , *++dewindow);
		       outf3 += q15_mul(*++u_ptr , *++dewindow);
		       outf4 += q15_mul(*++u_ptr , *++dewindow);
		    }

	      PcmBuffer[GrannulesInBuffer*1152+f*64+j*2+cha] = outf1 + outf2 + outf3 + outf4;

	      dewindow += 16;
		}

/*	    for (j = 0; j < 16; ++j)
	    {
	   	  outf1  = q15_mul(*++u_ptr , *++dewindow);
	      outf2  = q15_mul(*++u_ptr , *++dewindow);
	      outf3  = q15_mul(*++u_ptr , *++dewindow);
	      outf4  = q15_mul(*++u_ptr , *++dewindow);
	      outf1 += q15_mul(*++u_ptr , *++dewindow);
	      outf2 += q15_mul(*++u_ptr , *++dewindow);
	      outf3 += q15_mul(*++u_ptr , *++dewindow);
	      outf4 += q15_mul(*++u_ptr , *++dewindow);
	      outf1 += q15_mul(*++u_ptr , *++dewindow);
	      outf2 += q15_mul(*++u_ptr , *++dewindow);
	      outf3 += q15_mul(*++u_ptr , *++dewindow);
	      outf4 += q15_mul(*++u_ptr , *++dewindow);
	      outf1 += q15_mul(*++u_ptr , *++dewindow);
	      outf2 += q15_mul(*++u_ptr , *++dewindow);
	      outf3 += q15_mul(*++u_ptr , *++dewindow);
	      outf4 += q15_mul(*++u_ptr , *++dewindow);

	      out = outf1 + outf2 + outf3 + outf4;

	      dewindow += 16;
	    } */

	    if (div & 0x1)
	    {

	    	k=1;
	    	outf2=outf4=0;

	    	for (n=0; n<4; n++)
	    	{
	    		outf2 += q15_mul(u_ptr[ k] , dewindow[k]);
	    		k+=2;
				outf4 += q15_mul(u_ptr[ k] , dewindow[k]);
				k+=2;
			}

		/*	outf2  = q15_mul(u_ptr[ 1] , dewindow[0x1]);
			outf4  = q15_mul(u_ptr[ 3] , dewindow[0x3]);
			outf2 += q15_mul(u_ptr[ 5] , dewindow[0x5]);
			outf4 += q15_mul(u_ptr[ 7] , dewindow[0x7]);
			outf2 += q15_mul(u_ptr[ 9] , dewindow[0x9]);
			outf4 += q15_mul(u_ptr[11] , dewindow[0xb]);
			outf2 += q15_mul(u_ptr[13] , dewindow[0xd]);
			outf4 += q15_mul(u_ptr[15] , dewindow[0xf]); */

			PcmBuffer[GrannulesInBuffer*1152+f*64+j*2+cha] = outf2 + outf4;

	      dewindow -= 31;
	      dewindow += start;
	      dewindow += start;
	      u_ptr -= 16;

		outf1=outf2=outf3=outf4=0;

	    for (; j < 31; ++j)
	    {
	   		for (n=0; n<4; n++)
		    {
		       outf1 += q15_mul(*++u_ptr , *--dewindow);
		       outf2 += q15_mul(*++u_ptr , *--dewindow);
		       outf3 += q15_mul(*++u_ptr , *--dewindow);
		       outf4 += q15_mul(*++u_ptr , *--dewindow);
		    }

	   		PcmBuffer[GrannulesInBuffer*1152+f*64+j*2+cha] = outf2 - outf1 + outf4 - outf3;

			dewindow -= 16;
			u_ptr -= 32;
		}

/*		for (; j < 31; ++j)
	    {

		  outf1  = q15_mul(*++u_ptr , *--dewindow);
	      outf2  = q15_mul(*++u_ptr , *--dewindow);
	      outf3  = q15_mul(*++u_ptr , *--dewindow);
	      outf4  = q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow);


			out = outf2 - outf1 + outf4 - outf3;

			dewindow -= 16;
			u_ptr -= 32;

	     }*/
	    }
	    else
	    {

	  	k=2;
    	outf2=outf4=0;

    	for (n=0; n<4; n++)
    	{
    		outf2 += q15_mul(u_ptr[ k] , dewindow[k]);
    		k+=2;
			outf4 += q15_mul(u_ptr[ k] , dewindow[k]);
			k+=2;
		}

		PcmBuffer[GrannulesInBuffer*1152+f*64+j*2+cha] = outf2 + outf4;

/*		outf2  = q15_mul(u_ptr[ 2] , dewindow[0x2]);
		outf4  = q15_mul(u_ptr[ 4] , dewindow[0x4]);
		outf2 += q15_mul(u_ptr[ 6] , dewindow[0x6]);
		outf4 += q15_mul(u_ptr[ 8] , dewindow[0x8]);
		outf2 += q15_mul(u_ptr[10] , dewindow[0xa]);
		outf4 += q15_mul(u_ptr[12] , dewindow[0xc]);
		outf2 += q15_mul(u_ptr[14] , dewindow[0xe]);
		outf4 += q15_mul(u_ptr[16] , dewindow[0x10]); */

		//out = outf2 + outf4;

	      dewindow -= 31;
	      dewindow += start;
	      dewindow += start;
	      u_ptr -= 16;

	      for (; j < 31; ++j)
	      {

			for (n=0; n<4; n++)
		    {
		       outf1 += q15_mul(*++u_ptr , *--dewindow);
		       outf2 += q15_mul(*++u_ptr , *--dewindow);
		       outf3 += q15_mul(*++u_ptr , *--dewindow);
		       outf4 += q15_mul(*++u_ptr , *--dewindow);
		    }

/*		  outf1  = q15_mul(*++u_ptr , *--dewindow);
	      outf2  = q15_mul(*++u_ptr , *--dewindow);
	      outf3  = q15_mul(*++u_ptr , *--dewindow);
	      outf4  = q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow);
	      outf1 += q15_mul(*++u_ptr , *--dewindow);
	      outf2 += q15_mul(*++u_ptr , *--dewindow);
	      outf3 += q15_mul(*++u_ptr , *--dewindow);
	      outf4 += q15_mul(*++u_ptr , *--dewindow); */

		PcmBuffer[GrannulesInBuffer*1152+f*64+j*2+cha] = outf1 - outf2 + outf3 - outf4;

		dewindow -= 16;
		u_ptr -= 32;

	      }
	    }

	--u_start[ch];
	u_start[ch] &= 0xf;
	u_div[ch]=u_div[ch] ? 0 : 1;

}

void MD_IMDCT_Init()
{
	u16 *k;
	u16 i;

	spectrum=&SPECTRUM_POS;
	for (i=0; i<18; i++)
		prev[i]=0;

	k=&u[0][0][0][0];
	for (i=0; i<2*2*17*16; i++)
		*k++=0;
}

## log_exp.asm
;This code is taken from TI's ........
;Used to calcualate the exponent of an integer
;from 0 to 32768 (must be in A)
		.mmregs
		.global _log
		.global _exp
		.global _descale

		.data
;Look Up table for the exp function for the first 10 integers
;where the algorithm looses important accuracy.
;starts from ln(0) to ln(100) in Q13 format
exp_lup	.word   0h,0B17h,1193h,162Eh,19C0h,1CABh,1F22h,2145h,2327h
		.word  24D7h,265Dh,27C2h,290Ah,2A39h,2B54h,2C5Ch,2D54h,2E3Eh,2F1Ch
		.word  2FEEh,30B6h,3174h,322Ah,32D9h,3380h,3421h,34BBh,3550h,35E0h
		.word  366Bh,36F1h,3773h,37F1h,386Bh,38E2h,3956h,39C6h,3A33h,3A9Dh
		.word  3B05h,3B6Ah,3BCDh,3C2Dh,3C8Ch,3CE8h,3D42h,3D9Ah,3DF0h,3E44h
		.word  3E97h,3EE8h,3F38h,3F86h,3FD2h,401Eh,4067h,40B0h,40F7h,413Dh
		.word  4182h,41C6h,4208h,424Ah,428Ah,42CAh,4308h,4346h,4383h,43BEh
		.word  43F9h,4433h,446Dh,44A5h,44DDh,4514h,454Ah,4580h,45B5h,45E9h
		.word  461Ch,464Fh,4681h,46B3h,46E4h,4715h,4745h,4774h,47A3h,47D1h
		.word  47FFh,482Ch,4859h,4885h,48B1h,48DCh,4907h,4932h,495Ch,4985h
		.word  49AEh

;Q11 format 2048*n*ln2 starting from n=15 to n=0
logtbl	.int 21294, 19874, 18454,17035, 15615, 14196, 12776
		.int 11357,9937,8517,7098, 5678, 4259, 2839, 1420, 0
;Q15 format of the equation -32768/n (Taylor coefficients
;for n=11 down to 1)
a9_log	;.int -2521,-2731
		.int -3277, -3641,-4096,-4681,-5461,-6554,-8192
		.int -10293,-16384,-32768, 0, 0

		.bss N,1
		.bss X,1
		.bss EXP, 1
		.bss LNIS,1
		.text
_log:
		STM		N, AR4
		ADD 	#0,A,B  			;B=A=is
		EXP 	B  					;T=leading 0's of. Exponent of B
		LD 		#0x4000, 16,A 		;AH=16384, the largest supported scale
		ST 		T,*AR4 				;Store scaling number in N
		ANDM 	#0Fh, *AR4 			;compensate extra 16 leading bits
		MVDM	N,AR0				;AR0 index to segment table
		NORM	B 					;Normalize to Q15 fromat
		AND 	#0x3FFF, 16, B		;BH=BH-0x4000
		BC		taylor_log, BNEQ	;if (B==0) which means it can be represented in 2^N form
									;just return the result pre-stored in the index table
		STM 	#logtbl+1, AR3
		MAR		*AR3+0
		LD      *AR3, A
		RET
taylor_log:
		STM		X, AR4
		SUB 	B, 0 , A 			;A=A-B.A is the X in taylor's equation
		STH 	A, *AR4				;X is the fractional part in Q15 format
		STM		a9_log, AR3 			;AR3 points to coefficient in Taylor's equ
		LD 		*AR4 , T 			;T is the X in the polynomial equantion. POLY uses the value
									;of T
		LD 		*AR3+, 16, A 		;first coefficient of the n power in A
		LD 		*AR3+, 16, B 		;second coefficient of the (n-1) power in B

		RPT		#10					;loop 13 times, enough accuracy for MP3

		POLY	*AR3+				;AH=fractional part of the polynomial in Q15 format

		SFTA	A, -16				;AH=AL
		SFTA	A, -4				;Convert to Q11 format

		STM 	#logtbl, AR3		;sum up scaling part, N*ln2
		MAR		*AR3+0
		ADD 	*AR3,A

		RET

		.data

;exptbl is generated by equation e^(-n). n starts from 0 to 10 into Q15 format
exptbl	.int 0x7FFF, 0x2F16, 0x1152, 0x065F, 0x0258, 0x00DC, 0x0051, 0x01D, 0x000A, 0x0004, 0x0001

;a9 is generated by the equation 1/n!. N starts from 8 down to 1 to
;facilitate the use of POLY
;a9_exp	.int 1,7,46,273,1365,5461,16384,32767,0,0
a9_exp	.int 0,0x6,0x2D,0x111,0x555,0x1555,0x4000,0x7FFF,0,0

		.text

		.bss N1,1
		.bss X1,1

_exp:
		SUB		#0ACD2h, A, B
		BC		exp_q15limit, BLT
		AND		#0h, B
		OR  	#0ffffh,B
		SUB		A,B 		 		;Negative number.Make positive to compare
		ADD 	#0, B, A
		ADD 	#0,A,B
		STM		N1, AR4
		AND 	#400h, B			;Check if it is larger than 0.5
		BCD 	adj, BNEQ			;If larger than 0.5 adjust
		ADD 	#400h, A, B
		STM		N1, AR4
		STL		B, -11, *AR4		;store scaling index
		AND 	#3FFh, B			;truncate fractional part
		STM		X1, AR4				;store fractional part
		SFTA	B,4
		ADD 	#0,B,A
		LD 		#0FFFFh, 0, B
		SUB		A,B 		 		;Negative number.Make positive to compare
		STL		B, 0, *AR4			;in Q15 format

		B		taylor_exp

exp_q15limit:
		AND		#0,A
		B		exp_exit
adj:
		STL		B, -11, *AR4		;store scaling index
		AND		#7FFh, B			;truncate fractional part
		SUB 	#400h, B
		STM		X1, AR4				;store negative fraction
		STL		B, 4, *AR4			;in Q15 format
		LD 		*AR4, T
		MPY		#-1,B
		STL		B, *AR4

taylor_exp:

		STM		a9_exp, AR3			;AR3 points to coefficient in
									;Taylor's equ
		LD 		*AR4 , T 			;T is the X in the polynomial equantion. POLY uses the value
									;of T
		LD 		*AR3+, 16, A 		;first coefficient of the n power in A
		LD 		*AR3+, 16, B 		;second coefficient of the (n-1) power in B

		RPT		#7					;loop 8 times, enough accuracy for MP3
		POLY	*AR3+				;AH=fractional part of the polynomial in Q14

		ADD 	#4000h, 16, A		;taylor equation has one constant.Q14
		ADD		#0,A, B				;round
		AND		#0800h, 16, B
		BC		exp_mul, BEQ        ;if less than 0.5 don't round
		AND 	#0F000h,16, A       ;will not overflow. Max value 0x6F85
		ADD		#1000h,16,A			;rounding
exp_mul:
		MVDM	N1, AR0				;index into expbtl
		STM		exptbl, AR3
		MAR		*AR3+0
		MPYA 	*AR3				;multiply the scaling part
		SFTA	B,-14, A			;AL=BH
exp_exit:
		RET

;This routine calculates the x=(is)^4/3*2^exp
;Inputs <is> in A, and exp in stack
_descale:
		STM		EXP, AR4
		MVMM	SP, AR3			;first extract exp and place in T
		LD		*AR3+, T
		LD		*AR3,T
		ST 		T, *AR4
		PSHM	ST0
		PSHM	ST1
		RSBX	SXM				;we don't want sign extension
		SUB 	#101, A, B		;now check is to see if we can use the look-up table
		BC 		look_up, BLT
		CALL 	_log			;returns ln(A) in Q11 format in A
								;now multiply with 1,333333
		SFTL	A,+15
		SFTL	A, +1
		STM     #5555h, T
		MPYA	B
		SFTA	B, -14, A
		B 		calc_x
look_up:
		SUB		#1, A			;substract 1 from A in order
		STLM	A, AR0			;to use the look up table
		STM		exp_lup, AR3
		nop
		LD 		#5555h, 16, A
		MAR 	*AR3+0			;index AR3 to look up table
								;load A with 1,33333 in Q14 format
		MPYA	*AR3			;multiply and store in B
		SFTA 	B, -12, A		;return result in Q14 format
		SFTA 	A, -3			;convert to Q11
calc_x:
		STM		LNIS, AR4
		STL		A, *AR4
		STM		EXP, AR4
		LD      *AR4, T
		MPY		#-1, B			;exp is negative. Make positive
		SFTA	B, +15, A		;before multiplying
		SFTA 	A, +1
		STM		#58B9h, T		;T=ln2
		MPYA	B
		SFTA	B, -14, A		;convert to Q11

		STL		A, *AR4
		LD		*AR4, T
		MPY	    #-1, B 			;make negative again

		STM		LNIS, AR4		;and now add with LNIS
		ADD		*AR4, B
		AND 	#0FFFFh, B, A
		CALL 	_exp
		POPM	ST1
		POPM	ST0
		RET
		.
	;This code is taken from TI's ........
	;Used to calcualate the exponent of an integer
	;from 0 to 32768 (must be in A)
	.mmregs
	.global _log
	.global _exp
	.global _descale

	.data
	;Look Up table for the exp function for the first 10 integers
	;where the algorithm looses important accuracy.
	;starts from ln(0) to ln(100) in Q13 format
	exp_lup .word 0h,0B17h,1193h,162Eh,19C0h,1CABh,1F22h,2145h,2327h
	.word 24D7h,265Dh,27C2h,290Ah,2A39h,2B54h,2C5Ch,2D54h,2E3Eh,2F1Ch
	.word 2FEEh,30B6h,3174h,322Ah,32D9h,3380h,3421h,34BBh,3550h,35E0h
	.word 366Bh,36F1h,3773h,37F1h,386Bh,38E2h,3956h,39C6h,3A33h,3A9Dh
	.word 3B05h,3B6Ah,3BCDh,3C2Dh,3C8Ch,3CE8h,3D42h,3D9Ah,3DF0h,3E44h
	.word 3E97h,3EE8h,3F38h,3F86h,3FD2h,401Eh,4067h,40B0h,40F7h,413Dh
	.word 4182h,41C6h,4208h,424Ah,428Ah,42CAh,4308h,4346h,4383h,43BEh
	.word 43F9h,4433h,446Dh,44A5h,44DDh,4514h,454Ah,4580h,45B5h,45E9h
	.word 461Ch,464Fh,4681h,46B3h,46E4h,4715h,4745h,4774h,47A3h,47D1h
	.word 47FFh,482Ch,4859h,4885h,48B1h,48DCh,4907h,4932h,495Ch,4985h
	.word 49AEh

	;Q11 format 2048nln2 starting from n=15 to n=0
	logtbl .int 21294, 19874, 18454,17035, 15615, 14196, 12776
	.int 11357,9937,8517,7098, 5678, 4259, 2839, 1420, 0
	;Q15 format of the equation -32768/n (Taylor coefficients
	;for n=11 down to 1)
	a9_log ;.int -2521,-2731
	.int -3277, -3641,-4096,-4681,-5461,-6554,-8192
	.int -10293,-16384,-32768, 0, 0

	.bss N,1
	.bss X,1
	.bss EXP, 1
	.bss LNIS,1
	.text
	_log:
	STM N, AR4
	ADD #0,A,B ;B=A=is
	EXP B ;T=leading 0's of. Exponent of B
	LD #0x4000, 16,A ;AH=16384, the largest supported scale
	ST T,*AR4 ;Store scaling number in N
	ANDM #0Fh, *AR4 ;compensate extra 16 leading bits
	MVDM N,AR0 ;AR0 index to segment table
	NORM B ;Normalize to Q15 fromat
	AND #0x3FFF, 16, B ;BH=BH-0x4000
	BC taylor_log, BNEQ ;if (B==0) which means it can be represented in 2^N form
	;just return the result pre-stored in the index table
	STM #logtbl+1, AR3
	MAR *AR3+0
	LD *AR3, A
	RET
	taylor_log:
	STM X, AR4
	SUB B, 0 , A ;A=A-B.A is the X in taylor's equation
	STH A, *AR4 ;X is the fractional part in Q15 format
	STM a9_log, AR3 ;AR3 points to coefficient in Taylor's equ
	LD *AR4 , T ;T is the X in the polynomial equantion. POLY uses the value
	;of T
	LD *AR3+, 16, A ;first coefficient of the n power in A
	LD *AR3+, 16, B ;second coefficient of the (n-1) power in B

	RPT #10 ;loop 13 times, enough accuracy for MP3

	POLY *AR3+ ;AH=fractional part of the polynomial in Q15 format

	SFTA A, -16 ;AH=AL
	SFTA A, -4 ;Convert to Q11 format

	STM #logtbl, AR3 ;sum up scaling part, N*ln2
	MAR *AR3+0
	ADD *AR3,A

	RET

	.data

	;exptbl is generated by equation e^(-n). n starts from 0 to 10 into Q15 format
	exptbl .int 0x7FFF, 0x2F16, 0x1152, 0x065F, 0x0258, 0x00DC, 0x0051, 0x01D, 0x000A, 0x0004, 0x0001

	;a9 is generated by the equation 1/n!. N starts from 8 down to 1 to
	;facilitate the use of POLY
	;a9_exp .int 1,7,46,273,1365,5461,16384,32767,0,0
	a9_exp .int 0,0x6,0x2D,0x111,0x555,0x1555,0x4000,0x7FFF,0,0

	.text

	.bss N1,1
	.bss X1,1

	_exp:
	SUB #0ACD2h, A, B
	BC exp_q15limit, BLT
	AND #0h, B
	OR #0ffffh,B
	SUB A,B ;Negative number.Make positive to compare
	ADD #0, B, A
	ADD #0,A,B
	STM N1, AR4
	AND #400h, B ;Check if it is larger than 0.5
	BCD adj, BNEQ ;If larger than 0.5 adjust
	ADD #400h, A, B
	STM N1, AR4
	STL B, -11, *AR4 ;store scaling index
	AND #3FFh, B ;truncate fractional part
	STM X1, AR4 ;store fractional part
	SFTA B,4
	ADD #0,B,A
	LD #0FFFFh, 0, B
	SUB A,B ;Negative number.Make positive to compare
	STL B, 0, *AR4 ;in Q15 format

	B taylor_exp

	exp_q15limit:
	AND #0,A
	B exp_exit
	adj:
	STL B, -11, *AR4 ;store scaling index
	AND #7FFh, B ;truncate fractional part
	SUB #400h, B
	STM X1, AR4 ;store negative fraction
	STL B, 4, *AR4 ;in Q15 format
	LD *AR4, T
	MPY #-1,B
	STL B, *AR4

	taylor_exp:

	STM a9_exp, AR3 ;AR3 points to coefficient in
	;Taylor's equ
	LD *AR4 , T ;T is the X in the polynomial equantion. POLY uses the value
	;of T
	LD *AR3+, 16, A ;first coefficient of the n power in A
	LD *AR3+, 16, B ;second coefficient of the (n-1) power in B

	RPT #7 ;loop 8 times, enough accuracy for MP3
	POLY *AR3+ ;AH=fractional part of the polynomial in Q14

	ADD #4000h, 16, A ;taylor equation has one constant.Q14
	ADD #0,A, B ;round
	AND #0800h, 16, B
	BC exp_mul, BEQ ;if less than 0.5 don't round
	AND #0F000h,16, A ;will not overflow. Max value 0x6F85
	ADD #1000h,16,A ;rounding
	exp_mul:
	MVDM N1, AR0 ;index into expbtl
	STM exptbl, AR3
	MAR *AR3+0
	MPYA *AR3 ;multiply the scaling part
	SFTA B,-14, A ;AL=BH
	exp_exit:
	RET

	;This routine calculates the x=(is)^4/3*2^exp
	;Inputs <is> in A, and exp in stack
	_descale:
	STM EXP, AR4
	MVMM SP, AR3 ;first extract exp and place in T
	LD *AR3+, T
	LD *AR3,T
	ST T, *AR4
	PSHM ST0
	PSHM ST1
	RSBX SXM ;we don't want sign extension
	SUB #101, A, B ;now check is to see if we can use the look-up table
	BC look_up, BLT
	CALL _log ;returns ln(A) in Q11 format in A
	;now multiply with 1,333333
	SFTL A,+15
	SFTL A, +1
	STM #5555h, T
	MPYA B
	SFTA B, -14, A
	B calc_x
	look_up:
	SUB #1, A ;substract 1 from A in order
	STLM A, AR0 ;to use the look up table
	STM exp_lup, AR3
	nop
	LD #5555h, 16, A
	MAR *AR3+0 ;index AR3 to look up table
	;load A with 1,33333 in Q14 format
	MPYA *AR3 ;multiply and store in B
	SFTA B, -12, A ;return result in Q14 format
	SFTA A, -3 ;convert to Q11
	calc_x:
	STM LNIS, AR4
	STL A, *AR4
	STM EXP, AR4
	LD *AR4, T
	MPY #-1, B ;exp is negative. Make positive
	SFTA B, +15, A ;before multiplying
	SFTA A, +1
	STM #58B9h, T ;T=ln2
	MPYA B
	SFTA B, -14, A ;convert to Q11

	STL A, *AR4
	LD *AR4, T
	MPY #-1, B ;make negative again

	STM LNIS, AR4 ;and now add with LNIS
	ADD *AR4, B
	AND #0FFFFh, B, A
	CALL _exp
	POPM ST1
	POPM ST0
	RET
	.