reinhrst/dct.c

## dct.c
/* This implementation does DCT-II on iOS, using the iOS vDSP FFT methods. Supposedly this is faster than in plain C (i.e. FFTW), but I didn't do any tests to support this claim. For one thing, as far as I understand it, doing DCT takes less clock cycles than FFT, and using this code we do an FFT of size 2N for a DCT of size N (and then some post processing. Anyone is welcome to comment :)

This code was developed using the following 2 sources, and I owe great gratitude to their authors: http://developer.apple.com/library/ios/#documentation/Performance/Conceptual/vDSP_Programming_Guide/UsingFourierTransforms/UsingFourierTransforms.html and http://www.hydrogenaudio.org/forums//lofiversion/index.php/t39574.html

I don't claim I understand everything I do, but the thing is, it works -- as in, it gives me the same result as jtransform with scaling on :) And it's fast enough for my use: on the iPhone 4S I can get 320 DCTs of length 2048 per second. Perhaps I can improve this with some further tweaking (although I'm not sure I have to...)

Obviously the code below should be split between .c and .h file, just putting them all together here!

Note: This code does only 1D DCT. If you need 2D or 3D, this may be a good first step, but it's in no way done.
*/

#import <Accelerate/Accelerate.h>
#import <math.h>

struct dctII_setup_data {
    FFTSetupD fft_setup;
    double* dct_correction_factors;
    vDSP_Length fft_log2length;
};
typedef struct dctII_setup_data dctII_setup_data;

#=======================================

dctII_setup_data dctII_setup(vDSP_Length dct_log2length, FFTRadix radix) {
    vDSP_Length fft_log2length = dct_log2length + 1;
    dctII_setup_data result;
    result.fft_setup = vDSP_create_fftsetupD( fft_log2length, radix);

    int dct_length = 1 << dct_log2length;
    int fft_length = 1 << fft_log2length;
    result.dct_correction_factors = (double*) malloc(sizeof(double) * fft_length);
    for (int i=0;i<dct_length;i++) {
        result.dct_correction_factors[2*i] = cos((M_PI*(dct_length-0.5)*i)/dct_length)/sqrt(dct_length*8.0);
        result.dct_correction_factors[2*i+1] = sin((M_PI*(dct_length-0.5)*i)/dct_length)/sqrt(dct_length*8.0);
    }
    result.dct_correction_factors[0] = result.dct_correction_factors[0]/sqrt(2.0);
    result.dct_correction_factors[1] = result.dct_correction_factors[1]/sqrt(2.0); //although this one is always 0 :)
    result.fft_log2length =  fft_log2length;
    return result;
}


void dctII_1D(dctII_setup_data setup, double* ioData) {

    vDSP_Length fft_length = 1 << setup.fft_log2length;
    vDSP_Length dct_length = fft_length/2;

    double* mirroreddata = malloc(sizeof(double)*fft_length);
    for (int i=0;i<dct_length;i++) {
        mirroreddata[i] = ioData[dct_length-i-1];
    }
    for (int i=dct_length; i<fft_length; i++) {
        mirroreddata[i] = ioData[i-dct_length];
    }

    DSPDoubleSplitComplex* complexData = (DSPDoubleSplitComplex*)calloc(1, sizeof(DSPDoubleSplitComplex));

    complexData->realp = malloc(sizeof(double) * dct_length);
    complexData->imagp = malloc(sizeof(double) * dct_length);

    vDSP_ctozD((DSPDoubleComplex*) mirroreddata, (vDSP_Stride) 2, complexData, (vDSP_Stride) 1, dct_length);

    vDSP_fft_zripD( setup.fft_setup, complexData, (vDSP_Stride) 1, setup.fft_log2length, kFFTDirection_Forward);

    vDSP_ztocD(complexData, (vDSP_Stride) 1, (DSPDoubleComplex *) mirroreddata, (vDSP_Stride) 2, dct_length);

    vDSP_vmulD(mirroreddata, (vDSP_Stride) 1, setup.dct_correction_factors, (vDSP_Stride) 1, mirroreddata, (vDSP_Stride) 1, fft_length);

    for (int i=0; i<dct_length; i++){
        ioData[i] = mirroreddata[2*i]-mirroreddata[2*i+1];
    }

    free(complexData->realp);
    free(complexData->imagp);
    free(complexData);
    free(mirroreddata);
}
	/* This implementation does DCT-II on iOS, using the iOS vDSP FFT methods. Supposedly this is faster than in plain C (i.e. FFTW), but I didn't do any tests to support this claim. For one thing, as far as I understand it, doing DCT takes less clock cycles than FFT, and using this code we do an FFT of size 2N for a DCT of size N (and then some post processing. Anyone is welcome to comment :)

	This code was developed using the following 2 sources, and I owe great gratitude to their authors: http://developer.apple.com/library/ios/#documentation/Performance/Conceptual/vDSP_Programming_Guide/UsingFourierTransforms/UsingFourierTransforms.html and http://www.hydrogenaudio.org/forums//lofiversion/index.php/t39574.html

	I don't claim I understand everything I do, but the thing is, it works -- as in, it gives me the same result as jtransform with scaling on :) And it's fast enough for my use: on the iPhone 4S I can get 320 DCTs of length 2048 per second. Perhaps I can improve this with some further tweaking (although I'm not sure I have to...)

	Obviously the code below should be split between .c and .h file, just putting them all together here!

	Note: This code does only 1D DCT. If you need 2D or 3D, this may be a good first step, but it's in no way done.
	*/

	#import <Accelerate/Accelerate.h>
	#import <math.h>

	struct dctII_setup_data {
	FFTSetupD fft_setup;
	double* dct_correction_factors;
	vDSP_Length fft_log2length;
	};
	typedef struct dctII_setup_data dctII_setup_data;

	#=======================================

	dctII_setup_data dctII_setup(vDSP_Length dct_log2length, FFTRadix radix) {
	vDSP_Length fft_log2length = dct_log2length + 1;
	dctII_setup_data result;
	result.fft_setup = vDSP_create_fftsetupD( fft_log2length, radix);

	int dct_length = 1 << dct_log2length;
	int fft_length = 1 << fft_log2length;
	result.dct_correction_factors = (double) malloc(sizeof(double) fft_length);
	for (int i=0;i<dct_length;i++) {
	result.dct_correction_factors[2i] = cos((M_PI(dct_length-0.5)i)/dct_length)/sqrt(dct_length8.0);
	result.dct_correction_factors[2i+1] = sin((M_PI(dct_length-0.5)i)/dct_length)/sqrt(dct_length8.0);
	}
	result.dct_correction_factors[0] = result.dct_correction_factors[0]/sqrt(2.0);
	result.dct_correction_factors[1] = result.dct_correction_factors[1]/sqrt(2.0); //although this one is always 0 :)
	result.fft_log2length = fft_log2length;
	return result;
	}


	void dctII_1D(dctII_setup_data setup, double* ioData) {

	vDSP_Length fft_length = 1 << setup.fft_log2length;
	vDSP_Length dct_length = fft_length/2;

	double* mirroreddata = malloc(sizeof(double)*fft_length);
	for (int i=0;i<dct_length;i++) {
	mirroreddata[i] = ioData[dct_length-i-1];
	}
	for (int i=dct_length; i<fft_length; i++) {
	mirroreddata[i] = ioData[i-dct_length];
	}

	DSPDoubleSplitComplex* complexData = (DSPDoubleSplitComplex*)calloc(1, sizeof(DSPDoubleSplitComplex));

	complexData->realp = malloc(sizeof(double) * dct_length);
	complexData->imagp = malloc(sizeof(double) * dct_length);

	vDSP_ctozD((DSPDoubleComplex*) mirroreddata, (vDSP_Stride) 2, complexData, (vDSP_Stride) 1, dct_length);

	vDSP_fft_zripD( setup.fft_setup, complexData, (vDSP_Stride) 1, setup.fft_log2length, kFFTDirection_Forward);

	vDSP_ztocD(complexData, (vDSP_Stride) 1, (DSPDoubleComplex *) mirroreddata, (vDSP_Stride) 2, dct_length);

	vDSP_vmulD(mirroreddata, (vDSP_Stride) 1, setup.dct_correction_factors, (vDSP_Stride) 1, mirroreddata, (vDSP_Stride) 1, fft_length);

	for (int i=0; i<dct_length; i++){
	ioData[i] = mirroreddata[2i]-mirroreddata[2i+1];
	}

	free(complexData->realp);
	free(complexData->imagp);
	free(complexData);
	free(mirroreddata);
	}