maikxchd/cbe_spu_fatan.c

## cbe_spu_fatan.c
#ifndef CBE_SPU_FATAN
#define CBE_SPU_FATAN

#include <stdint.h>
#include <spu_intrinsics.h>
#include <spu_mfcio.h>

// Helper functions to load constant values into quadwords
// These functions load pre-computed constant values into quadwords,
// which can be efficiently used in SIMD operations on the CELL SPE.
// The constant values are stored as arrays of 32-bit integers, and
// the functions use the si_lqa intrinsic to load these values into
// quadwords for use in SIMD computations.

// Global Floating-point constants (32 bit)
static const vector unsigned int _cp_f_pio4 __attribute__((aligned(16))) = { 0x3f490fdb, 0x3f490fdb, 0x3f490fdb, 0x3f490fdb };
static const vector unsigned int _cp_f_t3p8 __attribute__((aligned(16))) = { 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d };
static const vector unsigned int _cp_f_npio2 __attribute__((aligned(16))) = { 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb };
static const vector unsigned int _cp_f_pio2 __attribute__((aligned(16))) = { 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb };
static const vector unsigned int _cp_f_pt66 __attribute__((aligned(16))) = { 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab };
static const vector unsigned int _cp_f_pi __attribute__((aligned(16))) = { 0x40490fdb, 0x40490fdb, 0x40490fdb, 0x40490fdb };
static const vector unsigned int _cp_f_npi __attribute__((aligned(16))) = { 0xc0490fdb, 0xc0490fdb, 0xc0490fdb, 0xc0490fdb };
static const vector unsigned int _cp_f_morebits __attribute__((aligned(16))) = { 0x38800000, 0x38800000, 0x38800000, 0x38800000 };
static const vector unsigned int _cp_f_hmorebits __attribute__((aligned(16))) = { 0x34000000, 0x34000000, 0x34000000, 0x34000000 };

// Helper functions to load constant values into quadwords
static inline qword cp_flpio4(void) { return (qword)_cp_f_pio4; }
static inline qword cp_flt3p8(void) { return (qword)_cp_f_t3p8; }
static inline qword cp_flnpio2(void) { return (qword)_cp_f_npio2; }
static inline qword cp_flpio2(void) { return (qword)_cp_f_pio2; }
static inline qword cp_flpt66(void) { return (qword)_cp_f_pt66; }
static inline qword cp_flpi(void) { return (qword)_cp_f_pi; }
static inline qword cp_flnpi(void) { return (qword)_cp_f_npi; }
static inline qword cp_filzero(void) { return si_ilhu((int16_t)0x0000); }
static inline qword cp_filnzero(void) { return si_ilhu((int16_t)0x8000); }
static inline qword cp_filone(void) { return si_ilhu((int16_t)0x3f80); }
static inline qword cp_filtwo(void) { return si_ilhu((int16_t)0x4000); }
static inline qword cp_filinf(void) { return si_ilhu((int16_t)0x7f80); }
static inline qword cp_filninf(void) { return si_ilhu((int16_t)0xff80); }
static inline qword cp_filnan(void) { return si_ilhu((int16_t)0x7fc0); }

// Polynomial coefficients for cp_fatan approximation
static const vector unsigned int _cp_f_atan_p7 __attribute__((aligned(16))) = { 0x3c08876a, 0x3c08876a, 0x3c08876a, 0x3c08876a };
static const vector unsigned int _cp_f_atan_p6 __attribute__((aligned(16))) = { 0xbd954629, 0xbd954629, 0xbd954629, 0xbd954629 };
static const vector unsigned int _cp_f_atan_p5 __attribute__((aligned(16))) = { 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1 };
static const vector unsigned int _cp_f_atan_p4 __attribute__((aligned(16))) = { 0xbf49eee6, 0xbf49eee6, 0xbf49eee6, 0xbf49eee6 };
static const vector unsigned int _cp_f_atan_p3 __attribute__((aligned(16))) = { 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5 };
static const vector unsigned int _cp_f_atan_p2 __attribute__((aligned(16))) = { 0xbf62365c, 0xbf62365c, 0xbf62365c, 0xbf62365c };
static const vector unsigned int _cp_f_atan_p1 __attribute__((aligned(16))) = { 0x3f490965, 0x3f490965, 0x3f490965, 0x3f490965 };
static const vector unsigned int _cp_f_atan_p0 __attribute__((aligned(16))) = { 0xbf2697e0, 0xbf2697e0, 0xbf2697e0, 0xbf2697e0 };

// Higher-degree polynomial coefficients for cp_fatan approximation
static const vector unsigned int _cp_f_atan_q7 __attribute__((aligned(16))) = { 0x3c0897d0, 0x3c0897d0, 0x3c0897d0, 0x3c0897d0 };
static const vector unsigned int _cp_f_atan_q6 __attribute__((aligned(16))) = { 0xbd890e31, 0xbd890e31, 0xbd890e31, 0xbd890e31 };
static const vector unsigned int _cp_f_atan_q5 __attribute__((aligned(16))) = { 0x3f6c4616, 0x3f6c4616, 0x3f6c4616, 0x3f6c4616 };
static const vector unsigned int _cp_f_atan_q4 __attribute__((aligned(16))) = { 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2 };
static const vector unsigned int _cp_f_atan_q3 __attribute__((aligned(16))) = { 0x3eb6679b, 0x3eb6679b, 0x3eb6679b, 0x3eb6679b };
static const vector unsigned int _cp_f_atan_q2 __attribute__((aligned(16))) = { 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9 };
static const vector unsigned int _cp_f_atan_q1 __attribute__((aligned(16))) = { 0x3f7df927, 0x3f7df927, 0x3f7df927, 0x3f7df927 };
static const vector unsigned int _cp_f_atan_q0 __attribute__((aligned(16))) = { 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 };

#define BUFFER_SIZE 128 // Example buffer size
#define TAG 1 // DMA tag

typedef vector float qword;

// Polynomial approximation using FMA
//
// zdiv   =  P  + P xp2 + P xp2^2 + P xp2^3 + P xp2^4
//            0    1       2        3       4
//           ------------------------------------------
//           Q  + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5
//            0    1       2        3       4
//
// The polynomial approximation is implemented using Estrin's method, evaluating
// the numerator and denominator polynomials separately using FMA instructions,
// and then dividing the numerator by the denominator to obtain the final result.
//
// The numerator is computed as:
// znum0 = f_atan_p0
// znum1 = (znum0 * xp2) + f_atan_p1
// znum2 = (znum1 * xp2) + f_atan_p2
// znum3 = (znum2 * xp2) + f_atan_p3
// znum  = (znum3 * xp2) + f_atan_p4
//
// The denominator is computed as:
// zden0 = xp2 + f_atan_q0
// zden1 = (zden0 * xp2) + f_atan_q1
// zden2 = (zden1 * xp2) + f_atan_q2
// zden3 = (zden2 * xp2) + f_atan_q3
// zden  = (zden3 * xp2) + f_atan_q4 + xp2
//
// The final result, zdiv, is computed by dividing znum by zden using
// additional FMA instructions for improved accuracy.
static inline qword _cp_fatan(const qword x) {
    const qword f_one = (qword){0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; // 1.0
    const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0

    const qword f_atan_p0 = (qword)_cp_f_atan_p0;
    const qword f_atan_p1 = (qword)_cp_f_atan_p1;
    const qword f_atan_p2 = (qword)_cp_f_atan_p2;
    const qword f_atan_p3 = (qword)_cp_f_atan_p3;
    const qword f_atan_p4 = (qword)_cp_f_atan_p4;
    const qword f_atan_q0 = (qword)_cp_f_atan_q0;
    const qword f_atan_q1 = (qword)_cp_f_atan_q1;
    const qword f_atan_q2 = (qword)_cp_f_atan_q2;
    const qword f_atan_q3 = (qword)_cp_f_atan_q3;
    const qword f_atan_q4 = (qword)_cp_f_atan_q4;

    // Range reduction
    // ##
    // ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) )
    // ## range1_mask = ( pos_x <= 0.66 )
    // ## range2_mask = !( range0_mask || range1_mask )
    // ##
    //
    // This section performs range reduction, which maps the input values into a smaller range
    // where the arctangent function can be more accurately approximated using a polynomial.
    //
    // range0_mask identifies cases where pos_x is greater than tan(3π/8), which is a constant
    // value f_t3p8. The si_fcgt intrinsic performs a floating-point greater-than comparison.
    //
    // range1_mask identifies cases where pos_x is less than or equal to 0.66, which is a constant
    // value f_pt66. It is computed by combining two masks using si_or: range1_gt_mask (pos_x < 0.66)
    // and range1_eq_mask (pos_x == 0.66).
    //
    // range2_mask identifies the remaining cases that are not covered by range0_mask or range1_mask.
    // It is computed by performing a bitwise NOR operation (si_nor) on range0_mask and range1_mask.
    //
    // These range masks are used to select different computations and approximations for the
    // arctangent function, based on the range in which the input value falls.
    const qword pos_x = si_and(x, (qword)si_ilhu(0x7FFFFFFF));
    const qword range0_mask = si_fcgt(pos_x, (qword)_cp_f_t3p8);
    const qword range1_gt_mask = si_fcgt((qword)_cp_f_pt66, pos_x);
    const qword range1_eq_mask = si_fceq((qword)_cp_f_pt66, pos_x);
    const qword range1_mask = si_or(range1_gt_mask, range1_eq_mask);
    const qword range2_mask = si_nor(range0_mask, range1_mask);

    // Range reduction using FMA
    //
    // range0_x = -1.0
    //            -----
    //            pos_x
    //
    // The range reduction step for the range0 case involves dividing -1.0 by pos_x.
    // This is implemented using a series of FMA (Fused Multiply-Add) instructions, which perform
    // the operation (a * b) + c in a single instruction for improved performance.
    //
    // The steps to compute range0_x are:
    // range0_x0 = pos_x - (trunc(pos_x) * pos_x)  // Compute the fractional part of pos_x
    // range0_x1 = trunc(pos_x)                    // Compute the integer part of pos_x
    // range0_x2 = -range0_x1 * pos_x + 1.0        // Compute -range0_x1 * pos_x and add 1.0
    // range0_x3 = range0_x2 * range0_x1 + range0_x1  // Final result: range0_x3 = -1.0 / pos_x
    //
    // The FMA instructions are used to improve performance by reducing the number of instructions
    // and utilizing specialized hardware for faster execution.
    const qword range0_x0 = si_frest(pos_x);
    const qword range0_x1 = si_fi(pos_x, range0_x0);
    const qword range0_x2 = si_fnms(range0_x1, pos_x, f_one);
    const qword range0_x3 = si_fma(range0_x2, range0_x1, range0_x1);
    const qword range0_x = si_xor(range0_x3, si_ilhu((int16_t)0x8000)); // Negate using f_msb
    const qword range0_y = (qword)_cp_f_pio2;

    // ##
    // ## range1_x = pos_x
    // ## range1_y = 0.0
    // ##
    //
    // For the range1 case, where the input value pos_x is less than or equal to 0.66,
    // the range-specific values are simply set to:
    //
    // range1_x = pos_x
    // range1_y = 0.0
    //
    // This means that the input value itself (pos_x) is used for the arctangent approximation
    // in this range, and the corresponding y-value is set to 0.0.
    const qword range1_x = pos_x;
    const qword range1_y = f_zero;

    // ##
    // ## range2_x = (pos_x-1.0)
    // ##            -----------
    // ##            (pos_x+1.0)
    // ##
    // ## range2_y = PI
    // ##            ---
    // ##            4.0
    // ##
    //
    // For the range2 case, the range reduction involves computing (pos_x - 1.0) / (pos_x + 1.0).
    // This is done using a series of steps:
    // range2_x0num = pos_x - 1.0             // Compute the numerator
    // range2_x0den = pos_x + 1.0             // Compute the denominator
    // range2_x0 = range2_x0den - (trunc(range2_x0den) * range2_x0den)  // Compute the fractional part of the denominator
    // range2_x1 = -range2_x0 * range2_x0den + 1.0  // Compute the reciprocal of the denominator
    // range2_x2 = range2_x1 * range2_x0             // Refine the reciprocal
    // range2_x = range2_x0num * range2_x2           // Final result: range2_x = (pos_x - 1.0) / (pos_x + 1.0)
    //
    // The range2_y value is set to PI/4.0, which is used in the final arctangent computation.
    const qword range2_y = (qword)_cp_f_pio4;
    const qword range2_x0num = si_fs(pos_x, f_one);
    const qword range2_x0den = si_fa(pos_x, f_one);
    const qword range2_x0 = si_frest(range2_x0den);
    const qword range2_x1 = si_fnms(range2_x0, range2_x0den, f_one);
    const qword range2_x2 = si_fma(range2_x1, range2_x0, range2_x0);
    const qword range2_x = si_fm(range2_x0num, range2_x2);

    // ##
    // ## range_x  = range0_x { range0_mask
    // ##            range1_x { range1_mask
    // ##            range2_x { range2_mask
    // ##
    // ## range_y  = range0_y { range0_mask
    // ##            range1_y { range1_mask
    // ##            range2_y { range2_mask
    // ##
    //
    // Based on the range masks computed earlier (range0_mask, range1_mask, range2_mask),
    // this section selects the appropriate range-specific values for x (range_x) and y (range_y).
    //
    // The range-specific values (range0_x, range1_x, range2_x, range0_y, range1_y, range2_y)
    // are precomputed constants that are used in the arctangent approximation for each range.
    //
    // The si_selb intrinsic is used to select the appropriate value based on the range masks.
    // It performs a conditional selection operation, selecting the first value if the corresponding
    // mask bit is 0, and the second value if the mask bit is 1.
    //
    // For range_x:
    // 1. range_x0 is computed by selecting between range2_x and range0_x based on range0_mask.
    // 2. range_x is computed by selecting between range_x0 and range1_x based on range1_mask.
    //
    // For range_y:
    // 1. range_y0 is computed by selecting between range2_y and range0_y based on range0_mask.
    // 2. range_y is computed by selecting between range_y0 and range1_y based on range1_mask.
    //
    // These range-specific values are used in the subsequent steps of the arctangent approximation.
    const qword range_x0 = si_selb(range2_x, range0_x, range0_mask);
    const qword range_x = si_selb(range_x0, range1_x, range1_mask);
    const qword range_y0 = si_selb(range2_y, range0_y, range0_mask);
    const qword range_y = si_selb(range_y0, range1_y, range1_mask);

    // Polynomial approximation using FMA
    //
    // zdiv   =  P  + P xp2 + P xp2^2 + P xp2^3 + P xp2^4
    //            0    1       2        3       4
    //           ------------------------------------------
    //           Q  + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5
    //            0    1       2        3       4
    //
    // The polynomial approximation is implemented using Estrin's method, evaluating
    // the numerator and denominator polynomials separately using FMA instructions,
    // and then dividing the numerator by the denominator to obtain the final result.
    const qword xp2 = si_fm(range_x, range_x);
    const qword znum0 = f_atan_p0;
    const qword znum1 = si_fma(znum0, xp2, f_atan_p1); // FMA: (znum0 * xp2) + f_atan_p1
    const qword znum2 = si_fma(znum1, xp2, f_atan_p2); // FMA: (znum1 * xp2) + f_atan_p2
    const qword znum3 = si_fma(znum2, xp2, f_atan_p3); // FMA: (znum2 * xp2) + f_atan_p3
    const qword znum = si_fma(znum3, xp2, f_atan_p4); // FMA: (znum3 * xp2) + f_atan_p4

    const qword zden0 = si_fa(xp2, f_atan_q0);
    const qword zden1 = si_fma(zden0, xp2, f_atan_q1); // FMA: (zden0 * xp2) + f_atan_q1
    const qword zden2 = si_fma(zden1, xp2, f_atan_q2); // FMA: (zden1 * xp2) + f_atan_q2
    const qword zden3 = si_fma(zden2, xp2, f_atan_q3); // FMA: (zden2 * xp2) + f_atan_q3
    const qword zden = si_fma(zden3, xp2, f_atan_q4); // FMA: (zden3 * xp2) + f_atan_q4

    // Refine the denominator value
    const qword zden_r0 = si_frest(zden);
    const qword zden_r1 = si_fnms(zden_r0, zden, f_one);
    const qword zden_r = si_fma(zden_r1, zden_r0, zden_r0);

    // Final division
    const qword zdiv = si_fm(znum, zden_r);

    // Final result with range reduction
    const qword z0 = si_fm(xp2, zdiv);
    const qword z1 = si_fma(range_x, z0, range_x);
    const qword zadd0 = si_selb(f_zero, (qword)_cp_f_hmorebits, range2_mask);
    const qword zadd1 = si_selb(zadd0, (qword)_cp_f_morebits, range1_mask);
    const qword zadd = si_fa(z1, zadd1);
    const qword yaddz = si_fa(range_y, zadd);
    const qword neg_yaddz = si_xor(yaddz, si_ilhu((int16_t)0x8000));
    const qword pos_yaddz = si_selb(yaddz, neg_yaddz, si_fcgt(f_zero, x));

    // Handling special cases
    const qword x_eqz_mask = si_fceq(f_zero, x);
    const qword result_y0 = si_selb(pos_yaddz, x, x_eqz_mask);

    const qword x_eqinf_mask = si_fceq((qword)_cp_f_pi, x);
    const qword x_eqninf_mask = si_fceq((qword)_cp_f_npi, x);
    const qword result_y1 = si_selb(result_y0, (qword)_cp_f_pio2, x_eqinf_mask);
    const qword result = si_selb(result_y1, (qword)_cp_f_npio2, x_eqninf_mask);

    return result;
}

// Handles special cases for atan2
// ##   Additionally, the atan2 function handles special cases where either x or y is zero,
// ##   positive infinity, or negative infinity, as described in the special cases section.
// ##
// ##   The implementation of cp_fatan2 follows these mathematical relationships and special
// ##   cases to compute the correct angle for any given (x, y) coordinate pair.
// ##
// ##   Special cases:
// ##     - If y is +0 and x is negative, the result is +pi
// ##     - If y is -0 and x is negative, the result is -pi
// ##     - If y is +0 and x is positive, the result is +0
// ##     - If y is -0 and x is positive, the result is -0
// ##     - If y is negative and x is 0, the result is -pi or pi/2 (depending on sign of y)
// ##     - If y is positive and x is 0, the result is pi or pi/2 (depending on sign of y)
// ##     - If either y or x is NaN, the result is NaN
// ##     - If y is +0 and x is -0, the result is +pi
// ##     - If y is -0 and x is -0, the result is -pi
// ##     - If y is +0 and x is +0, the result is +0
// ##     - If y is -0 and x is +0, the result is -0
// ##     - If y is +inf and x is +inf, the result is pi/4
// ##     - If y is -inf and x is +inf, the result is -pi/4
// ##     - If y is +inf and x is -inf, the result is 3pi/4
// ##     - If y is -inf and x is -inf, the result is -3pi/4
// ##     - If y is finite and positive, and x is -inf, the result is +pi
// ##     - If y is finite and negative, and x is -inf, the result is -pi
// ##     - If y is finite and positive, and x is +inf, the result is +0
// ##     - If y is finite and negative, and x is +inf, the result is -0
// ##     - If x is finite and y is +inf, the result is pi/2
// ##     - If x is finite and y is -inf, the result is -pi/2
// ##
// ##   General cases:
// ##     - If x is negative and y is non-negative, the result is pi + atan(y/x)
// ##     - If x is negative and y is negative, the result is -pi + atan(y/x)
// ##     - Otherwise, the result is atan(y/x)
static inline qword _cp_fatan2_special_cases(qword y, qword x, qword result) {
    const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0
    const qword f_pi = (qword)_cp_f_pi;
    const qword f_npi = (qword)_cp_f_npi;
    const qword f_inf = (qword){0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; // inf
    const qword f_ninf = (qword){0xff800000, 0xff800000, 0xff800000, 0xff800000}; // -inf

    const qword x_eqz_mask = si_fceq(f_zero, x);
    const qword y_eqz_mask = si_fceq(f_zero, y);
    const qword x_eqinf_mask = si_fceq(f_inf, x);
    const qword x_eqninf_mask = si_fceq(f_ninf, x);
    const qword y_eqinf_mask = si_fceq(f_inf, y);
    const qword y_eqninf_mask = si_fceq(f_ninf, y);

    const qword zero_by_zero = si_and(x_eqz_mask, y_eqz_mask);
    const qword inf_by_inf = si_and(x_eqinf_mask, y_eqinf_mask);
    const qword zero_by_any = si_and(x_eqz_mask, si_or(y_eqinf_mask, y_eqninf_mask));
    const qword any_by_zero = si_and(y_eqz_mask, si_or(x_eqinf_mask, x_eqninf_mask));

    const qword zero_by_zero_res = f_zero;
    const qword inf_by_inf_res = si_selb(f_pi, f_npi, si_fcgts(y, f_zero));
    const qword zero_by_any_res = f_zero;
    const qword any_by_zero_res = si_selb(f_pi, f_npi, si_fcgts(x, f_zero));

    result = si_selb(result, zero_by_zero_res, zero_by_zero);
    result = si_selb(result, inf_by_inf_res, inf_by_inf);
    result = si_selb(result, zero_by_any_res, zero_by_any);
    result = si_selb(result, any_by_zero_res, any_by_zero);

    return result;
}

// ## cp_fatan2(y, x)
// ##
// ## Function to compute the angle (in radians) between the positive x-axis and the
// ## point (x, y) in the Cartesian plane, using the atan2 function.
// ##
// ## Input:
// ##   y: float or vector float representing the y-coordinate
// ##   x: float or vector float representing the x-coordinate
// ##
// ## Output:
// ##   The angle (theta) in radians between the positive x-axis and the point (x, y)
// ##   in the range [-pi, pi].
// ##
// ## Mathematical Description:
// ##   The atan2 function is a variation of the arctangent function, which computes
// ##   the angle (in radians) between the positive x-axis and the vector (x, y) in the
// ##   Cartesian plane. It is defined as:
// ##
// ##     atan2(y, x) = arctan(y/x)
// ##
// ##   However, the standard arctangent function (arctan) has a range of [-pi/2, pi/2],
// ##   which is not sufficient to represent all possible angles in the Cartesian plane.
// ##   The atan2 function extends the range to [-pi, pi] by considering the signs of
// ##   both x and y.
// ##
// ##   The mathematical relationships governing atan2 are as follows:
// ##
// ##     atan2(y, x) = arctan(y/x)                     (for x > 0)
// ##     atan2(y, x) = arctan(y/x) + pi                (for x < 0 and y >= 0)
// ##     atan2(y, x) = arctan(y/x) - pi                (for x < 0 and y < 0)
qword _cp_fatan2(qword y, qword x) {
    const qword f_one = (qword){0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; // 1.0
    const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0
    const qword f_pi = (qword)_cp_f_pi;
    const qword f_npi = (qword)_cp_f_npi;

    // Compute the reciprocal of x
    const qword x_r0 = si_frest(x);
    const qword x_r1 = si_fnms(x_r0, x, f_one);
    const qword x_r = si_fma(x_r1, x_r0, x_r0);

    // Compute y / x
    const qword yox = si_fm(y, x_r);

    // Compute the arctangent of y / x using polynomial approximation
    const qword atan_yox = _cp_fatan(yox);

    // Determine the quadrant and adjust the result accordingly
    const qword x_ltz_mask = si_fcgt(f_zero, x);
    const qword y_ltz_mask = si_fcgt(f_zero, y);
    const qword xy_ltz_mask = si_and(x_ltz_mask, y_ltz_mask);
    const qword zadd0 = si_selb(f_zero, f_pi, x_ltz_mask);
    const qword zadd = si_selb(zadd0, f_npi, xy_ltz_mask);
    const qword result = si_fa(zadd, atan_yox);

    // Handle special cases and return the final result
    return _cp_fatan2_special_cases(y, x, result);
}

vector float cp_fatan2(vector float arg0 /* y */, vector float arg1 /* x */) {
    const qword y = (qword)arg0;
    const qword x = (qword)arg1;
    const qword result = _cp_fatan2(y, x);
    return (vector float)(result);
}

// cp_fatan2_scalar(arg0, arg1)
//
// Computes the atan2(y, x) function using _cp_fatan2 for single-precision float inputs.
//
// Input:
//   arg0: The y-coordinate as a single-precision float.
//   arg1: The x-coordinate as a single-precision float.
//
// Output:
//   The angle (theta) in radians between the positive x-axis and the point (x, y),
//   in the range [-pi, pi], as a single-precision float.
//
// This function is a scalar wrapper around _cp_fatan2, which computes atan2(y, x) using
// SIMD operations. The steps involved are:
// 1. Convert the input floats arg0 (y) and arg1 (x) to quadwords y and x using si_from_float.
// 2. Call _cp_fatan2(y, x) to compute the atan2(y, x) value, storing the result in z.
// 3. Convert the quadword z back to a float result using si_to_float.
// 4. Return the float result.
//
// This function provides a convenient way to compute atan2(y, x) for single-precision float
// inputs while leveraging the SIMD capabilities of the _cp_fatan2 function.
float cp_fatan2_scalar(float arg0 /* y */, float arg1 /* x */) {
    const qword y = si_from_float(arg0);
    const qword x = si_from_float(arg1);
    const qword z = _cp_fatan2(y, x);
    const float result = si_to_float(z);
    return result;
}

// DMA transfer function
void dma_transfer(qword *buffer, uint32_t ea, int tag) {
    mfc_get(buffer, ea, BUFFER_SIZE * sizeof(qword), tag, 0, 0);
    mfc_write_tag_mask(1 << tag);
    mfc_read_tag_status_all();
}

// Double buffering for DMA transfers
void process_with_double_buffering(uint32_t ea_data, uint32_t ea_result, int num_elements) {
    qword buffer0[BUFFER_SIZE] __attribute__((aligned(16)));
    qword buffer1[BUFFER_SIZE] __attribute__((aligned(16)));
    qword *current_buffer = buffer0;
    qword *next_buffer = buffer1;
    int current_tag = TAG;
    int next_tag = TAG + 1;

    // Initial DMA transfer
    dma_transfer(current_buffer, ea_data, current_tag);

    for (int i = 0; i < num_elements; i += BUFFER_SIZE) {
        // Start next DMA transfer
        if (i + BUFFER_SIZE < num_elements) {
            dma_transfer(next_buffer, ea_data + (i + BUFFER_SIZE) * sizeof(qword), next_tag);
        }

        // Process current buffer
        for (int j = 0; j < BUFFER_SIZE && (i + j) < num_elements; ++j) {
            qword y = current_buffer[j * 2];
            qword x = current_buffer[j * 2 + 1];
            qword result = _cp_fatan2(y, x);
            current_buffer[j] = result;
        }

        // Write results back
        mfc_put(current_buffer, ea_result + i * sizeof(qword), BUFFER_SIZE * sizeof(qword), current_tag, 0, 0);
        mfc_write_tag_mask(1 << current_tag);
        mfc_read_tag_status_all();

        // Swap buffers and tags
        qword *temp_buffer = current_buffer;
        current_buffer = next_buffer;
        next_buffer = temp_buffer;
        int temp_tag = current_tag;
        current_tag = next_tag;
        next_tag = temp_tag;
    }
}

#endif /* CBE_SPU_FATAN */