Last active
May 16, 2024 07:45
-
-
Save maikxchd/d6797078239928663b9f0d8176b139e2 to your computer and use it in GitHub Desktop.
An implementation of a function to calculate arctangent approximations on the CELL SPU using the Estrin Method
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef CBE_SPU_FATAN | |
#define CBE_SPU_FATAN | |
#include <stdint.h> | |
#include <spu_intrinsics.h> | |
#include <spu_mfcio.h> | |
// Helper functions to load constant values into quadwords | |
// These functions load pre-computed constant values into quadwords, | |
// which can be efficiently used in SIMD operations on the CELL SPE. | |
// The constant values are stored as arrays of 32-bit integers, and | |
// the functions use the si_lqa intrinsic to load these values into | |
// quadwords for use in SIMD computations. | |
// Global Floating-point constants (32 bit) | |
static const vector unsigned int _cp_f_pio4 __attribute__((aligned(16))) = { 0x3f490fdb, 0x3f490fdb, 0x3f490fdb, 0x3f490fdb }; | |
static const vector unsigned int _cp_f_t3p8 __attribute__((aligned(16))) = { 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d, 0x3fe5ec5d }; | |
static const vector unsigned int _cp_f_npio2 __attribute__((aligned(16))) = { 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb, 0xbfc90fdb }; | |
static const vector unsigned int _cp_f_pio2 __attribute__((aligned(16))) = { 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb, 0x3fc90fdb }; | |
static const vector unsigned int _cp_f_pt66 __attribute__((aligned(16))) = { 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab }; | |
static const vector unsigned int _cp_f_pi __attribute__((aligned(16))) = { 0x40490fdb, 0x40490fdb, 0x40490fdb, 0x40490fdb }; | |
static const vector unsigned int _cp_f_npi __attribute__((aligned(16))) = { 0xc0490fdb, 0xc0490fdb, 0xc0490fdb, 0xc0490fdb }; | |
static const vector unsigned int _cp_f_morebits __attribute__((aligned(16))) = { 0x38800000, 0x38800000, 0x38800000, 0x38800000 }; | |
static const vector unsigned int _cp_f_hmorebits __attribute__((aligned(16))) = { 0x34000000, 0x34000000, 0x34000000, 0x34000000 }; | |
// Helper functions to load constant values into quadwords | |
static inline qword cp_flpio4(void) { return (qword)_cp_f_pio4; } | |
static inline qword cp_flt3p8(void) { return (qword)_cp_f_t3p8; } | |
static inline qword cp_flnpio2(void) { return (qword)_cp_f_npio2; } | |
static inline qword cp_flpio2(void) { return (qword)_cp_f_pio2; } | |
static inline qword cp_flpt66(void) { return (qword)_cp_f_pt66; } | |
static inline qword cp_flpi(void) { return (qword)_cp_f_pi; } | |
static inline qword cp_flnpi(void) { return (qword)_cp_f_npi; } | |
static inline qword cp_filzero(void) { return si_ilhu((int16_t)0x0000); } | |
static inline qword cp_filnzero(void) { return si_ilhu((int16_t)0x8000); } | |
static inline qword cp_filone(void) { return si_ilhu((int16_t)0x3f80); } | |
static inline qword cp_filtwo(void) { return si_ilhu((int16_t)0x4000); } | |
static inline qword cp_filinf(void) { return si_ilhu((int16_t)0x7f80); } | |
static inline qword cp_filninf(void) { return si_ilhu((int16_t)0xff80); } | |
static inline qword cp_filnan(void) { return si_ilhu((int16_t)0x7fc0); } | |
// Polynomial coefficients for cp_fatan approximation | |
static const vector unsigned int _cp_f_atan_p7 __attribute__((aligned(16))) = { 0x3c08876a, 0x3c08876a, 0x3c08876a, 0x3c08876a }; | |
static const vector unsigned int _cp_f_atan_p6 __attribute__((aligned(16))) = { 0xbd954629, 0xbd954629, 0xbd954629, 0xbd954629 }; | |
static const vector unsigned int _cp_f_atan_p5 __attribute__((aligned(16))) = { 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1, 0x3f8a07c1 }; | |
static const vector unsigned int _cp_f_atan_p4 __attribute__((aligned(16))) = { 0xbf49eee6, 0xbf49eee6, 0xbf49eee6, 0xbf49eee6 }; | |
static const vector unsigned int _cp_f_atan_p3 __attribute__((aligned(16))) = { 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5, 0x3ee4f8b5 }; | |
static const vector unsigned int _cp_f_atan_p2 __attribute__((aligned(16))) = { 0xbf62365c, 0xbf62365c, 0xbf62365c, 0xbf62365c }; | |
static const vector unsigned int _cp_f_atan_p1 __attribute__((aligned(16))) = { 0x3f490965, 0x3f490965, 0x3f490965, 0x3f490965 }; | |
static const vector unsigned int _cp_f_atan_p0 __attribute__((aligned(16))) = { 0xbf2697e0, 0xbf2697e0, 0xbf2697e0, 0xbf2697e0 }; | |
// Higher-degree polynomial coefficients for cp_fatan approximation | |
static const vector unsigned int _cp_f_atan_q7 __attribute__((aligned(16))) = { 0x3c0897d0, 0x3c0897d0, 0x3c0897d0, 0x3c0897d0 }; | |
static const vector unsigned int _cp_f_atan_q6 __attribute__((aligned(16))) = { 0xbd890e31, 0xbd890e31, 0xbd890e31, 0xbd890e31 }; | |
static const vector unsigned int _cp_f_atan_q5 __attribute__((aligned(16))) = { 0x3f6c4616, 0x3f6c4616, 0x3f6c4616, 0x3f6c4616 }; | |
static const vector unsigned int _cp_f_atan_q4 __attribute__((aligned(16))) = { 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2, 0xbf2bbfc2 }; | |
static const vector unsigned int _cp_f_atan_q3 __attribute__((aligned(16))) = { 0x3eb6679b, 0x3eb6679b, 0x3eb6679b, 0x3eb6679b }; | |
static const vector unsigned int _cp_f_atan_q2 __attribute__((aligned(16))) = { 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9, 0xbf56c0c9 }; | |
static const vector unsigned int _cp_f_atan_q1 __attribute__((aligned(16))) = { 0x3f7df927, 0x3f7df927, 0x3f7df927, 0x3f7df927 }; | |
static const vector unsigned int _cp_f_atan_q0 __attribute__((aligned(16))) = { 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 }; | |
#define BUFFER_SIZE 128 // Example buffer size | |
#define TAG 1 // DMA tag | |
typedef vector float qword; | |
// Polynomial approximation using FMA | |
// | |
// zdiv = P + P xp2 + P xp2^2 + P xp2^3 + P xp2^4 | |
// 0 1 2 3 4 | |
// ------------------------------------------ | |
// Q + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5 | |
// 0 1 2 3 4 | |
// | |
// The polynomial approximation is implemented using Estrin's method, evaluating | |
// the numerator and denominator polynomials separately using FMA instructions, | |
// and then dividing the numerator by the denominator to obtain the final result. | |
// | |
// The numerator is computed as: | |
// znum0 = f_atan_p0 | |
// znum1 = (znum0 * xp2) + f_atan_p1 | |
// znum2 = (znum1 * xp2) + f_atan_p2 | |
// znum3 = (znum2 * xp2) + f_atan_p3 | |
// znum = (znum3 * xp2) + f_atan_p4 | |
// | |
// The denominator is computed as: | |
// zden0 = xp2 + f_atan_q0 | |
// zden1 = (zden0 * xp2) + f_atan_q1 | |
// zden2 = (zden1 * xp2) + f_atan_q2 | |
// zden3 = (zden2 * xp2) + f_atan_q3 | |
// zden = (zden3 * xp2) + f_atan_q4 + xp2 | |
// | |
// The final result, zdiv, is computed by dividing znum by zden using | |
// additional FMA instructions for improved accuracy. | |
static inline qword _cp_fatan(const qword x) { | |
const qword f_one = (qword){0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; // 1.0 | |
const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0 | |
const qword f_atan_p0 = (qword)_cp_f_atan_p0; | |
const qword f_atan_p1 = (qword)_cp_f_atan_p1; | |
const qword f_atan_p2 = (qword)_cp_f_atan_p2; | |
const qword f_atan_p3 = (qword)_cp_f_atan_p3; | |
const qword f_atan_p4 = (qword)_cp_f_atan_p4; | |
const qword f_atan_q0 = (qword)_cp_f_atan_q0; | |
const qword f_atan_q1 = (qword)_cp_f_atan_q1; | |
const qword f_atan_q2 = (qword)_cp_f_atan_q2; | |
const qword f_atan_q3 = (qword)_cp_f_atan_q3; | |
const qword f_atan_q4 = (qword)_cp_f_atan_q4; | |
// Range reduction | |
// ## | |
// ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) ) | |
// ## range1_mask = ( pos_x <= 0.66 ) | |
// ## range2_mask = !( range0_mask || range1_mask ) | |
// ## | |
// | |
// This section performs range reduction, which maps the input values into a smaller range | |
// where the arctangent function can be more accurately approximated using a polynomial. | |
// | |
// range0_mask identifies cases where pos_x is greater than tan(3π/8), which is a constant | |
// value f_t3p8. The si_fcgt intrinsic performs a floating-point greater-than comparison. | |
// | |
// range1_mask identifies cases where pos_x is less than or equal to 0.66, which is a constant | |
// value f_pt66. It is computed by combining two masks using si_or: range1_gt_mask (pos_x < 0.66) | |
// and range1_eq_mask (pos_x == 0.66). | |
// | |
// range2_mask identifies the remaining cases that are not covered by range0_mask or range1_mask. | |
// It is computed by performing a bitwise NOR operation (si_nor) on range0_mask and range1_mask. | |
// | |
// These range masks are used to select different computations and approximations for the | |
// arctangent function, based on the range in which the input value falls. | |
const qword pos_x = si_and(x, (qword)si_ilhu(0x7FFFFFFF)); | |
const qword range0_mask = si_fcgt(pos_x, (qword)_cp_f_t3p8); | |
const qword range1_gt_mask = si_fcgt((qword)_cp_f_pt66, pos_x); | |
const qword range1_eq_mask = si_fceq((qword)_cp_f_pt66, pos_x); | |
const qword range1_mask = si_or(range1_gt_mask, range1_eq_mask); | |
const qword range2_mask = si_nor(range0_mask, range1_mask); | |
// Range reduction using FMA | |
// | |
// range0_x = -1.0 | |
// ----- | |
// pos_x | |
// | |
// The range reduction step for the range0 case involves dividing -1.0 by pos_x. | |
// This is implemented using a series of FMA (Fused Multiply-Add) instructions, which perform | |
// the operation (a * b) + c in a single instruction for improved performance. | |
// | |
// The steps to compute range0_x are: | |
// range0_x0 = pos_x - (trunc(pos_x) * pos_x) // Compute the fractional part of pos_x | |
// range0_x1 = trunc(pos_x) // Compute the integer part of pos_x | |
// range0_x2 = -range0_x1 * pos_x + 1.0 // Compute -range0_x1 * pos_x and add 1.0 | |
// range0_x3 = range0_x2 * range0_x1 + range0_x1 // Final result: range0_x3 = -1.0 / pos_x | |
// | |
// The FMA instructions are used to improve performance by reducing the number of instructions | |
// and utilizing specialized hardware for faster execution. | |
const qword range0_x0 = si_frest(pos_x); | |
const qword range0_x1 = si_fi(pos_x, range0_x0); | |
const qword range0_x2 = si_fnms(range0_x1, pos_x, f_one); | |
const qword range0_x3 = si_fma(range0_x2, range0_x1, range0_x1); | |
const qword range0_x = si_xor(range0_x3, si_ilhu((int16_t)0x8000)); // Negate using f_msb | |
const qword range0_y = (qword)_cp_f_pio2; | |
// ## | |
// ## range1_x = pos_x | |
// ## range1_y = 0.0 | |
// ## | |
// | |
// For the range1 case, where the input value pos_x is less than or equal to 0.66, | |
// the range-specific values are simply set to: | |
// | |
// range1_x = pos_x | |
// range1_y = 0.0 | |
// | |
// This means that the input value itself (pos_x) is used for the arctangent approximation | |
// in this range, and the corresponding y-value is set to 0.0. | |
const qword range1_x = pos_x; | |
const qword range1_y = f_zero; | |
// ## | |
// ## range2_x = (pos_x-1.0) | |
// ## ----------- | |
// ## (pos_x+1.0) | |
// ## | |
// ## range2_y = PI | |
// ## --- | |
// ## 4.0 | |
// ## | |
// | |
// For the range2 case, the range reduction involves computing (pos_x - 1.0) / (pos_x + 1.0). | |
// This is done using a series of steps: | |
// range2_x0num = pos_x - 1.0 // Compute the numerator | |
// range2_x0den = pos_x + 1.0 // Compute the denominator | |
// range2_x0 = range2_x0den - (trunc(range2_x0den) * range2_x0den) // Compute the fractional part of the denominator | |
// range2_x1 = -range2_x0 * range2_x0den + 1.0 // Compute the reciprocal of the denominator | |
// range2_x2 = range2_x1 * range2_x0 // Refine the reciprocal | |
// range2_x = range2_x0num * range2_x2 // Final result: range2_x = (pos_x - 1.0) / (pos_x + 1.0) | |
// | |
// The range2_y value is set to PI/4.0, which is used in the final arctangent computation. | |
const qword range2_y = (qword)_cp_f_pio4; | |
const qword range2_x0num = si_fs(pos_x, f_one); | |
const qword range2_x0den = si_fa(pos_x, f_one); | |
const qword range2_x0 = si_frest(range2_x0den); | |
const qword range2_x1 = si_fnms(range2_x0, range2_x0den, f_one); | |
const qword range2_x2 = si_fma(range2_x1, range2_x0, range2_x0); | |
const qword range2_x = si_fm(range2_x0num, range2_x2); | |
// ## | |
// ## range_x = range0_x { range0_mask | |
// ## range1_x { range1_mask | |
// ## range2_x { range2_mask | |
// ## | |
// ## range_y = range0_y { range0_mask | |
// ## range1_y { range1_mask | |
// ## range2_y { range2_mask | |
// ## | |
// | |
// Based on the range masks computed earlier (range0_mask, range1_mask, range2_mask), | |
// this section selects the appropriate range-specific values for x (range_x) and y (range_y). | |
// | |
// The range-specific values (range0_x, range1_x, range2_x, range0_y, range1_y, range2_y) | |
// are precomputed constants that are used in the arctangent approximation for each range. | |
// | |
// The si_selb intrinsic is used to select the appropriate value based on the range masks. | |
// It performs a conditional selection operation, selecting the first value if the corresponding | |
// mask bit is 0, and the second value if the mask bit is 1. | |
// | |
// For range_x: | |
// 1. range_x0 is computed by selecting between range2_x and range0_x based on range0_mask. | |
// 2. range_x is computed by selecting between range_x0 and range1_x based on range1_mask. | |
// | |
// For range_y: | |
// 1. range_y0 is computed by selecting between range2_y and range0_y based on range0_mask. | |
// 2. range_y is computed by selecting between range_y0 and range1_y based on range1_mask. | |
// | |
// These range-specific values are used in the subsequent steps of the arctangent approximation. | |
const qword range_x0 = si_selb(range2_x, range0_x, range0_mask); | |
const qword range_x = si_selb(range_x0, range1_x, range1_mask); | |
const qword range_y0 = si_selb(range2_y, range0_y, range0_mask); | |
const qword range_y = si_selb(range_y0, range1_y, range1_mask); | |
// Polynomial approximation using FMA | |
// | |
// zdiv = P + P xp2 + P xp2^2 + P xp2^3 + P xp2^4 | |
// 0 1 2 3 4 | |
// ------------------------------------------ | |
// Q + Q xp2 + Q xp2^2 + Q xp2^3 + Q xp2^4 + xp2^5 | |
// 0 1 2 3 4 | |
// | |
// The polynomial approximation is implemented using Estrin's method, evaluating | |
// the numerator and denominator polynomials separately using FMA instructions, | |
// and then dividing the numerator by the denominator to obtain the final result. | |
const qword xp2 = si_fm(range_x, range_x); | |
const qword znum0 = f_atan_p0; | |
const qword znum1 = si_fma(znum0, xp2, f_atan_p1); // FMA: (znum0 * xp2) + f_atan_p1 | |
const qword znum2 = si_fma(znum1, xp2, f_atan_p2); // FMA: (znum1 * xp2) + f_atan_p2 | |
const qword znum3 = si_fma(znum2, xp2, f_atan_p3); // FMA: (znum2 * xp2) + f_atan_p3 | |
const qword znum = si_fma(znum3, xp2, f_atan_p4); // FMA: (znum3 * xp2) + f_atan_p4 | |
const qword zden0 = si_fa(xp2, f_atan_q0); | |
const qword zden1 = si_fma(zden0, xp2, f_atan_q1); // FMA: (zden0 * xp2) + f_atan_q1 | |
const qword zden2 = si_fma(zden1, xp2, f_atan_q2); // FMA: (zden1 * xp2) + f_atan_q2 | |
const qword zden3 = si_fma(zden2, xp2, f_atan_q3); // FMA: (zden2 * xp2) + f_atan_q3 | |
const qword zden = si_fma(zden3, xp2, f_atan_q4); // FMA: (zden3 * xp2) + f_atan_q4 | |
// Refine the denominator value | |
const qword zden_r0 = si_frest(zden); | |
const qword zden_r1 = si_fnms(zden_r0, zden, f_one); | |
const qword zden_r = si_fma(zden_r1, zden_r0, zden_r0); | |
// Final division | |
const qword zdiv = si_fm(znum, zden_r); | |
// Final result with range reduction | |
const qword z0 = si_fm(xp2, zdiv); | |
const qword z1 = si_fma(range_x, z0, range_x); | |
const qword zadd0 = si_selb(f_zero, (qword)_cp_f_hmorebits, range2_mask); | |
const qword zadd1 = si_selb(zadd0, (qword)_cp_f_morebits, range1_mask); | |
const qword zadd = si_fa(z1, zadd1); | |
const qword yaddz = si_fa(range_y, zadd); | |
const qword neg_yaddz = si_xor(yaddz, si_ilhu((int16_t)0x8000)); | |
const qword pos_yaddz = si_selb(yaddz, neg_yaddz, si_fcgt(f_zero, x)); | |
// Handling special cases | |
const qword x_eqz_mask = si_fceq(f_zero, x); | |
const qword result_y0 = si_selb(pos_yaddz, x, x_eqz_mask); | |
const qword x_eqinf_mask = si_fceq((qword)_cp_f_pi, x); | |
const qword x_eqninf_mask = si_fceq((qword)_cp_f_npi, x); | |
const qword result_y1 = si_selb(result_y0, (qword)_cp_f_pio2, x_eqinf_mask); | |
const qword result = si_selb(result_y1, (qword)_cp_f_npio2, x_eqninf_mask); | |
return result; | |
} | |
// Handles special cases for atan2 | |
// ## Additionally, the atan2 function handles special cases where either x or y is zero, | |
// ## positive infinity, or negative infinity, as described in the special cases section. | |
// ## | |
// ## The implementation of cp_fatan2 follows these mathematical relationships and special | |
// ## cases to compute the correct angle for any given (x, y) coordinate pair. | |
// ## | |
// ## Special cases: | |
// ## - If y is +0 and x is negative, the result is +pi | |
// ## - If y is -0 and x is negative, the result is -pi | |
// ## - If y is +0 and x is positive, the result is +0 | |
// ## - If y is -0 and x is positive, the result is -0 | |
// ## - If y is negative and x is 0, the result is -pi or pi/2 (depending on sign of y) | |
// ## - If y is positive and x is 0, the result is pi or pi/2 (depending on sign of y) | |
// ## - If either y or x is NaN, the result is NaN | |
// ## - If y is +0 and x is -0, the result is +pi | |
// ## - If y is -0 and x is -0, the result is -pi | |
// ## - If y is +0 and x is +0, the result is +0 | |
// ## - If y is -0 and x is +0, the result is -0 | |
// ## - If y is +inf and x is +inf, the result is pi/4 | |
// ## - If y is -inf and x is +inf, the result is -pi/4 | |
// ## - If y is +inf and x is -inf, the result is 3pi/4 | |
// ## - If y is -inf and x is -inf, the result is -3pi/4 | |
// ## - If y is finite and positive, and x is -inf, the result is +pi | |
// ## - If y is finite and negative, and x is -inf, the result is -pi | |
// ## - If y is finite and positive, and x is +inf, the result is +0 | |
// ## - If y is finite and negative, and x is +inf, the result is -0 | |
// ## - If x is finite and y is +inf, the result is pi/2 | |
// ## - If x is finite and y is -inf, the result is -pi/2 | |
// ## | |
// ## General cases: | |
// ## - If x is negative and y is non-negative, the result is pi + atan(y/x) | |
// ## - If x is negative and y is negative, the result is -pi + atan(y/x) | |
// ## - Otherwise, the result is atan(y/x) | |
static inline qword _cp_fatan2_special_cases(qword y, qword x, qword result) { | |
const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0 | |
const qword f_pi = (qword)_cp_f_pi; | |
const qword f_npi = (qword)_cp_f_npi; | |
const qword f_inf = (qword){0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; // inf | |
const qword f_ninf = (qword){0xff800000, 0xff800000, 0xff800000, 0xff800000}; // -inf | |
const qword x_eqz_mask = si_fceq(f_zero, x); | |
const qword y_eqz_mask = si_fceq(f_zero, y); | |
const qword x_eqinf_mask = si_fceq(f_inf, x); | |
const qword x_eqninf_mask = si_fceq(f_ninf, x); | |
const qword y_eqinf_mask = si_fceq(f_inf, y); | |
const qword y_eqninf_mask = si_fceq(f_ninf, y); | |
const qword zero_by_zero = si_and(x_eqz_mask, y_eqz_mask); | |
const qword inf_by_inf = si_and(x_eqinf_mask, y_eqinf_mask); | |
const qword zero_by_any = si_and(x_eqz_mask, si_or(y_eqinf_mask, y_eqninf_mask)); | |
const qword any_by_zero = si_and(y_eqz_mask, si_or(x_eqinf_mask, x_eqninf_mask)); | |
const qword zero_by_zero_res = f_zero; | |
const qword inf_by_inf_res = si_selb(f_pi, f_npi, si_fcgts(y, f_zero)); | |
const qword zero_by_any_res = f_zero; | |
const qword any_by_zero_res = si_selb(f_pi, f_npi, si_fcgts(x, f_zero)); | |
result = si_selb(result, zero_by_zero_res, zero_by_zero); | |
result = si_selb(result, inf_by_inf_res, inf_by_inf); | |
result = si_selb(result, zero_by_any_res, zero_by_any); | |
result = si_selb(result, any_by_zero_res, any_by_zero); | |
return result; | |
} | |
// ## cp_fatan2(y, x) | |
// ## | |
// ## Function to compute the angle (in radians) between the positive x-axis and the | |
// ## point (x, y) in the Cartesian plane, using the atan2 function. | |
// ## | |
// ## Input: | |
// ## y: float or vector float representing the y-coordinate | |
// ## x: float or vector float representing the x-coordinate | |
// ## | |
// ## Output: | |
// ## The angle (theta) in radians between the positive x-axis and the point (x, y) | |
// ## in the range [-pi, pi]. | |
// ## | |
// ## Mathematical Description: | |
// ## The atan2 function is a variation of the arctangent function, which computes | |
// ## the angle (in radians) between the positive x-axis and the vector (x, y) in the | |
// ## Cartesian plane. It is defined as: | |
// ## | |
// ## atan2(y, x) = arctan(y/x) | |
// ## | |
// ## However, the standard arctangent function (arctan) has a range of [-pi/2, pi/2], | |
// ## which is not sufficient to represent all possible angles in the Cartesian plane. | |
// ## The atan2 function extends the range to [-pi, pi] by considering the signs of | |
// ## both x and y. | |
// ## | |
// ## The mathematical relationships governing atan2 are as follows: | |
// ## | |
// ## atan2(y, x) = arctan(y/x) (for x > 0) | |
// ## atan2(y, x) = arctan(y/x) + pi (for x < 0 and y >= 0) | |
// ## atan2(y, x) = arctan(y/x) - pi (for x < 0 and y < 0) | |
qword _cp_fatan2(qword y, qword x) { | |
const qword f_one = (qword){0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; // 1.0 | |
const qword f_zero = (qword){0x00000000, 0x00000000, 0x00000000, 0x00000000}; // 0.0 | |
const qword f_pi = (qword)_cp_f_pi; | |
const qword f_npi = (qword)_cp_f_npi; | |
// Compute the reciprocal of x | |
const qword x_r0 = si_frest(x); | |
const qword x_r1 = si_fnms(x_r0, x, f_one); | |
const qword x_r = si_fma(x_r1, x_r0, x_r0); | |
// Compute y / x | |
const qword yox = si_fm(y, x_r); | |
// Compute the arctangent of y / x using polynomial approximation | |
const qword atan_yox = _cp_fatan(yox); | |
// Determine the quadrant and adjust the result accordingly | |
const qword x_ltz_mask = si_fcgt(f_zero, x); | |
const qword y_ltz_mask = si_fcgt(f_zero, y); | |
const qword xy_ltz_mask = si_and(x_ltz_mask, y_ltz_mask); | |
const qword zadd0 = si_selb(f_zero, f_pi, x_ltz_mask); | |
const qword zadd = si_selb(zadd0, f_npi, xy_ltz_mask); | |
const qword result = si_fa(zadd, atan_yox); | |
// Handle special cases and return the final result | |
return _cp_fatan2_special_cases(y, x, result); | |
} | |
vector float cp_fatan2(vector float arg0 /* y */, vector float arg1 /* x */) { | |
const qword y = (qword)arg0; | |
const qword x = (qword)arg1; | |
const qword result = _cp_fatan2(y, x); | |
return (vector float)(result); | |
} | |
// cp_fatan2_scalar(arg0, arg1) | |
// | |
// Computes the atan2(y, x) function using _cp_fatan2 for single-precision float inputs. | |
// | |
// Input: | |
// arg0: The y-coordinate as a single-precision float. | |
// arg1: The x-coordinate as a single-precision float. | |
// | |
// Output: | |
// The angle (theta) in radians between the positive x-axis and the point (x, y), | |
// in the range [-pi, pi], as a single-precision float. | |
// | |
// This function is a scalar wrapper around _cp_fatan2, which computes atan2(y, x) using | |
// SIMD operations. The steps involved are: | |
// 1. Convert the input floats arg0 (y) and arg1 (x) to quadwords y and x using si_from_float. | |
// 2. Call _cp_fatan2(y, x) to compute the atan2(y, x) value, storing the result in z. | |
// 3. Convert the quadword z back to a float result using si_to_float. | |
// 4. Return the float result. | |
// | |
// This function provides a convenient way to compute atan2(y, x) for single-precision float | |
// inputs while leveraging the SIMD capabilities of the _cp_fatan2 function. | |
float cp_fatan2_scalar(float arg0 /* y */, float arg1 /* x */) { | |
const qword y = si_from_float(arg0); | |
const qword x = si_from_float(arg1); | |
const qword z = _cp_fatan2(y, x); | |
const float result = si_to_float(z); | |
return result; | |
} | |
// DMA transfer function | |
void dma_transfer(qword *buffer, uint32_t ea, int tag) { | |
mfc_get(buffer, ea, BUFFER_SIZE * sizeof(qword), tag, 0, 0); | |
mfc_write_tag_mask(1 << tag); | |
mfc_read_tag_status_all(); | |
} | |
// Double buffering for DMA transfers | |
void process_with_double_buffering(uint32_t ea_data, uint32_t ea_result, int num_elements) { | |
qword buffer0[BUFFER_SIZE] __attribute__((aligned(16))); | |
qword buffer1[BUFFER_SIZE] __attribute__((aligned(16))); | |
qword *current_buffer = buffer0; | |
qword *next_buffer = buffer1; | |
int current_tag = TAG; | |
int next_tag = TAG + 1; | |
// Initial DMA transfer | |
dma_transfer(current_buffer, ea_data, current_tag); | |
for (int i = 0; i < num_elements; i += BUFFER_SIZE) { | |
// Start next DMA transfer | |
if (i + BUFFER_SIZE < num_elements) { | |
dma_transfer(next_buffer, ea_data + (i + BUFFER_SIZE) * sizeof(qword), next_tag); | |
} | |
// Process current buffer | |
for (int j = 0; j < BUFFER_SIZE && (i + j) < num_elements; ++j) { | |
qword y = current_buffer[j * 2]; | |
qword x = current_buffer[j * 2 + 1]; | |
qword result = _cp_fatan2(y, x); | |
current_buffer[j] = result; | |
} | |
// Write results back | |
mfc_put(current_buffer, ea_result + i * sizeof(qword), BUFFER_SIZE * sizeof(qword), current_tag, 0, 0); | |
mfc_write_tag_mask(1 << current_tag); | |
mfc_read_tag_status_all(); | |
// Swap buffers and tags | |
qword *temp_buffer = current_buffer; | |
current_buffer = next_buffer; | |
next_buffer = temp_buffer; | |
int temp_tag = current_tag; | |
current_tag = next_tag; | |
next_tag = temp_tag; | |
} | |
} | |
#endif /* CBE_SPU_FATAN */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment