Triang3l/VECTOR_CONVERT_I2F.cc

## VECTOR_CONVERT_I2F.cc
struct VECTOR_CONVERT_I2F
    : Sequence<VECTOR_CONVERT_I2F,
               I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // flags = ARITHMETIC_UNSIGNED
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
#if 1
      // 0.5 ULP precision rounding to nearest even (the only rounding mode on
      // AltiVec) for negatives.
      // TODO(Triang3l): Ignore the current rounding mode for positives as well
      // (and ideally throughout all the AltiVec instructions).

      // Handle values from 0x80000000 first (that would be negative as signed
      // integers). Round them to the nearest even to the mantissa of 2.0f ^ 31,
      // overflowing to 2.0f ^ 32 near the end of the range.
      // uint32_t(2.0f ^ 31 + ULP) == 0b10000000000000000000000100000000u.
      // Therefore, for even ULP:
      // ...0001111111 -> ...00
      // ...0010000000 -> ...00
      // ...0010000001 -> ...01
      // For odd ULP:
      // ...0101111111 -> ...01
      // ...0110000000 -> ...10
      // ...0110000001 -> ...10
      // Add 0b01111111 + ((src >> 8) & 1) to the 32-bit integer before
      // truncating - however, near UINT32_MAX, it will overflow, in this case,
      // 1 should be added to the exponent (the number should become 2.0f ^ 32).

      // xmm0 = (src >> 8) & 1
      e.vpslld(e.xmm0, i.src1, 31 - 8);
      e.vpsrld(e.xmm0, e.xmm0, 31);
      // xmm0 = src + ((src >> 8) & 1)
      e.vpaddd(e.xmm0, e.xmm0, i.src1);
      // xmm0 = src + ((src >> 8) & 1) + 0b1111111
      e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMInt127));

      // xmm1 = UINT_MAX if not overflowed, 0 if overflowed.
      e.vpsrad(e.xmm1, e.xmm0, 31);
      // xmm0 = rounded number, biased exponent 1 if not overflowed or 0 if
      // overflowed.
      e.vpsrld(e.xmm0, e.xmm0, 8);
      // xmm1 = -2 << 23 if not overflowed, 0 if overflowed.
      e.vpslld(e.xmm1, e.xmm1, 24);

      // xmm0 = 2^33 and mantissa if not overflowed, 2^32 if overflowed.
      e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));
      // xmm0 = unsigned integer as float if 0x80000000 or above.
      e.vpaddd(e.xmm0, e.xmm0, e.xmm1);

      // xmm1 = for the positive case, signed integer converted to float.
      e.vcvtdq2ps(e.xmm1, i.src1);

      // Merge the two ways depending on whether the number is >= 0x80000000.
      e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
#else
      // xmm0 = mask of positive values
      e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));

      // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
      e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
      e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);

      // xmm1 = [0, INT_MAX]
      e.vcvtdq2ps(i.dest, e.xmm1);

      // scale values back above [INT_MIN, UINT_MAX]
      e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
      e.vaddps(i.dest, i.dest, e.xmm0);
#endif
    } else {
      e.vcvtdq2ps(i.dest, i.src1);
    }
  }
};
	struct VECTOR_CONVERT_I2F
	: Sequence<VECTOR_CONVERT_I2F,
	I<OPCODE_VECTOR_CONVERT_I2F, V128Op, V128Op>> {
	static void Emit(X64Emitter& e, const EmitArgType& i) {
	// flags = ARITHMETIC_UNSIGNED
	if (i.instr->flags & ARITHMETIC_UNSIGNED) {
	#if 1
	// 0.5 ULP precision rounding to nearest even (the only rounding mode on
	// AltiVec) for negatives.
	// TODO(Triang3l): Ignore the current rounding mode for positives as well
	// (and ideally throughout all the AltiVec instructions).

	// Handle values from 0x80000000 first (that would be negative as signed
	// integers). Round them to the nearest even to the mantissa of 2.0f ^ 31,
	// overflowing to 2.0f ^ 32 near the end of the range.
	// uint32_t(2.0f ^ 31 + ULP) == 0b10000000000000000000000100000000u.
	// Therefore, for even ULP:
	// ...0001111111 -> ...00
	// ...0010000000 -> ...00
	// ...0010000001 -> ...01
	// For odd ULP:
	// ...0101111111 -> ...01
	// ...0110000000 -> ...10
	// ...0110000001 -> ...10
	// Add 0b01111111 + ((src >> 8) & 1) to the 32-bit integer before
	// truncating - however, near UINT32_MAX, it will overflow, in this case,
	// 1 should be added to the exponent (the number should become 2.0f ^ 32).

	// xmm0 = (src >> 8) & 1
	e.vpslld(e.xmm0, i.src1, 31 - 8);
	e.vpsrld(e.xmm0, e.xmm0, 31);
	// xmm0 = src + ((src >> 8) & 1)
	e.vpaddd(e.xmm0, e.xmm0, i.src1);
	// xmm0 = src + ((src >> 8) & 1) + 0b1111111
	e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMInt127));

	// xmm1 = UINT_MAX if not overflowed, 0 if overflowed.
	e.vpsrad(e.xmm1, e.xmm0, 31);
	// xmm0 = rounded number, biased exponent 1 if not overflowed or 0 if
	// overflowed.
	e.vpsrld(e.xmm0, e.xmm0, 8);
	// xmm1 = -2 << 23 if not overflowed, 0 if overflowed.
	e.vpslld(e.xmm1, e.xmm1, 24);

	// xmm0 = 2^33 and mantissa if not overflowed, 2^32 if overflowed.
	e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));
	// xmm0 = unsigned integer as float if 0x80000000 or above.
	e.vpaddd(e.xmm0, e.xmm0, e.xmm1);

	// xmm1 = for the positive case, signed integer converted to float.
	e.vcvtdq2ps(e.xmm1, i.src1);

	// Merge the two ways depending on whether the number is >= 0x80000000.
	e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
	#else
	// xmm0 = mask of positive values
	e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));

	// scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
	e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
	e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);

	// xmm1 = [0, INT_MAX]
	e.vcvtdq2ps(i.dest, e.xmm1);

	// scale values back above [INT_MIN, UINT_MAX]
	e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
	e.vaddps(i.dest, i.dest, e.xmm0);
	#endif
	} else {
	e.vcvtdq2ps(i.dest, i.src1);
	}
	}
	};