/intel-intrinsics-3.3.15.xml

## intel-intrinsics-3.3.15.xml
<intrinsics_list version='3.3.15' date='09/16/2016'>
<intrinsic tech='MMX' rettype='__m64' name='_m_from_int64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__int64'/>
	<description>Copy 64-bit integer "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='mm, r64'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__int64' name='_m_to_int64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64'/>
	<description>Copy 64-bit integer "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='r64, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='void' name='_m_empty'>
	<CPUID>MMX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
	<instruction name='emms' form=''/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_from_int'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='int'/>
	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := 0
	</operation>
	<instruction name='movd' form='mm, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='int' name='_m_to_int'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64'/>
	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
	<operation>
dst[31:0] := a[31:0]
	</operation>
	<instruction name='movd' form='r32, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_packsswb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
	</description>
	<operation>
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (b[15:0])
dst[47:40] := Saturate_Int16_To_Int8 (b[31:16])
dst[55:48] := Saturate_Int16_To_Int8 (b[47:32])
dst[63:56] := Saturate_Int16_To_Int8 (b[63:48])
	</operation>
	<instruction name='packsswb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_packssdw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (b[31:0])
dst[63:48] := Saturate_Int32_To_Int16 (b[63:32])
	</operation>
	<instruction name='packssdw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_packuswb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
	</operation>
	<instruction name='packuswb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhbw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]){
	dst[7:0] := src1[39:32]
	dst[15:8] := src2[39:32]
	dst[23:16] := src1[47:40]
	dst[31:24] := src2[47:40]
	dst[39:32] := src1[55:48]
	dst[47:40] := src2[55:48]
	dst[55:48] := src1[63:56]
	dst[63:56] := src2[63:56]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
	</operation>
	<instruction name='punpckhbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhwd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]){
	dst[15:0] := src1[47:32]
	dst[31:16] := src2[47:32]
	dst[47:32] := src1[63:48]
	dst[63:48] := src2[63:48]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpckhdq'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
	<operation>
dst[31:0] := a[63:32]
dst[63:32] := b[63:32]
	</operation>
	<instruction name='punpckhdq' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpcklbw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_BYTES(src1[63:0], src2[63:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpcklwd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_WORDS(src1[63:0], src2[63:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklwd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_punpckldq'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := b[31:0]
	</operation>
	<instruction name='punpckldq' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
	</operation>
	<instruction name='paddb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
	</operation>
	<instruction name='paddw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
	</operation>
	<instruction name='paddd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddsb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddsb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddsw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddsw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddusb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddusb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_paddusw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddusw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubb'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
	</operation>
	<instruction name='psubb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
	</operation>
	<instruction name='psubw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubd'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
	</operation>
	<instruction name='psubd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubsb'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubsb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubsw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubsw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubusb'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubusb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psubusw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubusw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pmaddwd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
	</operation>
	<instruction name='pmaddwd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pmulhw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pmullw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
	</operation>
	<instruction name='pmullw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psllw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psllwi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pslld'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pslldi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psllq'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF count[63:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; count[63:0])
FI
	</operation>
	<instruction name='psllq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psllqi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF imm8[7:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; imm8[7:0])
FI
	</operation>
	<instruction name='psllq' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psraw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrawi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrad'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psradi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrlw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrlwi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrld'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrldi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrlq'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF count[63:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; count[63:0])
FI
	</operation>
	<instruction name='psrlq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_psrlqi'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF imm8[7:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; imm8[7:0])
FI
	</operation>
	<instruction name='psrlq' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pand'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] AND b[63:0])
	</operation>
	<instruction name='pand' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pandn'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := ((NOT a[63:0]) AND b[63:0])
	</operation>
	<instruction name='pandn' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_por'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] OR b[63:0])
	</operation>
	<instruction name='por' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pxor'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] XOR b[63:0])
	</operation>
	<instruction name='pxor' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpeqd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtb'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtw'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_m_pcmpgtd'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='void' name='_mm_empty'>
	<CPUID>MMX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures.</description>
	<instruction name='emms' form=''/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
	</operation>
	<instruction name='paddb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
	</operation>
	<instruction name='paddw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_add_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
	</operation>
	<instruction name='paddd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddsb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddsw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pu8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddusb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_adds_pu16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddusw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
	</operation>
	<instruction name='psubb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
	</operation>
	<instruction name='psubw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sub_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
	</operation>
	<instruction name='psubd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubsb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubsw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pu8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubusb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_subs_pu16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubusw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_madd_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
	</operation>
	<instruction name='pmaddwd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_mulhi_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_mullo_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
	</operation>
	<instruction name='pmullw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sll_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF count[63:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; count[63:0])
FI
	</operation>
	<instruction name='psllq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_slli_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF imm8[7:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &lt;&lt; imm8[7:0])
FI
	</operation>
	<instruction name='psllq' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sra_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srai_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_sra_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srai_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srl_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='count' type='__m64'/>
	<description>Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF count[63:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; count[63:0])
FI
	</operation>
	<instruction name='psrlq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_srli_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". </description>
	<operation>
IF imm8[7:0] &gt; 63
	dst[63:0] := 0
ELSE
	dst[63:0] := ZeroExtend(a[63:0] &gt;&gt; imm8[7:0])
FI
	</operation>
	<instruction name='psrlq' form='mm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_and_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] AND b[63:0])
	</operation>
	<instruction name='pand' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_andnot_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := ((NOT a[63:0]) AND b[63:0])
	</operation>
	<instruction name='pandn' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_or_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] OR b[63:0])
	</operation>
	<instruction name='por' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_xor_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[63:0] XOR b[63:0])
	</operation>
	<instruction name='pxor' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpeq_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cmpgt_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cvtsi32_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='int'/>
	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst".</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := 0
	</operation>
	<instruction name='movd' form='mm, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='int' name='_mm_cvtsi64_si32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64'/>
	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
	<operation>
dst[31:0] := a[31:0]
	</operation>
	<instruction name='movd' form='r32, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__int64' name='_mm_cvtm64_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64'/>
	<description>Copy 64-bit integer "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='r64, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_cvtsi64_m64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__int64'/>
	<description>Copy 64-bit integer "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='mm, r64'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_setzero_si64'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m64 with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='pxor' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[31:0] := e0
dst[63:32] := e1
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[15:0] := e0
dst[31:16] := e1
dst[47:32] := e2
dst[63:48] := e3
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e0
dst[15:8] := e1
dst[23:16] := e2
dst[31:24] := e3
dst[39:32] := e4
dst[47:40] := e5
dst[55:48] := e6
dst[63:56] := e7
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='int'/>
	<description>Broadcast 32-bit integer "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='short'/>
	<description>Broadcast 16-bit integer "a" to all all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_set1_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='char'/>
	<description>Broadcast 8-bit integer "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[31:0] := e1
dst[63:32] := e0
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[15:0] := e3
dst[31:16] := e2
dst[47:32] := e1
dst[63:48] := e0
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' sequence='true' rettype='__m64' name='_mm_setr_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e7
dst[15:8] := e6
dst[23:16] := e5
dst[31:24] := e4
dst[39:32] := e3
dst[47:40] := e2
dst[55:48] := e1
dst[63:56] := e0
	</operation>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
	</description>
	<operation>
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (b[15:0])
dst[47:40] := Saturate_Int16_To_Int8 (b[31:16])
dst[55:48] := Saturate_Int16_To_Int8 (b[47:32])
dst[63:56] := Saturate_Int16_To_Int8 (b[63:48])
	</operation>
	<instruction name='packsswb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (b[31:0])
dst[63:48] := Saturate_Int32_To_Int16 (b[63:32])
	</operation>
	<instruction name='packssdw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_packs_pu16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
	</operation>
	<instruction name='packuswb' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]){
	dst[7:0] := src1[39:32]
	dst[15:8] := src2[39:32]
	dst[23:16] := src1[47:40]
	dst[31:24] := src2[47:40]
	dst[39:32] := src1[55:48]
	dst[47:40] := src2[55:48]
	dst[55:48] := src1[63:56]
	dst[63:56] := src2[63:56]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0])
	</operation>
	<instruction name='punpckhbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]){
	dst[15:0] := src1[47:32]
	dst[31:16] := src2[47:32]
	dst[47:32] := src1[63:48]
	dst[63:48] := src2[63:48]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpackhi_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".</description>
	<operation>
dst[31:0] := a[63:32]
dst[63:32] := b[63:32]
	</operation>
	<instruction name='punpckhdq' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi8'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_BYTES(src1[63:0], src2[63:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklbw' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi16'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_WORDS(src1[63:0], src2[63:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	RETURN dst[63:0]
}

dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0])
	</operation>
	<instruction name='punpcklwd' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>
<intrinsic tech='MMX' rettype='__m64' name='_mm_unpacklo_pi32'>
	<type>Integer</type>
	<CPUID>MMX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := b[31:0]
	</operation>
	<instruction name='punpckldq' form='mm, mm'/>
	<header>mmintrin.h</header>
</intrinsic>

<intrinsic tech='SSE' sequence='true' rettype='' name='_MM_TRANSPOSE4_PS'>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='row0' type='__m128' />
	<parameter varname='row1' type='__m128' />
	<parameter varname='row2' type='__m128' />
	<parameter varname='row3' type='__m128' />
	<description>Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.).</description>
	<operation>
__m128 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm_unpacklo_ps(row0, row1);
tmp2 = _mm_unpacklo_ps(row2, row3);
tmp1 = _mm_unpackhi_ps(row0, row1);
tmp3 = _mm_unpackhi_ps(row2, row3);
row0 = _mm_movelh_ps(tmp0, tmp2);
row1 = _mm_movehl_ps(tmp2, tmp0);
row2 = _mm_movelh_ps(tmp1, tmp3);
row3 = _mm_movehl_ps(tmp3, tmp1);
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='unsigned int' name='_mm_getcsr'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void' />
	<description>Get the unsigned 32-bit value of the MXCSR control and status register.</description>
	<operation>
dst[31:0] := MXCSR
	</operation>
	<instruction name='stmxcsr' form='MEMd'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_setcsr'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='a' type='unsigned int' />
	<description>Set the MXCSR control and status register with the value in unsigned 32-bit integer "a".</description>
	<operation>
MXCSR := a[31:0]
	</operation>
	<instruction name='ldmxcsr' form='MEMd'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_EXCEPTION_STATE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<description>Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
	<operation>
dst[31:0] := MXCSR &amp; _MM_EXCEPT_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_MM_SET_EXCEPTION_STATE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='a' type='unsigned int' />
	<description>Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT</description>
	<operation>
MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_EXCEPTION_MASK'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<description>Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
	<operation>
dst[31:0] := MXCSR &amp; _MM_MASK_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_MM_SET_EXCEPTION_MASK'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='a' type='unsigned int' />
	<description>Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT</description>
	<operation>
MXCSR := a[31:0] AND ~_MM_MASK_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_ROUNDING_MODE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<description>Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
	<operation>
dst[31:0] := MXCSR &amp; _MM_ROUND_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_MM_SET_ROUNDING_MODE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='a' type='unsigned int' />
	<description>Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO</description>
	<operation>
MXCSR := a[31:0] AND ~_MM_ROUND_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='unsigned int' name='_MM_GET_FLUSH_ZERO_MODE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<description>Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
	<operation>
dst[31:0] := MXCSR &amp; _MM_FLUSH_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_MM_SET_FLUSH_ZERO_MODE'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='a' type='unsigned int' />
	<description>Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF</description>
	<operation>
MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_prefetch'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='char const*' />
	<parameter varname='i' type='int' />
	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
	<instruction name='prefetchnta' form='mprefetch'/>
	<instruction name='prefetcht0' form='mprefetch'/>
	<instruction name='prefetcht1' form='mprefetch'/>
	<instruction name='prefetcht2' form='mprefetch'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech="KNC" rettype="void" name="_mm_prefetch">
	<CPUID>KNCNI</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='char const*' />
	<parameter varname='i' type='int' />
	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
	<instruction name='vprefetch0' form='mprefetch' xed=''/>
	<instruction name='vprefetch1' form='mprefetch' xed=''/>
	<instruction name='vprefetch2' form='mprefetch' xed=''/>
	<instruction name='vprefetchnta' form='mprefetch' xed=''/>
	<instruction name='vprefetche0' form='mprefetch' xed=''/>
	<instruction name='vprefetche1' form='mprefetch' xed=''/>
	<instruction name='vprefetche2' form='mprefetch' xed=''/>
	<instruction name='vprefetchenta' form='mprefetch' xed=''/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype="void" name="_mm_prefetch">
	<CPUID>PREFETCHWT1</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='char const*' />
	<parameter varname='i' type='int' />
	<description>Fetch the line of data from memory that contains address "p" to a location in the cache heirarchy specified by the locality hint "i".</description>
	<instruction name='prefetchwt1' form='mprefetch' xed=''/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_sfence'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void' />
	<description>Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.</description>
	<instruction name='sfence' form=''/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_max_pi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxsw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pmaxsw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxsw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_max_pu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxub' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pmaxub'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxub' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_min_pi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminsw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pminsw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminsw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_min_pu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminub' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pminub'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminub' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_mulhi_pu16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhuw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pmulhuw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhuw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_avg_pu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgb' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pavgb'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgb' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_avg_pu16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pavgw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_sad_pu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0
	</operation>
	<instruction name='psadbw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_psadbw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0
	</operation>
	<instruction name='psadbw' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cvtsi32_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='int' />
	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cvtsi2ss' form='xmm, r32'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cvt_si2ss'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='int' />
	<description>Convert the 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cvtsi2ss' form='xmm, r32'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m128' name='_mm_cvtsi64_ss'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__int64' />
	<description>Convert the 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='cvtsi2ss' form='xmm, r64'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='5'/>
	<perfdata arch='Sandy Bridge' lat='5'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m128' name='_mm_cvtpi32_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m64' />
	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". </description>
	<operation>
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='cvtpi2ps' form='xmm, mm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m128' name='_mm_cvt_pi2ps'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m64' />
	<description>Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". </description>
	<operation>
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='cvtpi2ps' form='xmm, mm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi16_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64' />
	<description>Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpu16_ps'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64' />
	<description>Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi8_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64' />
	<description>Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpu8_ps'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64' />
	<description>Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_cvtpi32x2_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='b' type='__m64' />
	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed 32-bit integers in "a" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". </description>
	<operation>
dst[31:0] := Convert_Int32_To_FP32(a[31:0])
dst[63:32] := Convert_Int32_To_FP32(a[63:32])
dst[95:64] := Convert_Int32_To_FP32(b[31:0])
dst[127:96] := Convert_Int32_To_FP32(b[63:32])
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_stream_pi'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m64*' />
	<parameter varname='a' type='__m64' />
	<description>Store 64-bits of integer data from "a" into memory using a non-temporal memory hint.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
	</operation>
	<instruction name='movntq' form='m64, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_maskmove_si64'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='mask' type='__m64' />
	<parameter varname='mem_addr' type='char*' />
	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='maskmovq' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_m_maskmovq'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='mask' type='__m64' />
	<parameter varname='mem_addr' type='char*' />
	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='maskmovq' form='mm, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_extract_pi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname="imm8" type='int' />
	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
	<operation>
dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
dst[31:16] := 0
	</operation>
	<instruction name='pextrw' form='r32, mm, imm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_m_pextrw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname="imm8" type='int' />
	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
	<operation>
dst[15:0] := (a[63:0] &gt;&gt; (imm8[1:0] * 16))[15:0]
dst[31:16] := 0
	</operation>
	<instruction name='pextrw' form='r32, mm, imm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_insert_pi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='i' type='int' />
	<parameter varname="imm8" type='int' />
	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]
	</operation>
	<instruction name='pinsrw' form='xmm, r32, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pinsrw'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname='i' type='int' />
	<parameter varname="imm8" type='int' />
	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]
	</operation>
	<instruction name='pinsrw' form='mm, r32, imm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_movemask_pi8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64' />
	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0
	</operation>
	<instruction name='pmovmskb' form='r32, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_m_pmovmskb'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64' />
	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0
	</operation>
	<instruction name='pmovmskb' form='r32, mm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_shuffle_pi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname="imm8" type='int' />
	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])
	</operation>
	<instruction name='pshufw' form='mm, mm, imm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_m_pshufw'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64' />
	<parameter varname="imm8" type='int' />
	<description>Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])
	</operation>
	<instruction name='pshufw' form='mm, mm, imm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_add_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
	<operation>
dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
	</operation>
	<instruction name='addss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_add_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
	</operation>
	<instruction name='addps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sub_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]
	</operation>
	<instruction name='subss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sub_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
	</operation>
	<instruction name='subps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_mul_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]
	</operation>
	<instruction name='mulss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_mul_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
	</operation>
	<instruction name='mulps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_div_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
	<operation>
dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]
	</operation>
	<instruction name='divss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='13' tpt='5'/>
	<perfdata arch='Ivy Bridge' lat='13' tpt='6'/>
	<perfdata arch='Sandy Bridge' lat='14' tpt='14'/>
	<perfdata arch='Westmere' lat='14' tpt='12'/>
	<perfdata arch='Nehalem' lat='14' tpt='12'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_div_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
	</operation>
	<instruction name='divps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='13' tpt='5'/>
	<perfdata arch='Ivy Bridge' lat='13' tpt='6'/>
	<perfdata arch='Sandy Bridge' lat='14' tpt='14'/>
	<perfdata arch='Westmere' lat='14' tpt='12'/>
	<perfdata arch='Nehalem' lat='14' tpt='12'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sqrt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := SQRT(a[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='sqrtss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='18' tpt='7'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='7'/>
	<perfdata arch='Sandy Bridge' lat='22' tpt='14'/>
	<perfdata arch='Westmere' lat='25' tpt='16'/>
	<perfdata arch='Nehalem' lat='25' tpt='16'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_sqrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='sqrtps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='18' tpt='7'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='7'/>
	<perfdata arch='Sandy Bridge' lat='22' tpt='14'/>
	<perfdata arch='Westmere' lat='25' tpt='16'/>
	<perfdata arch='Nehalem' lat='25' tpt='16'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rcp_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
dst[31:0] := APPROXIMATE(1.0/a[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='rcpss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='3'/>
	<perfdata arch='Nehalem' lat='3' tpt='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rcp_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
	</operation>
	<instruction name='rcpps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='2'/>
	<perfdata arch='Nehalem' lat='3' tpt='2'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rsqrt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0]))
dst[127:32] := a[127:32]
	</operation>
	<instruction name='rsqrtss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='3'/>
	<perfdata arch='Nehalem' lat='3' tpt='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_rsqrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128' />
	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
	</operation>
	<instruction name='rsqrtps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='2'/>
	<perfdata arch='Nehalem' lat='3' tpt='2'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_min_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[31:0] := MIN(a[31:0], b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='minss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_min_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
	</operation>
	<instruction name='minps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_max_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[31:0] := MAX(a[31:0], b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='maxss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_max_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
	</operation>
	<instruction name='maxps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_and_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
	</operation>
	<instruction name='andps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_andnot_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
	</operation>
	<instruction name='andnps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_or_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
	</operation>
	<instruction name='orps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_xor_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
	</operation>
	<instruction name='xorps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpeq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpeq_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmplt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] &lt; b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmplt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmple_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] &lt;= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmple_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &lt;= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpgt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] &gt; b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpgt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpge_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] &gt;= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpge_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt;= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpneq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpneq_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnlt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := !( a[31:0] &lt; b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnlt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] &lt; b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnle_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := !( a[31:0] &lt;= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnle_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] &lt;= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpngt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := !( a[31:0] &gt; b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpngt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] &gt; b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnge_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := !( a[31:0] &gt;= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpnge_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] &gt;= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpord_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpord_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpunord_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
	</operation>
	<instruction name='cmpss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_cmpunord_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xffffffff : 0
ENDFOR
	</operation>
	<instruction name='cmpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comieq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comilt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comile_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comigt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comige_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='int' name='_mm_comineq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='comiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomieq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomilt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] &lt; b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomile_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] &lt;= b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomigt_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] &gt; b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomige_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] &gt;= b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomineq_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomiss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtss_si32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
	</operation>
	<instruction name='cvtss2si' form='r32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvt_ss2si'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
	</operation>
	<instruction name='cvtss2si' form='r32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__int64' name='_mm_cvtss_si64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
	</operation>
	<instruction name='cvtss2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='float' name='_mm_cvtss_f32'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Copy the lower single-precision (32-bit) floating-point element of "a" to "dst".</description>
	<operation>dst[31:0] := a[31:0]</operation>
	<instruction name='movss' form='m32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_cvtps_pi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtps2pi' form='mm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_cvt_ps2pi'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtps2pi' form='mm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvttss_si32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
	</operation>
	<instruction name='cvttss2si' form='r32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtt_ss2si'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
	</operation>
	<instruction name='cvttss2si' form='r32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__int64' name='_mm_cvttss_si64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])
	</operation>
	<instruction name='cvttss2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_cvttps_pi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvttps2pi' form='mm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m64' name='_mm_cvtt_ps2pi'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvttps2pi' form='mm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m64' name='_mm_cvtps_pi16'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 16*j
	k := 32*j
	dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m64' name='_mm_cvtps_pi8'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128' />
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 8*j
	k := 32*j
	dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='a' type='float' />
	<description>Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements.</description>
	<operation>
dst[31:0] := a[31:0]
dst[127:32] := 0
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set1_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='a' type='float' />
	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ps1'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='a' type='float' />
	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_set_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='float'/>
	<parameter varname='e2' type='float'/>
	<parameter varname='e1' type='float'/>
	<parameter varname='e0' type='float'/>
	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
	<operation>
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_setr_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='float'/>
	<parameter varname='e2' type='float'/>
	<parameter varname='e1' type='float'/>
	<parameter varname='e0' type='float'/>
	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m128' name='_mm_setzero_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Set</category>
	<parameter varname='' type='void' />
	<description>Return vector of type __m128 with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='xorps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadh_pi'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='mem_addr' type='__m64 const*' />
	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+63:mem_addr+32]
	</operation>
	<instruction name='movhps' form='xmm, m64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadl_pi'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='mem_addr' type='__m64 const*' />
	<description>Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+63:mem_addr+32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='movlps' form='xmm, m64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_load_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[127:32] := 0
	</operation>
	<instruction name='movss' form='xmm, m32'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_load1_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
	<operation>
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+31:mem_addr]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+31:mem_addr]
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_load_ps1'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load a single-precision (32-bit) floating-point element from memory into all elements of "dst".</description>
	<operation>
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+31:mem_addr]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+31:mem_addr]
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_load_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movaps' form='xmm, m128'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_loadu_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movups' form='xmm, m128'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='__m128' name='_mm_loadr_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const*' />
	<description>Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[31:0] := MEM[mem_addr+127:mem_addr+96]
dst[63:32] := MEM[mem_addr+95:mem_addr+64]
dst[95:64] := MEM[mem_addr+63:mem_addr+32]
dst[127:96] := MEM[mem_addr+31:mem_addr]
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
		<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movntps' form='m128, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeh_pi'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m64*' />
	<parameter varname='a' type='__m128' />
	<description>Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[95:64]
MEM[mem_addr+63:mem_addr+32] := a[127:96]
	</operation>
	<instruction name='movhps' form='m64, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storel_pi'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m64*' />
	<parameter varname='a' type='__m128' />
	<description>Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[63:32]
	</operation>
	<instruction name='movlps' form='m64, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[31:0]
	</operation>
	<instruction name='movss' form='m32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_store1_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_store_ps1'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
	</operation>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='void' name='_mm_store_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movaps' form='m128, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movups' form='m128, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' sequence='true' rettype='void' name='_mm_storer_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float*' />
	<parameter varname='a' type='__m128' />
	<description>Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[127:96]
MEM[mem_addr+63:mem_addr+32] := a[95:64]
MEM[mem_addr+95:mem_addr+64] := a[63:32]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
	</operation>
	<instruction name='movups' form='m128, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_move_ss'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := b[31:0]
dst[63:32] := a[63:32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='movss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_shuffle_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<parameter varname="imm8" type='unsigned int' />
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
	</operation>
	<instruction name='shufps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_unpackhi_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='unpckhps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_unpacklo_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='unpcklps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_movehl_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst".</description>
	<operation>
dst[31:0] := b[95:64]
dst[63:32] := b[127:96]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='movhlps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' rettype='__m128' name='_mm_movelh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128' />
	<parameter varname='b' type='__m128' />
	<description>Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst".</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := b[31:0]
dst[127:96] := b[63:32]
	</operation>
	<instruction name='movlhps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128' />
	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF a[i+31]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:4] := 0
	</operation>
	<instruction name='movmskps' form='r32, xmm'/>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void*' name='_mm_malloc'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='size' type='size_t'/>
	<parameter varname='align' type='size_t'/>
	<description>Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc".</description>
	<header>xmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_mm_free'>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='mem_addr' type='void *'/>
	<description>Free aligned memory that was allocated with "_mm_malloc".</description>
	<header>xmmintrin.h</header>
</intrinsic>

<intrinsic tech='SSE2' rettype='void' name='_mm_pause'>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.</description>
	<instruction name='pause' form=''/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='4'/>
	<perfdata arch='Sandy Bridge' lat='4'/>
	<perfdata arch='Westmere' lat='5'/>
	<perfdata arch='Nehalem' lat='5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_clflush'>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='void const*'/>
	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
	<instruction name='clflush' form='mprefetch'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_lfence'>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.</description>
	<instruction name='lfence' form=''/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_mfence'>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.</description>
	<instruction name='mfence' form=''/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
	</operation>
	<instruction name='paddb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
	</operation>
	<instruction name='paddw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
	</operation>
	<instruction name='paddd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m64' name='_mm_add_si64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Add 64-bit integers "a" and "b", and store the result in "dst".</description>
	<operation>
dst[63:0] := a[63:0] + b[63:0]
	</operation>
	<instruction name='paddq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_add_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
	</operation>
	<instruction name='paddq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddsb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='paddusb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_adds_epu16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
	</operation>
	<instruction name='paddusw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_avg_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_avg_epu16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
ENDFOR
	</operation>
	<instruction name='pavgw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_madd_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
	</operation>
	<instruction name='pmaddwd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxub' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminub' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mulhi_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mulhi_epu16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
	</operation>
	<instruction name='pmulhuw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mullo_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
	</operation>
	<instruction name='pmullw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m64' name='_mm_mul_su32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". </description>
	<operation>
dst[63:0] := a[31:0] * b[31:0]
	</operation>
	<instruction name='pmuludq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_mul_epu32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
	</operation>
	<instruction name='pmuludq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sad_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 1
	i := j*64
	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] +
	               tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
	dst[i+63:i+16] := 0
ENDFOR
	</operation>
	<instruction name='psadbw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='5' tpt='1'/>
	<perfdata arch='Nehalem' lat='5' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
	</operation>
	<instruction name='psubb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
	</operation>
	<instruction name='psubw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
	</operation>
	<instruction name='psubd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m64' name='_mm_sub_si64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst".</description>
	<operation>
dst[63:0] := a[63:0] - b[63:0]
	</operation>
	<instruction name='psubq' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sub_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
	</operation>
	<instruction name='psubq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubsb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epu8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
	</operation>
	<instruction name='psubusb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_subs_epu16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
	</operation>
	<instruction name='psubusw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
	</operation>
	<instruction name='pslldq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_bslli_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
	</operation>
	<instruction name='pslldq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_bsrli_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
	</operation>
	<instruction name='psrldq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllw' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='pslld' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_slli_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF imm8[7:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sll_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF count[63:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psllq' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srai_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sra_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psraw' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srai_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_sra_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrad' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
	</operation>
	<instruction name='psrldq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrld' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srli_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF imm8[7:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlq' form='xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_srl_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF count[63:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
	FI
ENDFOR
	</operation>
	<instruction name='psrlq' form='xmm, xmm'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_and_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[127:0] := (a[127:0] AND b[127:0])
	</operation>
	<instruction name='pand' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_andnot_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
	<operation>
dst[127:0] := ((NOT a[127:0]) AND b[127:0])
	</operation>
	<instruction name='pandn' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_or_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[127:0] := (a[127:0] OR b[127:0])
	</operation>
	<instruction name='por' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_xor_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[127:0] := (a[127:0] XOR b[127:0])
	</operation>
	<instruction name='pxor' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := ( a[i+7:i] &lt; b[i+7:i] ) ? 0xFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := ( a[i+15:i] &lt; b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cmplt_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &lt; b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtepi32_pd'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtdq2pd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi32_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='int'/>
	<description>Convert the 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
	<operation>
dst[63:0] := Convert_Int32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='cvtsi2sd' form='xmm, r32'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi64_sd'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__int64'/>
	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
	<operation>
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='cvtsi2sd' form='xmm, r64'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtsi64x_sd'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__int64'/>
	<description>Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
	<operation>
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='cvtsi2sd' form='xmm, r64'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtepi32_ps'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtdq2ps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128d' name='_mm_cvtpi32_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m64'/>
	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtpi2pd' form='xmm, mm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi32_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='int'/>
	<description>Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".</description>
	<operation>
dst[31:0] := a[31:0]
dst[127:32] := 0
	</operation>
	<instruction name='movd' form='xmm, r32'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi64_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__int64'/>
	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := 0
	</operation>
	<instruction name='movq' form='xmm, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtsi64x_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__int64'/>
	<description>Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := 0
	</operation>
	<instruction name='movq' form='xmm, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtsi128_si32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Copy the lower 32-bit integer in "a" to "dst".</description>
	<operation>
dst[31:0] := a[31:0]
	</operation>
	<instruction name='movd' form='r32, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsi128_si64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsi128_si64x'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movq' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='__m64'/>
	<parameter varname='e0' type='__m64'/>
	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[63:0] := e0
dst[127:64] := e1
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi64x'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='__int64'/>
	<parameter varname='e0' type='__int64'/>
	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[63:0] := e0
dst[127:64] := e1
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='int'/>
	<parameter varname='e2' type='int'/>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='short'/>
	<parameter varname='e6' type='short'/>
	<parameter varname='e5' type='short'/>
	<parameter varname='e4' type='short'/>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[15:0] := e0
dst[31:16] := e1
dst[47:32] := e2
dst[63:48] := e3
dst[79:64] := e4
dst[95:80] := e5
dst[111:96] := e6
dst[127:112] := e7
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e15' type='char'/>
	<parameter varname='e14' type='char'/>
	<parameter varname='e13' type='char'/>
	<parameter varname='e12' type='char'/>
	<parameter varname='e11' type='char'/>
	<parameter varname='e10' type='char'/>
	<parameter varname='e9' type='char'/>
	<parameter varname='e8' type='char'/>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e0
dst[15:8] := e1
dst[23:16] := e2
dst[31:24] := e3
dst[39:32] := e4
dst[47:40] := e5
dst[55:48] := e6
dst[63:56] := e7
dst[71:64] := e8
dst[79:72] := e9
dst[87:80] := e10
dst[95:88] := e11
dst[103:96] := e12
dst[111:104] := e13
dst[119:112] := e14
dst[127:120] := e15
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='__m64'/>
	<description>Broadcast 64-bit integer "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi64x'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='__int64'/>
	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='int'/>
	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='short'/>
	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_set1_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='char'/>
	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='__m64'/>
	<parameter varname='e0' type='__m64'/>
	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[63:0] := e1
dst[127:64] := e0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='int'/>
	<parameter varname='e2' type='int'/>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='short'/>
	<parameter varname='e6' type='short'/>
	<parameter varname='e5' type='short'/>
	<parameter varname='e4' type='short'/>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[15:0] := e7
dst[31:16] := e6
dst[47:32] := e5
dst[63:48] := e4
dst[79:64] := e3
dst[95:80] := e2
dst[111:96] := e1
dst[127:112] := e0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128i' name='_mm_setr_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e15' type='char'/>
	<parameter varname='e14' type='char'/>
	<parameter varname='e13' type='char'/>
	<parameter varname='e12' type='char'/>
	<parameter varname='e11' type='char'/>
	<parameter varname='e10' type='char'/>
	<parameter varname='e9' type='char'/>
	<parameter varname='e8' type='char'/>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e15
dst[15:8] := e14
dst[23:16] := e13
dst[31:24] := e12
dst[39:32] := e11
dst[47:40] := e10
dst[55:48] := e9
dst[63:56] := e8
dst[71:64] := e7
dst[79:72] := e6
dst[87:80] := e5
dst[95:88] := e4
dst[103:96] := e3
dst[111:104] := e2
dst[119:112] := e1
dst[127:120] := e0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_setzero_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<description>Return vector of type __m128i with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='pxor' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_loadl_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m128i const*'/>
	<description>Load 64-bit integer from memory into the first element of "dst".</description>
	<operation>
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[MAX:64] := 0
	</operation>
	<instruction name='movq' form='xmm, m64'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_load_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m128i const*'/>
	<description>Load 128-bits of integer data from memory into "dst".
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movdqa' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_loadu_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m128i const*'/>
	<description>Load 128-bits of integer data from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.
	</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movdqu' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_maskmoveu_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='mem_addr' type='char*'/>
	<description>Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='maskmovdqu' form='xmm, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m128i*'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store 128-bits of integer data from "a" into memory.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movdqa' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m128i*'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store 128-bits of integer data from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.
	</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movdqu' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_storel_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m128i*'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store 64-bit integer from the first element of "a" into memory.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
	</operation>
	<instruction name='movq' form='m64, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m128i*'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store 128-bits of integer data from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movntdq' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_stream_si32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='int*'/>
	<parameter varname='a' type='int'/>
	<description>Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
	<operation>
MEM[mem_addr+31:mem_addr] := a[31:0]
	</operation>
	<instruction name='movnti' form='m32, r32'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='void' name='_mm_stream_si64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__int64*'/>
	<parameter varname='a' type='__int64'/>
	<description>Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
	</operation>
	<instruction name='movnti' form='m64, r64'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m64' name='_mm_movepi64_pi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<description>Copy the lower 64-bit integer in "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
	</operation>
	<instruction name='movdq2q' form='mm, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_movpi64_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m64'/>
	<description>Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := 0
	</operation>
	<instruction name='movq2dq' form='xmm, mm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_move_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128i'/>
	<description>Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := 0
	</operation>
	<instruction name='movq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packs_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
	</description>
	<operation>
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
	</operation>
	<instruction name='packsswb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packs_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
	</operation>
	<instruction name='packssdw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_packus_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
	</operation>
	<instruction name='packuswb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
	<operation>
dst[15:0] := (a[127:0] &gt;&gt; (imm8[2:0] * 16))[15:0]
dst[31:16] := 0
	</operation>
	<instruction name='pextrw' form='r32, xmm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='i' type='int'/>
	<parameter varname="imm8" type='int'/>
	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[127:0] := a[127:0]
sel := imm8[2:0]*16
dst[sel+15:sel] := i[15:0]
	</operation>
	<instruction name='pinsrw' form='xmm, r32, imm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<description>Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:16] := 0
	</operation>
	<instruction name='pmovmskb' form='r32, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shuffle_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
	</operation>
	<instruction name='pshufd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shufflehi_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
	</operation>
	<instruction name='pshufhw' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_shufflelo_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".</description>
	<operation>
dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='pshuflw' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
	</operation>
	<instruction name='punpckhbw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpckhwd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpckhdq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpackhi_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpckhqdq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi8'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
	</operation>
	<instruction name='punpcklbw' form='xmm, xmm'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi16'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpcklwd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi32'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpckldq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_unpacklo_epi64'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='punpcklqdq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_add_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
	<operation>
dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='addsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_add_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
	</operation>
	<instruction name='addpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_div_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". </description>
	<operation>
dst[63:0] := a[63:0] 0 b[63:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='divsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='20' tpt='12'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='21' tpt='22'/>
	<perfdata arch='Westmere' lat='21' tpt='20'/>
	<perfdata arch='Nehalem' lat='21' tpt='20'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_div_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
	</operation>
	<instruction name='divpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='20' tpt='12'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='21' tpt='22'/>
	<perfdata arch='Westmere' lat='21' tpt='20'/>
	<perfdata arch='Nehalem' lat='21' tpt='20'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_max_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := MAX(a[63:0], b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='maxsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_max_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
	</operation>
	<instruction name='maxpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_min_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := MIN(a[63:0], b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='minsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_min_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
	</operation>
	<instruction name='minpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_mul_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := a[63:0] * b[63:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='mulsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='5' tpt='1'/>
	<perfdata arch='Nehalem' lat='5' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_mul_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
	</operation>
	<instruction name='mulpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='5' tpt='1'/>
	<perfdata arch='Nehalem' lat='5' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sqrt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := SQRT(b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='sqrtsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='20' tpt='13'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='21' tpt='22'/>
	<perfdata arch='Westmere' lat='32' tpt='30'/>
	<perfdata arch='Nehalem' lat='32' tpt='30'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sqrt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
	</operation>
	<instruction name='sqrtpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='20' tpt='13'/>
	<perfdata arch='Ivy Bridge' lat='20' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='21' tpt='22'/>
	<perfdata arch='Westmere' lat='32' tpt='30'/>
	<perfdata arch='Nehalem' lat='32' tpt='30'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sub_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := a[63:0] - b[63:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='subsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_sub_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
	</operation>
	<instruction name='subpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_and_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
	</operation>
	<instruction name='andpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_andnot_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
	</operation>
	<instruction name='andnpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_or_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
	</operation>
	<instruction name='orpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_xor_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
	</operation>
	<instruction name='xorpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpeq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmplt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmple_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpgt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpge_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpord_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpunord_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpneq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnlt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := !(a[63:0] &lt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnle_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := !(a[63:0] &lt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpngt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := !(a[63:0] &gt; b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnge_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := !(a[63:0] &gt;= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
	</operation>
	<instruction name='cmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpeq_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmplt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmple_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpgt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpge_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpord_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpunord_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpneq_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnlt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := !(a[i+63:i] &lt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnle_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := !(a[i+63:i] &lt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpngt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := !(a[i+63:i] &gt; b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cmpnge_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := !(a[i+63:i] &gt;= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='cmppd' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comieq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comilt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comile_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comigt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comige_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_comineq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).</description>
	<operation>
RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='comisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomieq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomilt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] &lt; b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomile_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] &lt;= b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomigt_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] &gt; b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomige_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] &gt;= b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_ucomineq_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.</description>
	<operation>
RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
	</operation>
	<instruction name='ucomisd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtpd_ps'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
	</operation>
	<instruction name='cvtpd2ps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtps_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
	</operation>
	<instruction name='cvtps2pd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtpd_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
	</operation>
	<instruction name='cvtpd2dq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvtsd_si32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
	</operation>
	<instruction name='cvtsd2si' form='r32, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsd_si64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
	</operation>
	<instruction name='cvtsd2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvtsd_si64x'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
	</operation>
	<instruction name='cvtsd2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128' name='_mm_cvtsd_ss'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
	</description>
	<operation>
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
dst[127:32] := a[127:31]
dst[MAX:64] := 0
	</operation>
	<instruction name='cvtsd2ss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='double' name='_mm_cvtsd_f64'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".</description>
	<operation>dst[63:0] := a[63:0]</operation>
	<instruction name='movsd' form='m64, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_cvtss_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128'/>
	<description>Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
	</description>
	<operation>
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:64] := 0
	</operation>
	<instruction name='cvtss2sd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='0' tpt='1'/>
	<perfdata arch='Nehalem' lat='0' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvttpd_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
	</operation>
	<instruction name='cvttpd2dq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cvttsd_si32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
	</operation>
	<instruction name='cvttsd2si' form='r32, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvttsd_si64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
	</operation>
	<instruction name='cvttsd2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_cvttsd_si64x'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".</description>
	<operation>
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
	</operation>
	<instruction name='cvttsd2si' form='r64, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvtps_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvtps2dq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128i' name='_mm_cvttps_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='cvttps2dq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m64' name='_mm_cvtpd_pi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
	</operation>
	<instruction name='cvtpd2pi' form='mm, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m64' name='_mm_cvttpd_pi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
	</operation>
	<instruction name='cvttpd2pi' form='mm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Westmere' lat='4' tpt='1'/>
	<perfdata arch='Nehalem' lat='4' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='double'/>
	<description>Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := 0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set1_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='double'/>
	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_pd1'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='a' type='double'/>
	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_set_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='double'/>
	<parameter varname='e0' type='double'/>
	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
	<operation>
dst[63:0] := e0
dst[127:64] := e1
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_setr_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='e1' type='double'/>
	<parameter varname='e0' type='double'/>
	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[63:0] := e1
dst[127:64] := e0
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128d' name='_mm_setzero_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Set</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m128d with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='xorpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.8'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.33'/>
	<perfdata arch='Westmere' lat='1' tpt='0.33'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.33'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_load_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movapd' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_load1_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
	<operation>
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := MEM[mem_addr+63:mem_addr]
	</operation>
	<instruction name='movapd' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>


<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_load_pd1'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".</description>
	<operation>
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := MEM[mem_addr+63:mem_addr]
	</operation>
	<instruction name='movapd' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='__m128d' name='_mm_loadr_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[63:0] := MEM[mem_addr+127:mem_addr+64]
dst[127:64] := MEM[mem_addr+63:mem_addr]
	</operation>
	<instruction name='movapd' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadu_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movupd' form='xmm, m128'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_load_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := 0
	</operation>
	<instruction name='movsd' form='xmm, m64'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := MEM[mem_addr+63:mem_addr]
	</operation>
	<instruction name='movhpd' form='xmm, m64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_loadl_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Load</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='movlpd' form='xmm, m64'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_stream_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movntpd' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
	</operation>
	<instruction name='movsd' form='m64, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_store1_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_store_pd1'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_store_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movapd' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeu_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+127:mem_addr] := a[127:0]
	</operation>
	<instruction name='movupd' form='m128, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' sequence='true' rettype='void' name='_mm_storer_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[127:64]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storeh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store the upper double-precision (64-bit) floating-point element from "a" into memory.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[127:64]
	</operation>
	<instruction name='movhpd' form='m64, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='void' name='_mm_storel_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double*'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store the lower double-precision (64-bit) floating-point element from "a" into memory.</description>
	<operation>
MEM[mem_addr+63:mem_addr] := a[63:0]
	</operation>
	<instruction name='movlpd' form='m64, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_unpackhi_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='unpckhpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_unpacklo_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
	</operation>
	<instruction name='unpcklpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_movemask_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128d'/>
	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF a[i+63]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:2] := 0
	</operation>
	<instruction name='movmskpd' form='r32, xmm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_shuffle_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". </description>
	<operation>
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
	</operation>
	<instruction name='shufpd' form='xmm, xmm, imm'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' vexEq='TRUE' rettype='__m128d' name='_mm_move_sd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := b[63:0]
dst[127:64] := a[127:64]
	</operation>
	<instruction name='movsd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128' name='_mm_castpd_ps'>
	<type>Floating Point</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128d'/>
		<description>Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_castpd_si128'>
	<type>Floating Point</type>
	<type>Integer</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128d'/>
		<description>Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128d' name='_mm_castps_pd'>
	<type>Floating Point</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128'/>
		<description>Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_castps_si128'>
	<type>Floating Point</type>
	<type>Integer</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128'/>
		<description>Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128d' name='_mm_castsi128_pd'>
	<type>Floating Point</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128i'/>
		<description>Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128' name='_mm_castsi128_ps'>
	<type>Floating Point</type>
		<CPUID>SSE2</CPUID>
		<category>Cast</category>
		<parameter varname='a' type='__m128i'/>
		<description>Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' sequence='true' rettype='float' name='_cvtsh_ss'>
	<type>Floating Point</type>
	<category>Convert</category>
	<parameter varname='a' type='unsigned short'/>
	<description>Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst".</description>
	<operation>
dst[31:0] := Convert_FP16_To_FP32(a[15:0])
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' sequence='true' rettype='unsigned short' name='_cvtss_sh'>
	<type>Floating Point</type>
	<category>Convert</category>
	<parameter varname='a' type='float'/>
	<parameter varname="imm8" type='int'/>
	<description>Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst".</description>
	<operation>
dst[15:0] := Convert_FP32_To_FP16(a[31:0])
	</operation>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='__m128' name='_mm_cvtph_ps'>
	<type>Floating Point</type>
	<CPUID>FP16C</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*16
	dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcvtph2ps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='__m128i' name='_mm_cvtps_ph'>
	<type>Floating Point</type>
	<CPUID>FP16C</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='rounding' type='int'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst".
	[round_note]
	</description>
	<operation>
FOR j := 0 to 3
	i := 16*j
	l := 32*j
	dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcvtps2ph' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='10' tpt='1'/>
	<header>emmintrin.h</header>
</intrinsic>

<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_addsub_ps'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF (j is even)
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='addsubps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_addsub_pd'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF (j is even)
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='addsubpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_hadd_pd'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[63:0] := a[127:64] + a[63:0]
dst[127:64] := b[127:64] + b[63:0]
	</operation>
	<instruction name='haddpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Westmere' lat='5' tpt='2'/>
	<perfdata arch='Nehalem' lat='5' tpt='2'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_hadd_ps'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
	</operation>
	<instruction name='haddps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Westmere' lat='5' tpt='2'/>
	<perfdata arch='Nehalem' lat='5' tpt='2'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_hsub_pd'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[63:0] := a[63:0] - a[127:64]
dst[127:64] := b[63:0] - b[127:64]
	</operation>
	<instruction name='hsubpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Westmere' lat='5' tpt='2'/>
	<perfdata arch='Nehalem' lat='5' tpt='2'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_hsub_ps'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
	</operation>
	<instruction name='hsubps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='2'/>
	<perfdata arch='Westmere' lat='5' tpt='2'/>
	<perfdata arch='Nehalem' lat='5' tpt='2'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128i' name='_mm_lddqu_si128'>
	<type>Integer</type>
	<CPUID>SSE3</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m128i const*'/>
	<description>Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='lddqu' form='xmm, m128'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_mm_monitor'>
	<CPUID>MONITOR</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='void const*'/>
	<parameter varname='extensions' type='unsigned'/>
	<parameter varname='hints' type='unsigned'/>
	<description>Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints".</description>
	<instruction name='monitor' form=''/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_movedup_pd'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128d'/>
	<description>Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst".
	</description>
	<operation>
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
	</operation>
	<instruction name='movddup' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128d' name='_mm_loaddup_pd'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const*'/>
	<description>Load a double-precision (64-bit) floating-point element from memory into both elements of "dst".
	</description>
	<operation>
tmp[63:0] := MEM[mem_addr+63:mem_addr]
tmp[127:64] := MEM[mem_addr+63:mem_addr]
	</operation>
	<instruction name='movddup' form='xmm, m64'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_movehdup_ps'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128'/>
	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
	</description>
	<operation>
dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
	</operation>
	<instruction name='movshdup' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE3' vexEq='TRUE' rettype='__m128' name='_mm_moveldup_ps'>
	<type>Floating Point</type>
	<CPUID>SSE3</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m128'/>
	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
	</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
	</operation>
	<instruction name='movsldup' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>pmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_mm_mwait'>
	<CPUID>MONITOR</CPUID>
	<category>General Support</category>
	<parameter varname='extensions' type='unsigned'/>
	<parameter varname='hints' type='unsigned'/>
	<description>Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.</description>
	<instruction name='mwait' form=''/>
	<header>pmmintrin.h</header>
</intrinsic>

<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64'/>
	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
	</operation>
	<instruction name='pabsb' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
	</operation>
	<instruction name='pabsb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64'/>
	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
	</operation>
	<instruction name='pabsw' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
	</operation>
	<instruction name='pabsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_abs_pi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m64'/>
	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='pabsd' form='mm, mm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_abs_epi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='pabsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_shuffle_epi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[3:0] := b[i+3:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
ENDFOR
	</operation>
	<instruction name='pshufb' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_shuffle_pi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[2:0] := b[i+2:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
ENDFOR
	</operation>
	<instruction name='pshufb' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_alignr_epi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='count' type='int'/>
	<description>Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
	<operation>
tmp[255:0] := ((a[127:0] &lt;&lt; 128) OR b[127:0]) &gt;&gt; (count[7:0]*8)
dst[127:0] := tmp[127:0]
	</operation>
	<instruction name='palignr' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_alignr_pi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<parameter varname='count' type='int'/>
	<description>Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
	<operation>
tmp[127:0] := ((a[63:0] &lt;&lt; 64) OR b[63:0]) &gt;&gt; (count[7:0]*8)
dst[63:0] := tmp[63:0]
	</operation>
	<instruction name='palignr' form='mm, mm, imm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadd_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := a[95:80] + a[79:64]
dst[63:48] := a[127:112] + a[111:96]
dst[79:64] := b[31:16] + b[15:0]
dst[95:80] := b[63:48] + b[47:32]
dst[111:96] := b[95:80] + b[79:64]
dst[127:112] := b[127:112] + b[111:96]
	</operation>
	<instruction name='phaddw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadds_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
	</operation>
	<instruction name='phaddsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hadd_epi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
	</operation>
	<instruction name='phaddd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadd_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := b[31:16] + b[15:0]
dst[63:48] := b[63:48] + b[47:32]
	</operation>
	<instruction name='phaddw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadd_pi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := b[63:32] + b[31:0]
	</operation>
	<instruction name='phaddw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hadds_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[63:48] = Saturate_To_Int16(b[63:48] + b[47:32])
	</operation>
	<instruction name='phaddsw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsub_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := a[79:64] - a[95:80]
dst[63:48] := a[111:96] - a[127:112]
dst[79:64] := b[15:0] - b[31:16]
dst[95:80] := b[47:32] - b[63:48]
dst[111:96] := b[79:64] - b[95:80]
dst[127:112] := b[111:96] - b[127:112]
	</operation>
	<instruction name='phsubw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsubs_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
	</operation>
	<instruction name='phsubsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_hsub_epi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
	</operation>
	<instruction name='phsubd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3' tpt='1.5'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1.5'/>
	<perfdata arch='Westmere' lat='3' tpt='1.5'/>
	<perfdata arch='Nehalem' lat='3' tpt='1.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsub_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := b[15:0] - b[31:16]
dst[63:48] := b[47:32] - b[63:48]
	</operation>
	<instruction name='phsubw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsub_pi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := b[31:0] - b[63:32]
	</operation>
	<instruction name='phsubd' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_hsubs_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[63:48] = Saturate_To_Int16(b[47:32] - b[63:48])
	</operation>
	<instruction name='phsubsw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_maddubs_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='pmaddubsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_maddubs_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
	</operation>
	<instruction name='pmaddubsw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_mulhrs_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
	</operation>
	<instruction name='pmulhrsw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_mulhrs_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
	</operation>
	<instruction name='pmulhrsw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF b[i+7:i] &lt; 0
		dst[i+7:i] := NEG(a[i+7:i])
	ELSE IF b[i+7:i] = 0
		dst[i+7:i] := 0
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF b[i+15:i] &lt; 0
		dst[i+15:i] := NEG(a[i+15:i])
	ELSE IF b[i+15:i] = 0
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' vexEq='TRUE' rettype='__m128i' name='_mm_sign_epi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF b[i+31:i] &lt; 0
		dst[i+31:i] := NEG(a[i+31:i])
	ELSE IF b[i+31:i] = 0
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi8'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	IF b[i+7:i] &lt; 0
		dst[i+7:i] := NEG(a[i+7:i])
	ELSE IF b[i+7:i] = 0
		dst[i+7:i] := 0
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignb' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi16'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 3
	i := j*16
	IF b[i+15:i] &lt; 0
		dst[i+15:i] := NEG(a[i+15:i])
	ELSE IF b[i+15:i] = 0
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignw' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSSE3' rettype='__m64' name='_mm_sign_pi32'>
	<type>Integer</type>
	<CPUID>SSSE3</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m64'/>
	<parameter varname='b' type='__m64'/>
	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	IF b[i+31:i] &lt; 0
		dst[i+31:i] := NEG(a[i+31:i])
	ELSE IF b[i+31:i] = 0
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='psignd' form='mm, mm'/>
	<header>tmmintrin.h</header>
</intrinsic>

<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_blend_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF imm8[j%8]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='blendpd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_blend_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='blendps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_blendv_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='mask' type='__m128d'/>
	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='blendvpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='2'/>
	<perfdata arch='Nehalem' lat='2' tpt='2'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_blendv_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='mask' type='__m128'/>
	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='blendvps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='2'/>
	<perfdata arch='Nehalem' lat='2' tpt='2'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_blendv_epi8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF mask[i+7]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pblendvb' form='xmm, xmm'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_blend_epi16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF imm8[j%8]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pblendw' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_dp_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
	<operation>
DP(a[127:0], b[127:0], imm8[7:0]) {
	FOR j := 0 to 1
		i := j*64
		IF imm8[(4+j)%8]]
			temp[i+63:i] := a[i+63:i] * b[i+63:i]
		ELSE
			temp[i+63:i] := 0
		FI
	ENDFOR

	sum[63:0] := temp[127:64] + temp[63:0]

	FOR j := 0 to 1
		i := j*64
		IF imm8[j%8]
			tmpdst[i+63:i] := sum[63:0]
		ELSE
			tmpdst[i+63:i] := 0
		FI
	ENDFOR
	RETURN tmpdst[127:0]
}

dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
	</operation>
	<instruction name='dppd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='9'/>
	<perfdata arch='Ivy Bridge' lat='9'/>
	<perfdata arch='Sandy Bridge' lat='9'/>
	<perfdata arch='Westmere' lat='9'/>
	<perfdata arch='Nehalem' lat='9'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_dp_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
	<operation>
DP(a[127:0], b[127:0], imm8[7:0]) {
	FOR j := 0 to 3
		i := j*32
		IF imm8[(4+j)%8]
			temp[i+31:i] := a[i+31:i] * b[i+31:i]
		ELSE
			temp[i+31:i] := 0
		FI
	ENDFOR

	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])

	FOR j := 0 to 3
		i := j*32
		IF imm8[j%8]
			tmpdst[i+31:i] := sum[31:0]
		ELSE
			tmpdst[i+31:i] := 0
		FI
	ENDFOR
	RETURN tmpdst[127:0]
}

dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
	</operation>
	<instruction name='dpps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='14'/>
	<perfdata arch='Ivy Bridge' lat='12'/>
	<perfdata arch='Sandy Bridge' lat='12'/>
	<perfdata arch='Westmere' lat='11'/>
	<perfdata arch='Nehalem' lat='11'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
	</operation>
	<instruction name='extractps' form='r32, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='2'/>
	<perfdata arch='Nehalem' lat='2'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".</description>
	<operation>
dst[7:0] := (a[127:0] &gt;&gt; (imm8[3:0] * 8))[7:0]
dst[31:8] := 0
	</operation>
	<instruction name='pextrb' form='r32, xmm, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_extract_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
dst[31:0] := (a[127:0] &gt;&gt; (imm8[1:0] * 32))[31:0]
	</operation>
	<instruction name='pextrd' form='r32, xmm, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__int64' name='_mm_extract_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[127:0] &gt;&gt; (imm8[0] * 64))[63:0]
	</operation>
	<instruction name='pextrq' form='r64, xmm, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_insert_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). </description>
	<operation>
tmp2[127:0] := a[127:0]
CASE (imm8[7:6]) of
0: tmp1[31:0] := b[31:0]
1: tmp1[31:0] := b[63:32]
2: tmp1[31:0] := b[95:64]
3: tmp1[31:0] := b[127:96]
ESAC
CASE (imm8[5:4]) of
0: tmp2[31:0] := tmp1[31:0]
1: tmp2[63:32] := tmp1[31:0]
2: tmp2[95:64] := tmp1[31:0]
3: tmp2[127:96] := tmp1[31:0]
ESAC
FOR j := 0 to 3
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := tmp2[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='insertps' form='xmm, xmm, imm'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='i' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[127:0] := a[127:0]
sel := imm8[3:0]*8
dst[sel+7:sel] := i[7:0]
	</operation>
	<instruction name='pinsrb' form='xmm, r32, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='i' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[127:0] := a[127:0]
sel := imm8[1:0]*32
dst[sel+31:sel] := i[31:0]
	</operation>
	<instruction name='pinsrd' form='xmm, r32, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_insert_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='i' type='__int64'/>
	<parameter varname="imm8" type='const int'/>
	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". </description>
	<operation>
dst[127:0] := a[127:0]
sel := imm8[0]*64
dst[sel+63:sel] := i[63:0]
	</operation>
	<instruction name='pinsrq' form='xmm, r64, imm'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxsb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF a[i+31:i] &gt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF a[i+31:i] &gt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxud' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_max_epu16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pmaxuw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminsb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF a[i+31:i] &lt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminsd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF a[i+31:i] &lt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminud' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_min_epu16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
	</operation>
	<instruction name='pminuw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_packus_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
	</operation>
	<instruction name='packusdw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cmpeq_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpeqq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='0.5'/>
	<perfdata arch='Nehalem' lat='1' tpt='0.5'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	l := j*16
	dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
	</operation>
	<instruction name='pmovsxbw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
	</operation>
	<instruction name='pmovsxbd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi8_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
	</operation>
	<instruction name='pmovsxbq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi16_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
	</operation>
	<instruction name='pmovsxwd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi16_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
	</operation>
	<instruction name='pmovsxwq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepi32_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
	</operation>
	<instruction name='pmovsxdq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*8
	l := j*16
	dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
	</operation>
	<instruction name='pmovzxbw' form='xmm, xmm'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 8*j
	dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
	</operation>
	<instruction name='pmovzxbd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu8_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 8*j
	dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
	</operation>
	<instruction name='pmovzxbq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu16_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 16*j
	dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
	</operation>
	<instruction name='pmovzxwd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu16_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 16*j
	dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
	</operation>
	<instruction name='pmovzxwq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_cvtepu32_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	k := 32*j
	dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
	</operation>
	<instruction name='pmovzxdq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Westmere' lat='1' tpt='1'/>
	<perfdata arch='Nehalem' lat='1' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mul_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
	</operation>
	<instruction name='pmuldq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mullo_epi32'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	tmp[63:0] := a[i+31:i] * b[i+31:i]
	dst[i+31:i] := tmp[31:0]
ENDFOR
	</operation>
	<instruction name='pmulld' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='10' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<perfdata arch='Nehalem' lat='6' tpt='2'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testz_si128'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
IF (a[127:0] AND b[127:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[127:0]) AND b[127:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='ptest' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testc_si128'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
IF (a[127:0] AND b[127:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[127:0]) AND b[127:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='ptest' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_testnzc_si128'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
IF (a[127:0] AND b[127:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[127:0]) AND b[127:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='ptest' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_test_all_zeros'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0.</description>
	<operation>
IF (a[127:0] AND mask[127:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
RETURN ZF
	</operation>
	<instruction name='ptest' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_test_mix_ones_zeros'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	 <description>Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
IF (a[127:0] AND mask[127:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[127:0]) AND mask[127:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='ptest' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Westmere' lat='2' tpt='1'/>
	<perfdata arch='Nehalem' lat='2' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' sequence='true' rettype='int' name='_mm_test_all_ones'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128i'/>
	<description>Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0.</description>
	<operation>
FOR j := 0 to 127
	tmp[i] := 1
ENDFOR

IF ((NOT a[127:0]) AND tmp[127:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='pcmpeqd' form='xmm, xmm'/>
	<instruction name='ptest' form='xmm, xmm'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_round_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
	[round_note]
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
	</operation>
	<instruction name='roundpd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_floor_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
	</operation>
	<instruction name='roundpd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_ceil_pd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
	</operation>
	<instruction name='roundpd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_round_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
	[round_note]
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='roundps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_floor_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='roundps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_ceil_ps'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
	</operation>
	<instruction name='roundps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_round_sd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
	[round_note]
	</description>
	<operation>
dst[63:0] := ROUND(b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='roundsd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_floor_sd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := FLOOR(b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='roundsd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128d' name='_mm_ceil_sd'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := CEIL(b[63:0])
dst[127:64] := a[127:64]
	</operation>
	<instruction name='roundsd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_round_ss'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
	[round_note]
	</description>
	<operation>
dst[31:0] := ROUND(b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='roundss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_floor_ss'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := FLOOR(b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='roundss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128' name='_mm_ceil_ss'>
	<type>Floating Point</type>
	<CPUID>SSE4.1</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := CEIL(b[31:0])
dst[127:32] := a[127:32]
	</operation>
	<instruction name='roundss' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_minpos_epu16'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<description>Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst".</description>
	<operation>
index[2:0] := 0
min[15:0] := a[15:0]
FOR j := 0 to 7
	i := j*16
	IF a[i+15:i] &lt; min[15:0]
		index[2:0] := j
		min[15:0] := a[i+15:i]
	FI
ENDFOR
dst[15:0] := min[15:0]
dst[18:16] := index[2:0]
dst[127:19] := 0
	</operation>
	<instruction name='phminposuw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_mpsadbw_epu8'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Arithmetic</category>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
	Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
	<operation>
MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
	a_offset := imm8[2]*32
	b_offset := imm8[1:0]*32
	FOR j := 0 to 7
		i := j*8
		k := a_offset+i
		l := b_offset
		tmp[i*2+15:i*2] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
	ENDFOR
	RETURN tmp[127:0]
}

dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
	</operation>
	<instruction name='mpsadbw' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='7' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='5' tpt='1'/>
	<perfdata arch='Nehalem' lat='5' tpt='1'/>
	<header>smmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.1' vexEq='TRUE' rettype='__m128i' name='_mm_stream_load_si128'>
	<type>Integer</type>
	<CPUID>SSE4.1</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m128i*'/>
	<description>Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint.
	"mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[127:0] := MEM[mem_addr+127:mem_addr]
	</operation>
	<instruction name='movntdqa' form='xmm, m128'/>
	<header>smmintrin.h</header>
</intrinsic>

<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_cmpistrm'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst".
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF a[m+size-1:m] == 0
			aInvalid := 1
		FI
		IF b[n+size-1:n] == 0
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF b[n+size-1:n] == 0
				bInvalid := 1
			FI
			IF bInvalid // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
IF imm8[6] // byte / word mask
	FOR i := 0 to UpperBound
		j := i*size
		IF IntRes2[i]
			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
		ELSE
			dst[j+size-1:j] := 0
		FI
	ENDFOR
ELSE // bit mask
	dst[UpperBound:0] := IntRes[UpperBound:0]
	dst[127:UpperBound+1] := 0
FI
	</operation>
	<instruction name='pcmpistrm' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='8' tpt='2'/>
	<perfdata arch='Nehalem' lat='8' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistri'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst".
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF a[m+size-1:m] == 0
			aInvalid := 1
		FI
		IF b[n+size-1:n] == 0
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF b[n+size-1:n] == 0
				bInvalid := 1
			FI
			IF bInvalid // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
IF imm8[6] // most significant bit
	tmp := UpperBound
	dst := tmp
	DO WHILE ((tmp &gt;= 0) AND a[tmp] = 0)
		tmp := tmp - 1
		dst := tmp
	OD
ELSE // least significant bit
	tmp := 0
	dst := tmp
	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] = 0)
		tmp := tmp + 1
		dst := tmp
	OD
FI
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrz'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

bInvalid := 0
FOR j := 0 to UpperBound
	n := j*size
	IF b[n+size-1:n] == 0
		bInvalid := 1
	FI
ENDFOR

dst := bInvalid
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrc'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF a[m+size-1:m] == 0
			aInvalid := 1
		FI
		IF b[n+size-1:n] == 0
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF b[n+size-1:n] == 0
				bInvalid := 1
			FI
			IF bInvalid // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := (IntRes2 != 0)
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistrs'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

aInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	IF b[m+size-1:m] == 0
		aInvalid := 1
	FI
ENDFOR

dst := aInvalid
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistro'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF a[m+size-1:m] == 0
			aInvalid := 1
		FI
		IF b[n+size-1:n] == 0
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF b[n+size-1:n] == 0
				bInvalid := 1
			FI
			IF bInvalid // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := IntRes2[0]
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpistra'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF a[m+size-1:m] == 0
			aInvalid := 1
		FI
		IF b[n+size-1:n] == 0
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF b[n+size-1:n] == 0
				bInvalid := 1
			FI
			IF bInvalid // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := (IntRes2 == 0) AND bInvalid
	</operation>
	<instruction name='pcmpistri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='__m128i' name='_mm_cmpestrm'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst".
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF i == la
			aInvalid := 1
		FI
		IF j == lb
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF i &gt;= lb // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
IF imm8[6] // byte / word mask
	FOR i := 0 to UpperBound
		j := i*size
		IF IntRes2[i]
			dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
		ELSE
			dst[j+size-1:j] := 0
		FI
	ENDFOR
ELSE // bit mask
	dst[UpperBound:0] := IntRes[UpperBound:0]
	dst[127:UpperBound+1] := 0
FI
	</operation>
	<instruction name='pcmpestrm' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='8' tpt='2'/>
	<perfdata arch='Nehalem' lat='8' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestri'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst".
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF i == la
			aInvalid := 1
		FI
		IF j == lb
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
		0:  // equal any
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		1:  // ranges
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			FI
		2:  // equal each
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 0
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		3:  // equal ordered
			IF (!aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 0
			ELSE IF (aInvalid &amp;&amp; !bInvalid)
				BoolRes[i][j] := 1
			ELSE If (aInvalid &amp;&amp; bInvalid)
				BoolRes[i][j] := 1
			FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
0:  // equal any
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound
			IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
		ENDFOR
	ENDFOR
1:  // ranges
	IntRes1 := 0
	FOR i := 0 to UpperBound
		FOR j := 0 to UpperBound, j += 2
			IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
		ENDFOR
	ENDFOR
2:  // equal each
	IntRes1 := 0
	FOR i := 0 to UpperBound
		IntRes1[i] := BoolRes[i][i]
	ENDFOR
3:  // equal ordered
	IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
	FOR i := 0 to UpperBound
		k := i
		FOR j := 0 to UpperBound-i
			IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
			k++
		ENDFOR
	ENDFOR
ESAC

// optionally negate results
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF i &gt;= lb // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
IF imm8[6] // most significant bit
	tmp := UpperBound
	dst := tmp
	DO WHILE ((tmp &gt;= 0) AND a[tmp] = 0)
		tmp := tmp - 1
		dst := tmp
	OD
ELSE // least significant bit
	tmp := 0
	dst := tmp
	DO WHILE ((tmp &lt;= UpperBound) AND a[tmp] = 0)
		tmp := tmp + 1
		dst := tmp
	OD
FI
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrz'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

dst := (lb &lt;= UpperBound)
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrc'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF i == la
			aInvalid := 1
		FI
		IF j == lb
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
			0:  // equal any
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			1:  // ranges
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			2:  // equal each
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
			3:  // equal ordered
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 1
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
	0:  // equal any
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound
				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
			ENDFOR
		ENDFOR
	1:  // ranges
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound, j += 2
				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
			ENDFOR
		ENDFOR
	2:  // equal each
		IntRes1 := 0
		FOR i := 0 to UpperBound
			IntRes1[i] := BoolRes[i][i]
		ENDFOR
	3:  // equal ordered
		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
		FOR i := 0 to UpperBound
			k := i
			FOR j := 0 to UpperBound-i
				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
				k++
			ENDFOR
		ENDFOR
ESAC

// optionally negate results
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF i &gt;= lb // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := (IntRes2 != 0)
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestrs'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

dst := (la &lt;= UpperBound)
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestro'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF i == la
			aInvalid := 1
		FI
		IF j == lb
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
			0:  // equal any
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			1:  // ranges
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			2:  // equal each
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
			3:  // equal ordered
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 1
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
	0:  // equal any
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound
				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
			ENDFOR
		ENDFOR
	1:  // ranges
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound, j += 2
				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
			ENDFOR
		ENDFOR
	2:  // equal each
		IntRes1 := 0
		FOR i := 0 to UpperBound
			IntRes1[i] := BoolRes[i][i]
		ENDFOR
	3:  // equal ordered
		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
		FOR i := 0 to UpperBound
			k := i
			FOR j := 0 to UpperBound-i
				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
				k++
			ENDFOR
		ENDFOR
ESAC

// optionally negate results
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF i &gt;= lb // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := IntRes2[0
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' dontShowZeroUnmodMsg='TRUE' rettype='int' name='_mm_cmpestra'>
	<CPUID>SSE4.2</CPUID>
	<category>String Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='la' type='int'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='lb' type='int'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise.
	[strcmp_note]
	</description>
	<operation>
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1

// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
	m := i*size
	FOR j := 0 to UpperBound
		n := j*size
		BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])

		// invalidate characters after EOS
		IF i == la
			aInvalid := 1
		FI
		IF j == lb
			bInvalid := 1
		FI

		// override comparisons for invalid characters
		CASE (imm8[3:2]) OF
			0:  // equal any
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			1:  // ranges
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				FI
			2:  // equal each
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 0
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
			3:  // equal ordered
				IF (!aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 0
				ELSE IF (aInvalid &amp;&amp; !bInvalid)
					BoolRes[i][j] := 1
				ELSE If (aInvalid &amp;&amp; bInvalid)
					BoolRes[i][j] := 1
				FI
		ESAC
	ENDFOR
ENDFOR

// aggregate results
CASE (imm8[3:2]) OF
	0:  // equal any
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound
				IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
			ENDFOR
		ENDFOR
	1:  // ranges
		IntRes1 := 0
		FOR i := 0 to UpperBound
			FOR j := 0 to UpperBound, j += 2
				IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
			ENDFOR
		ENDFOR
	2:  // equal each
		IntRes1 := 0
		FOR i := 0 to UpperBound
			IntRes1[i] := BoolRes[i][i]
		ENDFOR
	3:  // equal ordered
		IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
		FOR i := 0 to UpperBound
			k := i
			FOR j := 0 to UpperBound-i
				IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
				k++
			ENDFOR
		ENDFOR
ESAC

// optionally negate results
FOR i := 0 to UpperBound
	IF imm8[4]
		IF imm8[5] // only negate valid
			IF i &gt;= lb // invalid, don't negate
				IntRes2[i] := IntRes1[i]
			ELSE // valid, negate
				IntRes2[i] := -1 XOR IntRes1[i]
			FI
		ELSE // negate all
			IntRes2[i] := -1 XOR IntRes1[i]
		FI
	ELSE // don't negate
		IntRes2[i] := IntRes1[i]
	FI
ENDFOR

// output
dst := (IntRes2 == 0) AND (lb &gt; UpperBound)
	</operation>
	<instruction name='pcmpestri' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='11' tpt='3'/>
	<perfdata arch='Ivy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Sandy Bridge' lat='11' tpt='3'/>
	<perfdata arch='Westmere' lat='7' tpt='2'/>
	<perfdata arch='Nehalem' lat='7' tpt='2'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' vexEq='TRUE' rettype='__m128i' name='_mm_cmpgt_epi64'>
	<type>Integer</type>
	<CPUID>SSE4.2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
	</operation>
	<instruction name='pcmpgtq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u8'>
	<type>Integer</type>
	<CPUID>SSE4.2</CPUID>
	<category>Cryptography</category>
	<parameter varname='crc' type='unsigned int'/>
	<parameter varname='v' type='unsigned char'/>
	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst".</description>
	<operation>
tmp1[7:0] := v[0:7] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[39:0] := tmp1[7:0] &lt;&lt; 32
tmp4[39:0] := tmp2[31:0] &lt;&lt; 8
tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
tmp6[31:0] := tmp5[39:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
	</operation>
	<instruction name='crc32' form='r32, r8'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u16'>
	<type>Integer</type>
	<CPUID>SSE4.2</CPUID>
	<category>Cryptography</category>
	<parameter varname='crc' type='unsigned int'/>
	<parameter varname='v' type='unsigned short'/>
	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst".</description>
	<operation>
tmp1[15:0] := v[0:15] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[47:0] := tmp1[15:0] &lt;&lt; 32
tmp4[47:0] := tmp2[31:0] &lt;&lt; 16
tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
tmp6[31:0] := tmp5[47:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
	</operation>
	<instruction name='crc32' form='r32, r16'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' rettype='unsigned int' name='_mm_crc32_u32'>
	<type>Integer</type>
	<CPUID>SSE4.2</CPUID>
	<category>Cryptography</category>
	<parameter varname='crc' type='unsigned int'/>
	<parameter varname='v' type='unsigned int'/>
	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst".</description>
	<operation>
tmp1[31:0] := v[0:31] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[63:0] := tmp1[31:0] &lt;&lt; 32
tmp4[63:0] := tmp2[31:0] &lt;&lt; 32
tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
tmp6[31:0] := tmp5[63:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
	</operation>
	<instruction name='crc32' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech='SSE4.2' rettype='unsigned __int64' name='_mm_crc32_u64'>
	<type>Integer</type>
	<CPUID>SSE4.2</CPUID>
	<category>Cryptography</category>
	<parameter varname='crc' type='unsigned __int64'/>
	<parameter varname='v' type='unsigned __int64'/>
	<description>Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst".</description>
	<operation>
tmp1[63:0] := v[0:63] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[95:0] := tmp1[31:0] &lt;&lt; 32
tmp4[95:0] := tmp2[63:0] &lt;&lt; 64
tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
tmp6[31:0] := tmp5[95:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
	</operation>
	<instruction name='crc32' form='r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='int' name='_mm_popcnt_u32'>
	<type>Integer</type>
	<CPUID>POPCNT</CPUID>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='unsigned int'/>
	<description>
		Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst".
	</description>
	<operation>
dst := 0
FOR i := 0 to 31
	IF a[i]
		dst := dst + 1
	FI
ENDFOR
	</operation>
	<instruction name='popcnt' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>nmmintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='__int64' name='_mm_popcnt_u64'>
	<type>Integer</type>
	<CPUID>POPCNT</CPUID>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='unsigned __int64'/>
	<description>
		Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst".
	</description>
	<operation>
dst := 0
FOR i := 0 to 63
	IF a[i]
		dst := dst + 1
	FI
ENDFOR
	</operation>
	<instruction name='popcnt' form='r64, r64'/>
	<perfdata arch='Westmere' lat='3' tpt='1'/>
	<perfdata arch='Nehalem' lat='3' tpt='1'/>
	<header>nmmintrin.h</header>
</intrinsic>

<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesenc_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='RoundKey' type='__m128i'/>
	<description>Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
	<operation>state := a
a[127:0] := ShiftRows(a[127:0])
a[127:0] := SubBytes(a[127:0])
a[127:0] := MixColumns(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
	</operation>
	<instruction name='aesenc' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesenclast_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='RoundKey' type='__m128i'/>
	<description>Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
	<operation>state := a
a[127:0] := ShiftRows(a[127:0])
a[127:0] := SubBytes(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
	</operation>
	<instruction name='aesenclast' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesdec_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='RoundKey' type='__m128i'/>
	<description>Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
	<operation>state := a
a[127:0] := InvShiftRows(a[127:0])
a[127:0] := InvSubBytes(a[127:0])
a[127:0] := InvMixColumns(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
	</operation>
	<instruction name='aesdec' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesdeclast_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='RoundKey' type='__m128i'/>
	<description>Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"."</description>
	<operation>state := a
a[127:0] := InvShiftRows(a[127:0])
a[127:0] := InvSubBytes(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
	</operation>
	<instruction name='aesdeclast' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aesimc_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<description>Perform the InvMixColumns transformation on "a" and store the result in "dst".</description>
	<operation>
dst[127:0] := InvMixColumns(a[127:0])
	</operation>
	<instruction name='aesimc' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='14' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='12' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='12' tpt='2'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_aeskeygenassist_si128'>
	<type>Integer</type>
	<CPUID>AES</CPUID>
	<category>Cryptography</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"."
	</description>
	<operation>
X3[31:0] := a[127:96]
X2[31:0] := a[95:64]
X1[31:0] := a[63:32]
X0[31:0] := a[31:0]
RCON[31:0] := ZeroExtend(imm8[7:0]);
dst[31:0] := SubWord(X1)
dst[63:32] := (RotWord(SubWord(X1)) XOR RCON;
dst[95:64] := SubWord(X3)
dst[127:96] := RotWord(SubWord(X3)) XOR RCON;
	</operation>
	<instruction name='aeskeygenassist' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='10' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='10' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='10' tpt='2'/>
	<perfdata arch='Westmere' lat='6' tpt='2'/>
	<header>wmmintrin.h</header>
</intrinsic>
<intrinsic tech='Other' vexEq='TRUE' rettype='__m128i' name='_mm_clmulepi64_si128'>
	<type>Integer</type>
	<CPUID>PCLMULQDQ</CPUID>
	<category>Application-Targeted</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname='imm8' type='const int'/>
	<description>Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst".
	</description>
	<operation>
IF (imm8[0] = 0)
	TEMP1 := a[63:0];
ELSE
	TEMP1 := a[127:64];
FI
IF (imm8[4] = 0)
	TEMP2 := b[63:0];
ELSE
	TEMP2 := b[127:64];
FI

FOR i := 0 to 63
	TEMP[i] := (TEMP1[0] and TEMP2[i]);
	FOR j := 1 to i
		TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
	ENDFOR
	dst[i] := TEMP[i];
ENDFOR
FOR i := 64 to 127
	TEMP [i] := 0;
	FOR j := (i - 63) to 63
		TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
	ENDFOR
	dst[i] := TEMP[i];
ENDFOR
dst[127] := 0
	</operation>
	<instruction name='pclmulqdq' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='7' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='14' tpt='8'/>
	<perfdata arch='Sandy Bridge' lat='14' tpt='8'/>
	<perfdata arch='Westmere' lat='14' tpt='8'/>
	<header>wmmintrin.h</header>
</intrinsic>

<intrinsic tech='AVX' rettype='__m256d' name='_mm256_add_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vaddpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_add_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vaddps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_addsub_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF (j is even)
		dst[i+63:i] := a[i+63:i] - b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i] + b[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vaddsubpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_addsub_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF (j is even)
		dst[i+31:i] := a[i+31:i] - b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i] + b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vaddsubps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_and_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
		<description>Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vandpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_and_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vandps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_andnot_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vandnpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_andnot_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vandnps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_blend_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF imm8[j%8]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vblendpd' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_blend_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vblendps' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='0.5'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_blendv_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='mask' type='__m256d'/>
	<description>Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := b[i+63:i]
	ELSE
		dst[i+63:i] := a[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vblendvpd' form='ymm, ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_blendv_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='mask' type='__m256'/>
	<description>Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vblendvps' form='ymm, ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_div_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 64*j
	dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vdivpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='35' tpt='25'/>
	<perfdata arch='Ivy Bridge' lat='35' tpt='28'/>
	<perfdata arch='Sandy Bridge' lat='43' tpt='44'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_div_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vdivps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='21' tpt='13'/>
	<perfdata arch='Ivy Bridge' lat='21' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='29' tpt='28'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_dp_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname="imm8" type='const int'/>
	<description>Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8".</description>
	<operation>
DP(a[127:0], b[127:0], imm8[7:0]) {
	FOR j := 0 to 3
		i := j*32
		IF imm8[(4+j)%8]
			temp[i+31:i] := a[i+31:i] * b[i+31:i]
		ELSE
			temp[i+31:i] := 0
		FI
	ENDFOR

	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])

	FOR j := 0 to 3
		i := j*32
		IF imm8[j%8]
			tmpdst[i+31:i] := sum[31:0]
		ELSE
			tmpdst[i+31:i] := 0
		FI
	ENDFOR
	RETURN tmpdst[127:0]
}

dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
dst[MAX:256] := 0
	</operation>
	<instruction name='vdpps' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='14' tpt='2'/>
	<perfdata arch='Ivy Bridge' lat='12' tpt='2'/>
	<perfdata arch='Sandy Bridge' lat='12' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_hadd_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[63:0] := a[127:64] + a[63:0]
dst[127:64] := b[127:64] + b[63:0]
dst[191:128] := a[255:192] + a[191:128]
dst[255:192] := b[255:192] + b[191:128]
dst[MAX:256] := 0
	</operation>
	<instruction name='vhaddpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='5'/>
	<perfdata arch='Sandy Bridge' lat='5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_hadd_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
dst[159:128] := a[191:160] + a[159:128]
dst[191:160] := a[255:224] + a[223:192]
dst[223:192] := b[191:160] + b[159:128]
dst[255:224] := b[255:224] + b[223:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vhaddps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='5'/>
	<perfdata arch='Sandy Bridge' lat='5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_hsub_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[63:0] := a[63:0] - a[127:64]
dst[127:64] := b[63:0] - b[127:64]
dst[191:128] := a[191:128] - a[255:192]
dst[255:192] := b[191:128] - b[255:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vhsubpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='5'/>
	<perfdata arch='Sandy Bridge' lat='5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_hsub_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst".</description>
	<operation>
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
dst[159:128] := a[159:128] - a[191:160]
dst[191:160] := a[223:192] - a[255:224]
dst[223:192] := b[159:128] - b[191:160]
dst[255:224] := b[223:192] - b[255:224]
dst[MAX:256] := 0
	</operation>
	<instruction name='vhsubps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5'/>
	<perfdata arch='Ivy Bridge' lat='5'/>
	<perfdata arch='Sandy Bridge' lat='5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_max_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmaxpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_max_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmaxps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_min_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vminpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_min_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vminps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_mul_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmulpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_mul_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmulps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<perfdata arch='Ivy Bridge' lat='5' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_or_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vorpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_or_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vorps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_shuffle_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". </description>
	<operation>
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vshufpd' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_shuffle_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(b[255:128], imm8[5:4])
dst[255:224] := SELECT4(b[255:128], imm8[7:6])
dst[MAX:256] := 0
	</operation>
	<instruction name='vshufps' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_sub_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vsubpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_sub_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vsubps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_xor_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vxorpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_xor_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vxorps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm_cmp_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcmppd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cmp_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcmppd' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_cmp_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcmpps' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_cmp_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst".</description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcmpps' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm_cmp_sd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vcmpsd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_cmp_ss'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". </description>
	<operation>
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC

dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='vcmpss' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cvtepi32_pd'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*64
	dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcvtdq2pd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_cvtepi32_ps'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256i'/>
	<description>Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcvtdq2ps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm256_cvtpd_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcvtpd2ps' form='xmm, ymm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_cvtps_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcvtps2dq' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_cvtps_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 64*j
	k := 32*j
	dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcvtps2pd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='2' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='2' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128i' name='_mm256_cvttpd_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcvttpd2dq' form='xmm, ymm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128i' name='_mm256_cvtpd_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256d'/>
	<description>Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	k := 64*j
	dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vcvtpd2dq' form='xmm, ymm'/>
	<perfdata arch='Haswell' lat='4' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='4' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='4' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_cvttps_epi32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m256'/>
	<description>Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vcvttps2dq' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm256_extractf128_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
	</operation>
	<instruction name='vextractf128' form='xmm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm256_extractf128_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
	</operation>
	<instruction name='vextractf128' form='xmm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128i' name='_mm256_extractf128_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
	</operation>
	<instruction name='vextractf128' form='xmm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__int8' name='_mm256_extract_epi8'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="index" type='const int'/>
	<description>Extract an 8-bit integer from "a", selected with "index", and store the result in "dst".</description>
	<operation>
dst[7:0] := (a[255:0] &gt;&gt; (index * 8))[7:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__int16' name='_mm256_extract_epi16'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="index" type='const int'/>
	<description>Extract a 16-bit integer from "a", selected with "index", and store the result in "dst".</description>
	<operation>
dst[15:0] := (a[255:0] &gt;&gt; (index * 16))[15:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__int32' name='_mm256_extract_epi32'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="index" type='const int'/>
	<description>Extract a 32-bit integer from "a", selected with "index", and store the result in "dst".</description>
	<operation>
dst[31:0] := (a[255:0] &gt;&gt; (index * 32))[31:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__int64' name='_mm256_extract_epi64'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="index" type='const int'/>
	<description>Extract a 64-bit integer from "a", selected with "index", and store the result in "dst".</description>
	<operation>
dst[63:0] := (a[255:0] &gt;&gt; (index * 64))[63:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_zeroall'>
	<CPUID>AVX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Zero the contents of all XMM or YMM registers.</description>
	<operation>
YMM0[MAX:0] := 0
YMM1[MAX:0] := 0
YMM2[MAX:0] := 0
YMM3[MAX:0] := 0
YMM4[MAX:0] := 0
YMM5[MAX:0] := 0
YMM6[MAX:0] := 0
YMM7[MAX:0] := 0
IF 64-bit mode
	YMM8[MAX:0] := 0
	YMM9[MAX:0] := 0
	YMM10[MAX:0] := 0
	YMM11[MAX:0] := 0
	YMM12[MAX:0] := 0
	YMM13[MAX:0] := 0
	YMM14[MAX:0] := 0
	YMM15[MAX:0] := 0
FI
</operation>
	<instruction name='vzeroall' form=''/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_zeroupper'>
	<CPUID>AVX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.</description>
	<operation>
YMM0[MAX:128] := 0
YMM1[MAX:128] := 0
YMM2[MAX:128] := 0
YMM3[MAX:128] := 0
YMM4[MAX:128] := 0
YMM5[MAX:128] := 0
YMM6[MAX:128] := 0
YMM7[MAX:128] := 0
IF 64-bit mode
	YMM8[MAX:128] := 0
	YMM9[MAX:128] := 0
	YMM10[MAX:128] := 0
	YMM11[MAX:128] := 0
	YMM12[MAX:128] := 0
	YMM13[MAX:128] := 0
	YMM14[MAX:128] := 0
	YMM15[MAX:128] := 0
FI
</operation>
	<instruction name='vzeroupper' form=''/>
	<perfdata arch='Haswell' lat='0' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='0' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='0' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_permutevar_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256i'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[159:128] := SELECT4(a[255:128], b[129:128])
dst[191:160] := SELECT4(a[255:128], b[161:160])
dst[223:192] := SELECT4(a[255:128], b[193:192])
dst[255:224] := SELECT4(a[255:128], b[225:224])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermilps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_permutevar_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128i'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[MAX:128] := 0
	</operation>
	<instruction name='vpermilps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_permute_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermilps' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_permute_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[MAX:128] := 0
	</operation>
	<instruction name='vpermilps' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permutevar_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256i'/>
	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst".</description>
	<operation>
IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
IF (b[129] == 0) dst[191:128] := a[191:128]
IF (b[129] == 1) dst[191:128] := a[255:192]
IF (b[193] == 0) dst[255:192] := a[191:128]
IF (b[193] == 1) dst[255:192] := a[255:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermilpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm_permutevar_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128i'/>
	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst".</description>
	<operation>
IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vpermilpd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permute_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
IF (imm8[2] == 0) dst[191:128] := a[191:128]
IF (imm8[2] == 1) dst[191:128] := a[255:192]
IF (imm8[3] == 0) dst[255:192] := a[191:128]
IF (imm8[3] == 1) dst[255:192] := a[255:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermilpd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm_permute_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst".</description>
	<operation>
IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vpermilpd' form='xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_permute2f128_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
	<operation>
SELECT4(src1, src2, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src1[127:0]
	1:	tmp[127:0] := src1[255:128]
	2:	tmp[127:0] := src2[127:0]
	3:	tmp[127:0] := src2[255:128]
	ESAC
	IF control[3]
		tmp[127:0] := 0
	FI
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
	</operation>
	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_permute2f128_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
	<operation>
SELECT4(src1, src2, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src1[127:0]
	1:	tmp[127:0] := src1[255:128]
	2:	tmp[127:0] := src2[127:0]
	3:	tmp[127:0] := src2[255:128]
	ESAC
	IF control[3]
		tmp[127:0] := 0
	FI
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
	</operation>
	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_permute2f128_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
	<operation>
SELECT4(src1, src2, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src1[127:0]
	1:	tmp[127:0] := src1[255:128]
	2:	tmp[127:0] := src2[127:0]
	3:	tmp[127:0] := src2[255:128]
	ESAC
	IF control[3]
		tmp[127:0] := 0
	FI
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
	</operation>
	<instruction name='vperm2f128' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_broadcast_ss'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const *'/>
	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
	<operation>
tmp[31:0] = MEM[mem_addr+31:mem_addr]
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastss' form='ymm, m32'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_broadcast_ss'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<category>Swizzle</category>
	<parameter varname='mem_addr' type='float const *'/>
	<description>Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst".</description>
	<operation>
tmp[31:0] = MEM[mem_addr+31:mem_addr]
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vbroadcastss' form='xmm, m32'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_broadcast_sd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<category>Swizzle</category>
	<parameter varname='mem_addr' type='double const *'/>
	<description>Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst".</description>
	<operation>
tmp[63:0] = MEM[mem_addr+63:mem_addr]
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastsd' form='ymm, m64'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_broadcast_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<category>Swizzle</category>
	<parameter varname='mem_addr' type='__m128 const *'/>
	<description>Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst".</description>
	<operation>
tmp[127:0] = MEM[mem_addr+127:mem_addr]
dst[127:0] := tmp[127:0]
dst[255:128] := tmp[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastf128' form='ymm, m128'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_broadcast_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<category>Swizzle</category>
	<parameter varname='mem_addr' type='__m128d const *'/>
	<description>Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst".</description>
	<operation>
tmp[127:0] = MEM[mem_addr+127:mem_addr]
dst[127:0] := tmp[127:0]
dst[255:128] := tmp[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastf128' form='ymm, m128'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_insertf128_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname="imm8" type='int'/>
	<description>Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
	<operation>
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_insertf128_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname="imm8" type='int'/>
	<description>Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8".</description>
	<operation>
dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_insertf128_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='int'/>
	<description>Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8".</description>
	<operation>
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi8'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='i' type='__int8'/>
	<parameter varname="index" type='const int'/>
	<description>Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". </description>
	<operation>
dst[255:0] := a[255:0]
sel := index*8
dst[sel+7:sel] := i[7:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi16'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='i' type='__int16'/>
	<parameter varname="index" type='const int'/>
	<description>Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". </description>
	<operation>
dst[255:0] := a[255:0]
sel := index*16
dst[sel+15:sel] := i[15:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi32'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='i' type='__int32'/>
	<parameter varname="index" type='const int'/>
	<description>Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". </description>
	<operation>
dst[255:0] := a[255:0]
sel := index*32
dst[sel+31:sel] := i[31:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_insert_epi64'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='i' type='__int64'/>
	<parameter varname="index" type='const int'/>
	<description>Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". </description>
	<operation>
dst[255:0] := a[255:0]
sel := index*64
dst[sel+63:sel] := i[63:0]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_load_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const *'/>
	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovapd' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_store_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double *'/>
	<parameter varname='a' type='__m256d'/>
	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovapd' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_load_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const *'/>
	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovaps' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_store_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float *'/>
	<parameter varname='a' type='__m256'/>
	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovaps' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_loadu_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const *'/>
	<description>Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovupd' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double *'/>
	<parameter varname='a' type='__m256d'/>
	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovupd' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_loadu_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const *'/>
	<description>Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovups' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float *'/>
	<parameter varname='a' type='__m256'/>
	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovups' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_load_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m256i const *'/>
	<description>Load 256-bits of integer data from memory into "dst".
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovdqa' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_store_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m256i *'/>
	<parameter varname='a' type='__m256i'/>
	<description>Store 256-bits of integer data from "a" into memory.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovdqa' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_loadu_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m256i const *'/>
	<description>Load 256-bits of integer data from memory into "dst".
	"mem_addr" does not need to be aligned on any particular boundary.
	</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovdqu' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_storeu_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m256i *'/>
	<parameter varname='a' type='__m256i'/>
	<description>Store 256-bits of integer data from "a" into memory.
	"mem_addr" does not need to be aligned on any particular boundary.
	</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovdqu' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_maskload_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const *'/>
	<parameter varname='mask' type='__m256i'/>
	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmaskmovpd' form='ymm, ymm, m256'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_maskstore_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double *'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='a' type='__m256d'/>
	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='vmaskmovpd' form='m256, ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm_maskload_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='double const *'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vmaskmovpd' form='xmm, xmm, m128'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm_maskstore_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double *'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='a' type='__m128d'/>
	<description>Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='vmaskmovpd' form='m128, xmm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_maskload_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const *'/>
	<parameter varname='mask' type='__m256i'/>
	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vmaskmovps' form='ymm, ymm, m256'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_maskstore_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float *'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='a' type='__m256'/>
	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='vmaskmovps' form='m256, ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm_maskload_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='float const *'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set).</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vmaskmovps' form='xmm, xmm, m128'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm_maskstore_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float *'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='a' type='__m128'/>
	<description>Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='vmaskmovps' form='m128, xmm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_movehdup_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m256'/>
	<description>Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
	</description>
	<operation>
dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
dst[159:128] := a[191:160]
dst[191:160] := a[191:160]
dst[223:192] := a[255:224]
dst[255:224] := a[255:224]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovshdup' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_moveldup_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m256'/>
	<description>Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst".
	</description>
	<operation>
dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
dst[159:128] := a[159:128]
dst[191:160] := a[159:128]
dst[223:192] := a[223:192]
dst[255:224] := a[223:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovsldup' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_movedup_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Move</category>
	<parameter varname='a' type='__m256d'/>
	<description>Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst".
	</description>
	<operation>
dst[63:0] := a[63:0]
dst[127:64] := a[63:0]
dst[191:128] := a[191:128]
dst[255:192] := a[191:128]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovddup' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_lddqu_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m256i const *'/>
	<description>Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vlddqu' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_stream_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__m256i *'/>
	<parameter varname='a' type='__m256i'/>
	<description>Store 256-bits of integer data from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovntdq' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_stream_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='double *'/>
	<parameter varname='a' type='__m256d'/>
	<description>Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovntpd' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='void' name='_mm256_stream_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='float *'/>
	<parameter varname='a' type='__m256'/>
	<description>Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
MEM[mem_addr+255:mem_addr] := a[255:0]
	</operation>
	<instruction name='vmovntps' form='m256, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_rcp_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<description>Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vrcpps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_rsqrt_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<description>Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12.</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vrsqrtps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='7' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='7' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='7' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_sqrt_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vsqrtpd' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='35' tpt='28'/>
	<perfdata arch='Ivy Bridge' lat='35' tpt='28'/>
	<perfdata arch='Sandy Bridge' lat='43' tpt='44'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_sqrt_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vsqrtps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='21' tpt='14'/>
	<perfdata arch='Ivy Bridge' lat='21' tpt='14'/>
	<perfdata arch='Sandy Bridge' lat='29' tpt='28'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_round_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst".
	[round_note]
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundpd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_round_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='rounding' type='int'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst".
	[round_note]
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundps' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_unpackhi_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vunpckhpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_unpackhi_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vunpckhps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_unpacklo_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vunpcklpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_unpacklo_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vunpcklps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testz_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
IF (a[255:0] AND b[255:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[255:0]) AND b[255:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='vptest' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testc_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
IF (a[255:0] AND b[255:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[255:0]) AND b[255:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='vptest' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
IF (a[255:0] AND b[255:0] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
IF ((NOT a[255:0]) AND b[255:0] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='vptest' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testz_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='vtestpd' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testc_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='vtestpd' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<description>Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='vtestpd' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testz_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='vtestpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testc_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='vtestpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testnzc_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='vtestpd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testz_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='vtestps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testc_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='vtestps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_testnzc_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<description>Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255]  == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[255:0] := (NOT a[255:0]) AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255]  == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='vtestps' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testz_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN ZF
	</operation>
	<instruction name='vtestps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testc_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
RETURN CF
	</operation>
	<instruction name='vtestps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm_testnzc_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0.</description>
	<operation>
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	ZF := 1
ELSE
	ZF := 0
FI
tmp[127:0] := (NOT a[127:0]) AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
	CF := 1
ELSE
	CF := 0
FI
IF (ZF == 0 &amp;&amp; CF == 0)
	RETURN 1
ELSE
	RETURN 0
FI
	</operation>
	<instruction name='vtestps' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_movemask_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256d'/>
	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF a[i+63]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:4] := 0
	</operation>
	<instruction name='vmovmskpd' form='r32, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='int' name='_mm256_movemask_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256'/>
	<description>Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF a[i+31]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:8] := 0
	</operation>
	<instruction name='vmovmskps' form='r32, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_setzero_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256d with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='vxorpd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_setzero_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256 with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='vxorps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='1' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_setzero_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256i with all elements set to zero.</description>
	<operation>
dst[MAX:0] := 0
	</operation>
	<instruction name='vpxor' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_set_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='double'/>
	<parameter varname='e2' type='double'/>
	<parameter varname='e1' type='double'/>
	<parameter varname='e0' type='double'/>
	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.</description>
	<operation>
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_set_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='float'/>
	<parameter varname='e6' type='float'/>
	<parameter varname='e5' type='float'/>
	<parameter varname='e4' type='float'/>
	<parameter varname='e3' type='float'/>
	<parameter varname='e2' type='float'/>
	<parameter varname='e1' type='float'/>
	<parameter varname='e0' type='float'/>
	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values.</description>
	<operation>
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi8'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e31' type='char'/>
	<parameter varname='e30' type='char'/>
	<parameter varname='e29' type='char'/>
	<parameter varname='e28' type='char'/>
	<parameter varname='e27' type='char'/>
	<parameter varname='e26' type='char'/>
	<parameter varname='e25' type='char'/>
	<parameter varname='e24' type='char'/>
	<parameter varname='e23' type='char'/>
	<parameter varname='e22' type='char'/>
	<parameter varname='e21' type='char'/>
	<parameter varname='e20' type='char'/>
	<parameter varname='e19' type='char'/>
	<parameter varname='e18' type='char'/>
	<parameter varname='e17' type='char'/>
	<parameter varname='e16' type='char'/>
	<parameter varname='e15' type='char'/>
	<parameter varname='e14' type='char'/>
	<parameter varname='e13' type='char'/>
	<parameter varname='e12' type='char'/>
	<parameter varname='e11' type='char'/>
	<parameter varname='e10' type='char'/>
	<parameter varname='e9' type='char'/>
	<parameter varname='e8' type='char'/>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e0
dst[15:8] := e1
dst[23:16] := e2
dst[31:24] := e3
dst[39:32] := e4
dst[47:40] := e5
dst[55:48] := e6
dst[63:56] := e7
dst[71:64] := e8
dst[79:72] := e9
dst[87:80] := e10
dst[95:88] := e11
dst[103:96] := e12
dst[111:104] := e13
dst[119:112] := e14
dst[127:120] := e15
dst[135:128] := e16
dst[143:136] := e17
dst[151:144] := e18
dst[159:152] := e19
dst[167:160] := e20
dst[175:168] := e21
dst[183:176] := e22
dst[191:184] := e23
dst[199:192] := e24
dst[207:200] := e25
dst[215:208] := e26
dst[223:216] := e27
dst[231:224] := e28
dst[239:232] := e29
dst[247:240] := e30
dst[255:248] := e31
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi16'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e15' type='short'/>
	<parameter varname='e14' type='short'/>
	<parameter varname='e13' type='short'/>
	<parameter varname='e12' type='short'/>
	<parameter varname='e11' type='short'/>
	<parameter varname='e10' type='short'/>
	<parameter varname='e9' type='short'/>
	<parameter varname='e8' type='short'/>
	<parameter varname='e7' type='short'/>
	<parameter varname='e6' type='short'/>
	<parameter varname='e5' type='short'/>
	<parameter varname='e4' type='short'/>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[15:0] := e0
dst[31:16] := e1
dst[47:32] := e2
dst[63:48] := e3
dst[79:64] := e4
dst[95:80] := e5
dst[111:96] := e6
dst[127:112] := e7
dst[145:128] := e8
dst[159:144] := e9
dst[175:160] := e10
dst[191:176] := e11
dst[207:192] := e12
dst[223:208] := e13
dst[239:224] := e14
dst[255:240] := e15
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi32'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='int'/>
	<parameter varname='e6' type='int'/>
	<parameter varname='e5' type='int'/>
	<parameter varname='e4' type='int'/>
	<parameter varname='e3' type='int'/>
	<parameter varname='e2' type='int'/>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set_epi64x'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='__int64'/>
	<parameter varname='e2' type='__int64'/>
	<parameter varname='e1' type='__int64'/>
	<parameter varname='e0' type='__int64'/>
	<description>Set packed 64-bit integers in "dst" with the supplied values.</description>
	<operation>
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_setr_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='double'/>
	<parameter varname='e2' type='double'/>
	<parameter varname='e1' type='double'/>
	<parameter varname='e0' type='double'/>
	<description>Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[63:0] := e3
dst[127:64] := e2
dst[191:128] := e1
dst[255:192] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_setr_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='float'/>
	<parameter varname='e6' type='float'/>
	<parameter varname='e5' type='float'/>
	<parameter varname='e4' type='float'/>
	<parameter varname='e3' type='float'/>
	<parameter varname='e2' type='float'/>
	<parameter varname='e1' type='float'/>
	<parameter varname='e0' type='float'/>
	<description>Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[31:0] := e7
dst[63:32] := e6
dst[95:64] := e5
dst[127:96] := e4
dst[159:128] := e3
dst[191:160] := e2
dst[223:192] := e1
dst[255:224] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi8'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e31' type='char'/>
	<parameter varname='e30' type='char'/>
	<parameter varname='e29' type='char'/>
	<parameter varname='e28' type='char'/>
	<parameter varname='e27' type='char'/>
	<parameter varname='e26' type='char'/>
	<parameter varname='e25' type='char'/>
	<parameter varname='e24' type='char'/>
	<parameter varname='e23' type='char'/>
	<parameter varname='e22' type='char'/>
	<parameter varname='e21' type='char'/>
	<parameter varname='e20' type='char'/>
	<parameter varname='e19' type='char'/>
	<parameter varname='e18' type='char'/>
	<parameter varname='e17' type='char'/>
	<parameter varname='e16' type='char'/>
	<parameter varname='e15' type='char'/>
	<parameter varname='e14' type='char'/>
	<parameter varname='e13' type='char'/>
	<parameter varname='e12' type='char'/>
	<parameter varname='e11' type='char'/>
	<parameter varname='e10' type='char'/>
	<parameter varname='e9' type='char'/>
	<parameter varname='e8' type='char'/>
	<parameter varname='e7' type='char'/>
	<parameter varname='e6' type='char'/>
	<parameter varname='e5' type='char'/>
	<parameter varname='e4' type='char'/>
	<parameter varname='e3' type='char'/>
	<parameter varname='e2' type='char'/>
	<parameter varname='e1' type='char'/>
	<parameter varname='e0' type='char'/>
	<description>Set packed 8-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[7:0] := e31
dst[15:8] := e30
dst[23:16] := e29
dst[31:24] := e28
dst[39:32] := e27
dst[47:40] := e26
dst[55:48] := e25
dst[63:56] := e24
dst[71:64] := e23
dst[79:72] := e22
dst[87:80] := e21
dst[95:88] := e20
dst[103:96] := e19
dst[111:104] := e18
dst[119:112] := e17
dst[127:120] := e16
dst[135:128] := e15
dst[143:136] := e14
dst[151:144] := e13
dst[159:152] := e12
dst[167:160] := e11
dst[175:168] := e10
dst[183:176] := e9
dst[191:184] := e8
dst[199:192] := e7
dst[207:200] := e6
dst[215:208] := e5
dst[223:216] := e4
dst[231:224] := e3
dst[239:232] := e2
dst[247:240] := e1
dst[255:248] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi16'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e15' type='short'/>
	<parameter varname='e14' type='short'/>
	<parameter varname='e13' type='short'/>
	<parameter varname='e12' type='short'/>
	<parameter varname='e11' type='short'/>
	<parameter varname='e10' type='short'/>
	<parameter varname='e9' type='short'/>
	<parameter varname='e8' type='short'/>
	<parameter varname='e7' type='short'/>
	<parameter varname='e6' type='short'/>
	<parameter varname='e5' type='short'/>
	<parameter varname='e4' type='short'/>
	<parameter varname='e3' type='short'/>
	<parameter varname='e2' type='short'/>
	<parameter varname='e1' type='short'/>
	<parameter varname='e0' type='short'/>
	<description>Set packed 16-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[15:0] := e15
dst[31:16] := e14
dst[47:32] := e13
dst[63:48] := e12
dst[79:64] := e11
dst[95:80] := e10
dst[111:96] := e9
dst[127:112] := e8
dst[145:128] := e7
dst[159:144] := e6
dst[175:160] := e5
dst[191:176] := e4
dst[207:192] := e3
dst[223:208] := e2
dst[239:224] := e1
dst[255:240] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi32'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e7' type='int'/>
	<parameter varname='e6' type='int'/>
	<parameter varname='e5' type='int'/>
	<parameter varname='e4' type='int'/>
	<parameter varname='e3' type='int'/>
	<parameter varname='e2' type='int'/>
	<parameter varname='e1' type='int'/>
	<parameter varname='e0' type='int'/>
	<description>Set packed 32-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[31:0] := e7
dst[63:32] := e6
dst[95:64] := e5
dst[127:96] := e4
dst[159:128] := e3
dst[191:160] := e2
dst[223:192] := e1
dst[255:224] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_setr_epi64x'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='e3' type='__int64'/>
	<parameter varname='e2' type='__int64'/>
	<parameter varname='e1' type='__int64'/>
	<parameter varname='e0' type='__int64'/>
	<description>Set packed 64-bit integers in "dst" with the supplied values in reverse order.</description>
	<operation>
dst[63:0] := e3
dst[127:64] := e2
dst[191:128] := e1
dst[255:192] := e0
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_set1_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='double'/>
	<description>Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_set1_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='float'/>
	<description>Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi8'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='char'/>
	<description>Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi16'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='short'/>
	<description>Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi32'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='int'/>
	<description>Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_set1_epi64x'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='a' type='long long'/>
	<description>Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_castpd_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256d'/>
	<description>Cast vector of type __m256d to type __m256.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castps_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256'/>
	<description>Cast vector of type __m256 to type __m256d.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castps_si256'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256'/>
	<description>Casts vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castpd_si256'>
	<type>Floating Point</type>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256d'/>
	<description>Casts vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_castsi256_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256i'/>
	<description>Casts vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castsi256_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256i'/>
	<description>Casts vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128' name='_mm256_castps256_ps128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256'/>
	<description>Casts vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128d' name='_mm256_castpd256_pd128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256d'/>
	<description>Casts vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m128i' name='_mm256_castsi256_si128'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m256i'/>
	<description>Casts vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_castps128_ps256'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m128'/>
	<description>Casts vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_castpd128_pd256'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m128d'/>
	<description>Casts vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_castsi128_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Cast</category>
	<parameter varname='a' type='__m128i'/>
	<description>Casts vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
	</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_floor_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundps' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_ceil_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundps' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_floor_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundpd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_ceil_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vroundpd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='6' tpt='1'/>
	<perfdata arch='Ivy Bridge' lat='3' tpt='1'/>
	<perfdata arch='Sandy Bridge' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SSE' rettype='__m128' name='_mm_undefined_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m128 with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128d' name='_mm_undefined_pd'>
	<type>Floating Point</type>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m128d with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SSE2' rettype='__m128i' name='_mm_undefined_si128'>
	<type>Integer</type>
	<CPUID>SSE2</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m128i with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_undefined_ps'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256 with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_undefined_pd'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256d with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_undefined_si256'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>General Support</category>
	<parameter varname='' type='void'/>
	<description>Return vector of type __m256i with undefined elements.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_mm_clflushopt'>
	<CPUID>CLFLUSHOPT</CPUID>
	<category>General Support</category>
	<parameter varname='p' type='void const *'/>
	<description>Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy.</description>
	<instruction name='clflushopt' />
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_set_m128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='hi' type='__m128'/>
	<parameter varname='lo' type='__m128'/>
	<description>Set packed __m256 vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_set_m128d'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='hi' type='__m128d'/>
	<parameter varname='lo' type='__m128d'/>
	<description>Set packed __m256d vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_set_m128i'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='hi' type='__m128i'/>
	<parameter varname='lo' type='__m128i'/>
	<description>Set packed __m256i vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256' name='_mm256_setr_m128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='lo' type='__m128'/>
	<parameter varname='hi' type='__m128'/>
	<description>Set packed __m256 vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256d' name='_mm256_setr_m128d'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='lo' type='__m128d'/>
	<parameter varname='hi' type='__m128d'/>
	<description>Set packed __m256d vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' rettype='__m256i' name='_mm256_setr_m128i'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Set</category>
	<parameter varname='lo' type='__m128i'/>
	<parameter varname='hi' type='__m128i'/>
	<description>Set packed __m256i vector "dst" with the supplied values.</description>
	<operation>
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vinsertf128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256' name='_mm256_loadu2_m128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='hiaddr' type='float const*'/>
	<parameter varname='loaddr' type='float const*'/>
	<description>Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256d' name='_mm256_loadu2_m128d'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='hiaddr' type='double const*'/>
	<parameter varname='loaddr' type='double const*'/>
	<description>Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst".
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='__m256i' name='_mm256_loadu2_m128i'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Load</category>
	<parameter varname='hiaddr' type='__m128i const*'/>
	<parameter varname='loaddr' type='__m128i const*'/>
	<description>Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst".
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='hiaddr' type='float*'/>
	<parameter varname='loaddr' type='float*'/>
	<parameter varname='a' type='__m256' />
	<description>Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations.
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128d'>
	<type>Floating Point</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='hiaddr' type='double*'/>
	<parameter varname='loaddr' type='double*'/>
	<parameter varname='a' type='__m256d' />
	<description>Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations.
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX' sequence='true' rettype='void' name='_mm256_storeu2_m128i'>
	<type>Integer</type>
	<CPUID>AVX</CPUID>
	<category>Store</category>
	<parameter varname='hiaddr' type='__m128i*'/>
	<parameter varname='loaddr' type='__m128i*'/>
	<parameter varname='a' type='__m256i' />
	<description>Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations.
	"hiaddr" and "loaddr" do not need to be aligned on any particular boundary.</description>
	<operation>
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<description>Compute the absolute value of packed 8-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpabsb' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<description>Compute the absolute value of packed 16-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpabsw' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_abs_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<description>Compute the absolute value of packed 32-bit integers in "a", and store the unsigned results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpabsd' form='ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 32-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_add_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 64-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddsb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddusb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_adds_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpaddusw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_alignr_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname='count' type='const int'/>
	<description>Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "count" bytes, and store the low 16 bytes in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] &lt;&lt; 128) OR b[i+127:i]) &gt;&gt; (count[7:0]*8)
	dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpalignr' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_and_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[255:0] := (a[255:0] AND b[255:0])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpand' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_andnot_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".</description>
	<operation>
dst[255:0] := ((NOT a[255:0]) AND b[255:0])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpandn' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_avg_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) &gt;&gt; 1
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpavgb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_avg_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) &gt;&gt; 1
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpavgw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blend_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF imm8[j%8]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpblendw' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_blend_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpblendd' form='xmm, xmm, xmm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blend_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpblendd' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_blendv_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname='mask' type='__m256i'/>
	<description>Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF mask[i+7]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpblendvb' form='ymm, ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastb_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpbroadcastb' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastb_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 8-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpbroadcastb' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastd_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpbroadcastd' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastd_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 32-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpbroadcastd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastq_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpbroadcastq' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastq_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 64-bit integer from "a" to all elements of "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpbroadcastq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' vexEq='TRUE' rettype='__m128d' name='_mm_broadcastsd_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='movddup' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='1'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_broadcastsd_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128d'/>
	<description>Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastsd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastsi128_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst".
	</description>
	<operation>
dst[127:0] := a[127:0]
dst[255:128] := a[127:0]
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcasti128' form='ymm, m128'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm_broadcastss_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vbroadcastss' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256' name='_mm256_broadcastss_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128'/>
	<description>Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vbroadcastss' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_broadcastw_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpbroadcastw' form='xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_broadcastw_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m128i'/>
	<description>Broadcast the low packed 16-bit integer from "a" to all elements of "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpbroadcastw' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpeqb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpeqw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpeqd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpeq_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpeqq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ( a[i+7:i] &gt; b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpgtb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ( a[i+15:i] &gt; b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpgtw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ( a[i+31:i] &gt; b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpgtd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cmpgt_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Compare</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ( a[i+63:i] &gt; b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpcmpgtq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi16_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j:= 0 to 7
	i := 32*j
	k := 16*j
	dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxwd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi16_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j:= 0 to 3
	i := 64*j
	k := 16*j
	dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxwq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi32_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j:= 0 to 3
	i := 64*j
	k := 32*j
	dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxdq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	l := j*16
	dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxbw' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxbd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepi8_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovsxbq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu16_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxwd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu16_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j:= 0 to 3
	i := 64*j
	k := 16*j
	dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxwq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu32_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j:= 0 to 3
	i := 64*j
	k := 32*j
	dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxdq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi16'>
	<type>Integer</type>
		 <CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	l := j*16
	dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxbw' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxbd' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_cvtepu8_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Convert</category>
	<parameter varname='a' type='__m128i'/>
	<description>Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmovzxbq' form='ymm, xmm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_extracti128_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst".</description>
	<operation>
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
	</operation>
	<instruction name='vextracti128' form='xmm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadd_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := a[95:80] + a[79:64]
dst[63:48] := a[127:112] + a[111:96]
dst[79:64] := b[31:16] + b[15:0]
dst[95:80] := b[63:48] + b[47:32]
dst[111:96] := b[95:80] + b[79:64]
dst[127:112] := b[127:112] + b[111:96]
dst[143:128] := a[159:144] + a[143:128]
dst[159:144] := a[191:176] + a[175:160]
dst[175:160] := a[223:208] + a[207:192]
dst[191:176] := a[255:240] + a[239:224]
dst[207:192] := b[127:112] + b[143:128]
dst[223:208] := b[159:144] + b[175:160]
dst[239:224] := b[191:176] + b[207:192]
dst[255:240] := b[223:208] + b[239:224]
dst[MAX:256] := 0
	</operation>
	<instruction name='vphaddw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadd_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
dst[159:128] := a[191:160] + a[159:128]
dst[191:160] := a[255:224] + a[223:192]
dst[223:192] := b[191:160] + b[159:128]
dst[255:224] := b[255:224] + b[223:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vphaddd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hadds_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally add adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
dst[143:128] = Saturate_To_Int16(a[159:144] + a[143:128])
dst[159:144] = Saturate_To_Int16(a[191:176] + a[175:160])
dst[175:160] = Saturate_To_Int16( a[223:208] + a[207:192])
dst[191:176] = Saturate_To_Int16(a[255:240] + a[239:224])
dst[207:192] = Saturate_To_Int16(b[127:112] + b[143:128])
dst[223:208] = Saturate_To_Int16(b[159:144] + b[175:160])
dst[239:224] = Saturate_To_Int16(b[191-160] + b[159-128])
dst[255:240] = Saturate_To_Int16(b[255:240] + b[239:224])
dst[MAX:256] := 0
	</operation>
	<instruction name='vphaddsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsub_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := a[79:64] - a[95:80]
dst[63:48] := a[111:96] - a[127:112]
dst[79:64] := b[15:0] - b[31:16]
dst[95:80] := b[47:32] - b[63:48]
dst[111:96] := b[79:64] - b[95:80]
dst[127:112] := b[111:96] - b[127:112]
dst[143:128] := a[143:128] - a[159:144]
dst[159:144] := a[175:160] - a[191:176]
dst[175:160] := a[207:192] - a[223:208]
dst[191:176] := a[239:224] - a[255:240]
dst[207:192] := b[143:128] - b[159:144]
dst[223:208] := b[175:160] - b[191:176]
dst[239:224] := b[207:192] - b[223:208]
dst[255:240] := b[239:224] - b[255:240]
dst[MAX:256] := 0
	</operation>
	<instruction name='vphsubw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsub_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst".</description>
	<operation>
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
dst[159:128] := a[159:128] - a[191:160]
dst[191:160] := a[223:192] - a[255:224]
dst[223:192] := b[159:128] - b[191:160]
dst[255:224] := b[223:192] - b[255:224]
dst[MAX:256] := 0
	</operation>
	<instruction name='vphsubd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_hsubs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst".</description>
	<operation>
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
dst[143:128]= Saturate_To_Int16(a[143:128] - a[159:144])
dst[159:144] = Saturate_To_Int16(a[175:160] - a[191:176])
dst[175:160] = Saturate_To_Int16(a[207:192] - a[223:208])
dst[191:176] = Saturate_To_Int16(a[239:224] - a[255:240])
dst[207:192] = Saturate_To_Int16(b[143:128] - b[159:144])
dst[223:208] = Saturate_To_Int16(b[175:160] - b[191:176])
dst[239:224] = Saturate_To_Int16(b[207:192] - b[223:208])
dst[255:240] = Saturate_To_Int16(b[239:224] - b[255:240])
dst[MAX:256] := 0
	</operation>
	<instruction name='vphsubsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128d' name='_mm_i32gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherdpd' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_i32gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherdpd' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm_i32gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherdps' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256' name='_mm256_i32gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherdps' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i32gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherdd' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i32gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherdd' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i32gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherdq' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i32gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherdq' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128d' name='_mm_i64gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherqpd' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_i64gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherqpd' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm_i64gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:64] := 0
	</operation>
	<instruction name='vgatherqps' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm256_i64gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherqps' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i64gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:64] := 0
	</operation>
	<instruction name='vpgatherqd' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_i64gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherqd' form='ymm, vm64x, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_i64gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherqq' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_i64gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherqq' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_inserti128_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m128i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8".</description>
	<operation>
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
	</operation>
	<instruction name='vinserti128' form='ymm, ymm, xmm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_madd_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaddwd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maddubs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst".
	</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaddubsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128d' name='_mm_mask_i32gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128d'/>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128d'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherdpd' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_mask_i32gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256d'/>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m256d'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherdpd' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm_mask_i32gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128'/>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherdps' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256' name='_mm256_mask_i32gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256'/>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m256'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherdps' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i32gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128i'/>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherdd' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i32gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256i'/>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherdd' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i32gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128i'/>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherdq' form='xmm, vm32x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i32gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256i'/>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherdq' form='ymm, vm32x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128d' name='_mm_mask_i64gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128d'/>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128d'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherqpd' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_mask_i64gather_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256d'/>
	<parameter varname='base_addr' type='double const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m256d'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vgatherqpd' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm_mask_i64gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128'/>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:64] := 0
dst[MAX:64] := 0
	</operation>
	<instruction name='vgatherqps' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128' name='_mm256_mask_i64gather_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128'/>
	<parameter varname='base_addr' type='float const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m128'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vgatherqps' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i64gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128i'/>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:64] := 0
dst[MAX:64] := 0
	</operation>
	<instruction name='vpgatherqd' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm256_mask_i64gather_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128i'/>
	<parameter varname='base_addr' type='int const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherqd' form='ymm, vm64x, ymm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_mask_i64gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m128i'/>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m128i'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
	</operation>
	<instruction name='vpgatherqq' form='xmm, vm64x, xmm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mask_i64gather_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='src' type='__m256i'/>
	<parameter varname='base_addr' type='__int64 const*'/>
	<parameter varname='vindex' type='__m256i'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='scale' type='const int'/>
	<description>
	Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8.
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+63:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
	</operation>
	<instruction name='vpgatherqq' form='ymm, vm64x, ymm'/>
	<perfdata arch='Haswell' lat='6'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_maskload_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='int const*'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpmaskmovd' form='xmm, xmm, m128'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maskload_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='int const*'/>
	<parameter varname='mask' type='__m256i'/>
	<description>Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaskmovd' form='ymm, ymm, m256'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_maskload_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__int64 const*'/>
	<parameter varname='mask' type='__m128i'/>
	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpmaskmovq' form='xmm, xmm, m128'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_maskload_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__int64 const*'/>
	<parameter varname='mask' type='__m256i'/>
	<description>Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element).</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaskmovq' form='ymm, ymm, m256'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='void' name='_mm_maskstore_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='int*'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='vpmaskmovd' form='m128, xmm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='void' name='_mm256_maskstore_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='int*'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='a' type='__m256i'/>
	<description>Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR
	</operation>
	<instruction name='vpmaskmovd' form='m256, ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='void' name='_mm_maskstore_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__int64*'/>
	<parameter varname='mask' type='__m128i'/>
	<parameter varname='a' type='__m128i'/>
	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='vpmaskmovq' form='m128, xmm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='void' name='_mm256_maskstore_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Store</category>
	<parameter varname='mem_addr' type='__int64*'/>
	<parameter varname='mask' type='__m256i'/>
	<parameter varname='a' type='__m256i'/>
	<description>Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element).
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR
	</operation>
	<instruction name='vpmaskmovq' form='m256, ymm, ymm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 8-bit integers in "a" and "b", and store packed maximum values in "dst". </description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxsb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".
	</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] &gt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxsd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".
	</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] &gt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxub' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst".
	</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] &gt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxuw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_max_epu32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] &gt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmaxud' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 8-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminsb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed 32-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] &lt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminsd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] &lt; b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminub' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] &lt; b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminuw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_min_epu32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] &lt; b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpminud' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='int' name='_mm256_movemask_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<description>
Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".
	</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[j] := a[i+7]
ENDFOR
	</operation>
	<instruction name='vpmovmskb' form='r32, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mpsadbw_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst".
	Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8".</description>
	<operation>
MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
	a_offset := imm8[2]*32
	b_offset := imm8[1:0]*32
	FOR j := 0 to 7
		i := j*8
		k := a_offset+i
		l := b_offset
		tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
	ENDFOR
	RETURN tmp[127:0]
}

dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
dst[MAX:256] := 0
	</operation>
	<instruction name='vmpsadbw' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='7' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mul_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the low 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmuldq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mul_epu32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmuludq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhi_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmulhw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhi_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmulhuw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mulhrs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply packed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) &gt;&gt; 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmulhrsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mullo_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmullw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_mullo_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	tmp[63:0] := a[i+31:i] * b[i+31:i]
	dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpmulld' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='10' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_or_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[255:0] := (a[255:0] OR b[255:0])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpor' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.33'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
	</description>
	<operation>
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpacksswb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packs_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpackssdw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packus_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpackuswb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_packus_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Miscellaneous</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst".</description>
	<operation>
dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpackusdw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permute2x128_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". </description>
	<operation>
SELECT4(src1, src2, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src1[127:0]
	1:	tmp[127:0] := src1[255:128]
	2:	tmp[127:0] := src2[127:0]
	3:	tmp[127:0] := src2[255:128]
	ESAC
	IF control[3]
		tmp[127:0] := 0
	FI
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
	</operation>
	<instruction name='vperm2i128' form='ymm, ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permute4x64_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256d' name='_mm256_permute4x64_pd'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermpd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_permutevar8x32_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='idx' type='__m256i'/>
	<description>Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256' name='_mm256_permutevar8x32_ps'>
	<type>Floating Point</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='idx' type='__m256i'/>
	<description>Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpermps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='3' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sad_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 4
	i := j*64
	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
	dst[i+63:i+16] := 0
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsadbw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shuffle_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst".</description>
	<operation>
SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpshufd' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shuffle_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[3:0] := b[i+3:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
	IF b[128+i+7] == 1
		dst[128+i+7:128+i] := 0
	ELSE
		index[3:0] := b[128+i+3:128+i]
		dst[128+i+7:128+i] := a[128+index*8+7:128+index*8]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpshufb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shufflehi_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
	<operation>
dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vpshufhw' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_shufflelo_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst".</description>
	<operation>
dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[MAX:256] := 0
	</operation>
	<instruction name='vpshuflw' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	IF b[i+7:i] &lt; 0
		dst[i+7:i] := NEG(a[i+7:i])
	ELSE IF b[i+7:i] = 0
		dst[i+7:i] := 0
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsignb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF b[i+15:i] &lt; 0
		dst[i+15:i] := NEG(a[i+15:i])
	ELSE IF b[i+15:i] = 0
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsignw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sign_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero.</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF b[i+31:i] &lt; 0
		dst[i+31:i] := NEG(a[i+31:i])
	ELSE IF b[i+31:i] = 0
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsignd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
dst[MAX:256] := 0
	</operation>
	<instruction name='vpslldq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_bslli_epi128'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &lt;&lt; (tmp*8)
dst[255:128] := a[255:128] &lt;&lt; (tmp*8)
dst[MAX:256] := 0
	</operation>
	<instruction name='vpslldq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllw' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.5'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllw' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Haswell' lat='1' tpt=''/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpslld' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpslld' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sll_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
		<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF count[63:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllq' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_slli_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_sllv_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpsllvd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sllv_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m256i'/>
	<description>Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] &lt;&lt; count[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllvd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2' tpt='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_sllv_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpsllvq' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sllv_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m256i'/>
	<description>Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] &lt;&lt; count[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsllvq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sra_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsraw' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srai_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsraw' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sra_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrad' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srai_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrad' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srav_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpsravd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srav_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m256i'/>
	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := SignExtend(a[i+31:i] &gt;&gt; count[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsravd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrldq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_bsrli_epi128'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='const int'/>
	<description>Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".</description>
	<operation>
tmp := imm8[7:0]
IF tmp &gt; 15
	tmp := 16
FI
dst[127:0] := a[127:0] &gt;&gt; (tmp*8)
dst[255:128] := a[255:128] &gt;&gt; (tmp*8)
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrldq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF count[63:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlw' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] &gt; 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlw' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF count[63:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrld' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] &gt; 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrld' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srl_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF count[63:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlq' form='ymm, ymm, xmm'/>
	<perfdata arch='Haswell' lat='4'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srli_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname="imm8" type='int'/>
	<description>Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] &gt; 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlq' form='ymm, ymm, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srlv_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpsrlvd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srlv_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m256i'/>
	<description>Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] &gt;&gt; count[i+31:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlvd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m128i' name='_mm_srlv_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='count' type='__m128i'/>
	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vpsrlvq' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_srlv_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Shift</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='count' type='__m256i'/>
	<description>Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] &gt;&gt; count[i+63:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsrlvq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_stream_load_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Load</category>
	<parameter varname='mem_addr' type='__m256i const*'/>
	<description>Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint.
	"mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated.</description>
	<operation>
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
	</operation>
	<instruction name='vmovntdqa' form='ymm, m256'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_sub_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubsb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubsw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epu8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubusb' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_subs_epu16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vpsubusw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_xor_si256'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Logical</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst".</description>
	<operation>
dst[255:0] := (a[255:0] XOR b[255:0])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpxor' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpckhbw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpckhwd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpckhdq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpackhi_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpckhqdq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi8'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpcklbw' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi16'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpcklwd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi32'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpckldq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='AVX2' rettype='__m256i' name='_mm256_unpacklo_epi64'>
	<type>Integer</type>
	<CPUID>AVX2</CPUID>
	<category>Swizzle</category>
	<parameter varname='a' type='__m256i'/>
	<parameter varname='b' type='__m256i'/>
	<description>Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". </description>
	<operation>
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
	</operation>
	<instruction name='vpunpcklqdq' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='1' tpt='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmadd132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmadd132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmadd213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmadd231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
		<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmadd132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmadd132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmadd213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmadd231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmadd_sd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
	</description>
	<operation>
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmadd132sd' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd213sd' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd231sd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmadd_ss'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".
	</description>
	<operation>
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmadd132ss' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd213ss' form='xmm, xmm, xmm'/>
	<instruction name='vfmadd231ss' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmaddsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmaddsub132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmaddsub213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmaddsub231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmaddsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmaddsub132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmaddsub213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmaddsub231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmaddsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmaddsub132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmaddsub213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmaddsub231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmaddsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmaddsub132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmaddsub213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmaddsub231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsub132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmsub132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmsub213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmsub231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsub132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmsub132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmsub213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmsub231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsub_sd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsub132sd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub213sd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub231sd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsub_ss'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsub132ss' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub213ss' form='xmm, xmm, xmm'/>
	<instruction name='vfmsub231ss' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fmsubadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsubadd132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsubadd213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfmsubadd231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fmsubadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	IF (j is even)
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
	ELSE
		dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmsubadd132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmsubadd213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfmsubadd231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fmsubadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfmsubadd132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmsubadd213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfmsubadd231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fmsubadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	IF (j is even)
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
	ELSE
		dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfmsubadd132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmsubadd213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfmsubadd231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmadd132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fnmadd_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfnmadd132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfnmadd213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfnmadd231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmadd132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fnmadd_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst".
	</description>
	<operation>
FOR j := 0 to 7
	i := j*32
	a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfnmadd132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfnmadd213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfnmadd231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmadd_sd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmadd132sd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd213sd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd231sd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmadd_ss'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmadd132ss' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd213ss' form='xmm, xmm, xmm'/>
	<instruction name='vfnmadd231ss' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmsub132pd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub213pd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub231pd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256d' name='_mm256_fnmsub_pd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256d'/>
	<parameter varname='b' type='__m256d'/>
	<parameter varname='c' type='__m256d'/>
	<description>Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfnmsub132pd' form='ymm, ymm, ymm'/>
	<instruction name='vfnmsub213pd' form='ymm, ymm, ymm'/>
	<instruction name='vfnmsub231pd' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmsub132ps' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub213ps' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub231ps' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m256' name='_mm256_fnmsub_ps'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m256'/>
	<parameter varname='b' type='__m256'/>
	<parameter varname='c' type='__m256'/>
	<description>Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". </description>
	<operation>
FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:256] := 0
	</operation>
	<instruction name='vfnmsub132ps' form='ymm, ymm, ymm'/>
	<instruction name='vfnmsub213ps' form='ymm, ymm, ymm'/>
	<instruction name='vfnmsub231ps' form='ymm, ymm, ymm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128d' name='_mm_fnmsub_sd'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<parameter varname='c' type='__m128d'/>
	<description>Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".</description>
	<operation>
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmsub132sd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub213sd' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub231sd' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/> b
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='FMA' rettype='__m128' name='_mm_fnmsub_ss'>
	<type>Floating Point</type>
	<CPUID>FMA</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<parameter varname='c' type='__m128'/>
	<description>Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst".</description>
	<operation>
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
	</operation>
	<instruction name='vfnmsub132ss' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub213ss' form='xmm, xmm, xmm'/>
	<instruction name='vfnmsub231ss' form='xmm, xmm, xmm'/>
	<perfdata arch='Haswell' lat='5' tpt='0.5'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_bextr_u32'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<parameter type='unsigned int' varname='start' />
	<parameter type='unsigned int' varname='len' />
	<description>Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
	<operation>
tmp := ZERO_EXTEND_TO_512(a)
dst := ZERO_EXTEND(tmp[start+len-1:start])
	</operation>
	<instruction name='bextr' form='r32, r32, r32'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_bextr_u64'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<parameter type='unsigned int' varname='start' />
	<parameter type='unsigned int' varname='len' />
	<description>Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start".</description>
	<operation>
tmp := ZERO_EXTEND_TO_512(a)
dst := ZERO_EXTEND(tmp[start+len-1:start])
	</operation>
	<instruction name='bextr' form='r64, r64, r64'/>
	<perfdata arch='Haswell' lat='2'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_blsi_u32'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<description>Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
	<operation>
dst := (-a) BITWISE AND a
	</operation>
	<instruction name='blsi' form='r32, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_blsi_u64'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<description>Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a".</description>
	<operation>
dst := (-a) BITWISE AND a
	</operation>
	<instruction name='blsi' form='r64, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_blsmsk_u32'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a".</description>
	<operation>
dst := (a - 1) XOR a
	</operation>
	<instruction name='blsmsk' form='r32, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_blsmsk_u64'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<description>Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a".</description>
	<operation>
dst := (a - 1) XOR a
	</operation>
	<instruction name='blsmsk' form='r64, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_blsr_u32'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
	<operation>
dst := (a - 1) BITWISE AND a
	</operation>
	<instruction name='blsr' form='r32, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_blsr_u64'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a".</description>
	<operation>
dst := (a - 1) BITWISE AND a
	</operation>
	<instruction name='blsr' form='r64, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_bzhi_u32'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<parameter type='unsigned int' varname='index' />
	<description>Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
	<operation>
n := index[7:0]
dst := a
IF (n &lt; 32)
	dst[31:n] := 0
FI
	</operation>
	<instruction name='bzhi' form='r32, r32, r32'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_bzhi_u64'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<parameter type='unsigned int' varname='index' />
	<description>Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index".</description>
	<operation>
n := index[7:0]
dst := a
IF (n &lt; 64)
	dst[63:n] := 0
FI
	</operation>
	<instruction name='bzhi' form='r64, r64, r64'/>
	<perfdata arch='Haswell' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='void' name='_invpcid'>
	<CPUID>INVPCID</CPUID>
	<category>OS-Targeted</category>
	<parameter type='unsigned int' varname='type' />
	<parameter type='void*' varname='descriptor' />
	<description>
	Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type".
	The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved.
	The types supported are:
		0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs.
		1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well.
		2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID.
		3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well.
	</description>
	<operation>
CASE type OF
0: // individual-address invalidation retaining global translations
	OP_PCID := descriptor[11:0]
	ADDR := descriptor[127:64]
	BREAK
1: // single PCID invalidation retaining globals
	OP_PCID := descriptor[11:0]
	// invalidate all mappings tagged with OP_PCID except global translations
	BREAK
2: // all PCID invalidation
	// invalidate all mappings tagged with any PCID
	BREAK
3: // all PCID invalidation retaining global translations
	// invalidate all mappings tagged with any PCID except global translations
	BREAK
ESAC
	</operation>
	<instruction name='invpcid' form='r32, m128'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_lzcnt_u32'>
	<type>Integer</type>
	<CPUID>LZCNT</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<description>Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
	<operation>
tmp := 31
dst := 0
DO WHILE (tmp &gt;= 0 AND a[tmp] = 0)
	tmp := tmp - 1
	dst := dst + 1
OD
	</operation>
	<instruction name='lzcnt' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_lzcnt_u64'>
	<type>Integer</type>
	<CPUID>LZCNT</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<description>Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
	<operation>
tmp := 63
dst := 0
DO WHILE (tmp &gt;= 0 AND a[tmp] = 0)
	tmp := tmp - 1
	dst := dst + 1
OD
	</operation>
	<instruction name='lzcnt' form='r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_pdep_u32'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<parameter type='unsigned int' varname='mask' />
	<description>Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
	<operation>
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m &lt; 32
	IF mask[m] = 1
		dst[m] := tmp[k]
		k := k + 1
	FI
	m := m + 1
OD
	</operation>
	<instruction name='pdep' form='r32, r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_pdep_u64'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<parameter type='unsigned __int64' varname='mask' />
	<description>Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero.</description>
	<operation>
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m &lt; 64
	IF mask[m] = 1
		dst[m] := tmp[k]
		k := k + 1
	FI
	m := m + 1
OD
	</operation>
	<instruction name='pdep' form='r64, r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_pext_u32'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<parameter type='unsigned int' varname='mask' />
	<description>Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
	<operation>
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m &lt; 32
	IF mask[m] = 1
		dst[k] := tmp[m]
		k := k + 1
	FI
	m := m + 1
OD
	</operation>
	<instruction name='pext' form='r32, r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_pext_u64'>
	<type>Integer</type>
	<CPUID>BMI2</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<parameter type='unsigned __int64' varname='mask' />
	<description>Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero.</description>
	<operation>
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m &lt; 64
	IF mask[m] = 1
		dst[k] := tmp[m]
		k := k + 1
	FI
	m := m + 1
OD
	</operation>
	<instruction name='pext' form='r64, r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned int' name='_tzcnt_u32'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned int' varname='a' />
	<description>Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst".</description>
	<operation>
tmp := 0
dst := 0
DO WHILE ((tmp &lt; 32) AND a[tmp] = 0)
	tmp := tmp + 1
	dst := dst + 1
OD
	</operation>
	<instruction name='tzcnt' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned __int64' name='_tzcnt_u64'>
	<type>Integer</type>
	<CPUID>BMI1</CPUID>
	<category>Bit Manipulation</category>
	<parameter type='unsigned __int64' varname='a' />
	<description>Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst".</description>
	<operation>
tmp := 0
dst := 0
DO WHILE ((tmp &lt; 64) AND a[tmp] = 0)
	tmp := tmp + 1
	dst := dst + 1
OD
	</operation>
	<instruction name='tzcnt' form='r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_xabort'>
	<CPUID>RTM</CPUID>
	<category>General Support</category>
	<parameter type='const unsigned int' varname="imm8" />
	<description>
	Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX.
	Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction.
	</description>
	<operation>
IF RTM_ACTIVE = 0
	// nop
ELSE
	// restore architectural register state
	// discard memory updates performed in transaction
	// update EAX with status and imm8 value
	RTM_NEST_COUNT := 0
	RTM_ACTIVE := 0
	IF 64-bit Mode
		RIP := fallbackRIP
	ELSE
		EIP := fallbackEIP
	FI
FI
	</operation>
	<instruction name='xabort' form='imm'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='unsigned int' name='_xbegin'>
	<CPUID>RTM</CPUID>
	<category>General Support</category>
	<parameter type='void' varname='' />
	<description>
	Specify the start of an RTM code region.
	If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution.
	On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction.
	</description>
	<operation>
IF RTM_NEST_COUNT &lt; MAX_RTM_NEST_COUNT
	RTM_NEST_COUNT := RTM_NEST_COUNT + 1
	IF RTM_NEST_COUNT = 1
		IF 64-bit Mode
			fallbackRIP := RIP + SignExtend(IMM)
		ELSE IF 32-bit Mode
			fallbackEIP := EIP + SignExtend(IMM)
		ELSE // 16-bit Mode
			fallbackEIP := (EIP + SignExtend(IMM)) AND 0x0000FFFF
		FI

		RTM_ACTIVE := 1
		// enter RTM execution, record register state, start tracking memory state
	FI
ELSE
	// RTM abort (see _xabort)
FI
	</operation>
	<instruction name='xbegin' form=''/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='void' name='_xend'>
	<CPUID>RTM</CPUID>
	<category>General Support</category>
	<parameter type='void' varname='' />
	<description>
	Specify the end of an RTM code region.
	If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically.
	If the commit fails, the logical processor will perform an RTM abort.
	</description>
	<operation>
IF RTM_ACTIVE = 1
	RTM_NEST_COUNT := RTM_NEST_COUNT - 1
	IF RTM_NEST_COUNT = 0
		// try to commit transaction
		IF fail to commit transaction
			// RTM abort (see _xabort)
		ELSE
			RTM_ACTIVE = 0
		FI
	FI
FI
	</operation>
	<instruction name='xend' form=''/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='unsigned char' name='_xtest'>
	<CPUID>RTM</CPUID>
	<category>General Support</category>
	<parameter type='void' varname='' />
	<description>Query the transactional execution status, return 0 if inside a transactionally executing RTM or HLE region, and return 1 otherwise.</description>
	<operation>
IF (RTM_ACTIVE = 1 OR HLE_ACTIVE = 1)
	dst := 0
ELSE
	dst := 1
FI
	</operation>
	<instruction name='xtest' form=''/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='unsigned __int64' name='__rdtscp'>
	<CPUID>RDTSCP</CPUID>
	<category>General Support</category>
	<parameter varname='mem_addr' type='unsigned int *'/>
	<description>Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr".</description>
	<operation>
dst[63:0] := TimeStampCounter
MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]
	</operation>
	<instruction name='rdtscp' form=''/>
	<perfdata arch='Westmere' lat='9'/>
	<perfdata arch='Nehalem' lat='9'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='int' name='_bit_scan_forward'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='int'/>
	<description>Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
	<operation>
tmp := 0
IF a = 0
	dst := undefined
ELSE
	DO WHILE ((tmp &lt; 32) AND a[tmp] = 0)
		tmp := tmp + 1
		dst := tmp
	OD
FI
	</operation>
	<instruction name='bsf' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='int' name='_bit_scan_reverse'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='int'/>
	<description>Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined.</description>
	<operation>
tmp := 31
IF a = 0
	dst := undefined
ELSE
	DO WHILE ((tmp &gt; 0) AND a[tmp] = 0)
		tmp := tmp - 1
		dst := tmp
	OD
FI
	</operation>
	<instruction name='bsr' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_BitScanForward'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='index' type='unsigned __int32*'/>
	<parameter varname='mask' type='unsigned __int32'/>
	<description>Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
	<operation>
tmp := 0
IF mask = 0
	dst := 0
ELSE
	DO WHILE ((tmp &lt; 32) AND mask[tmp] = 0)
		tmp := tmp + 1
		index := tmp
		dst := 1
	OD
FI
	</operation>
	<instruction name='bsf' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_BitScanReverse'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='index' type='unsigned __int32*'/>
	<parameter varname='mask' type='unsigned __int32'/>
	<description>Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
	<operation>
tmp := 31
IF mask = 0
	dst := 0
ELSE
	DO WHILE ((tmp &gt; 0) AND mask[tmp] = 0)
		tmp := tmp - 1
		index := tmp
		dst := 1
	OD
FI
	</operation>
	<instruction name='bsr' form='r32, r32'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_BitScanForward64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='index' type='unsigned __int32*'/>
	<parameter varname='mask' type='unsigned __int64'/>
	<description>Set "index" to the index of the lowest set bit in 64-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
	<operation>
tmp := 0
IF mask = 0
	dst := 0
ELSE
	DO WHILE ((tmp &lt; 64) AND mask[tmp] = 0)
		tmp := tmp + 1
		index := tmp
		dst := 1
	OD
FI
	</operation>
	<instruction name='bsf' form='r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_BitScanReverse64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='index' type='unsigned __int32*'/>
	<parameter varname='mask' type='unsigned __int64'/>
	<description>Set "index" to the index of the highest set bit in 64-bit integer "mask". If no bits are set in "mask", then set "dst" to 0, otherwise set "dst" to 1.</description>
	<operation>
tmp := 31
IF mask = 0
	dst := 0
ELSE
	DO WHILE ((tmp &gt; 0) AND mask[tmp] = 0)
		tmp := tmp - 1
		index := tmp
		dst := 1
	OD
FI
	</operation>
	<instruction name='bsr' form='r64, r64'/>
	<perfdata arch='Haswell' lat='3'/>
	<perfdata arch='Ivy Bridge' lat='3'/>
	<perfdata arch='Sandy Bridge' lat='3'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittest'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int32*'/>
	<parameter varname='b' type='__int32'/>
	<description>Return the bit at index "b" of 32-bit integer "a".</description>
	<operation>
dst := a[b]
	</operation>
	<instruction name='bt' form='r32, r32'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandcomplement'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int32*'/>
	<parameter varname='b' type='__int32'/>
	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement.</description>
	<operation>
dst := a[b]
a[b] := ~a[b]
	</operation>
	<instruction name='btc' form='r32, r32'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandreset'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int32*'/>
	<parameter varname='b' type='__int32'/>
	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to zero.</description>
	<operation>
dst := a[b]
a[b] := 0
	</operation>
	<instruction name='btr' form='r32, r32'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandset'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int32*'/>
	<parameter varname='b' type='__int32'/>
	<description>Return the bit at index "b" of 32-bit integer "a", and set that bit to one.</description>
	<operation>
dst := a[b]
a[b] := 1
	</operation>
	<instruction name='bts' form='r32, r32'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittest64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int64*'/>
	<parameter varname='b' type='__int64'/>
	<description>Return the bit at index "b" of 64-bit integer "a".</description>
	<operation>
dst := a[b]
	</operation>
	<instruction name='bt' form='r64, r64'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandcomplement64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int64*'/>
	<parameter varname='b' type='__int64'/>
	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement.</description>
	<operation>
dst := a[b]
a[b] := ~a[b]
	</operation>
	<instruction name='btc' form='r64, r64'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandreset64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int64*'/>
	<parameter varname='b' type='__int64'/>
	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to zero.</description>
	<operation>
dst := a[b]
a[b] := 0
	</operation>
	<instruction name='btr' form='r64, r64'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned char' name='_bittestandset64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int64*'/>
	<parameter varname='b' type='__int64'/>
	<description>Return the bit at index "b" of 64-bit integer "a", and set that bit to one.</description>
	<operation>
dst := a[b]
a[b] := 1
	</operation>
	<instruction name='bts' form='r64, r64'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='int' name='_bswap'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='int'/>
	<description>Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
	<operation>
dst[7:0] := a[31:24]
dst[15:8] := a[23:16]
dst[23:16] := a[15:8]
dst[31:24] := a[7:0]
	</operation>
	<instruction name='bswap' form='r32'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='__int64' name='_bswap64'>
	<type>Integer</type>
	<category>Bit Manipulation</category>
	<parameter varname='a' type='__int64'/>
	<description>Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values.</description>
	<operation>
dst[7:0] := a[63:56]
dst[15:8] := a[55:48]
dst[23:16] := a[47:40]
dst[31:24] := a[39:32]
dst[39:32] := a[31:24]
dst[47:40] := a[23:16]
dst[55:48] := a[15:8]
dst[63:56] := a[7:0]
	</operation>
	<instruction name='bswap' form='r64'/>
	<perfdata arch='Haswell' lat='2'/>
	<perfdata arch='Ivy Bridge' lat='2'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='3'/>
	<perfdata arch='Nehalem' lat='3'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='unsigned __int32' name='_castf32_u32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<category>Cast</category>
	<parameter varname='a' type='float'/>
	<description>Cast from type float to type unsigned __int32 without conversion.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='unsigned __int64' name='_castf64_u64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<category>Cast</category>
	<parameter varname='a' type='double'/>
	<description>Cast from type double to type unsigned __int64 without conversion.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='float' name='_castu32_f32'>
	<type>Floating Point</type>
	<type>Integer</type>
	<category>Cast</category>
	<parameter varname='a' type='unsigned __int32'/>
	<description>Cast from type unsigned __int32 to type float without conversion.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' rettype='double' name='_castu64_f64'>
	<type>Floating Point</type>
	<type>Integer</type>
	<category>Cast</category>
	<parameter varname='a' type='unsigned __int64'/>
	<description>Cast from type unsigned __int64 to type double without conversion.
	This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.</description>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='void' name='_fxrstor'>
	<CPUID>FXSR</CPUID>
	<category>OS-Targeted</category>
	<parameter varname='mem_addr' type='void *'/>
	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
	<operation>
(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
	</operation>
	<instruction name='fxrstor' form='MEMmfpxenv'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='void' name='_fxrstor64'>
	<CPUID>FXSR</CPUID>
	<category>OS-Targeted</category>
	<parameter varname='mem_addr' type='void *'/>
	<description>Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary.</description>
	<operation>
(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
	</operation>
	<instruction name='fxrstor64' form='MEMmfpxenv'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='void' name='_fxsave'>
	<CPUID>FXSR</CPUID>
	<category>OS-Targeted</category>
	<parameter varname='mem_addr' type='void *'/>
	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The clayout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
	<operation>
MEM[mem_addr+511*8:mem_addr] := Fxsave(x87 FPU, MMX, XMM7-XMM0, MXCSR)
	</operation>
	<instruction name='fxsave' form='MEMmfpxenv'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='void' name='_fxsave64'>
	<CPUID>FXSR</CPUID>
	<category>OS-Targeted</category>
	<parameter varname='mem_addr' type='void *'/>
	<description>Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.</description>
	<operation>
MEM[mem_addr+511*8:mem_addr] := Fxsave64(x87 FPU, MMX, XMM7-XMM0, MXCSR)
	</operation>
	<instruction name='fxsave64' form='MEMmfpxenv'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned long' name='_lrotl'>
	<type>Integer</type>
	<category>Shift</category>
	<parameter varname='a' type='unsigned long'/>
	<parameter varname='shift' type='int'/>
	<description>Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst".</description>
	<operation>
dst := a
count := shift BITWISE AND 63
DO WHILE (count &gt; 0)
	tmp[0] := dst[63]
	dst := (dst &lt;&lt; 1) OR tmp[0]
	count := count - 1
OD
	</operation>
	<instruction name='rol' form='r64, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech="Other" rettype='unsigned long' name='_lrotr'>
	<type>Integer</type>
	<category>Shift</category>
	<parameter varname='a' type='unsigned long'/>
	<parameter varname='shift' type='int'/>
	<description>Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst".</description>
	<operation>
dst := a
count := shift BITWISE AND 63
DO WHILE (count &gt; 0)
	tmp[63] := dst[0]
	dst := (dst &gt;&gt; 1) OR tmp[63]
	count := count - 1
OD
	</operation>
	<instruction name='ror' form='r64, imm'/>
	<perfdata arch='Haswell' lat='1'/>
	<perfdata arch='Ivy Bridge' lat='1'/>
	<perfdata arch='Sandy Bridge' lat='2'/>
	<perfdata arch='Westmere' lat='1'/>
	<perfdata arch='Nehalem' lat='1'/>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' sequence='true' rettype='void' name='_allow_cpu_features'>
	<category>General Support</category>
	<parameter varname='a' type='unsigned __int64'/>
	<description>Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below:</description>
	<operation>
_FEATURE_GENERIC_IA32
_FEATURE_FPU
_FEATURE_CMOV
_FEATURE_MMX
_FEATURE_FXSAVE
_FEATURE_SSE
_FEATURE_SSE2
_FEATURE_SSE3
_FEATURE_SSSE3
_FEATURE_SSE4_1
_FEATURE_SSE4_2
_FEATURE_MOVBE
_FEATURE_POPCNT
_FEATURE_PCLMULQDQ
_FEATURE_AES
_FEATURE_F16C
_FEATURE_AVX
_FEATURE_RDRND
_FEATURE_FMA
_FEATURE_BMI
_FEATURE_LZCNT
_FEATURE_HLE
_FEATURE_RTM
_FEATURE_AVX2
_FEATURE_KNCNI
_FEATURE_AVX512F
_FEATURE_ADX
_FEATURE_RDSEED
_FEATURE_AVX512ER
_FEATURE_AVX512PF
_FEATURE_AVX512CD
_FEATURE_SHA
_FEATURE_MPX
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='Other' sequence='true' rettype='int' name='_may_i_use_cpu_feature'>
	<category>General Support</category>
	<parameter varname='a' type='unsigned __int64'/>
	<description>Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below:</description>
	<operation>
_FEATURE_GENERIC_IA32
_FEATURE_FPU
_FEATURE_CMOV
_FEATURE_MMX
_FEATURE_FXSAVE
_FEATURE_SSE
_FEATURE_SSE2
_FEATURE_SSE3
_FEATURE_SSSE3
_FEATURE_SSE4_1
_FEATURE_SSE4_2
_FEATURE_MOVBE
_FEATURE_POPCNT
_FEATURE_PCLMULQDQ
_FEATURE_AES
_FEATURE_F16C
_FEATURE_AVX
_FEATURE_RDRND
_FEATURE_FMA
_FEATURE_BMI
_FEATURE_LZCNT
_FEATURE_HLE
_FEATURE_RTM
_FEATURE_AVX2
_FEATURE_KNCNI
_FEATURE_AVX512F
_FEATURE_ADX
_FEATURE_RDSEED
_FEATURE_AVX512ER
_FEATURE_AVX512PF
_FEATURE_AVX512CD
_FEATURE_SHA
_FEATURE_MPX
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_acos_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_acos_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_acosh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_acosh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_asin_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_asin_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_asinh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_asinh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atan_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atan_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atan2_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atan2_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_atanh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_atanh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cbrt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cbrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cdfnorm_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cdfnorm_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cdfnorminv_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cdfnorminv_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cexp_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the exponential value of "e" raised to the power of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_clog_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cos_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cos_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cosd_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cosd_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_cosh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_cosh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_csqrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the square root of packed complex single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epi64'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_div_epu64'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erf_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erf_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfc_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfc_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfcinv_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfcinv_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_erfinv_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_erfinv_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Probability/Statistics</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp10_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp10_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_exp2_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_exp2_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_expm1_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_expm1_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_hypot_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_hypot_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_idiv_epi32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_idivrem_epi32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='mem_addr' type='__m128i *'/>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_invcbrt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvCubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_invcbrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvCubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_invsqrt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_invsqrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_irem_epi32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log10_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log10_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log1p_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log1p_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_log2_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_log2_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_logb_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_logb_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_pow_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<parameter varname='b' type='__m128d'/>
	<description>Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_pow_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<parameter varname='b' type='__m128'/>
	<description>Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epi64'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu8'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu16'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu32'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128i' name='_mm_rem_epu64'>
	<type>Integer</type>
	<CPUID>SSE</CPUID>
	<category>Arithmetic</category>
	<parameter varname='a' type='__m128i'/>
	<parameter varname='b' type='__m128i'/>
	<description>Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sin_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sin_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sincos_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='mem_addr' type='__m128d *'/>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sincos_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='mem_addr' type='__m128 *'/>
	<parameter varname='a' type='__m128'/>
	<description>Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sind_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sind_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_sinh_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_sinh_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_ceil_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_ceil_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_floor_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_floor_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_round_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction.</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_round_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Special Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction.</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_svml_sqrt_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_svml_sqrt_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Elementary Math Functions</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_tan_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_tan_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<description>Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128d' name='_mm_tand_pd'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128d'/>
	<description>Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst".</description>
	<operation>
FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
	</operation>
	<header>immintrin.h</header>
</intrinsic>
<intrinsic tech='SVML' sequence='true' rettype='__m128' name='_mm_tand_ps'>
	<type>Floating Point</type>
	<CPUID>SSE</CPUID>
	<category>Trigonometry</category>
	<parameter varname='a' type='__m128'/>
	<descriptio