BenjaminPoulain/gist:4457933

## gistfile1.diff
diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
index 48379ba..27872ed 100644
--- a/Source/WebCore/ChangeLog
+++ b/Source/WebCore/ChangeLog
@@ -1,3 +1,31 @@
+2013-01-04  Benjamin Poulain  <benjamin@webkit.org>
+
+        Optimize TransformationMatrix::multiply() for x86_64
+        https://bugs.webkit.org/show_bug.cgi?id=105719
+
+        Reviewed by NOBODY (OOPS!).
+
+        On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
+        We can use that in two ways to optimize matrix multiplications:
+        -Keep the source matrix completely in registers. Write the result directly in
+         the source matrix's memory. This avoids the memcpy at the end of the multiplication
+         and various memory operations.
+        -Use SIMD with SSE to perform 2 operations at a time.
+
+        The parameter from the second matrix are loaded one by one in XMM registers.
+        Loading them with SSE then shuffling the values perform worse than loading
+        one by one.
+
+        This is only enabled on 64bits as x86 only has access to 8 XMM registers and
+        the function should be written differently.
+
+        On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
+
+        * platform/graphics/transforms/TransformationMatrix.cpp:
+        (WebCore::TransformationMatrix::multiply):
+        * platform/graphics/transforms/TransformationMatrix.h:
+        (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
+
 2013-01-04  Zoltan Horvath  <zoltan@webkit.org>

         [CSS Regions] @region rules inside media queries are ignored
diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
index adcb48b..bfbc355 100644
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
@@ -36,6 +36,10 @@
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>

+#if CPU(X86_64)
+#include <emmintrin.h>
+#endif
+
 using namespace std;

 namespace WebCore {
@@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
                                 to.y() - from.y());
 }

-//
-// *this = mat * *this
-//
+// this = mat * this.
 TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
 {
 #if CPU(APPLE_ARMV7S)
@@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
     }
 #undef MATRIX_MULTIPLY_ONE_LINE

+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
+    // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
+    __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
+    __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
+    __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
+    __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
+
+    // First row.
+    __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
+    __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
+    __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
+    __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
+
+    // output00 and output01.
+    __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
+    __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
+    __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
+    __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][0], accumulator);
+
+    // output02 and output03.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[0][2], accumulator);
+
+    // Second row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
+
+    // output10 and output11.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][0], accumulator);
+
+    // output12 and output13.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[1][2], accumulator);
+
+    // Third row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
+
+    // output20 and output21.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][0], accumulator);
+
+    // output22 and output23.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[2][2], accumulator);
+
+    // Fourth row.
+    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
+    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
+    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
+    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
+
+    // output30 and output31.
+    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][0], accumulator);
+
+    // output32 and output33.
+    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+    accumulator = _mm_add_pd(accumulator, temp1);
+    accumulator = _mm_add_pd(accumulator, temp2);
+    accumulator = _mm_add_pd(accumulator, temp3);
+    _mm_store_pd(&m_matrix[3][2], accumulator);
 #else
     Matrix4 tmp;

diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
index 181e033..f324c35 100644
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
@@ -69,10 +69,14 @@ class LayoutRect;
 class FloatRect;
 class FloatQuad;

+#if CPU(X86_64) && !PLATFORM(WINDOWS)
+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
+#endif
+
 class TransformationMatrix {
     WTF_MAKE_FAST_ALLOCATED;
 public:
-#if CPU(APPLE_ARMV7S)
+#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
     typedef double Matrix4[4][4] __attribute__((aligned (16)));
 #else
     typedef double Matrix4[4][4];
@@ -226,7 +230,7 @@ public:
     double f() const { return m_matrix[3][1]; }
     void setF(double f) { m_matrix[3][1] = f; }

-    // this = this * mat
+    // this = mat * this.
     TransformationMatrix& multiply(const TransformationMatrix&);

     TransformationMatrix& scale(double);
	diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
	index 48379ba..27872ed 100644
	--- a/Source/WebCore/ChangeLog
	+++ b/Source/WebCore/ChangeLog
	@@ -1,3 +1,31 @@
	+2013-01-04 Benjamin Poulain <benjamin@webkit.org>
	+
	+ Optimize TransformationMatrix::multiply() for x86_64
	+ https://bugs.webkit.org/show_bug.cgi?id=105719
	+
	+ Reviewed by NOBODY (OOPS!).
	+
	+ On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
	+ We can use that in two ways to optimize matrix multiplications:
	+ -Keep the source matrix completely in registers. Write the result directly in
	+ the source matrix's memory. This avoids the memcpy at the end of the multiplication
	+ and various memory operations.
	+ -Use SIMD with SSE to perform 2 operations at a time.
	+
	+ The parameter from the second matrix are loaded one by one in XMM registers.
	+ Loading them with SSE then shuffling the values perform worse than loading
	+ one by one.
	+
	+ This is only enabled on 64bits as x86 only has access to 8 XMM registers and
	+ the function should be written differently.
	+
	+ On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
	+
	+ * platform/graphics/transforms/TransformationMatrix.cpp:
	+ (WebCore::TransformationMatrix::multiply):
	+ * platform/graphics/transforms/TransformationMatrix.h:
	+ (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
	+
	2013-01-04 Zoltan Horvath <zoltan@webkit.org>

	[CSS Regions] @region rules inside media queries are ignored
	diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	index adcb48b..bfbc355 100644
	--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	@@ -36,6 +36,10 @@
	#include <wtf/Assertions.h>
	#include <wtf/MathExtras.h>

	+#if CPU(X86_64)
	+#include <emmintrin.h>
	+#endif
	+
	using namespace std;

	namespace WebCore {
	@@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
	to.y() - from.y());
	}

	-//
	-// this = mat *this
	-//
	+// this = mat * this.
	TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
	{
	#if CPU(APPLE_ARMV7S)
	@@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
	}
	#undef MATRIX_MULTIPLY_ONE_LINE

	+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
	+ // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
	+ __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
	+ __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
	+ __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
	+ __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
	+
	+ // First row.
	+ __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
	+ __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
	+ __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
	+ __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
	+
	+ // output00 and output01.
	+ __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
	+ __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
	+ __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
	+ __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[0][0], accumulator);
	+
	+ // output02 and output03.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[0][2], accumulator);
	+
	+ // Second row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
	+
	+ // output10 and output11.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[1][0], accumulator);
	+
	+ // output12 and output13.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[1][2], accumulator);
	+
	+ // Third row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
	+
	+ // output20 and output21.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[2][0], accumulator);
	+
	+ // output22 and output23.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[2][2], accumulator);
	+
	+ // Fourth row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
	+
	+ // output30 and output31.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[3][0], accumulator);
	+
	+ // output32 and output33.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[3][2], accumulator);
	#else
	Matrix4 tmp;

	diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	index 181e033..f324c35 100644
	--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	@@ -69,10 +69,14 @@ class LayoutRect;
	class FloatRect;
	class FloatQuad;

	+#if CPU(X86_64) && !PLATFORM(WINDOWS)
	+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
	+#endif
	+
	class TransformationMatrix {
	WTF_MAKE_FAST_ALLOCATED;
	public:
	-#if CPU(APPLE_ARMV7S)
	+#if CPU(APPLE_ARMV7S) \|\| defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
	typedef double Matrix4[4][4] __attribute__((aligned (16)));
	#else
	typedef double Matrix4[4][4];
	@@ -226,7 +230,7 @@ public:
	double f() const { return m_matrix[3][1]; }
	void setF(double f) { m_matrix[3][1] = f; }

	- // this = this * mat
	+ // this = mat * this.
	TransformationMatrix& multiply(const TransformationMatrix&);

	TransformationMatrix& scale(double);