Skip to content

Instantly share code, notes, and snippets.

@BenjaminPoulain
Created January 4, 2013 22:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BenjaminPoulain/4457933 to your computer and use it in GitHub Desktop.
Save BenjaminPoulain/4457933 to your computer and use it in GitHub Desktop.
SSE2 TransformationMatrix
diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
index 48379ba..27872ed 100644
--- a/Source/WebCore/ChangeLog
+++ b/Source/WebCore/ChangeLog
@@ -1,3 +1,31 @@
+2013-01-04 Benjamin Poulain <benjamin@webkit.org>
+
+ Optimize TransformationMatrix::multiply() for x86_64
+ https://bugs.webkit.org/show_bug.cgi?id=105719
+
+ Reviewed by NOBODY (OOPS!).
+
+ On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
+ We can use that in two ways to optimize matrix multiplications:
+ -Keep the source matrix completely in registers. Write the result directly in
+ the source matrix's memory. This avoids the memcpy at the end of the multiplication
+ and various memory operations.
+ -Use SIMD with SSE to perform 2 operations at a time.
+
+ The parameter from the second matrix are loaded one by one in XMM registers.
+ Loading them with SSE then shuffling the values perform worse than loading
+ one by one.
+
+ This is only enabled on 64bits as x86 only has access to 8 XMM registers and
+ the function should be written differently.
+
+ On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
+
+ * platform/graphics/transforms/TransformationMatrix.cpp:
+ (WebCore::TransformationMatrix::multiply):
+ * platform/graphics/transforms/TransformationMatrix.h:
+ (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
+
2013-01-04 Zoltan Horvath <zoltan@webkit.org>
[CSS Regions] @region rules inside media queries are ignored
diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
index adcb48b..bfbc355 100644
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
@@ -36,6 +36,10 @@
#include <wtf/Assertions.h>
#include <wtf/MathExtras.h>
+#if CPU(X86_64)
+#include <emmintrin.h>
+#endif
+
using namespace std;
namespace WebCore {
@@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
to.y() - from.y());
}
-//
-// *this = mat * *this
-//
+// this = mat * this.
TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
{
#if CPU(APPLE_ARMV7S)
@@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
}
#undef MATRIX_MULTIPLY_ONE_LINE
+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
+ // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
+ __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
+ __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
+ __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
+ __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
+
+ // First row.
+ __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
+ __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
+ __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
+ __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
+
+ // output00 and output01.
+ __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
+ __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
+ __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
+ __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[0][0], accumulator);
+
+ // output02 and output03.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[0][2], accumulator);
+
+ // Second row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
+
+ // output10 and output11.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[1][0], accumulator);
+
+ // output12 and output13.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[1][2], accumulator);
+
+ // Third row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
+
+ // output20 and output21.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[2][0], accumulator);
+
+ // output22 and output23.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[2][2], accumulator);
+
+ // Fourth row.
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
+
+ // output30 and output31.
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[3][0], accumulator);
+
+ // output32 and output33.
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
+
+ accumulator = _mm_add_pd(accumulator, temp1);
+ accumulator = _mm_add_pd(accumulator, temp2);
+ accumulator = _mm_add_pd(accumulator, temp3);
+ _mm_store_pd(&m_matrix[3][2], accumulator);
#else
Matrix4 tmp;
diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
index 181e033..f324c35 100644
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
@@ -69,10 +69,14 @@ class LayoutRect;
class FloatRect;
class FloatQuad;
+#if CPU(X86_64) && !PLATFORM(WINDOWS)
+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
+#endif
+
class TransformationMatrix {
WTF_MAKE_FAST_ALLOCATED;
public:
-#if CPU(APPLE_ARMV7S)
+#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
typedef double Matrix4[4][4] __attribute__((aligned (16)));
#else
typedef double Matrix4[4][4];
@@ -226,7 +230,7 @@ public:
double f() const { return m_matrix[3][1]; }
void setF(double f) { m_matrix[3][1] = f; }
- // this = this * mat
+ // this = mat * this.
TransformationMatrix& multiply(const TransformationMatrix&);
TransformationMatrix& scale(double);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment