Skip to content

Instantly share code, notes, and snippets.

@laruence
Last active February 14, 2018 01:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laruence/a4517072f30b6ca6e93a80cd6d9b3e7b to your computer and use it in GitHub Desktop.
Save laruence/a4517072f30b6ca6e93a80cd6d9b3e7b to your computer and use it in GitHub Desktop.
diff --git a/ext/opcache/ZendAccelerator.c b/ext/opcache/ZendAccelerator.c
index 1c870e9..1c2c41f 100644
--- a/ext/opcache/ZendAccelerator.c
+++ b/ext/opcache/ZendAccelerator.c
@@ -1358,10 +1358,10 @@ static zend_persistent_script *store_script_in_file_cache(zend_persistent_script
memory_used = zend_accel_script_persist_calc(new_persistent_script, NULL, 0, 0);
/* Allocate memory block */
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
- ZCG(mem) = zend_arena_alloc(&CG(arena), memory_used + 64);
- ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L);
+ ZCG(mem) = zend_arena_alloc(&CG(arena), memory_used + FAST_COPY_PAD);
+ ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + FAST_COPY_MASK) & ~FAST_COPY_MASK);
#else
ZCG(mem) = zend_arena_alloc(&CG(arena), memory_used);
#endif
@@ -1465,12 +1465,12 @@ static zend_persistent_script *cache_script_in_shared_memory(zend_persistent_scr
memory_used = zend_accel_script_persist_calc(new_persistent_script, key, key_length, 1);
/* Allocate shared memory */
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
- ZCG(mem) = zend_shared_alloc(memory_used + 64);
+ ZCG(mem) = zend_shared_alloc(memory_used + FAST_COPY_PAD);
if (ZCG(mem)) {
memset(ZCG(mem), 0, memory_used + 64);
- ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L);
+ ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + FAST_COPY_MASK) & ~FAST_COPY_MASK);
}
#else
ZCG(mem) = zend_shared_alloc(memory_used);
diff --git a/ext/opcache/zend_accelerator_util_funcs.c b/ext/opcache/zend_accelerator_util_funcs.c
index bd46a86..5acbc88 100644
--- a/ext/opcache/zend_accelerator_util_funcs.c
+++ b/ext/opcache/zend_accelerator_util_funcs.c
@@ -596,8 +596,86 @@ static void zend_accel_class_hash_copy(HashTable *target, HashTable *source, uni
return;
}
-#ifdef __SSE2__
-# include <mmintrin.h>
+#if defined(__AVX__)
+# include <nmmintrin.h>
+# if defined(__GNUC__) && defined(__i386__)
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ size_t delta = (char*)dest - (char*)src;
+
+ __asm__ volatile (
+ ".align 16\n\t"
+ ".LL0%=:\n\t"
+ "prefetchnta 0x40(%1)\n\t"
+ "vmovaps (%1), %%ymm0\n\t"
+ "vmovaps 0x20(%1), %%ymm1\n\t"
+ "vmovaps 0x40(%1), %%ymm2\n\t"
+ "vmovaps 0x60(%1), %%ymm3\n\t"
+ "vmovaps %%ymm0, (%1,%2)\n\t"
+ "vmovaps %%ymm1, 0x20(%1,%2)\n\t"
+ "vmovaps %%ymm2, 0x40(%1,%2)\n\t"
+ "vmovaps %%ymm3, 0x60(%1,%2)\n\t"
+ "addl $0x80, %1\n\t"
+ "subl $0x80, %0\n\t"
+ "ja .LL0%="
+ : "+r"(size),
+ "+r"(src)
+ : "r"(delta)
+ : "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3");
+}
+# elif defined(__GNUC__) && defined(__x86_64__)
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ size_t delta = (char*)dest - (char*)src;
+
+ __asm__ volatile (
+ ".align 16\n\t"
+ ".LL0%=:\n\t"
+ "prefetchnta 0x40(%1)\n\t"
+ "vmovaps (%1), %%ymm0\n\t"
+ "vmovaps 0x20(%1), %%ymm1\n\t"
+ "vmovaps 0x40(%1), %%ymm2\n\t"
+ "vmovaps 0x60(%1), %%ymm3\n\t"
+ "vmovaps %%ymm0, (%1,%2)\n\t"
+ "vmovaps %%ymm1, 0x20(%1,%2)\n\t"
+ "vmovaps %%ymm2, 0x40(%1,%2)\n\t"
+ "vmovaps %%ymm3, 0x60(%1,%2)\n\t"
+ "addq $0x80, %1\n\t"
+ "subq $0x80, %0\n\t"
+ "ja .LL0%="
+ : "+r"(size),
+ "+r"(src)
+ : "r"(delta)
+ : "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3");
+}
+# else
+static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
+{
+ __m256i *dqdest = (__m256i*)dest;
+ const __m256i *dqsrc = (const __m256i*)src;
+ const __m256i *end = (const __m256i*)((const char*)src + size);
+
+ do {
+#ifdef PHP_WIN32
+ _mm_prefetch((const char *)(dqsrc + 4), _MM_HINT_NTA);
+#else
+ _mm_prefetch(dqsrc + 4, _MM_HINT_NTA);
+#endif
+
+ __m256i ymm0 = _mm256_load_ps(dqsrc + 0);
+ __m256i ymm1 = _mm256_load_ps(dqsrc + 1);
+ __m256i ymm2 = _mm256_load_ps(dqsrc + 2);
+ __m256i ymm3 = _mm256_load_ps(dqsrc + 3);
+ dqsrc += 4;
+ _mm256_store_ps(dqdest + 0, ymm0);
+ _mm256_store_ps(dqdest + 1, ymm1);
+ _mm256_store_ps(dqdest + 2, ymm2);
+ _mm256_store_ps(dqdest + 3, ymm3);
+ dqdest += 4;
+ } while (dqsrc != end);
+}
+# endif
+#elif defined(__SSE2__)
# include <emmintrin.h>
# if defined(__GNUC__) && defined(__i386__)
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
@@ -691,11 +769,11 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
ZCG(current_persistent_script) = persistent_script;
ZCG(arena_mem) = NULL;
if (EXPECTED(persistent_script->arena_size)) {
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Target address must be aligned to 64-byte boundary */
_mm_prefetch(persistent_script->arena_mem, _MM_HINT_NTA);
- ZCG(arena_mem) = zend_arena_alloc(&CG(arena), persistent_script->arena_size + 64);
- ZCG(arena_mem) = (void*)(((zend_uintptr_t)ZCG(arena_mem) + 63L) & ~63L);
+ ZCG(arena_mem) = zend_arena_alloc(&CG(arena), persistent_script->arena_size + FAST_COPY_PAD);
+ ZCG(arena_mem) = (void*)(((zend_uintptr_t)ZCG(arena_mem) + FAST_COPY_MASK) & ~FAST_COPY_MASK);
fast_memcpy(ZCG(arena_mem), persistent_script->arena_mem, persistent_script->arena_size);
#else
ZCG(arena_mem) = zend_arena_alloc(&CG(arena), persistent_script->arena_size);
diff --git a/ext/opcache/zend_accelerator_util_funcs.h b/ext/opcache/zend_accelerator_util_funcs.h
index 5d7834a..8fb7acd 100644
--- a/ext/opcache/zend_accelerator_util_funcs.h
+++ b/ext/opcache/zend_accelerator_util_funcs.h
@@ -41,6 +41,14 @@ unsigned int zend_adler32(unsigned int checksum, signed char *buf, uint32_t len)
unsigned int zend_accel_script_checksum(zend_persistent_script *persistent_script);
+#if defined(__AVX__)
+#define FAST_COPY_PAD 128L
+#define FAST_COPY_MASK 127L
+#elif defined(__SSE2__)
+#define FAST_COPY_PAD 64L
+#define FAST_COPY_MASK 63L
+#endif
+
#endif /* ZEND_ACCELERATOR_UTIL_FUNCS_H */
/*
diff --git a/ext/opcache/zend_file_cache.c b/ext/opcache/zend_file_cache.c
index de54949..a2c985c 100644
--- a/ext/opcache/zend_file_cache.c
+++ b/ext/opcache/zend_file_cache.c
@@ -809,10 +809,9 @@ int zend_file_cache_script_store(zend_persistent_script *script, int in_shm)
return FAILURE;
}
-#ifdef __SSE2__
- /* Align to 64-byte boundary */
- mem = emalloc(script->size + 64);
- buf = (void*)(((zend_uintptr_t)mem + 63L) & ~63L);
+#if defined(__AVX__) || defined(__SSE2__)
+ mem = emalloc(script->size + FAST_COPY_PAD);
+ buf = (void*)(((zend_uintptr_t)mem + FAST_COPY_MASK) & ~FAST_COPY_MASK);
#else
mem = buf = emalloc(script->size);
#endif
@@ -1391,10 +1390,9 @@ zend_persistent_script *zend_file_cache_script_load(zend_file_handle *file_handl
}
checkpoint = zend_arena_checkpoint(CG(arena));
-#ifdef __SSE2__
- /* Align to 64-byte boundary */
- mem = zend_arena_alloc(&CG(arena), info.mem_size + info.str_size + 64);
- mem = (void*)(((zend_uintptr_t)mem + 63L) & ~63L);
+#if defined(__AVX__) || defined(__SSE2__)
+ mem = zend_arena_alloc(&CG(arena), info.mem_size + info.str_size + FAST_COPY_PAD);
+ mem = (void*)(((zend_uintptr_t)mem + FAST_COPY_MASK) & ~FAST_COPY_MASK);
#else
mem = zend_arena_alloc(&CG(arena), info.mem_size + info.str_size);
#endif
@@ -1452,10 +1450,9 @@ zend_persistent_script *zend_file_cache_script_load(zend_file_handle *file_handl
goto use_process_mem;
}
-#ifdef __SSE2__
- /* Align to 64-byte boundary */
- buf = zend_shared_alloc(info.mem_size + 64);
- buf = (void*)(((zend_uintptr_t)buf + 63L) & ~63L);
+#if defined(__AVX__) || defined(__SSE2__)
+ buf = zend_shared_alloc(info.mem_size + FAST_COPY_PAD);
+ buf = (void*)(((zend_uintptr_t)buf + FAST_COPY_MASK) & ~FAST_COPY_MASK);
#else
buf = zend_shared_alloc(info.mem_size);
#endif
diff --git a/ext/opcache/zend_persist.c b/ext/opcache/zend_persist.c
index 8c16538..7476a7c 100644
--- a/ext/opcache/zend_persist.c
+++ b/ext/opcache/zend_persist.c
@@ -27,6 +27,7 @@
#include "zend_vm.h"
#include "zend_constants.h"
#include "zend_operators.h"
+#include "zend_accelerator_util_funcs.h"
#define zend_accel_store(p, size) \
(p = _zend_shared_memdup((void*)p, size, 1))
@@ -870,9 +871,9 @@ zend_persistent_script *zend_accel_script_persist(zend_persistent_script *script
}
zend_accel_store_interned_string(script->script.filename);
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align to 64-byte boundary */
- ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + 63L) & ~63L);
+ ZCG(mem) = (void*)(((zend_uintptr_t)ZCG(mem) + FAST_COPY_MASK) & ~FAST_COPY_MASK);
#else
ZEND_ASSERT(((zend_uintptr_t)ZCG(mem) & 0x7) == 0); /* should be 8 byte aligned */
#endif
diff --git a/ext/opcache/zend_persist_calc.c b/ext/opcache/zend_persist_calc.c
index eb80232..c925d43 100644
--- a/ext/opcache/zend_persist_calc.c
+++ b/ext/opcache/zend_persist_calc.c
@@ -25,6 +25,7 @@
#include "zend_extensions.h"
#include "zend_shared_alloc.h"
#include "zend_operators.h"
+#include "zend_accelerator_util_funcs.h"
#define ADD_DUP_SIZE(m,s) ZCG(current_persistent_script)->size += zend_shared_memdup_size((void*)m, s)
#define ADD_SIZE(m) ZCG(current_persistent_script)->size += ZEND_ALIGNED_SIZE(m)
@@ -415,9 +416,9 @@ uint32_t zend_accel_script_persist_calc(zend_persistent_script *new_persistent_s
}
ADD_STRING(new_persistent_script->script.filename);
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align size to 64-byte boundary */
- new_persistent_script->size = (new_persistent_script->size + 63) & ~63;
+ new_persistent_script->size = (new_persistent_script->size + FAST_COPY_MASK) & ~FAST_COPY_MASK;
#endif
if (new_persistent_script->script.class_table.nNumUsed != new_persistent_script->script.class_table.nNumOfElements) {
@@ -430,9 +431,9 @@ uint32_t zend_accel_script_persist_calc(zend_persistent_script *new_persistent_s
zend_hash_persist_calc(&new_persistent_script->script.function_table, zend_persist_op_array_calc);
zend_persist_op_array_calc_ex(&new_persistent_script->script.main_op_array);
-#ifdef __SSE2__
+#if defined(__AVX__) || defined(__SSE2__)
/* Align size to 64-byte boundary */
- new_persistent_script->arena_size = (new_persistent_script->arena_size + 63) & ~63;
+ new_persistent_script->arena_size = (new_persistent_script->arena_size + FAST_COPY_MASK) & ~FAST_COPY_MASK;
#endif
new_persistent_script->size += new_persistent_script->arena_size;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment