Last active
April 2, 2025 11:38
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12,13c12,15 | |
< #include "ggml-cpu/unary-ops.h" | |
< #include "ggml-cpu/binary-ops.h" | |
--- | |
> #include "unary-ops.h" | |
> #include "binary-ops.h" | |
> #include "vec.h" | |
> #include "ops.h" | |
86,109d87 | |
< #if defined(GGML_USE_ACCELERATE) | |
< #include <Accelerate/Accelerate.h> | |
< #endif | |
< | |
< // floating point type used to accumulate sums | |
< typedef double ggml_float; | |
< | |
< #define GGML_GELU_FP16 | |
< #define GGML_GELU_QUICK_FP16 | |
< | |
< #define GGML_SOFT_MAX_UNROLL 4 | |
< #define GGML_VEC_DOT_UNROLL 2 | |
< #define GGML_VEC_MAD_UNROLL 32 | |
< | |
< // | |
< // global data | |
< // | |
< | |
< // precomputed gelu table for f16 (128 KB) | |
< static ggml_fp16_t ggml_table_gelu_f16[1 << 16]; | |
< | |
< // precomputed quick gelu table for f16 (128 KB) | |
< static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; | |
< | |
233,255d210 | |
< // | |
< // cache line | |
< // | |
< | |
< #if defined(__cpp_lib_hardware_interference_size) | |
< #define CACHE_LINE_SIZE hardware_destructive_interference_size | |
< #else | |
< #if defined(__POWER9_VECTOR__) | |
< #define CACHE_LINE_SIZE 128 | |
< #elif defined(__VXE__) || defined(__VXE2__) | |
< #define CACHE_LINE_SIZE 256 | |
< #else | |
< #define CACHE_LINE_SIZE 64 | |
< #endif | |
< #endif | |
< | |
< static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); | |
< | |
< | |
< static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); | |
< static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc); | |
< static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc); | |
< | |
428,1308d382 | |
< // simd mappings | |
< // | |
< | |
< // we define a common set of C macros which map to specific intrinsics based on the current architecture | |
< // we then implement the fundamental computation operations below using only these macros | |
< // adding support for new architectures requires to define the corresponding SIMD macros | |
< // | |
< // GGML_F32_STEP / GGML_F16_STEP | |
< // number of elements to process in a single step | |
< // | |
< // GGML_F32_EPR / GGML_F16_EPR | |
< // number of elements to fit in a single register | |
< // | |
< | |
< #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 NEON | |
< | |
< #define GGML_F32_STEP 16 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 float32x4_t | |
< #define GGML_F32x4_ZERO vdupq_n_f32(0.0f) | |
< #define GGML_F32x4_SET1(x) vdupq_n_f32(x) | |
< #define GGML_F32x4_LOAD vld1q_f32 | |
< #define GGML_F32x4_STORE vst1q_f32 | |
< #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) | |
< #define GGML_F32x4_ADD vaddq_f32 | |
< #define GGML_F32x4_MUL vmulq_f32 | |
< #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \ | |
< } \ | |
< (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \ | |
< } | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 NEON | |
< | |
< #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) | |
< #define GGML_F16_STEP 32 | |
< #define GGML_F16_EPR 8 | |
< | |
< #define GGML_F16x8 float16x8_t | |
< #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) | |
< #define GGML_F16x8_SET1(x) vdupq_n_f16(x) | |
< #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) | |
< #define GGML_F16x8_STORE vst1q_f16 | |
< #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) | |
< #define GGML_F16x8_ADD vaddq_f16 | |
< #define GGML_F16x8_MUL vmulq_f16 | |
< #define GGML_F16x8_REDUCE(res, x) \ | |
< do { \ | |
< int offset = GGML_F16_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ | |
< } \ | |
< const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ | |
< const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ | |
< (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ | |
< } while (0) | |
< | |
< #define GGML_F16_VEC GGML_F16x8 | |
< #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i]) | |
< #define GGML_F16_VEC_FMA GGML_F16x8_FMA | |
< #define GGML_F16_VEC_ADD GGML_F16x8_ADD | |
< #define GGML_F16_VEC_MUL GGML_F16x8_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE | |
< #else | |
< // if FP16 vector arithmetic is not supported, we use FP32 instead | |
< // and take advantage of the vcvt_ functions to convert to/from FP16 | |
< | |
< #define GGML_F16_STEP 16 | |
< #define GGML_F16_EPR 4 | |
< | |
< #define GGML_F32Cx4 float32x4_t | |
< #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) | |
< #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) | |
< #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) | |
< #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) | |
< #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) | |
< #define GGML_F32Cx4_ADD vaddq_f32 | |
< #define GGML_F32Cx4_MUL vmulq_f32 | |
< #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE | |
< | |
< #define GGML_F16_VEC GGML_F32Cx4 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE | |
< #endif | |
< | |
< #elif defined(__AVX512F__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 AVX512 | |
< | |
< #define GGML_F32_STEP 64 | |
< #define GGML_F32_EPR 16 | |
< | |
< #define GGML_F32x16 __m512 | |
< #define GGML_F32x16_ZERO _mm512_setzero_ps() | |
< #define GGML_F32x16_SET1(x) _mm512_set1_ps(x) | |
< #define GGML_F32x16_LOAD _mm512_loadu_ps | |
< #define GGML_F32x16_STORE _mm512_storeu_ps | |
< // _mm512_fmadd_ps is defined in AVX512F so no guard is required | |
< #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) | |
< #define GGML_F32x16_ADD _mm512_add_ps | |
< #define GGML_F32x16_MUL _mm512_mul_ps | |
< #define GGML_F32x16_REDUCE(res, x) \ | |
< do { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ | |
< } while (0) | |
< | |
< // TODO: is this optimal ? | |
< | |
< #define GGML_F32_VEC GGML_F32x16 | |
< #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x16_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x16_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x16_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x16_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x16_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE | |
< | |
< // F16 AVX512 | |
< | |
< // F16 AVX | |
< | |
< #define GGML_F16_STEP 64 | |
< #define GGML_F16_EPR 16 | |
< | |
< // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead | |
< | |
< #define GGML_F32Cx16 __m512 | |
< #define GGML_F32Cx16_ZERO _mm512_setzero_ps() | |
< #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) | |
< | |
< // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F | |
< // so F16C guard isn't required | |
< #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) | |
< #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) | |
< | |
< #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) | |
< #define GGML_F32Cx16_ADD _mm512_add_ps | |
< #define GGML_F32Cx16_MUL _mm512_mul_ps | |
< #define GGML_F32Cx16_REDUCE(res, x) \ | |
< do { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm512_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ | |
< } while (0) | |
< | |
< #define GGML_F16_VEC GGML_F32Cx16 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL | |
< | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE | |
< #elif defined(__AVX__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 AVX | |
< | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 8 | |
< | |
< #define GGML_F32x8 __m256 | |
< #define GGML_F32x8_ZERO _mm256_setzero_ps() | |
< #define GGML_F32x8_SET1(x) _mm256_set1_ps(x) | |
< #define GGML_F32x8_LOAD _mm256_loadu_ps | |
< #define GGML_F32x8_STORE _mm256_storeu_ps | |
< #if defined(__FMA__) | |
< #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) | |
< #else | |
< #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) | |
< #endif | |
< #define GGML_F32x8_ADD _mm256_add_ps | |
< #define GGML_F32x8_MUL _mm256_mul_ps | |
< #define GGML_F32x8_REDUCE(res, x) \ | |
< do { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm256_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm256_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm256_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ | |
< _mm256_extractf128_ps(x[0], 1)); \ | |
< const __m128 t1 = _mm_hadd_ps(t0, t0); \ | |
< res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \ | |
< } while (0) | |
< // TODO: is this optimal ? | |
< | |
< #define GGML_F32_VEC GGML_F32x8 | |
< #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x8_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x8_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x8_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x8_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE | |
< | |
< // F16 AVX | |
< | |
< #define GGML_F16_STEP 32 | |
< #define GGML_F16_EPR 8 | |
< | |
< // F16 arithmetic is not supported by AVX, so we use F32 instead | |
< | |
< #define GGML_F32Cx8 __m256 | |
< #define GGML_F32Cx8_ZERO _mm256_setzero_ps() | |
< #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) | |
< | |
< #if defined(__F16C__) | |
< // the _mm256_cvt intrinsics require F16C | |
< #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) | |
< #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) | |
< #else | |
< static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { | |
< float tmp[8]; | |
< | |
< for (int i = 0; i < 8; i++) { | |
< tmp[i] = GGML_FP16_TO_FP32(x[i]); | |
< } | |
< | |
< return _mm256_loadu_ps(tmp); | |
< } | |
< static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { | |
< float arr[8]; | |
< | |
< _mm256_storeu_ps(arr, y); | |
< | |
< for (int i = 0; i < 8; i++) | |
< x[i] = GGML_FP32_TO_FP16(arr[i]); | |
< } | |
< #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) | |
< #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) | |
< #endif | |
< | |
< #define GGML_F32Cx8_FMA GGML_F32x8_FMA | |
< #define GGML_F32Cx8_ADD _mm256_add_ps | |
< #define GGML_F32Cx8_MUL _mm256_mul_ps | |
< #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE | |
< | |
< #define GGML_F16_VEC GGML_F32Cx8 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE | |
< | |
< #elif defined(__POWER9_VECTOR__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 POWER9 | |
< | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 vector float | |
< #define GGML_F32x4_ZERO 0.0f | |
< #define GGML_F32x4_SET1 vec_splats | |
< #define GGML_F32x4_LOAD(p) vec_xl(0, p) | |
< #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) | |
< #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) | |
< #define GGML_F32x4_ADD vec_add | |
< #define GGML_F32x4_MUL vec_mul | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset+i]); \ | |
< } \ | |
< res = vec_extract(x[0], 0) + \ | |
< vec_extract(x[0], 1) + \ | |
< vec_extract(x[0], 2) + \ | |
< vec_extract(x[0], 3); \ | |
< } | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 POWER9 | |
< #define GGML_F16_STEP GGML_F32_STEP | |
< #define GGML_F16_EPR GGML_F32_EPR | |
< #define GGML_F16_VEC GGML_F32x4 | |
< #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F16_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE | |
< // Use vec_xl, not vec_ld, in case the load address is not aligned. | |
< #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ | |
< vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ | |
< vec_extract_fp32_from_shortl(vec_xl(0, p)) | |
< #define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] | |
< #define GGML_F16_VEC_STORE(p, r, i) \ | |
< if (i & 0x1) \ | |
< vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ | |
< r[i - GGML_ENDIAN_BYTE(0)]), \ | |
< 0, p - GGML_F16_EPR) | |
< | |
< #elif defined(__wasm_simd128__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 WASM | |
< | |
< #define GGML_F32_STEP 16 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 v128_t | |
< #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) | |
< #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) | |
< #define GGML_F32x4_LOAD wasm_v128_load | |
< #define GGML_F32x4_STORE wasm_v128_store | |
< #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) | |
< #define GGML_F32x4_ADD wasm_f32x4_add | |
< #define GGML_F32x4_MUL wasm_f32x4_mul | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< res = wasm_f32x4_extract_lane(x[0], 0) + \ | |
< wasm_f32x4_extract_lane(x[0], 1) + \ | |
< wasm_f32x4_extract_lane(x[0], 2) + \ | |
< wasm_f32x4_extract_lane(x[0], 3); \ | |
< } | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 WASM | |
< | |
< #define GGML_F16_STEP 16 | |
< #define GGML_F16_EPR 4 | |
< | |
< inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { | |
< float tmp[4]; | |
< | |
< tmp[0] = GGML_FP16_TO_FP32(p[0]); | |
< tmp[1] = GGML_FP16_TO_FP32(p[1]); | |
< tmp[2] = GGML_FP16_TO_FP32(p[2]); | |
< tmp[3] = GGML_FP16_TO_FP32(p[3]); | |
< | |
< return wasm_v128_load(tmp); | |
< } | |
< | |
< inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { | |
< float tmp[4]; | |
< | |
< wasm_v128_store(tmp, x); | |
< | |
< p[0] = GGML_FP32_TO_FP16(tmp[0]); | |
< p[1] = GGML_FP32_TO_FP16(tmp[1]); | |
< p[2] = GGML_FP32_TO_FP16(tmp[2]); | |
< p[3] = GGML_FP32_TO_FP16(tmp[3]); | |
< } | |
< | |
< #define GGML_F16x4 v128_t | |
< #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) | |
< #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) | |
< #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) | |
< #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) | |
< #define GGML_F16x4_FMA GGML_F32x4_FMA | |
< #define GGML_F16x4_ADD wasm_f32x4_add | |
< #define GGML_F16x4_MUL wasm_f32x4_mul | |
< #define GGML_F16x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F16_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ | |
< } \ | |
< res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \ | |
< wasm_f32x4_extract_lane(x[0], 1) + \ | |
< wasm_f32x4_extract_lane(x[0], 2) + \ | |
< wasm_f32x4_extract_lane(x[0], 3)); \ | |
< } | |
< | |
< #define GGML_F16_VEC GGML_F16x4 | |
< #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F16x4_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F16x4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F16x4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F16x4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE | |
< | |
< #elif defined(__SSE3__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 SSE | |
< | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 __m128 | |
< #define GGML_F32x4_ZERO _mm_setzero_ps() | |
< #define GGML_F32x4_SET1(x) _mm_set1_ps(x) | |
< #define GGML_F32x4_LOAD _mm_loadu_ps | |
< #define GGML_F32x4_STORE _mm_storeu_ps | |
< #if defined(__FMA__) | |
< // TODO: Does this work? | |
< #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) | |
< #else | |
< #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) | |
< #endif | |
< #define GGML_F32x4_ADD _mm_add_ps | |
< #define GGML_F32x4_MUL _mm_mul_ps | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = _mm_add_ps(x[i], x[offset+i]); \ | |
< } \ | |
< const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ | |
< res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ | |
< } | |
< // TODO: is this optimal ? | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 SSE | |
< | |
< #define GGML_F16_STEP 32 | |
< #define GGML_F16_EPR 4 | |
< | |
< static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { | |
< float tmp[4]; | |
< | |
< tmp[0] = GGML_FP16_TO_FP32(x[0]); | |
< tmp[1] = GGML_FP16_TO_FP32(x[1]); | |
< tmp[2] = GGML_FP16_TO_FP32(x[2]); | |
< tmp[3] = GGML_FP16_TO_FP32(x[3]); | |
< | |
< return _mm_loadu_ps(tmp); | |
< } | |
< | |
< static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { | |
< float arr[4]; | |
< | |
< _mm_storeu_ps(arr, y); | |
< | |
< x[0] = GGML_FP32_TO_FP16(arr[0]); | |
< x[1] = GGML_FP32_TO_FP16(arr[1]); | |
< x[2] = GGML_FP32_TO_FP16(arr[2]); | |
< x[3] = GGML_FP32_TO_FP16(arr[3]); | |
< } | |
< | |
< #define GGML_F32Cx4 __m128 | |
< #define GGML_F32Cx4_ZERO _mm_setzero_ps() | |
< #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) | |
< #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) | |
< #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) | |
< #define GGML_F32Cx4_FMA GGML_F32x4_FMA | |
< #define GGML_F32Cx4_ADD _mm_add_ps | |
< #define GGML_F32Cx4_MUL _mm_mul_ps | |
< #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE | |
< | |
< #define GGML_F16_VEC GGML_F32Cx4 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE | |
< | |
< #elif defined(__loongarch_asx) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 LASX | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 8 | |
< | |
< #define GGML_F32x8 __m256 | |
< #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0) | |
< #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x)) | |
< #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0) | |
< #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0) | |
< #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a) | |
< #define GGML_F32x8_ADD __lasx_xvfadd_s | |
< #define GGML_F32x8_MUL __lasx_xvfmul_s | |
< #define GGML_F32x8_REDUCE(res, x) \ | |
< do { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \ | |
< } \ | |
< float *tmp_p = (float *)&x[0]; \ | |
< res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \ | |
< } while (0) | |
< // TODO: is this optimal ? | |
< | |
< #define GGML_F32_VEC GGML_F32x8 | |
< #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x8_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x8_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x8_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x8_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x8_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE | |
< | |
< // F16 LASX | |
< | |
< #define GGML_F16_STEP 32 | |
< #define GGML_F16_EPR 8 | |
< | |
< // F16 arithmetic is not supported by LASX, so we use F32 instead | |
< | |
< #define GGML_F32Cx8 __m256 | |
< #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) | |
< #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) | |
< | |
< static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { | |
< __m256i a; | |
< memcpy(&a, x, sizeof(ggml_fp16_t) * 8); | |
< a = __lasx_xvpermi_d(a, 0 | (1 << 4)); | |
< return __lasx_xvfcvtl_s_h(a); | |
< } | |
< | |
< static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { | |
< __m256i a = __lasx_xvfcvt_h_s(y, y); | |
< a = __lasx_xvpermi_d(a, 0 | (2 << 2)); | |
< memcpy(x, &a, sizeof(ggml_fp16_t) * 8); | |
< } | |
< #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) | |
< #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) | |
< | |
< #define GGML_F32Cx8_FMA GGML_F32x8_FMA | |
< #define GGML_F32Cx8_ADD __lasx_xvfadd_s | |
< #define GGML_F32Cx8_MUL __lasx_xvfmul_s | |
< #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE | |
< | |
< #define GGML_F16_VEC GGML_F32Cx8 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE | |
< | |
< #elif defined(__loongarch_sx) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 LSX | |
< | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 __m128 | |
< #define GGML_F32x4_ZERO __lsx_vldi(0) | |
< #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) | |
< #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) | |
< #define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) | |
< #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) | |
< #define GGML_F32x4_ADD __lsx_vfadd_s | |
< #define GGML_F32x4_MUL __lsx_vfmul_s | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ | |
< } \ | |
< __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \ | |
< tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \ | |
< tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ | |
< const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ | |
< tmp = __lsx_vsrli_d((__m128i) t0, 32); \ | |
< tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \ | |
< tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ | |
< res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ | |
< } | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 LSX | |
< | |
< #define GGML_F16_STEP 32 | |
< #define GGML_F16_EPR 4 | |
< | |
< static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { | |
< float tmp[4]; | |
< | |
< tmp[0] = GGML_FP16_TO_FP32(x[0]); | |
< tmp[1] = GGML_FP16_TO_FP32(x[1]); | |
< tmp[2] = GGML_FP16_TO_FP32(x[2]); | |
< tmp[3] = GGML_FP16_TO_FP32(x[3]); | |
< | |
< return __lsx_vld(tmp, 0); | |
< } | |
< | |
< static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { | |
< float arr[4]; | |
< | |
< __lsx_vst(y, arr, 0); | |
< | |
< x[0] = GGML_FP32_TO_FP16(arr[0]); | |
< x[1] = GGML_FP32_TO_FP16(arr[1]); | |
< x[2] = GGML_FP32_TO_FP16(arr[2]); | |
< x[3] = GGML_FP32_TO_FP16(arr[3]); | |
< } | |
< | |
< #define GGML_F32Cx4 __m128 | |
< #define GGML_F32Cx4_ZERO __lsx_vldi(0) | |
< #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) | |
< #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x) | |
< #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y) | |
< #define GGML_F32Cx4_FMA GGML_F32x4_FMA | |
< #define GGML_F32Cx4_ADD __lsx_vfadd_s | |
< #define GGML_F32Cx4_MUL __lsx_vfmul_s | |
< #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE | |
< | |
< #define GGML_F16_VEC GGML_F32Cx4 | |
< #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE | |
< | |
< #elif defined(__VXE__) || defined(__VXE2__) | |
< | |
< #define GGML_SIMD | |
< | |
< // F32 s390x | |
< | |
< #define GGML_F32_STEP 32 | |
< #define GGML_F32_EPR 4 | |
< | |
< #define GGML_F32x4 __vector float | |
< #define GGML_F32x4_ZERO vec_splats(0.0f) | |
< #define GGML_F32x4_SET1 vec_splats | |
< #define GGML_F32x4_LOAD(p) vec_xl(0, p) | |
< #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) | |
< #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) | |
< #define GGML_F32x4_ADD vec_add | |
< #define GGML_F32x4_MUL vec_mul | |
< #define GGML_F32x4_REDUCE(res, x) \ | |
< { \ | |
< int offset = GGML_F32_ARR >> 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset + i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset + i]); \ | |
< } \ | |
< offset >>= 1; \ | |
< for (int i = 0; i < offset; ++i) { \ | |
< x[i] = vec_add(x[i], x[offset + i]); \ | |
< } \ | |
< res = vec_extract(x[0], 0) + \ | |
< vec_extract(x[0], 1) + \ | |
< vec_extract(x[0], 2) + \ | |
< vec_extract(x[0], 3); \ | |
< } | |
< | |
< #define GGML_F32_VEC GGML_F32x4 | |
< #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F32_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32_VEC_STORE GGML_F32x4_STORE | |
< #define GGML_F32_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F32_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F32_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< // F16 s390x | |
< #define GGML_F16_STEP GGML_F32_STEP | |
< #define GGML_F16_EPR GGML_F32_EPR | |
< | |
< static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { | |
< float tmp[4]; | |
< | |
< for (int i = 0; i < 4; i++) { | |
< tmp[i] = GGML_FP16_TO_FP32(x[i]); | |
< } | |
< | |
< return vec_xl(0, tmp); | |
< } | |
< | |
< static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { | |
< float arr[4]; | |
< | |
< vec_xst(y, 0, arr); | |
< | |
< for (int i = 0; i < 4; i++) { | |
< x[i] = GGML_FP32_TO_FP16(arr[i]); | |
< } | |
< } | |
< | |
< #define GGML_F16_VEC GGML_F32x4 | |
< #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO | |
< #define GGML_F16_VEC_SET1 GGML_F32x4_SET1 | |
< #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p) | |
< #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i]) | |
< #define GGML_F16_VEC_FMA GGML_F32x4_FMA | |
< #define GGML_F16_VEC_ADD GGML_F32x4_ADD | |
< #define GGML_F16_VEC_MUL GGML_F32x4_MUL | |
< #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE | |
< | |
< #endif | |
< | |
< // GGML_F32_ARR / GGML_F16_ARR | |
< // number of registers to use per step | |
< #ifdef GGML_SIMD | |
< #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) | |
< #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) | |
< #endif | |
< | |
< // | |
1407,2404d480 | |
< // | |
< // fundamental operations | |
< // | |
< | |
< inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< | |
< inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } | |
< | |
< inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } | |
< inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { | |
< for (int i = 0; i < n; ++i) { | |
< z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } | |
< inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } | |
< inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } | |
< inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } | |
< inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { | |
< for (int i = 0; i < n; ++i) { | |
< z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } | |
< inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } | |
< inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } | |
< inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i])); | |
< } | |
< } | |
< | |
< inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } | |
< inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { | |
< for (int i = 0; i < n; ++i) { | |
< z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } | |
< inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { | |
< for (int i = 0; i < n; ++i) { | |
< z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< | |
< static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) { | |
< assert(nrc == 1); | |
< UNUSED(nrc); | |
< UNUSED(bx); | |
< UNUSED(by); | |
< UNUSED(bs); | |
< | |
< #if defined(GGML_SIMD) | |
< float sumf = 0.0f; | |
< const int np = (n & ~(GGML_F32_STEP - 1)); | |
< | |
< GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; | |
< | |
< GGML_F32_VEC ax[GGML_F32_ARR]; | |
< GGML_F32_VEC ay[GGML_F32_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F32_STEP) { | |
< for (int j = 0; j < GGML_F32_ARR; j++) { | |
< ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); | |
< ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); | |
< | |
< sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); | |
< } | |
< } | |
< | |
< // reduce sum0..sum3 to sum0 | |
< GGML_F32_VEC_REDUCE(sumf, sum); | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< sumf += x[i]*y[i]; | |
< } | |
< #else | |
< // scalar | |
< ggml_float sumf = 0.0; | |
< for (int i = 0; i < n; ++i) { | |
< sumf += (ggml_float)(x[i]*y[i]); | |
< } | |
< #endif | |
< | |
< *s = sumf; | |
< } | |
< | |
< static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) { | |
< assert(nrc == 1); | |
< UNUSED(nrc); | |
< UNUSED(bx); | |
< UNUSED(by); | |
< UNUSED(bs); | |
< int i = 0; | |
< ggml_float sumf = 0; | |
< | |
< #if defined(__AVX512BF16__) | |
< __m512 c1 = _mm512_setzero_ps(); | |
< __m512 c2 = _mm512_setzero_ps(); | |
< for (; i + 64 <= n; i += 64) { | |
< c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), | |
< m512bh(_mm512_loadu_si512((y + i)))); | |
< c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), | |
< m512bh(_mm512_loadu_si512((y + i + 32)))); | |
< } | |
< sumf += (ggml_float)_mm512_reduce_add_ps(c1); | |
< sumf += (ggml_float)_mm512_reduce_add_ps(c2); | |
< | |
< #elif defined(__AVX512F__) | |
< #define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) | |
< __m512 c1 = _mm512_setzero_ps(); | |
< __m512 c2 = _mm512_setzero_ps(); | |
< for (; i + 32 <= n; i += 32) { | |
< c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); | |
< c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); | |
< } | |
< sumf += (ggml_float)_mm512_reduce_add_ps(c1); | |
< sumf += (ggml_float)_mm512_reduce_add_ps(c2); | |
< | |
< #undef LOAD | |
< #elif defined(__AVX2__) || defined(__AVX__) | |
< #if defined(__AVX2__) | |
< #define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) | |
< #else | |
< #define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1)) | |
< #endif | |
< __m256 c1 = _mm256_setzero_ps(); | |
< __m256 c2 = _mm256_setzero_ps(); | |
< __m256 c3 = _mm256_setzero_ps(); | |
< __m256 c4 = _mm256_setzero_ps(); | |
< for (; i + 32 <= n; i += 32) { | |
< c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); | |
< c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); | |
< c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); | |
< c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); | |
< } | |
< __m128 g; | |
< c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), | |
< _mm256_add_ps(c2, c4)); | |
< g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), | |
< _mm256_castps256_ps128(c1)); | |
< g = _mm_add_ps(g, _mm_movehl_ps(g, g)); | |
< g = _mm_add_ss(g, _mm_movehdup_ps(g)); | |
< sumf += (ggml_float)_mm_cvtss_f32(g); | |
< | |
< #undef LOAD | |
< #endif | |
< | |
< for (; i < n; ++i) { | |
< sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * | |
< GGML_BF16_TO_FP32(y[i])); | |
< } | |
< *s = sumf; | |
< } | |
< | |
< static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) { | |
< assert(nrc == 1); | |
< UNUSED(nrc); | |
< UNUSED(bx); | |
< UNUSED(by); | |
< UNUSED(bs); | |
< | |
< ggml_float sumf = 0.0; | |
< | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F16_STEP - 1)); | |
< | |
< GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; | |
< | |
< GGML_F16_VEC ax[GGML_F16_ARR]; | |
< GGML_F16_VEC ay[GGML_F16_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F16_STEP) { | |
< for (int j = 0; j < GGML_F16_ARR; j++) { | |
< ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); | |
< ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); | |
< | |
< sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); | |
< } | |
< } | |
< | |
< // reduce sum0..sum3 to sum0 | |
< GGML_F16_VEC_REDUCE(sumf, sum); | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); | |
< } | |
< #else | |
< for (int i = 0; i < n; ++i) { | |
< sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); | |
< } | |
< #endif | |
< | |
< *s = sumf; | |
< } | |
< | |
< // compute GGML_VEC_DOT_UNROLL dot products at once | |
< // xs - x row stride in bytes | |
< inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { | |
< ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; | |
< | |
< ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; | |
< | |
< for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { | |
< x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); | |
< } | |
< | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F16_STEP - 1)); | |
< | |
< GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; | |
< | |
< GGML_F16_VEC ax[GGML_F16_ARR]; | |
< GGML_F16_VEC ay[GGML_F16_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F16_STEP) { | |
< for (int j = 0; j < GGML_F16_ARR; j++) { | |
< ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); | |
< | |
< for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { | |
< ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); | |
< | |
< sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); | |
< } | |
< } | |
< } | |
< | |
< // reduce sum0..sum3 to sum0 | |
< for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { | |
< GGML_F16_VEC_REDUCE(sumf[k], sum[k]); | |
< } | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { | |
< sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< #else | |
< for (int i = 0; i < n; ++i) { | |
< for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { | |
< sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); | |
< } | |
< } | |
< #endif | |
< | |
< for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { | |
< s[i] = sumf[i]; | |
< } | |
< } | |
< | |
< inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F32_STEP - 1)); | |
< | |
< GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); | |
< | |
< GGML_F32_VEC ax[GGML_F32_ARR]; | |
< GGML_F32_VEC ay[GGML_F32_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F32_STEP) { | |
< for (int j = 0; j < GGML_F32_ARR; j++) { | |
< ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); | |
< ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); | |
< ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); | |
< | |
< GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); | |
< } | |
< } | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< y[i] += x[i]*v; | |
< } | |
< #else | |
< // scalar | |
< for (int i = 0; i < n; ++i) { | |
< y[i] += x[i]*v; | |
< } | |
< #endif | |
< } | |
< | |
< inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F16_STEP - 1)); | |
< | |
< GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); | |
< | |
< GGML_F16_VEC ax[GGML_F16_ARR]; | |
< GGML_F16_VEC ay[GGML_F16_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F16_STEP) { | |
< for (int j = 0; j < GGML_F16_ARR; j++) { | |
< ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); | |
< ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); | |
< ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); | |
< | |
< GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); | |
< } | |
< } | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); | |
< } | |
< #else | |
< // scalar | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); | |
< } | |
< #endif | |
< } | |
< | |
< // xs and vs are byte strides of x and v | |
< inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { | |
< | |
< const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; | |
< const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; | |
< | |
< for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { | |
< x[i] = (const float *) ((const char *) xv + i*xs); | |
< v[i] = (const float *) ((const char *) vv + i*vs); | |
< } | |
< | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F32_STEP - 1)); | |
< | |
< GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL]; | |
< | |
< for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { | |
< vx[k] = GGML_F32_VEC_SET1(v[k][0]); | |
< } | |
< | |
< GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR]; | |
< GGML_F32_VEC ay[GGML_F32_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F32_STEP) { | |
< for (int j = 0; j < GGML_F32_ARR; j++) { | |
< ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); | |
< | |
< for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { | |
< ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR); | |
< ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]); | |
< } | |
< | |
< GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); | |
< } | |
< } | |
< | |
< // leftovers | |
< for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { | |
< for (int i = np; i < n; ++i) { | |
< y[i] += x[k][i]*v[k][0]; | |
< } | |
< } | |
< #else | |
< // scalar | |
< for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] += x[k][i]*v[k][0]; | |
< } | |
< } | |
< #endif | |
< } | |
< | |
< //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } | |
< inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { | |
< #if defined(GGML_USE_ACCELERATE) | |
< vDSP_vsmul(y, 1, &v, y, 1, n); | |
< #elif defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F32_STEP - 1)); | |
< | |
< GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); | |
< | |
< GGML_F32_VEC ay[GGML_F32_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F32_STEP) { | |
< for (int j = 0; j < GGML_F32_ARR; j++) { | |
< ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); | |
< ay[j] = GGML_F32_VEC_MUL(ay[j], vx); | |
< | |
< GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); | |
< } | |
< } | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< y[i] *= v; | |
< } | |
< #else | |
< // scalar | |
< for (int i = 0; i < n; ++i) { | |
< y[i] *= v; | |
< } | |
< #endif | |
< } | |
< | |
< inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { | |
< #if defined(GGML_SIMD) | |
< const int np = (n & ~(GGML_F16_STEP - 1)); | |
< | |
< GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); | |
< | |
< GGML_F16_VEC ay[GGML_F16_ARR]; | |
< | |
< for (int i = 0; i < np; i += GGML_F16_STEP) { | |
< for (int j = 0; j < GGML_F16_ARR; j++) { | |
< ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); | |
< ay[j] = GGML_F16_VEC_MUL(ay[j], vx); | |
< | |
< GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); | |
< } | |
< } | |
< | |
< // leftovers | |
< for (int i = np; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); | |
< } | |
< #else | |
< // scalar | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); | |
< } | |
< #endif | |
< } | |
< | |
< inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } | |
< inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } | |
< inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16(v*v); | |
< } | |
< } | |
< inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } | |
< inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } | |
< inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } | |
< inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } | |
< inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } | |
< inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } | |
< inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); | |
< } | |
< } | |
< inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } | |
< inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); | |
< } | |
< } | |
< inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } | |
< inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } | |
< inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } | |
< inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f); | |
< } | |
< } | |
< inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } | |
< inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); | |
< } | |
< } | |
< inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } | |
< inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i])))); | |
< } | |
< } | |
< // TODO: optimize performance | |
< inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } | |
< inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); | |
< } | |
< } | |
< inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } | |
< inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); | |
< } | |
< } | |
< inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } | |
< inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i]))); | |
< } | |
< } | |
< | |
< static const float GELU_COEF_A = 0.044715f; | |
< static const float GELU_QUICK_COEF = -1.702f; | |
< static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; | |
< | |
< inline static float ggml_gelu_f32(float x) { | |
< return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); | |
< } | |
< | |
< inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< const uint16_t * i16 = (const uint16_t *) x; | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = ggml_table_gelu_f16[i16[i]]; | |
< } | |
< } | |
< | |
< #ifdef GGML_GELU_FP16 | |
< inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { | |
< uint16_t t; | |
< for (int i = 0; i < n; ++i) { | |
< if (x[i] <= -10.0f) { | |
< y[i] = 0.0f; | |
< } else if (x[i] >= 10.0f) { | |
< y[i] = x[i]; | |
< } else { | |
< ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); | |
< memcpy(&t, &fp16, sizeof(uint16_t)); | |
< y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); | |
< } | |
< } | |
< } | |
< #else | |
< inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = ggml_gelu_f32(x[i]); | |
< } | |
< } | |
< #endif | |
< | |
< inline static float ggml_gelu_quick_f32(float x) { | |
< return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); | |
< } | |
< | |
< //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< // const uint16_t * i16 = (const uint16_t *) x; | |
< // for (int i = 0; i < n; ++i) { | |
< // y[i] = ggml_table_gelu_quick_f16[i16[i]]; | |
< // } | |
< //} | |
< | |
< #ifdef GGML_GELU_QUICK_FP16 | |
< inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { | |
< uint16_t t; | |
< for (int i = 0; i < n; ++i) { | |
< ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); | |
< memcpy(&t, &fp16, sizeof(uint16_t)); | |
< y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); | |
< } | |
< } | |
< #else | |
< inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = ggml_gelu_quick_f32(x[i]); | |
< } | |
< } | |
< #endif | |
< | |
< inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< float v = GGML_FP16_TO_FP32(x[i]); | |
< y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); | |
< } | |
< } | |
< | |
< // Sigmoid Linear Unit (SiLU) function | |
< inline static float ggml_silu_f32(float x) { | |
< return x/(1.0f + expf(-x)); | |
< } | |
< inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { | |
< float v = GGML_FP16_TO_FP32(x); | |
< return GGML_FP32_TO_FP16(v/(1.0f + expf(-v))); | |
< } | |
< | |
< #if __FINITE_MATH_ONLY__ | |
< #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" | |
< #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461" | |
< #endif | |
< | |
< #if defined(__ARM_NEON) && defined(__aarch64__) | |
< | |
< // adapted from arm limited optimized routine | |
< // the maximum error is 1.45358 plus 0.5 ulps | |
< // numbers above 88.38 will flush to infinity | |
< // numbers beneath -103.97 will flush to zero | |
< inline static float32x4_t ggml_v_expf(float32x4_t x) { | |
< const float32x4_t r = vdupq_n_f32(0x1.8p23f); | |
< const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f)); | |
< const float32x4_t n = vsubq_f32(z, r); | |
< const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n, | |
< vdupq_n_f32(0x1.7f7d1cp-20f)); | |
< const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23); | |
< const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1)))); | |
< const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126)); | |
< const float32x4_t u = vmulq_f32(b, b); | |
< const float32x4_t j = vfmaq_f32( | |
< vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b), | |
< vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b), | |
< vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u); | |
< if (!vpaddd_u64(vreinterpretq_u64_u32(c))) | |
< return vfmaq_f32(k, j, k); | |
< const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000)); | |
< const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000))); | |
< const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d)); | |
< return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1), | |
< vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j))); | |
< } | |
< | |
< // computes silu x/(1+exp(-x)) in single precision vector | |
< inline static float32x4_t ggml_v_silu(float32x4_t x) { | |
< const float32x4_t one = vdupq_n_f32(1.0f); | |
< const float32x4_t zero = vdupq_n_f32(0.0f); | |
< const float32x4_t neg_x = vsubq_f32(zero, x); | |
< const float32x4_t exp_neg_x = ggml_v_expf(neg_x); | |
< const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x); | |
< return vdivq_f32(x, one_plus_exp_neg_x); | |
< } | |
< | |
< #elif defined(__AVX512F__) && defined(__AVX512DQ__) | |
< | |
< // adapted from arm limited optimized routine | |
< // the maximum error is 1.45358 plus 0.5 ulps | |
< // numbers above 88.38 will flush to infinity | |
< // numbers beneath -103.97 will flush to zero | |
< inline static __m512 ggml_v_expf(__m512 x) { | |
< const __m512 r = _mm512_set1_ps(0x1.8p23f); | |
< const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r); | |
< const __m512 n = _mm512_sub_ps(z, r); | |
< const __m512 b = | |
< _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f), | |
< _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x)); | |
< const __mmask16 d = | |
< _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ); | |
< const __m512 u = _mm512_mul_ps(b, b); | |
< const __m512 j = _mm512_fmadd_ps( | |
< _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b, | |
< _mm512_set1_ps(0x1.573e2ep-5f)), | |
< u, | |
< _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b, | |
< _mm512_set1_ps(0x1.fffdb6p-2f))), | |
< u, | |
< _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F))); | |
< const __m512 res = _mm512_scalef_ps(j, n); | |
< if (_mm512_kortestz(d, d)) | |
< return res; | |
< const __m512 zero = _mm512_setzero_ps(); | |
< const __m512 alt = _mm512_mask_blend_ps( | |
< _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero); | |
< return _mm512_mask_blend_ps(d, res, alt); | |
< } | |
< | |
< // computes silu x/(1+exp(-x)) in single precision vector | |
< inline static __m512 ggml_v_silu(__m512 x) { | |
< const __m512 one = _mm512_set1_ps(1); | |
< const __m512 zero = _mm512_setzero_ps(); | |
< const __m512 neg_x = _mm512_sub_ps(zero, x); | |
< const __m512 exp_neg_x = ggml_v_expf(neg_x); | |
< const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x); | |
< return _mm512_div_ps(x, one_plus_exp_neg_x); | |
< } | |
< | |
< #elif defined(__AVX2__) && defined(__FMA__) | |
< | |
< // adapted from arm limited optimized routine | |
< // the maximum error is 1.45358 plus 0.5 ulps | |
< // numbers above 88.38 will flush to infinity | |
< // numbers beneath -103.97 will flush to zero | |
< inline static __m256 ggml_v_expf(__m256 x) { | |
< const __m256 r = _mm256_set1_ps(0x1.8p23f); | |
< const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r); | |
< const __m256 n = _mm256_sub_ps(z, r); | |
< const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f), | |
< _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x)); | |
< const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23); | |
< const __m256 k = _mm256_castsi256_ps( | |
< _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1)))); | |
< const __m256i c = _mm256_castps_si256( | |
< _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), | |
< _mm256_set1_ps(126), _CMP_GT_OQ)); | |
< const __m256 u = _mm256_mul_ps(b, b); | |
< const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b, | |
< _mm256_set1_ps(0x1.573e2ep-5f)), u, | |
< _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b, | |
< _mm256_set1_ps(0x1.fffdb6p-2f))), | |
< u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b)); | |
< if (!_mm256_movemask_ps(_mm256_castsi256_ps(c))) | |
< return _mm256_fmadd_ps(j, k, k); | |
< const __m256i g = _mm256_and_si256( | |
< _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)), | |
< _mm256_set1_epi32(0x82000000u)); | |
< const __m256 s1 = | |
< _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u))); | |
< const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g)); | |
< const __m256i d = _mm256_castps_si256( | |
< _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n), | |
< _mm256_set1_ps(192), _CMP_GT_OQ)); | |
< return _mm256_or_ps( | |
< _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)), | |
< _mm256_andnot_ps( | |
< _mm256_castsi256_ps(d), | |
< _mm256_or_ps( | |
< _mm256_and_ps(_mm256_castsi256_ps(c), | |
< _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)), | |
< _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k))))); | |
< } | |
< | |
< // computes silu x/(1+exp(-x)) in single precision vector | |
< inline static __m256 ggml_v_silu(__m256 x) { | |
< const __m256 one = _mm256_set1_ps(1); | |
< const __m256 zero = _mm256_setzero_ps(); | |
< const __m256 neg_x = _mm256_sub_ps(zero, x); | |
< const __m256 exp_neg_x = ggml_v_expf(neg_x); | |
< const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x); | |
< return _mm256_div_ps(x, one_plus_exp_neg_x); | |
< } | |
< | |
< #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON | |
< | |
< #if defined(__FMA__) | |
< #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z) | |
< #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z) | |
< #else | |
< #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z) | |
< #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y)) | |
< #endif | |
< | |
< // adapted from arm limited optimized routine | |
< // the maximum error is 1.45358 plus 0.5 ulps | |
< // numbers above 88.38 will flush to infinity | |
< // numbers beneath -103.97 will flush to zero | |
< inline static __m128 ggml_v_expf(__m128 x) { | |
< const __m128 r = _mm_set1_ps(0x1.8p23f); | |
< const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r); | |
< const __m128 n = _mm_sub_ps(z, r); | |
< const __m128 b = | |
< NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x)); | |
< const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23); | |
< const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1)))); | |
< const __m128i c = | |
< _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126))); | |
< const __m128 u = _mm_mul_ps(b, b); | |
< const __m128 j = | |
< MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u, | |
< MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))), | |
< u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b)); | |
< if (!_mm_movemask_epi8(c)) | |
< return MADD128(j, k, k); | |
< const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())), | |
< _mm_set1_epi32(0x82000000u)); | |
< const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u))); | |
< const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g)); | |
< const __m128i d = | |
< _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192))); | |
< return _mm_or_ps( | |
< _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)), | |
< _mm_andnot_ps(_mm_castsi128_ps(d), | |
< _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)), | |
< _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k))))); | |
< } | |
< | |
< // computes silu x/(1+exp(-x)) in single precision vector | |
< inline static __m128 ggml_v_silu(__m128 x) { | |
< const __m128 one = _mm_set1_ps(1); | |
< const __m128 zero = _mm_setzero_ps(); | |
< const __m128 neg_x = _mm_sub_ps(zero, x); | |
< const __m128 exp_neg_x = ggml_v_expf(neg_x); | |
< const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x); | |
< return _mm_div_ps(x, one_plus_exp_neg_x); | |
< } | |
< | |
< #endif // __ARM_NEON / __AVX2__ / __SSE2__ | |
< | |
< static void ggml_vec_silu_f32(const int n, float * y, const float * x) { | |
< int i = 0; | |
< #if defined(__AVX512F__) && defined(__AVX512DQ__) | |
< for (; i + 15 < n; i += 16) { | |
< _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i))); | |
< } | |
< #elif defined(__AVX2__) && defined(__FMA__) | |
< for (; i + 7 < n; i += 8) { | |
< _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i))); | |
< } | |
< #elif defined(__SSE2__) | |
< for (; i + 3 < n; i += 4) { | |
< _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); | |
< } | |
< #elif defined(__ARM_NEON) && defined(__aarch64__) | |
< for (; i + 3 < n; i += 4) { | |
< vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); | |
< } | |
< #endif | |
< for (; i < n; ++i) { | |
< y[i] = ggml_silu_f32(x[i]); | |
< } | |
< } | |
< | |
< inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { | |
< for (int i = 0; i < n; ++i) { | |
< y[i] = ggml_silu_f16(x[i]); | |
< } | |
< } | |
< | |
< static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { | |
< int i = 0; | |
< ggml_float sum = 0; | |
< #if defined(__AVX512F__) && defined(__AVX512DQ__) | |
< for (; i + 15 < n; i += 16) { | |
< __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), | |
< _mm512_set1_ps(max))); | |
< _mm512_storeu_ps(y + i, val); | |
< sum += (ggml_float)_mm512_reduce_add_ps(val); | |
< } | |
< #elif defined(__AVX2__) && defined(__FMA__) | |
< for (; i + 7 < n; i += 8) { | |
< __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), | |
< _mm256_set1_ps(max))); | |
< _mm256_storeu_ps(y + i, val); | |
< __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), | |
< _mm256_castps256_ps128(val)); | |
< val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); | |
< val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); | |
< sum += (ggml_float)_mm_cvtss_f32(val2); | |
< } | |
< #elif defined(__SSE2__) | |
< for (; i + 3 < n; i += 4) { | |
< __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), | |
< _mm_set1_ps(max))); | |
< _mm_storeu_ps(y + i, val); | |
< #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) | |
< val = _mm_add_ps(val, _mm_movehl_ps(val, val)); | |
< val = _mm_add_ss(val, _mm_movehdup_ps(val)); | |
< #else | |
< __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); | |
< val = _mm_add_ps(val, tmp); | |
< tmp = _mm_movehl_ps(tmp, val); | |
< val = _mm_add_ss(val, tmp); | |
< #endif | |
< sum += (ggml_float)_mm_cvtss_f32(val); | |
< } | |
< #elif defined(__ARM_NEON) && defined(__aarch64__) | |
< for (; i + 3 < n; i += 4) { | |
< float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), | |
< vdupq_n_f32(max))); | |
< vst1q_f32(y + i, val); | |
< sum += (ggml_float)vaddvq_f32(val); | |
< } | |
< #endif | |
< for (; i < n; ++i) { | |
< float val = expf(x[i] - max); | |
< sum += (ggml_float)val; | |
< y[i] = val; | |
< } | |
< return sum; | |
< } | |
< | |
< static ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) { | |
< // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i) | |
< | |
< int i = 0; | |
< ggml_float sum = 0; | |
< for (; i < n; ++i) { | |
< float val = x[i] - max; | |
< y[i] = val; | |
< sum += (ggml_float)expf(val); | |
< } | |
< return sum = (ggml_float)logf(sum); | |
< } | |
< | |
< inline static float ggml_silu_backward_f32(float x, float dy) { | |
< const float s = 1.0f/(1.0f + expf(-x)); | |
< return dy*s*(1.0f + x*(1.0f - s)); | |
< } | |
< | |
< inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { | |
< const float v = GGML_FP16_TO_FP32(x); | |
< const float s = 1.0f/(1.0f + expf(-v)); | |
< return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); | |
< } | |
< | |
< inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { | |
< for (int i = 0; i < n; ++i) { | |
< dx[i] = ggml_silu_backward_f32(x[i], dy[i]); | |
< } | |
< } | |
< | |
< inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) { | |
< for (int i = 0; i < n; ++i) { | |
< dx[i] = ggml_silu_backward_f16(x[i], dy[i]); | |
< } | |
< } | |
< | |
< inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { | |
< #ifndef GGML_USE_ACCELERATE | |
< ggml_float sum = 0.0; | |
< for (int i = 0; i < n; ++i) { | |
< sum += (ggml_float)x[i]; | |
< } | |
< *s = sum; | |
< #else | |
< vDSP_sve(x, 1, s, n); | |
< #endif | |
< } | |
< | |
< inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { | |
< ggml_float sum = 0.0; | |
< for (int i = 0; i < n; ++i) { | |
< sum += (ggml_float)x[i]; | |
< } | |
< *s = sum; | |
< } | |
< | |
< inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { | |
< float sum = 0.0f; | |
< for (int i = 0; i < n; ++i) { | |
< sum += GGML_FP16_TO_FP32(x[i]); | |
< } | |
< *s = sum; | |
< } | |
< | |
< inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { | |
< float sum = 0.0f; | |
< for (int i = 0; i < n; ++i) { | |
< sum += GGML_BF16_TO_FP32(x[i]); | |
< } | |
< *s = sum; | |
< } | |
< | |
< inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { | |
< #ifndef GGML_USE_ACCELERATE | |
< float max = -INFINITY; | |
< for (int i = 0; i < n; ++i) { | |
< max = MAX(max, x[i]); | |
< } | |
< *s = max; | |
< #else | |
< vDSP_maxv(x, 1, s, n); | |
< #endif | |
< } | |
< | |
< inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { | |
< ggml_vec_norm_f32(n, s, x); | |
< *s = 1.f/(*s); | |
< } | |
< | |
< inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { | |
< float max = -INFINITY; | |
< int idx = 0; | |
< for (int i = 0; i < n; ++i) { | |
< max = MAX(max, x[i]); | |
< if (max == x[i]) { idx = i; } | |
< } | |
< *s = idx; | |
< } | |
< | |
3098,6652d1173 | |
< // ggml_compute_forward_dup | |
< | |
< static void ggml_compute_forward_dup_same_cont( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); | |
< GGML_ASSERT(src0->type == dst->type); | |
< | |
< const size_t nb0 = ggml_type_size(src0->type); | |
< | |
< const int ith = params->ith; // thread index | |
< const int nth = params->nth; // number of threads | |
< | |
< // parallelize by blocks | |
< const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type); | |
< const int dr = (nk + nth - 1) / nth; | |
< const int k0 = dr * ith; | |
< const int k1 = MIN(k0 + dr, nk); | |
< | |
< if (k0 < k1) { | |
< memcpy( | |
< ((char *) dst->data + k0*nb0), | |
< ((char *) src0->data + k0*nb0), | |
< (k1 - k0) * nb0); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_dup_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const int ith = params->ith; // thread index | |
< const int nth = params->nth; // number of threads | |
< | |
< // parallelize by rows | |
< const int nr = ne01; | |
< // number of rows per thread | |
< const int dr = (nr + nth - 1) / nth; | |
< // row range for this thread | |
< const int ir0 = dr * ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< if (src0->type == dst->type && | |
< ne00 == ne0 && | |
< nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { | |
< // copy by rows | |
< const size_t rs = ne00*nb00; | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< memcpy( | |
< ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), | |
< ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), | |
< rs); | |
< } | |
< } | |
< } | |
< return; | |
< } | |
< | |
< // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy | |
< | |
< if (ggml_is_contiguous(dst)) { | |
< if (nb00 == sizeof(ggml_fp16_t)) { | |
< if (dst->type == GGML_TYPE_F16) { | |
< size_t id = 0; | |
< const size_t rs = ne00 * nb00; | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; | |
< memcpy(dst_ptr + id, src0_ptr, rs); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (ggml_get_type_traits_cpu(dst->type)->from_float) { | |
< ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float; | |
< float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< size_t id = 0; | |
< size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); | |
< } | |
< | |
< quantize_row_q(src0_f32, dst_ptr + id, ne00); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } else { | |
< //printf("%s: this is not optimal - fix me\n", __func__); | |
< | |
< if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< size_t id = 0; | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = *src0_ptr; | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< return; | |
< } | |
< | |
< // dst counters | |
< int64_t i10 = 0; | |
< int64_t i11 = 0; | |
< int64_t i12 = 0; | |
< int64_t i13 = 0; | |
< | |
< if (dst->type == GGML_TYPE_F16) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); | |
< | |
< if (++i10 == ne00) { | |
< i10 = 0; | |
< if (++i11 == ne01) { | |
< i11 = 0; | |
< if (++i12 == ne02) { | |
< i12 = 0; | |
< if (++i13 == ne03) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F32) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_dup_bf16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const int ith = params->ith; // thread index | |
< const int nth = params->nth; // number of threads | |
< | |
< // parallelize by rows | |
< const int nr = ne01; | |
< // number of rows per thread | |
< const int dr = (nr + nth - 1) / nth; | |
< // row range for this thread | |
< const int ir0 = dr * ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< if (src0->type == dst->type && | |
< ne00 == ne0 && | |
< nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { | |
< // copy by rows | |
< const size_t rs = ne00*nb00; | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< memcpy( | |
< ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), | |
< ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), | |
< rs); | |
< } | |
< } | |
< } | |
< return; | |
< } | |
< | |
< // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy | |
< | |
< if (ggml_is_contiguous(dst)) { | |
< if (nb00 == sizeof(ggml_bf16_t)) { | |
< if (dst->type == GGML_TYPE_BF16) { | |
< size_t id = 0; | |
< const size_t rs = ne00 * nb00; | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; | |
< memcpy(dst_ptr + id, src0_ptr, rs); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< size_t id = 0; | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (ggml_get_type_traits_cpu(dst->type)->from_float) { | |
< ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float; | |
< float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< size_t id = 0; | |
< size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]); | |
< } | |
< | |
< quantize_row_q(src0_f32, dst_ptr + id, ne00); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } else { | |
< //printf("%s: this is not optimal - fix me\n", __func__); | |
< | |
< if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_BF16) { | |
< size_t id = 0; | |
< ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = *src0_ptr; | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< size_t id = 0; | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< return; | |
< } | |
< | |
< // dst counters | |
< int64_t i10 = 0; | |
< int64_t i11 = 0; | |
< int64_t i12 = 0; | |
< int64_t i13 = 0; | |
< | |
< if (dst->type == GGML_TYPE_BF16) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); | |
< | |
< if (++i10 == ne00) { | |
< i10 = 0; | |
< if (++i11 == ne01) { | |
< i11 = 0; | |
< if (++i12 == ne02) { | |
< i12 = 0; | |
< if (++i13 == ne03) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F32) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_dup_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const int ith = params->ith; // thread index | |
< const int nth = params->nth; // number of threads | |
< | |
< // parallelize by rows | |
< const int nr = ne01; | |
< // number of rows per thread | |
< const int dr = (nr + nth - 1) / nth; | |
< // row range for this thread | |
< const int ir0 = dr * ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< if (src0->type == dst->type && | |
< ne00 == ne0 && | |
< nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { | |
< // copy by rows | |
< const size_t rs = ne00*nb00; | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< memcpy( | |
< ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), | |
< ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), | |
< rs); | |
< } | |
< } | |
< } | |
< return; | |
< } | |
< | |
< if (ggml_is_contiguous(dst)) { | |
< // TODO: simplify | |
< if (nb00 == sizeof(float)) { | |
< if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< const size_t rs = ne00 * nb00; | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; | |
< memcpy(dst_ptr + id, src0_ptr, rs); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else if (ggml_get_type_traits_cpu(dst->type)->from_float) { | |
< ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float; | |
< | |
< size_t id = 0; | |
< size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); | |
< char * dst_ptr = (char *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< quantize_row_q(src0_ptr, dst_ptr + id, ne00); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } else { | |
< //printf("%s: this is not optimal - fix me\n", __func__); | |
< | |
< if (dst->type == GGML_TYPE_F32) { | |
< size_t id = 0; | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = *src0_ptr; | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< size_t id = 0; | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_BF16) { | |
< size_t id = 0; | |
< ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; | |
< | |
< for (int i03 = 0; i03 < ne03; i03++) { | |
< for (int i02 = 0; i02 < ne02; i02++) { | |
< id += ne00 * ir0; | |
< for (int i01 = ir0; i01 < ir1; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr); | |
< id++; | |
< } | |
< } | |
< id += ne00 * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< | |
< return; | |
< } | |
< | |
< // dst counters | |
< | |
< int64_t i10 = 0; | |
< int64_t i11 = 0; | |
< int64_t i12 = 0; | |
< int64_t i13 = 0; | |
< | |
< if (dst->type == GGML_TYPE_F32) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< memcpy(dst_ptr, src0_ptr, sizeof(float)); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_F16) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else if (dst->type == GGML_TYPE_BF16) { | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< i10 += ne00 * ir0; | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr); | |
< | |
< if (++i10 == ne0) { | |
< i10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< i10 += ne00 * (ne01 - ir1); | |
< while (i10 >= ne0) { | |
< i10 -= ne0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } else { | |
< GGML_ABORT("fatal error"); // TODO: implement | |
< } | |
< } | |
< | |
< // A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. | |
< static void ggml_compute_forward_dup_bytes( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< GGML_ASSERT(src0->type == dst->type); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS; | |
< | |
< if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { | |
< ggml_compute_forward_dup_same_cont(params, dst); | |
< return; | |
< } | |
< | |
< const size_t type_size = ggml_type_size(src0->type); | |
< | |
< const int ith = params->ith; // thread index | |
< const int nth = params->nth; // number of threads | |
< | |
< // parallelize by rows | |
< const int nr = ne01; | |
< // number of rows per thread | |
< const int dr = (nr + nth - 1) / nth; | |
< // row range for this thread | |
< const int ir0 = dr * ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< if (src0->type == dst->type && | |
< ggml_are_same_shape(src0, dst) && | |
< nb00 == type_size && nb0 == type_size) { | |
< // copy by rows | |
< const size_t rs = ggml_row_size(src0->type, ne00); | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< memcpy( | |
< ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), | |
< ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), | |
< rs); | |
< } | |
< } | |
< } | |
< return; | |
< } | |
< | |
< if (ggml_is_contiguous(dst)) { | |
< size_t id = 0; | |
< char * dst_ptr = (char *) dst->data; | |
< const size_t rs = ne00 * type_size; | |
< | |
< if (nb00 == type_size) { | |
< // src0 is contigous on first dimension, copy by rows | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; | |
< memcpy(dst_ptr + id, src0_ptr, rs); | |
< id += rs; | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } else { | |
< //printf("%s: this is not optimal - fix me\n", __func__); | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< id += rs * ir0; | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; | |
< memcpy(dst_ptr + id, src0_ptr, type_size); | |
< | |
< id += type_size; | |
< } | |
< } | |
< id += rs * (ne01 - ir1); | |
< } | |
< } | |
< } | |
< | |
< return; | |
< } | |
< | |
< // dst counters | |
< int64_t k10 = 0; | |
< int64_t i11 = 0; | |
< int64_t i12 = 0; | |
< int64_t i13 = 0; | |
< | |
< // number of blocks in a row | |
< const int64_t nk00 = ne00 / ggml_blck_size(src0->type); | |
< const int64_t nk0 = ne0 / ggml_blck_size(dst->type); | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< k10 += nk00 * ir0; | |
< while (k10 >= nk0) { | |
< k10 -= nk0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< for (int64_t i01 = ir0; i01 < ir1; i01++) { | |
< for (int64_t k00 = 0; k00 < nk00; k00++) { | |
< const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); | |
< | |
< memcpy(dst_ptr, src0_ptr, type_size); | |
< | |
< if (++k10 == nk0) { | |
< k10 = 0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< k10 += nk00 * (ne01 - ir1); | |
< while (k10 >= nk0) { | |
< k10 -= nk0; | |
< if (++i11 == ne1) { | |
< i11 = 0; | |
< if (++i12 == ne2) { | |
< i12 = 0; | |
< if (++i13 == ne3) { | |
< i13 = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_dup_q( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const enum ggml_type type = src0->type; | |
< ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; | |
< | |
< size_t qk = ggml_blck_size(type); | |
< const int64_t nr = ggml_nelements(src1) / qk; | |
< | |
< // destination must be contiguous in the first dimension | |
< GGML_ASSERT(nb10 == ggml_type_size(dst->type)); | |
< // must either have first dimension large enough to hold a row, or fully contiguous | |
< GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t ir = ir0; ir < ir1; ++ir) { | |
< | |
< uint32_t i = ir * qk; | |
< | |
< const int64_t i03 = i/(ne00 * ne01 * ne02); | |
< const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); | |
< const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; | |
< const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; | |
< const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; | |
< | |
< const int64_t i13 = i/(ne10 * ne11 * ne12); | |
< const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); | |
< const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; | |
< const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; | |
< const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; | |
< | |
< dequantize_row_q( | |
< (const void *) ((char *) src0->data + x_offset), | |
< (float *) ((char *) dst->data + dst_offset), qk); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_dup( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (src0->type == dst->type) { | |
< ggml_compute_forward_dup_bytes(params, dst); | |
< return; | |
< } | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_dup_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_BF16: | |
< { | |
< ggml_compute_forward_dup_bf16(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_dup_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) { | |
< ggml_compute_forward_dup_q(params, dst); | |
< break; | |
< } | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_add | |
< | |
< static void ggml_compute_forward_add_q_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const enum ggml_type type = src0->type; | |
< const enum ggml_type dtype = dst->type; | |
< ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; | |
< ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float; | |
< | |
< // we don't support permuted src0 or src1 | |
< GGML_ASSERT(nb00 == ggml_type_size(type)); | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< // dst cannot be transposed or permuted | |
< GGML_ASSERT(nb0 <= nb1); | |
< GGML_ASSERT(nb1 <= nb2); | |
< GGML_ASSERT(nb2 <= nb3); | |
< | |
< GGML_ASSERT(ggml_is_quantized(src0->type)); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 indices | |
< const int i03 = ir/(ne02*ne01); | |
< const int i02 = (ir - i03*ne02*ne01)/ne01; | |
< const int i01 = (ir - i03*ne02*ne01 - i02*ne01); | |
< | |
< // src1 and dst are same shape as src0 => same indices | |
< const int i13 = i03; | |
< const int i12 = i02; | |
< const int i11 = i01; | |
< | |
< const int i3 = i03; | |
< const int i2 = i02; | |
< const int i1 = i01; | |
< | |
< void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); | |
< float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); | |
< void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); | |
< | |
< assert(ne00 % 32 == 0); | |
< | |
< // unquantize row from src0 to temp buffer | |
< dequantize_row_q(src0_row, wdata, ne00); | |
< // add src1 | |
< ggml_vec_acc_f32(ne00, wdata, src1_row); | |
< // quantize row to dst | |
< if (quantize_row_q != NULL) { | |
< quantize_row_q(wdata, dst_row, ne00); | |
< } else { | |
< memcpy(dst_row, wdata, ne0*nb0); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< { | |
< ggml_compute_forward_add_non_quantized(params, dst); | |
< } break; | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< { | |
< ggml_compute_forward_add_q_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_add1 | |
< | |
< static void ggml_compute_forward_add1_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT( nb0 == sizeof(float)); | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< #ifdef GGML_USE_ACCELERATE | |
< UNUSED(ggml_vec_add1_f32); | |
< | |
< vDSP_vadd( | |
< (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, | |
< (float *) ((char *) src1->data), 0, | |
< (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, | |
< ne0); | |
< #else | |
< ggml_vec_add1_f32(ne0, | |
< (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), | |
< (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), | |
< *(float *) src1->data); | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1_f16_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< // scalar to add | |
< const float v = *(float *) src1->data; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT(dst->type == GGML_TYPE_F16); | |
< | |
< GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); | |
< ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); | |
< for (int i = 0; i < ne0; i++) { | |
< dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1_f16_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< // scalar to add | |
< const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F16); | |
< GGML_ASSERT(dst->type == GGML_TYPE_F16); | |
< | |
< GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); | |
< ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); | |
< for (int i = 0; i < ne0; i++) { | |
< dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1_q_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< // scalar to add | |
< const float v = *(float *) src1->data; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const enum ggml_type type = src0->type; | |
< ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; | |
< ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float; | |
< | |
< // we don't support permuted src0 | |
< GGML_ASSERT(nb00 == ggml_type_size(type)); | |
< | |
< // dst cannot be transposed or permuted | |
< GGML_ASSERT(nb0 <= nb1); | |
< GGML_ASSERT(nb1 <= nb2); | |
< GGML_ASSERT(nb2 <= nb3); | |
< | |
< GGML_ASSERT(ggml_is_quantized(src0->type)); | |
< GGML_ASSERT(dst->type == src0->type); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< void * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03)); | |
< void * dst_row = (void *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb0 )); | |
< | |
< assert(ne0 % 32 == 0); | |
< | |
< // unquantize row from src0 to temp buffer | |
< dequantize_row_q(src0_row, wdata, ne0); | |
< // add src1 | |
< ggml_vec_acc1_f32(ne0, wdata, v); | |
< // quantize row to dst | |
< quantize_row_q(wdata, dst_row, ne0); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1_bf16_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< // scalar to add | |
< const float v = *(float *) src1->data; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_BF16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT(dst->type == GGML_TYPE_BF16); | |
< | |
< GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); | |
< ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); | |
< for (int i = 0; i < ne0; i++) { | |
< dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1_bf16_bf16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_scalar(src1)); | |
< | |
< // scalar to add | |
< const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_BF16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_BF16); | |
< GGML_ASSERT(dst->type == GGML_TYPE_BF16); | |
< | |
< GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are same shape => same indices | |
< const int i3 = ir/(ne2*ne1); | |
< const int i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); | |
< ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); | |
< for (int i = 0; i < ne0; i++) { | |
< dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add1( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_add1_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< if (src1->type == GGML_TYPE_F16) { | |
< ggml_compute_forward_add1_f16_f16(params, dst); | |
< } | |
< else if (src1->type == GGML_TYPE_F32) { | |
< ggml_compute_forward_add1_f16_f32(params, dst); | |
< } | |
< else { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } break; | |
< case GGML_TYPE_BF16: | |
< { | |
< if (src1->type == GGML_TYPE_BF16) { | |
< ggml_compute_forward_add1_bf16_bf16(params, dst); | |
< } | |
< else if (src1->type == GGML_TYPE_F32) { | |
< ggml_compute_forward_add1_bf16_f32(params, dst); | |
< } | |
< else { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } break; | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q8_1: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< { | |
< ggml_compute_forward_add1_q_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_acc | |
< | |
< static void ggml_compute_forward_acc_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); | |
< | |
< // view src0 and dst with these strides and data offset inbytes during acc | |
< // nb0 is implicitly element_size because src0 and dst are contiguous | |
< size_t nb1 = ((int32_t *) dst->op_params)[0]; | |
< size_t nb2 = ((int32_t *) dst->op_params)[1]; | |
< size_t nb3 = ((int32_t *) dst->op_params)[2]; | |
< size_t offset = ((int32_t *) dst->op_params)[3]; | |
< bool inplace = (bool) ((int32_t *) dst->op_params)[4]; | |
< | |
< if (!inplace) { | |
< if (params->ith == 0) { | |
< // memcpy needs to be synchronized across threads to avoid race conditions. | |
< // => do it in INIT phase | |
< memcpy( | |
< ((char *) dst->data), | |
< ((char *) src0->data), | |
< ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< } | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src1); | |
< const int nc = src1->ne[0]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) | |
< | |
< // src0 and dst as viewed during acc | |
< const size_t nb0 = ggml_element_size(src0); | |
< | |
< const size_t nb00 = nb0; | |
< const size_t nb01 = nb1; | |
< const size_t nb02 = nb2; | |
< const size_t nb03 = nb3; | |
< | |
< GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); | |
< GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); | |
< | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are viewed with shape of src1 and offset | |
< // => same indices | |
< const int i3 = ir/(ne12*ne11); | |
< const int i2 = (ir - i3*ne12*ne11)/ne11; | |
< const int i1 = (ir - i3*ne12*ne11 - i2*ne11); | |
< | |
< #ifdef GGML_USE_ACCELERATE | |
< vDSP_vadd( | |
< (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, | |
< (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, | |
< (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); | |
< #else | |
< ggml_vec_add_f32(nc, | |
< (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), | |
< (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), | |
< (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_acc( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_acc_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q8_1: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_sum | |
< | |
< static void ggml_compute_forward_sum_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_scalar(dst)); | |
< assert(src0->nb[0] == sizeof(float)); | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) | |
< | |
< ggml_float sum = 0; | |
< ggml_float row_sum = 0; | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< ggml_vec_sum_f32_ggf(ne00, | |
< &row_sum, | |
< (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); | |
< sum += row_sum; | |
< } | |
< } | |
< } | |
< ((float *) dst->data)[0] = sum; | |
< } | |
< | |
< static void ggml_compute_forward_sum_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_scalar(dst)); | |
< | |
< assert(src0->nb[0] == sizeof(ggml_fp16_t)); | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) | |
< | |
< float sum = 0; | |
< float row_sum = 0; | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< ggml_vec_sum_f16_ggf(ne00, | |
< &row_sum, | |
< (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); | |
< sum += row_sum; | |
< } | |
< } | |
< } | |
< ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); | |
< } | |
< | |
< static void ggml_compute_forward_sum_bf16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_scalar(dst)); | |
< | |
< assert(src0->nb[0] == sizeof(ggml_bf16_t)); | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) | |
< | |
< float sum = 0; | |
< float row_sum = 0; | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< ggml_vec_sum_bf16_ggf(ne00, | |
< &row_sum, | |
< (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); | |
< sum += row_sum; | |
< } | |
< } | |
< } | |
< ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); | |
< } | |
< | |
< static void ggml_compute_forward_sum( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_sum_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_sum_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_BF16: | |
< { | |
< ggml_compute_forward_sum_bf16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_sum_rows | |
< | |
< static void ggml_compute_forward_sum_rows_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< GGML_ASSERT(dst->nb[0] == sizeof(float)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(ne0 == 1); | |
< GGML_ASSERT(ne1 == ne01); | |
< GGML_ASSERT(ne2 == ne02); | |
< GGML_ASSERT(ne3 == ne03); | |
< | |
< for (int64_t i3 = 0; i3 < ne03; i3++) { | |
< for (int64_t i2 = 0; i2 < ne02; i2++) { | |
< for (int64_t i1 = 0; i1 < ne01; i1++) { | |
< float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); | |
< float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); | |
< float row_sum = 0; | |
< ggml_vec_sum_f32(ne00, &row_sum, src_row); | |
< dst_row[0] = row_sum; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_sum_rows( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_sum_rows_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_mean | |
< | |
< static void ggml_compute_forward_mean_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(src0->nb[0] == sizeof(float)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< assert(ne0 == 1); | |
< assert(ne1 == ne01); | |
< assert(ne2 == ne02); | |
< assert(ne3 == ne03); | |
< | |
< UNUSED(ne0); | |
< UNUSED(ne1); | |
< UNUSED(ne2); | |
< UNUSED(ne3); | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< ggml_vec_sum_f32(ne00, | |
< (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), | |
< (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); | |
< | |
< *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_mean( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_mean_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_argmax | |
< | |
< static void ggml_compute_forward_argmax_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(src0->nb[0] == sizeof(float)); | |
< assert(dst->nb[0] == sizeof(float)); | |
< | |
< const int64_t ne00 = src0->ne[0]; | |
< const int64_t ne01 = src0->ne[1]; | |
< | |
< const size_t nb01 = src0->nb[1]; | |
< const size_t nb0 = dst->nb[0]; | |
< | |
< for (int64_t i1 = 0; i1 < ne01; i1++) { | |
< float * src = (float *) ((char *) src0->data + i1*nb01); | |
< int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); | |
< int v = 0; | |
< ggml_vec_argmax_f32(ne00, &v, src); | |
< dst_[0] = v; | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_argmax( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_argmax_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_count_equal | |
< | |
< static void ggml_compute_forward_count_equal_i32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_I32); | |
< GGML_ASSERT(src1->type == GGML_TYPE_I32); | |
< GGML_ASSERT(ggml_are_same_shape(src0, src1)); | |
< GGML_ASSERT(ggml_is_scalar(dst)); | |
< GGML_ASSERT(dst->type == GGML_TYPE_I64); | |
< | |
< const int64_t nr = ggml_nrows(src0); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< int64_t * sums = (int64_t *) params->wdata; | |
< int64_t sum_thread = 0; | |
< | |
< // rows per thread | |
< const int64_t dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int64_t ir0 = dr*ith; | |
< const int64_t ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t ir = ir0; ir < ir1; ++ir) { | |
< const int64_t i03 = ir / (ne02*ne01); | |
< const int64_t i02 = (ir - i03*ne03) / ne01; | |
< const int64_t i01 = ir - i03*ne03 - i02*ne02; | |
< | |
< const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; | |
< const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; | |
< | |
< for (int64_t i00 = 0; i00 < ne00; ++i00) { | |
< const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); | |
< const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); | |
< | |
< sum_thread += val0 == val1; | |
< } | |
< } | |
< if (ith != 0) { | |
< sums[ith] = sum_thread; | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< if (ith != 0) { | |
< return; | |
< } | |
< | |
< for (int ith_other = 1; ith_other < nth; ++ith_other) { | |
< sum_thread += sums[ith_other]; | |
< } | |
< *((int64_t *) dst->data) = sum_thread; | |
< } | |
< | |
< static void ggml_compute_forward_count_equal( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_I32: | |
< { | |
< ggml_compute_forward_count_equal_i32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_repeat | |
< | |
< static void ggml_compute_forward_repeat_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(ggml_can_repeat(src0, dst)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< // guaranteed to be an integer due to the check in ggml_can_repeat | |
< const int nr0 = (int)(ne0/ne00); | |
< const int nr1 = (int)(ne1/ne01); | |
< const int nr2 = (int)(ne2/ne02); | |
< const int nr3 = (int)(ne3/ne03); | |
< | |
< // TODO: support for transposed / permuted tensors | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< // TODO: maybe this is not optimal? | |
< for (int i3 = 0; i3 < nr3; i3++) { | |
< for (int k3 = 0; k3 < ne03; k3++) { | |
< for (int i2 = 0; i2 < nr2; i2++) { | |
< for (int k2 = 0; k2 < ne02; k2++) { | |
< for (int i1 = 0; i1 < nr1; i1++) { | |
< for (int k1 = 0; k1 < ne01; k1++) { | |
< for (int i0 = 0; i0 < nr0; i0++) { | |
< ggml_vec_cpy_f32(ne00, | |
< (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), | |
< (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_repeat_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(ggml_can_repeat(src0, dst)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< // guaranteed to be an integer due to the check in ggml_can_repeat | |
< const int nr0 = (int)(ne0/ne00); | |
< const int nr1 = (int)(ne1/ne01); | |
< const int nr2 = (int)(ne2/ne02); | |
< const int nr3 = (int)(ne3/ne03); | |
< | |
< // TODO: support for transposed / permuted tensors | |
< GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< | |
< // TODO: maybe this is not optimal? | |
< for (int i3 = 0; i3 < nr3; i3++) { | |
< for (int k3 = 0; k3 < ne03; k3++) { | |
< for (int i2 = 0; i2 < nr2; i2++) { | |
< for (int k2 = 0; k2 < ne02; k2++) { | |
< for (int i1 = 0; i1 < nr1; i1++) { | |
< for (int k1 = 0; k1 < ne01; k1++) { | |
< for (int i0 = 0; i0 < nr0; i0++) { | |
< ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); | |
< ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); | |
< // ggml_vec_cpy_f16(ne00, y, x) | |
< for (int i = 0; i < ne00; ++i) { | |
< y[i] = x[i]; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_repeat( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< case GGML_TYPE_I16: | |
< { | |
< ggml_compute_forward_repeat_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< case GGML_TYPE_I32: | |
< { | |
< ggml_compute_forward_repeat_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_repeat_back | |
< | |
< static void ggml_compute_forward_repeat_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(ggml_can_repeat(dst, src0)); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< // guaranteed to be an integer due to the check in ggml_can_repeat | |
< const int nr0 = (int)(ne00/ne0); | |
< const int nr1 = (int)(ne01/ne1); | |
< const int nr2 = (int)(ne02/ne2); | |
< const int nr3 = (int)(ne03/ne3); | |
< | |
< // TODO: support for transposed / permuted tensors | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< if (ggml_is_contiguous(dst)) { | |
< ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); | |
< } else { | |
< for (int k3 = 0; k3 < ne3; k3++) { | |
< for (int k2 = 0; k2 < ne2; k2++) { | |
< for (int k1 = 0; k1 < ne1; k1++) { | |
< ggml_vec_set_f32(ne0, | |
< (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), | |
< 0); | |
< } | |
< } | |
< } | |
< } | |
< | |
< // TODO: maybe this is not optimal? | |
< for (int i3 = 0; i3 < nr3; i3++) { | |
< for (int k3 = 0; k3 < ne3; k3++) { | |
< for (int i2 = 0; i2 < nr2; i2++) { | |
< for (int k2 = 0; k2 < ne2; k2++) { | |
< for (int i1 = 0; i1 < nr1; i1++) { | |
< for (int k1 = 0; k1 < ne1; k1++) { | |
< for (int i0 = 0; i0 < nr0; i0++) { | |
< ggml_vec_acc_f32(ne0, | |
< (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), | |
< (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_repeat_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_repeat_back_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_concat | |
< | |
< static void ggml_compute_forward_concat_any( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< const size_t len = ggml_type_size(src0->type); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int32_t dim = ggml_get_op_params_i32(dst, 0); | |
< | |
< GGML_ASSERT(dim >= 0 && dim < 4); | |
< | |
< int64_t o[4] = {0, 0, 0, 0}; | |
< o[dim] = src0->ne[dim]; | |
< | |
< const char * x; | |
< | |
< // TODO: smarter multi-theading | |
< for (int i3 = 0; i3 < ne3; i3++) { | |
< for (int i2 = ith; i2 < ne2; i2 += nth) { | |
< for (int i1 = 0; i1 < ne1; i1++) { | |
< for (int i0 = 0; i0 < ne0; i0++) { | |
< if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { | |
< x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03; | |
< } else { | |
< x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13; | |
< } | |
< | |
< char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3; | |
< | |
< memcpy(y, x, len); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_concat_i8( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int32_t dim = ggml_get_op_params_i32(dst, 0); | |
< | |
< GGML_ASSERT(dim >= 0 && dim < 4); | |
< | |
< int64_t o[4] = {0, 0, 0, 0}; | |
< o[dim] = src0->ne[dim]; | |
< | |
< const int8_t * x; | |
< | |
< // TODO: smarter multi-theading | |
< for (int i3 = 0; i3 < ne3; i3++) { | |
< for (int i2 = ith; i2 < ne2; i2 += nth) { | |
< for (int i1 = 0; i1 < ne1; i1++) { | |
< for (int i0 = 0; i0 < ne0; i0++) { | |
< if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { | |
< x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); | |
< } else { | |
< x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); | |
< } | |
< | |
< int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); | |
< | |
< *y = *x; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_concat_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int32_t dim = ggml_get_op_params_i32(dst, 0); | |
< | |
< GGML_ASSERT(dim >= 0 && dim < 4); | |
< | |
< int64_t o[4] = {0, 0, 0, 0}; | |
< o[dim] = src0->ne[dim]; | |
< | |
< const ggml_fp16_t * x; | |
< | |
< // TODO: smarter multi-theading | |
< for (int i3 = 0; i3 < ne3; i3++) { | |
< for (int i2 = ith; i2 < ne2; i2 += nth) { | |
< for (int i1 = 0; i1 < ne1; i1++) { | |
< for (int i0 = 0; i0 < ne0; i0++) { | |
< if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { | |
< x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); | |
< } else { | |
< x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); | |
< } | |
< | |
< ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); | |
< | |
< *y = *x; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_concat_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int32_t dim = ggml_get_op_params_i32(dst, 0); | |
< | |
< GGML_ASSERT(dim >= 0 && dim < 4); | |
< | |
< int64_t o[4] = {0, 0, 0, 0}; | |
< o[dim] = src0->ne[dim]; | |
< | |
< const float * x; | |
< | |
< // TODO: smarter multi-theading | |
< for (int i3 = 0; i3 < ne3; i3++) { | |
< for (int i2 = ith; i2 < ne2; i2 += nth) { | |
< for (int i1 = 0; i1 < ne1; i1++) { | |
< for (int i0 = 0; i0 < ne0; i0++) { | |
< if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { | |
< x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); | |
< } else { | |
< x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); | |
< } | |
< | |
< float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); | |
< | |
< *y = *x; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_concat( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< case GGML_TYPE_I16: | |
< { | |
< ggml_compute_forward_concat_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_I8: | |
< { | |
< ggml_compute_forward_concat_i8(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< case GGML_TYPE_I32: | |
< { | |
< ggml_compute_forward_concat_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< ggml_compute_forward_concat_any(params, dst); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_gelu | |
< | |
< static void ggml_compute_forward_gelu_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_gelu_f32(nc, | |
< (float *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< UNUSED(x); | |
< assert(!isnan(x)); | |
< assert(!isinf(x)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_gelu_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_gelu_f16(nc, | |
< (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< const float v = GGML_FP16_TO_FP32(x); | |
< UNUSED(v); | |
< assert(!isnan(v)); | |
< assert(!isinf(v)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_gelu( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_gelu_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_gelu_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_gelu_quick | |
< | |
< static void ggml_compute_forward_gelu_quick_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_gelu_quick_f32(nc, | |
< (float *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< UNUSED(x); | |
< assert(!isnan(x)); | |
< assert(!isinf(x)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_gelu_quick_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_gelu_quick_f16(nc, | |
< (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< const float v = GGML_FP16_TO_FP32(x); | |
< UNUSED(v); | |
< assert(!isnan(v)); | |
< assert(!isinf(v)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_gelu_quick( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_gelu_quick_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_gelu_quick_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_silu | |
< | |
< static void ggml_compute_forward_silu_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_silu_f32(nc, | |
< (float *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k]; | |
< UNUSED(x); | |
< assert(!isnan(x)); | |
< assert(!isinf(x)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_silu_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_silu_f16(nc, | |
< (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k]; | |
< const float v = GGML_FP16_TO_FP32(x); | |
< UNUSED(v); | |
< assert(!isnan(v)); | |
< assert(!isinf(v)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_silu( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_silu_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_silu_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< // ggml_compute_forward_leaky_relu | |
< | |
< static void ggml_compute_forward_leaky_relu_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< float negative_slope; | |
< memcpy(&negative_slope, dst->op_params, sizeof(float)); | |
< | |
< assert(dst->nb[0] == sizeof(float)); | |
< assert(src0->nb[0] == sizeof(float)); | |
< | |
< for (int i = 0; i < n; i++) { | |
< ggml_vec_leaky_relu_f32(nc, | |
< (float *) ((char *) dst->data + i*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_leaky_relu_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< float negative_slope; | |
< memcpy(&negative_slope, dst->op_params, sizeof(float)); | |
< | |
< assert(dst->nb[0] == sizeof(ggml_fp16_t)); | |
< assert(src0->nb[0] == sizeof(ggml_fp16_t)); | |
< | |
< for (int i = 0; i < n; i++) { | |
< ggml_vec_leaky_relu_f16(nc, | |
< (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), | |
< (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_leaky_relu( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_leaky_relu_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_leaky_relu_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_silu_back | |
< | |
< static void ggml_compute_forward_silu_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * grad = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< assert(ggml_is_contiguous_1(grad)); | |
< assert(ggml_is_contiguous_1(src1)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src1, dst)); | |
< assert(ggml_are_same_shape(src1, grad)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src1->ne[0]; | |
< const int nr = ggml_nrows(src1); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_silu_backward_f32(nc, | |
< (float *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (float *) ((char *) src1->data + i1*(src1->nb[1])), | |
< (float *) ((char *) grad->data + i1*(grad->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< UNUSED(x); | |
< assert(!isnan(x)); | |
< assert(!isinf(x)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_silu_back_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * grad = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< assert(ggml_is_contiguous_1(grad)); | |
< assert(ggml_is_contiguous_1(src1)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src1, dst)); | |
< assert(ggml_are_same_shape(src1, grad)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src1->ne[0]; | |
< const int nr = ggml_nrows(src1); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< ggml_vec_silu_backward_f16(nc, | |
< (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), | |
< (ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])), | |
< (ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1]))); | |
< | |
< #ifndef NDEBUG | |
< for (int k = 0; k < nc; k++) { | |
< const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; | |
< const float v = GGML_FP16_TO_FP32(x); | |
< UNUSED(v); | |
< assert(!isnan(v)); | |
< assert(!isinf(v)); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_silu_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_silu_back_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_silu_back_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_norm | |
< | |
< static void ggml_compute_forward_norm_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< float eps; | |
< memcpy(&eps, dst->op_params, sizeof(float)); | |
< | |
< GGML_ASSERT(eps >= 0.0f); | |
< | |
< // TODO: optimize | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ith; i01 < ne01; i01 += nth) { | |
< const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< ggml_float sum = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< sum += (ggml_float)x[i00]; | |
< } | |
< | |
< float mean = sum/ne00; | |
< | |
< float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); | |
< | |
< ggml_float sum2 = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< float v = x[i00] - mean; | |
< y[i00] = v; | |
< sum2 += (ggml_float)(v*v); | |
< } | |
< | |
< float variance = sum2/ne00; | |
< const float scale = 1.0f/sqrtf(variance + eps); | |
< | |
< ggml_vec_scale_f32(ne00, y, scale); | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_norm( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_norm_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_group_rms_norm | |
< | |
< static void ggml_compute_forward_rms_norm_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< float eps; | |
< memcpy(&eps, dst->op_params, sizeof(float)); | |
< | |
< GGML_ASSERT(eps >= 0.0f); | |
< | |
< // TODO: optimize | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ith; i01 < ne01; i01 += nth) { | |
< const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< ggml_float sum = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< sum += (ggml_float)(x[i00] * x[i00]); | |
< } | |
< | |
< const float mean = sum/ne00; | |
< | |
< float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); | |
< | |
< memcpy(y, x, ne00 * sizeof(float)); | |
< // for (int i00 = 0; i00 < ne00; i00++) { | |
< // y[i00] = x[i00]; | |
< // } | |
< | |
< const float scale = 1.0f/sqrtf(mean + eps); | |
< | |
< ggml_vec_scale_f32(ne00, y, scale); | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_rms_norm( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rms_norm_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_rms_norm_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output | |
< const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src1->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< float eps; | |
< memcpy(&eps, dst->op_params, sizeof(float)); | |
< | |
< // TODO: optimize | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ith; i01 < ne01; i01 += nth) { | |
< // src1 is same shape as src0 => same indices | |
< const int64_t i11 = i01; | |
< const int64_t i12 = i02; | |
< const int64_t i13 = i03; | |
< | |
< const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); | |
< | |
< ggml_float sum_xx = 0.0; | |
< ggml_float sum_xdz = 0.0; | |
< | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< sum_xx += (ggml_float)(x[i00] * x[i00]); | |
< sum_xdz += (ggml_float)(x[i00] * dz[i00]); | |
< } | |
< | |
< //const float mean = (float)(sum_xx)/ne00; | |
< const float mean_eps = (float)(sum_xx)/ne00 + eps; | |
< const float sum_eps = (float)(sum_xx) + eps*ne00; | |
< //const float mean_xdz = (float)(sum_xdz)/ne00; | |
< // we could cache rms from forward pass to improve performance. | |
< // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. | |
< //const float rms = sqrtf(mean_eps); | |
< const float rrms = 1.0f / sqrtf(mean_eps); | |
< //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) | |
< | |
< { | |
< // z = rms_norm(x) | |
< // | |
< // rms_norm(src1) = | |
< // scale( | |
< // src1, | |
< // div( | |
< // 1, | |
< // sqrt( | |
< // add( | |
< // scale( | |
< // sum( | |
< // sqr( | |
< // src1)), | |
< // (1.0/N)), | |
< // eps)))); | |
< | |
< // postorder: | |
< // ## op args grad | |
< // 00 param src1 grad[#00] | |
< // 01 const 1 | |
< // 02 sqr (#00) grad[#02] | |
< // 03 sum (#02) grad[#03] | |
< // 04 const 1/N | |
< // 05 scale (#03, #04) grad[#05] | |
< // 06 const eps | |
< // 07 add (#05, #06) grad[#07] | |
< // 08 sqrt (#07) grad[#08] | |
< // 09 div (#01,#08) grad[#09] | |
< // 10 scale (#00,#09) grad[#10] | |
< // | |
< // backward pass, given grad[#10] | |
< // #10: scale | |
< // grad[#00] += scale(grad[#10],#09) | |
< // grad[#09] += sum(mul(grad[#10],#00)) | |
< // #09: div | |
< // grad[#08] += neg(mul(grad[#09], div(#09,#08))) | |
< // #08: sqrt | |
< // grad[#07] += mul(grad[#08], div(0.5, #08)) | |
< // #07: add | |
< // grad[#05] += grad[#07] | |
< // #05: scale | |
< // grad[#03] += scale(grad[#05],#04) | |
< // #03: sum | |
< // grad[#02] += repeat(grad[#03], #02) | |
< // #02: | |
< // grad[#00] += scale(mul(#00, grad[#02]), 2.0) | |
< // | |
< // substitute and simplify: | |
< // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) | |
< // grad[#02] = repeat(grad[#03], #02) | |
< // grad[#02] = repeat(scale(grad[#05],#04), #02) | |
< // grad[#02] = repeat(scale(grad[#07],#04), #02) | |
< // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02) | |
< // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02) | |
< // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02) | |
< // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02) | |
< // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02) | |
< // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02) | |
< // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02) | |
< // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0) | |
< // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0) | |
< // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N))) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N)) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N)) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps)) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps))) | |
< // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps)) | |
< // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps)) | |
< // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps)) | |
< // a = b*c + d*e | |
< // a = b*c*f/f + d*e*f/f | |
< // a = (b*c*f + d*e*f)*(1/f) | |
< // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c)) | |
< // a = (b + d*e/c)*c | |
< // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps) | |
< // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms | |
< // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms | |
< // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms | |
< // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms | |
< // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms | |
< // a = (dz + x*div(-mean_xdz,mean_eps))*rrms | |
< // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms) | |
< // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) | |
< // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) | |
< } | |
< // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms) | |
< // post-order: | |
< // dx := x | |
< // dx := scale(dx,-mean_xdz/mean_eps) | |
< // dx := add(dx, dz) | |
< // dx := scale(dx, rrms) | |
< float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); | |
< | |
< // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps) | |
< ggml_vec_cpy_f32 (ne00, dx, x); | |
< // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); | |
< ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); | |
< ggml_vec_acc_f32 (ne00, dx, dz); | |
< ggml_vec_scale_f32(ne00, dx, rrms); | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_rms_norm_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rms_norm_back_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_group_norm | |
< | |
< static void ggml_compute_forward_group_norm_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< // TODO: optimize | |
< | |
< float eps; | |
< memcpy(&eps, dst->op_params + 1, sizeof(float)); | |
< | |
< int n_channels = src0->ne[2]; | |
< int n_groups = dst->op_params[0]; | |
< int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; | |
< for (int i = ith; i < n_groups; i += nth) { | |
< int start = i * n_channels_per_group; | |
< int end = start + n_channels_per_group; | |
< if (end > n_channels) { | |
< end = n_channels; | |
< } | |
< int step = end - start; | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< ggml_float sum = 0.0; | |
< for (int64_t i02 = start; i02 < end; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); | |
< | |
< ggml_float sumr = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< sumr += (ggml_float)x[i00]; | |
< } | |
< sum += sumr; | |
< } | |
< } | |
< const float mean = sum / (ne00 * ne01 * step); | |
< | |
< ggml_float sum2 = 0.0; | |
< for (int64_t i02 = start; i02 < end; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); | |
< | |
< float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); | |
< | |
< ggml_float sumr = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< float v = x[i00] - mean; | |
< y[i00] = v; | |
< sumr += (ggml_float)(v * v); | |
< } | |
< sum2 += sumr; | |
< } | |
< } | |
< const float variance = sum2 / (ne00 * ne01 * step); | |
< const float scale = 1.0f / sqrtf(variance + eps); | |
< | |
< for (int64_t i02 = start; i02 < end; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); | |
< ggml_vec_scale_f32(ne00, y, scale); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_group_norm( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_group_norm_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_l2_norm | |
< | |
< static void ggml_compute_forward_l2_norm_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< float eps; | |
< memcpy(&eps, dst->op_params, sizeof(float)); | |
< | |
< GGML_ASSERT(eps >= 0.0f); | |
< | |
< // TODO: optimize | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = ith; i01 < ne01; i01 += nth) { | |
< const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); | |
< | |
< ggml_float sum = 0.0; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< sum += (ggml_float)(x[i00] * x[i00]); | |
< } | |
< | |
< float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); | |
< | |
< memcpy(y, x, ne00 * sizeof(float)); | |
< | |
< const float scale = 1.0f/fmaxf(sqrtf(sum), eps); | |
< | |
< ggml_vec_scale_f32(ne00, y, scale); | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_l2_norm( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_l2_norm_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
7199,12339d1719 | |
< // ggml_compute_forward_out_prod | |
< | |
< static void ggml_compute_forward_out_prod_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< GGML_ASSERT(dst->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_ASSERT(ne0 == ne00); | |
< GGML_ASSERT(ne1 == ne10); | |
< GGML_ASSERT(ne2 == ne12); | |
< GGML_ASSERT(ne3 == ne13); | |
< | |
< GGML_ASSERT(ne2 % ne02 == 0); | |
< GGML_ASSERT(ne3 % ne03 == 0); | |
< | |
< // we don't support permuted src0 or src1 | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< // dst cannot be transposed or permuted | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< // GGML_ASSERT(nb0 <= nb1); | |
< // GGML_ASSERT(nb1 <= nb2); | |
< // GGML_ASSERT(nb2 <= nb3); | |
< | |
< // nb01 >= nb00 - src0 is not transposed | |
< // compute by src0 rows | |
< | |
< if (ith == 0) { | |
< ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< // dst[:,:,:,:] = 0 | |
< // for i2,i3: | |
< // for i1: | |
< // for i01: | |
< // for i0: | |
< // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] | |
< | |
< // parallelize by last three dimensions | |
< | |
< // total rows in dst | |
< const int64_t nr = ne1*ne2*ne3; | |
< | |
< // rows per thread | |
< const int64_t dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int64_t ir0 = dr*ith; | |
< const int64_t ir1 = MIN(ir0 + dr, nr); | |
< | |
< // block-tiling attempt | |
< const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); | |
< const int64_t blck_1 = 16; | |
< | |
< // dps == dst per src0, used for group query attention | |
< const int64_t dps2 = ne2 / ne02; | |
< const int64_t dps3 = ne3 / ne03; | |
< | |
< for (int64_t bir = ir0; bir < ir1; bir += blck_1) { | |
< const int64_t bir1 = MIN(bir + blck_1, ir1); | |
< for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { | |
< const int64_t bne01 = MIN(bi01 + blck_0, ne01); | |
< for (int64_t ir = bir; ir < bir1; ++ir) { | |
< // dst indices | |
< const int64_t i3 = ir/(ne2*ne1); | |
< const int64_t i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< const int64_t i02 = i2 / dps2; | |
< const int64_t i03 = i3 / dps3; | |
< | |
< //const int64_t i10 = i1; | |
< const int64_t i12 = i2; | |
< const int64_t i13 = i3; | |
< | |
< #if GGML_VEC_MAD_UNROLL > 2 | |
< const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL); | |
< for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) { | |
< const int64_t i11 = i01; | |
< | |
< float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); | |
< float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); | |
< float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); | |
< | |
< ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1); | |
< } | |
< for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) { | |
< const int64_t i11 = i01; | |
< | |
< float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); | |
< float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); | |
< float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); | |
< | |
< ggml_vec_mad_f32(ne0, d, s0, *s1); | |
< } | |
< #else | |
< for (int64_t i01 = bi01; i01 < bne01; ++i01) { | |
< const int64_t i11 = i01; | |
< | |
< float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); | |
< float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); | |
< float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); | |
< | |
< ggml_vec_mad_f32(ne0, d, s0, *s1); | |
< } | |
< #endif | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_out_prod_q_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const enum ggml_type type = src0->type; | |
< ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; | |
< | |
< GGML_ASSERT(ne02 == ne12); | |
< GGML_ASSERT(ne03 == ne13); | |
< GGML_ASSERT(ne2 == ne12); | |
< GGML_ASSERT(ne3 == ne13); | |
< | |
< // we don't support permuted src0 dim0 | |
< GGML_ASSERT(nb00 == ggml_type_size(type)); | |
< | |
< // dst dim0 cannot be transposed or permuted | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< // GGML_ASSERT(nb0 <= nb1); | |
< // GGML_ASSERT(nb1 <= nb2); | |
< // GGML_ASSERT(nb2 <= nb3); | |
< | |
< GGML_ASSERT(ne0 == ne00); | |
< GGML_ASSERT(ne1 == ne10); | |
< GGML_ASSERT(ne2 == ne02); | |
< GGML_ASSERT(ne3 == ne03); | |
< | |
< // nb01 >= nb00 - src0 is not transposed | |
< // compute by src0 rows | |
< | |
< if (ith == 0) { | |
< ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< // parallelize by last three dimensions | |
< | |
< // total rows in dst | |
< const int64_t nr = ne1*ne2*ne3; | |
< | |
< // rows per thread | |
< const int64_t dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int64_t ir0 = dr*ith; | |
< const int64_t ir1 = MIN(ir0 + dr, nr); | |
< | |
< // dst[:,:,:,:] = 0 | |
< // for i2,i3: | |
< // for i1: | |
< // for i01: | |
< // for i0: | |
< // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] | |
< | |
< float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< for (int64_t ir = ir0; ir < ir1; ++ir) { | |
< // dst indices | |
< const int64_t i3 = ir/(ne2*ne1); | |
< const int64_t i2 = (ir - i3*ne2*ne1)/ne1; | |
< const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); | |
< | |
< const int64_t i02 = i2; | |
< const int64_t i03 = i3; | |
< | |
< //const int64_t i10 = i1; | |
< const int64_t i12 = i2; | |
< const int64_t i13 = i3; | |
< | |
< for (int64_t i01 = 0; i01 < ne01; ++i01) { | |
< const int64_t i11 = i01; | |
< | |
< float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); | |
< float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); | |
< float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); | |
< | |
< dequantize_row_q(s0, wdata, ne0); | |
< ggml_vec_mad_f32(ne0, d, wdata, *s1); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_out_prod( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< { | |
< ggml_compute_forward_out_prod_q_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< GGML_ABORT("fatal error"); // todo | |
< // ggml_compute_forward_out_prod_f16_f32(params, dst); | |
< } | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_out_prod_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_scale | |
< | |
< static void ggml_compute_forward_scale_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(ggml_is_contiguous(src0)); | |
< GGML_ASSERT(ggml_is_contiguous(dst)); | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< | |
< // scale factor | |
< float v; | |
< memcpy(&v, dst->op_params, sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< const size_t nb01 = src0->nb[1]; | |
< | |
< const size_t nb1 = dst->nb[1]; | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< if (dst->data != src0->data) { | |
< // src0 is same shape as dst => same indices | |
< memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); | |
< } | |
< ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_scale( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_scale_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_set | |
< | |
< static void ggml_compute_forward_set_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); | |
< | |
< // view src0 and dst with these strides and data offset inbytes during set | |
< // nb0 is implicitly element_size because src0 and dst are contiguous | |
< size_t nb1 = ((int32_t *) dst->op_params)[0]; | |
< size_t nb2 = ((int32_t *) dst->op_params)[1]; | |
< size_t nb3 = ((int32_t *) dst->op_params)[2]; | |
< size_t offset = ((int32_t *) dst->op_params)[3]; | |
< bool inplace = (bool) ((int32_t *) dst->op_params)[4]; | |
< | |
< if (!inplace) { | |
< if (params->ith == 0) { | |
< // memcpy needs to be synchronized across threads to avoid race conditions. | |
< // => do it in INIT phase | |
< memcpy( | |
< ((char *) dst->data), | |
< ((char *) src0->data), | |
< ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< } | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src1); | |
< const int nc = src1->ne[0]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) | |
< | |
< // src0 and dst as viewed during set | |
< const size_t nb0 = ggml_element_size(src0); | |
< | |
< const int im0 = (ne10 == 0 ? 0 : ne10-1); | |
< const int im1 = (ne11 == 0 ? 0 : ne11-1); | |
< const int im2 = (ne12 == 0 ? 0 : ne12-1); | |
< const int im3 = (ne13 == 0 ? 0 : ne13-1); | |
< | |
< GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); | |
< | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are viewed with shape of src1 and offset | |
< // => same indices | |
< const int i3 = ir/(ne12*ne11); | |
< const int i2 = (ir - i3*ne12*ne11)/ne11; | |
< const int i1 = (ir - i3*ne12*ne11 - i2*ne11); | |
< | |
< ggml_vec_cpy_f32(nc, | |
< (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), | |
< (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_set_i32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); | |
< | |
< // view src0 and dst with these strides and data offset inbytes during set | |
< // nb0 is implicitly element_size because src0 and dst are contiguous | |
< size_t nb1 = ((int32_t *) dst->op_params)[0]; | |
< size_t nb2 = ((int32_t *) dst->op_params)[1]; | |
< size_t nb3 = ((int32_t *) dst->op_params)[2]; | |
< size_t offset = ((int32_t *) dst->op_params)[3]; | |
< bool inplace = (bool) ((int32_t *) dst->op_params)[4]; | |
< | |
< if (!inplace) { | |
< if (params->ith == 0) { | |
< // memcpy needs to be synchronized across threads to avoid race conditions. | |
< // => do it in INIT phase | |
< memcpy( | |
< ((char *) dst->data), | |
< ((char *) src0->data), | |
< ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< } | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src1); | |
< const int nc = src1->ne[0]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) | |
< | |
< // src0 and dst as viewed during set | |
< const size_t nb0 = ggml_element_size(src0); | |
< | |
< const int im0 = (ne10 == 0 ? 0 : ne10-1); | |
< const int im1 = (ne11 == 0 ? 0 : ne11-1); | |
< const int im2 = (ne12 == 0 ? 0 : ne12-1); | |
< const int im3 = (ne13 == 0 ? 0 : ne13-1); | |
< | |
< GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); | |
< | |
< GGML_ASSERT(nb10 == sizeof(int32_t)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // src0 and dst are viewed with shape of src1 and offset | |
< // => same indices | |
< const int i3 = ir/(ne12*ne11); | |
< const int i2 = (ir - i3*ne12*ne11)/ne11; | |
< const int i1 = (ir - i3*ne12*ne11 - i2*ne11); | |
< | |
< ggml_vec_cpy_i32(nc, | |
< (int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), | |
< (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_set( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_set_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_I32: | |
< { | |
< ggml_compute_forward_set_i32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q8_1: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_cpy | |
< | |
< static void ggml_compute_forward_cpy( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< ggml_compute_forward_dup(params, dst); | |
< } | |
< | |
< // ggml_compute_forward_cont | |
< | |
< static void ggml_compute_forward_cont( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< ggml_compute_forward_dup(params, dst); | |
< } | |
< | |
< // ggml_compute_forward_reshape | |
< | |
< static void ggml_compute_forward_reshape( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< // NOP | |
< UNUSED(params); | |
< UNUSED(dst); | |
< } | |
< | |
< // ggml_compute_forward_view | |
< | |
< static void ggml_compute_forward_view( | |
< const struct ggml_compute_params * params, | |
< const struct ggml_tensor * dst) { | |
< // NOP | |
< UNUSED(params); | |
< UNUSED(dst); | |
< } | |
< | |
< // ggml_compute_forward_permute | |
< | |
< static void ggml_compute_forward_permute( | |
< const struct ggml_compute_params * params, | |
< const struct ggml_tensor * dst) { | |
< // NOP | |
< UNUSED(params); | |
< UNUSED(dst); | |
< } | |
< | |
< // ggml_compute_forward_transpose | |
< | |
< static void ggml_compute_forward_transpose( | |
< const struct ggml_compute_params * params, | |
< const struct ggml_tensor * dst) { | |
< // NOP | |
< UNUSED(params); | |
< UNUSED(dst); | |
< } | |
< | |
< // ggml_compute_forward_get_rows | |
< | |
< static void ggml_compute_forward_get_rows_q( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int64_t nc = ne00; | |
< const int64_t nr = ggml_nelements(src1); | |
< | |
< const enum ggml_type type = src0->type; | |
< ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float; | |
< | |
< assert(ne0 == nc); | |
< assert(ne02 == ne11); | |
< assert(nb00 == ggml_type_size(type)); | |
< assert(ggml_nrows(dst) == nr); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t i = ir0; i < ir1; ++i) { | |
< const int64_t i12 = i/(ne11*ne10); | |
< const int64_t i11 = (i - i12*ne11*ne10)/ne10; | |
< const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); | |
< const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); | |
< | |
< GGML_ASSERT(i01 >= 0 && i01 < ne01); | |
< | |
< dequantize_row_q( | |
< (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), | |
< (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int64_t nc = ne00; | |
< const int64_t nr = ggml_nelements(src1); | |
< | |
< assert(ne0 == nc); | |
< assert(ne02 == ne11); | |
< assert(nb00 == sizeof(ggml_fp16_t)); | |
< assert(ggml_nrows(dst) == nr); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t i = ir0; i < ir1; ++i) { | |
< const int64_t i12 = i/(ne11*ne10); | |
< const int64_t i11 = (i - i12*ne11*ne10)/ne10; | |
< const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); | |
< const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); | |
< | |
< GGML_ASSERT(i01 >= 0 && i01 < ne01); | |
< | |
< ggml_fp16_to_fp32_row( | |
< (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), | |
< (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows_bf16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int64_t nc = ne00; | |
< const int64_t nr = ggml_nelements(src1); | |
< | |
< assert(ne0 == nc); | |
< assert(ne02 == ne11); | |
< assert(nb00 == sizeof(ggml_bf16_t)); | |
< assert(ggml_nrows(dst) == nr); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t i = ir0; i < ir1; ++i) { | |
< const int64_t i12 = i/(ne11*ne10); | |
< const int64_t i11 = (i - i12*ne11*ne10)/ne10; | |
< const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); | |
< const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); | |
< | |
< GGML_ASSERT(i01 >= 0 && i01 < ne01); | |
< | |
< ggml_bf16_to_fp32_row( | |
< (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), | |
< (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int64_t nc = ne00; | |
< const int64_t nr = ggml_nelements(src1); | |
< | |
< assert(ne0 == nc); | |
< assert(ne02 == ne11); | |
< assert(nb00 == sizeof(float)); | |
< assert(ggml_nrows(dst) == nr); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t i = ir0; i < ir1; ++i) { | |
< const int64_t i12 = i/(ne11*ne10); | |
< const int64_t i11 = (i - i12*ne11*ne10)/ne10; | |
< const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); | |
< const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); | |
< | |
< GGML_ASSERT(i01 >= 0 && i01 < ne01); | |
< | |
< ggml_vec_cpy_f32(nc, | |
< (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), | |
< (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q8_1: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< { | |
< ggml_compute_forward_get_rows_q(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_get_rows_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_BF16: | |
< { | |
< ggml_compute_forward_get_rows_bf16(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< case GGML_TYPE_I32: | |
< { | |
< ggml_compute_forward_get_rows_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< | |
< //static bool first = true; | |
< //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); | |
< //if (first) { | |
< // first = false; | |
< //} else { | |
< // for (int k = 0; k < dst->ne[1]; ++k) { | |
< // for (int j = 0; j < dst->ne[0]/16; ++j) { | |
< // for (int i = 0; i < 16; ++i) { | |
< // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); | |
< // } | |
< // printf("\n"); | |
< // } | |
< // printf("\n"); | |
< // } | |
< // printf("\n"); | |
< // exit(0); | |
< //} | |
< } | |
< | |
< // ggml_compute_forward_get_rows_back | |
< | |
< static void ggml_compute_forward_get_rows_back_f32_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(ggml_is_contiguous(dst)); | |
< | |
< // ggml_compute_forward_dup_same_cont(params, opt0, dst); | |
< | |
< memset(dst->data, 0, ggml_nbytes(dst)); | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nelements(src1); | |
< | |
< GGML_ASSERT( dst->ne[0] == nc); | |
< GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); | |
< | |
< for (int i = 0; i < nr; ++i) { | |
< const int r = ((int32_t *) src1->data)[i]; | |
< | |
< for (int j = 0; j < nc; ++j) { | |
< ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; | |
< ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< GGML_ASSERT(ggml_is_contiguous(dst)); | |
< | |
< // ggml_compute_forward_dup_same_cont(params, opt0, dst); | |
< | |
< memset(dst->data, 0, ggml_nbytes(dst)); | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nelements(src1); | |
< | |
< GGML_ASSERT( dst->ne[0] == nc); | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< for (int i = 0; i < nr; ++i) { | |
< const int r = ((int32_t *) src1->data)[i]; | |
< | |
< ggml_vec_add_f32(nc, | |
< (float *) ((char *) dst->data + r*dst->nb[1]), | |
< (float *) ((char *) dst->data + r*dst->nb[1]), | |
< (float *) ((char *) src0->data + i*src0->nb[1])); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rows_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_get_rows_back_f32_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_get_rows_back_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< | |
< //static bool first = true; | |
< //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]); | |
< //if (first) { | |
< // first = false; | |
< //} else { | |
< // for (int k = 0; k < dst->ne[1]; ++k) { | |
< // for (int j = 0; j < dst->ne[0]/16; ++j) { | |
< // for (int i = 0; i < 16; ++i) { | |
< // printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]); | |
< // } | |
< // printf("\n"); | |
< // } | |
< // printf("\n"); | |
< // } | |
< // printf("\n"); | |
< // exit(0); | |
< //} | |
< } | |
< | |
< // ggml_compute_forward_diag | |
< | |
< static void ggml_compute_forward_diag_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< // TODO: handle transposed/permuted matrices | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(ne00 == ne0); | |
< GGML_ASSERT(ne00 == ne1); | |
< GGML_ASSERT(ne01 == 1); | |
< GGML_ASSERT(ne02 == ne2); | |
< GGML_ASSERT(ne03 == ne3); | |
< | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< | |
< for (int i3 = 0; i3 < ne3; i3++) { | |
< for (int i2 = 0; i2 < ne2; i2++) { | |
< for (int i1 = 0; i1 < ne1; i1++) { | |
< float * d = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); | |
< float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02); | |
< for (int i0 = 0; i0 < i1; i0++) { | |
< d[i0] = 0; | |
< } | |
< d[i1] = s[i1]; | |
< for (int i0 = i1+1; i0 < ne0; i0++) { | |
< d[i0] = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_diag( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_diag_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_diag_mask_inf | |
< | |
< static void ggml_compute_forward_diag_mask_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const float value) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int n_past = ((int32_t *) dst->op_params)[0]; | |
< const bool inplace = src0->data == dst->data; | |
< | |
< GGML_ASSERT(n_past >= 0); | |
< | |
< if (!inplace) { | |
< if (ith == 0) { | |
< // memcpy needs to be synchronized across threads to avoid race conditions. | |
< // => do it in INIT phase | |
< GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | |
< GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); | |
< memcpy( | |
< ((char *) dst->data), | |
< ((char *) src0->data), | |
< ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< } | |
< | |
< // TODO: handle transposed/permuted matrices | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< const int nr = src0->ne[1]; | |
< const int nz = n/nr; | |
< | |
< GGML_ASSERT( dst->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< for (int k = 0; k < nz; k++) { | |
< for (int j = ith; j < nr; j += nth) { | |
< for (int i = n_past; i < nc; i++) { | |
< if (i > n_past + j) { | |
< *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_diag_mask_inf( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_diag_mask_f32(params, dst, -INFINITY); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_diag_mask_zero( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_diag_mask_f32(params, dst, 0); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_soft_max | |
< | |
< static void ggml_compute_forward_soft_max_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< assert(ggml_is_contiguous(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< float scale = 1.0f; | |
< float max_bias = 0.0f; | |
< | |
< memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); | |
< memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); | |
< | |
< // TODO: handle transposed/permuted matrices | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< //const int64_t ne11 = src1 ? src1->ne[1] : 1; | |
< | |
< // TODO: is this supposed to be ceil instead of floor? | |
< // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 | |
< const uint32_t n_head = ne02; | |
< const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); | |
< | |
< const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); | |
< const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; | |
< | |
< const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< // ALiBi | |
< const uint32_t h = (i1/ne01)%ne02; // head | |
< const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; | |
< | |
< float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); | |
< float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); | |
< | |
< // broadcast the mask across rows | |
< ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; | |
< float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; | |
< | |
< ggml_vec_cpy_f32 (nc, wp, sp); | |
< ggml_vec_scale_f32(nc, wp, scale); | |
< if (mp_f32) { | |
< if (use_f16) { | |
< for (int i = 0; i < nc; ++i) { | |
< wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); | |
< } | |
< } else { | |
< for (int i = 0; i < nc; ++i) { | |
< wp[i] += slope*mp_f32[i]; | |
< } | |
< } | |
< } | |
< | |
< #ifndef NDEBUG | |
< for (int i = 0; i < nc; ++i) { | |
< //printf("p[%d] = %f\n", i, p[i]); | |
< assert(!isnan(wp[i])); | |
< } | |
< #endif | |
< | |
< float max = -INFINITY; | |
< ggml_vec_max_f32(nc, &max, wp); | |
< | |
< ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max); | |
< assert(sum > 0.0); | |
< | |
< sum = 1.0/sum; | |
< ggml_vec_scale_f32(nc, dp, sum); | |
< | |
< #ifndef NDEBUG | |
< for (int i = 0; i < nc; ++i) { | |
< assert(!isnan(dp[i])); | |
< assert(!isinf(dp[i])); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_soft_max( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_soft_max_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< | |
< // ggml_compute_forward_soft_max_ext_back | |
< | |
< static void ggml_compute_forward_soft_max_ext_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(ggml_is_contiguous(src0)); | |
< GGML_ASSERT(ggml_is_contiguous(src1)); | |
< GGML_ASSERT(ggml_is_contiguous(dst)); | |
< GGML_ASSERT(ggml_are_same_shape(src0, dst)); | |
< GGML_ASSERT(ggml_are_same_shape(src1, dst)); | |
< | |
< float scale = 1.0f; | |
< float max_bias = 0.0f; | |
< | |
< memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); | |
< memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); | |
< | |
< GGML_ASSERT(max_bias == 0.0f); | |
< | |
< // TODO: handle transposed/permuted matrices | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src0->ne[0]; | |
< const int nr = ggml_nrows(src0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); | |
< float *y = (float *)((char *) src1->data + i1*src1->nb[1]); | |
< float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); | |
< | |
< #ifndef NDEBUG | |
< for (int i = 0; i < nc; ++i) { | |
< //printf("p[%d] = %f\n", i, p[i]); | |
< assert(!isnan(dy[i])); | |
< assert(!isnan(y[i])); | |
< } | |
< #endif | |
< // Jii = yi - yi*yi | |
< // Jij = -yi*yj | |
< // J = diag(y)-y.T*y | |
< // dx = J * dy | |
< // dxk = sum_i(Jki * dyi) | |
< // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk | |
< // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk | |
< // dxk = sum_i(-yk*yi * dyi) + yk*dyk | |
< // dxk = -yk * sum_i(yi * dyi) + yk*dyk | |
< // dxk = -yk * dot(y, dy) + yk*dyk | |
< // dxk = yk * (- dot(y, dy) + dyk) | |
< // dxk = yk * (dyk - dot(y, dy)) | |
< // | |
< // post-order: | |
< // dot_y_dy := dot(y, dy) | |
< // dx := dy | |
< // dx := dx - dot_y_dy | |
< // dx := dx * y | |
< | |
< // linear runtime, no additional memory | |
< float dot_y_dy = 0; | |
< ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); | |
< ggml_vec_cpy_f32 (nc, dx, dy); | |
< ggml_vec_acc1_f32 (nc, dx, -dot_y_dy); | |
< ggml_vec_mul_f32 (nc, dx, dx, y); | |
< ggml_vec_scale_f32(nc, dx, scale); | |
< | |
< #ifndef NDEBUG | |
< for (int i = 0; i < nc; ++i) { | |
< assert(!isnan(dx[i])); | |
< assert(!isinf(dx[i])); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_soft_max_ext_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_soft_max_ext_back_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_clamp | |
< | |
< static void ggml_compute_forward_clamp_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< float min; | |
< float max; | |
< memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); | |
< memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< const size_t nb00 = src0->nb[0]; | |
< const size_t nb01 = src0->nb[1]; | |
< | |
< const size_t nb0 = dst->nb[0]; | |
< const size_t nb1 = dst->nb[1]; | |
< | |
< GGML_ASSERT( nb0 == sizeof(float)); | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< for (int j = ith; j < n; j += nth) { | |
< float * dst_ptr = (float *) ((char *) dst->data + j*nb1); | |
< float * src0_ptr = (float *) ((char *) src0->data + j*nb01); | |
< | |
< for (int i = 0; i < nc; i++) { | |
< dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_clamp_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< float min; | |
< float max; | |
< memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); | |
< memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< const size_t nb00 = src0->nb[0]; | |
< const size_t nb01 = src0->nb[1]; | |
< | |
< const size_t nb0 = dst->nb[0]; | |
< const size_t nb1 = dst->nb[1]; | |
< | |
< GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< | |
< for (int j = ith; j < n; j += nth) { | |
< ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1); | |
< ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); | |
< | |
< for (int i = 0; i < nc; i++) { | |
< float v = GGML_FP16_TO_FP32(src0_ptr[i]); | |
< dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min)); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_clamp( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_clamp_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_clamp_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_BF16: | |
< case GGML_TYPE_Q4_0: | |
< case GGML_TYPE_Q4_1: | |
< case GGML_TYPE_Q5_0: | |
< case GGML_TYPE_Q5_1: | |
< case GGML_TYPE_Q8_0: | |
< case GGML_TYPE_Q8_1: | |
< case GGML_TYPE_Q2_K: | |
< case GGML_TYPE_Q3_K: | |
< case GGML_TYPE_Q4_K: | |
< case GGML_TYPE_Q5_K: | |
< case GGML_TYPE_Q6_K: | |
< case GGML_TYPE_TQ1_0: | |
< case GGML_TYPE_TQ2_0: | |
< case GGML_TYPE_IQ2_XXS: | |
< case GGML_TYPE_IQ2_XS: | |
< case GGML_TYPE_IQ3_XXS: | |
< case GGML_TYPE_IQ1_S: | |
< case GGML_TYPE_IQ1_M: | |
< case GGML_TYPE_IQ4_NL: | |
< case GGML_TYPE_IQ4_XS: | |
< case GGML_TYPE_IQ3_S: | |
< case GGML_TYPE_IQ2_S: | |
< case GGML_TYPE_Q8_K: | |
< case GGML_TYPE_I8: | |
< case GGML_TYPE_I16: | |
< case GGML_TYPE_I32: | |
< case GGML_TYPE_I64: | |
< case GGML_TYPE_F64: | |
< case GGML_TYPE_COUNT: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_rope | |
< | |
< static float rope_yarn_ramp(const float low, const float high, const int i0) { | |
< const float y = (i0 / 2 - low) / MAX(0.001f, high - low); | |
< return 1 - MIN(1, MAX(0, y)); | |
< } | |
< | |
< // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn | |
< // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. | |
< static void rope_yarn( | |
< float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, | |
< float * cos_theta, float * sin_theta) { | |
< // Get n-d rotational scaling corrected for extrapolation | |
< float theta_interp = freq_scale * theta_extrap; | |
< float theta = theta_interp; | |
< if (ext_factor != 0.0f) { | |
< float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; | |
< theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; | |
< | |
< // Get n-d magnitude scaling corrected for interpolation | |
< mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); | |
< } | |
< *cos_theta = cosf(theta) * mscale; | |
< *sin_theta = sinf(theta) * mscale; | |
< } | |
< | |
< static void ggml_rope_cache_init( | |
< float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, | |
< float * cache, float sin_sign, float theta_scale) { | |
< // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py | |
< float theta = theta_base; | |
< for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | |
< const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; | |
< rope_yarn( | |
< theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] | |
< ); | |
< cache[i0 + 1] *= sin_sign; | |
< | |
< theta *= theta_scale; | |
< } | |
< } | |
< | |
< static void ggml_mrope_cache_init( | |
< float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects, | |
< float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, | |
< float * cache, float sin_sign, float theta_scale) { | |
< // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py | |
< float theta_t = theta_base_t; | |
< float theta_h = theta_base_h; | |
< float theta_w = theta_base_w; | |
< float theta_e = theta_base_e; // extra position id for vision encoder | |
< int sect_dims = sections[0] + sections[1] + sections[2] + sections[3]; | |
< int sec_w = sections[1] + sections[0]; | |
< int sec_e = sections[2] + sec_w; | |
< GGML_ASSERT(sect_dims <= ne0); | |
< | |
< for (int64_t i0 = 0; i0 < ne0; i0 += 2) { | |
< const float ff = freq_factors ? freq_factors[i0/2] : 1.0f; | |
< | |
< int sector = (i0 / 2) % sect_dims; | |
< if (indep_sects) { | |
< // compute theta independently for each dim sections | |
< // (i.e. reset corresponding theta when `i0` go from one section to another) | |
< if (sector == 0) { | |
< theta_t = theta_base_t; | |
< } | |
< else if (sector == sections[0]) { | |
< theta_h = theta_base_h;; | |
< } | |
< else if (sector == sec_w) { | |
< theta_w = theta_base_w; | |
< } | |
< else if (sector == sec_e) { | |
< theta_e = theta_base_e; | |
< } | |
< } | |
< | |
< float theta = theta_t; | |
< if (sector >= sections[0] && sector < sec_w) { | |
< theta = theta_h; | |
< } | |
< else if (sector >= sec_w && sector < sec_w + sections[2]) { | |
< theta = theta_w; | |
< } | |
< else if (sector >= sec_w + sections[2]) { | |
< theta = theta_e; | |
< } | |
< | |
< rope_yarn( | |
< theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] | |
< ); | |
< cache[i0 + 1] *= sin_sign; | |
< | |
< theta_t *= theta_scale; | |
< theta_w *= theta_scale; | |
< theta_h *= theta_scale; | |
< theta_e *= theta_scale; | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_rope_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const bool forward) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< const struct ggml_tensor * src2 = dst->src[2]; | |
< | |
< float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; | |
< int sections[4]; | |
< | |
< //const int n_past = ((int32_t *) dst->op_params)[0]; | |
< const int n_dims = ((int32_t *) dst->op_params)[1]; | |
< const int mode = ((int32_t *) dst->op_params)[2]; | |
< //const int n_ctx = ((int32_t *) dst->op_params)[3]; | |
< const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; | |
< | |
< memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); | |
< memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); | |
< memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); | |
< memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); | |
< memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); | |
< memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); | |
< memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); | |
< //printf("n_past = %d, ne2 = %d\n", n_past, ne2); | |
< | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(dst); | |
< | |
< GGML_ASSERT(n_dims <= ne0); | |
< GGML_ASSERT(n_dims % 2 == 0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< // row index used to determine which thread to use | |
< int ir = 0; | |
< | |
< const float theta_scale = powf(freq_base, -2.0f/n_dims); | |
< | |
< float corr_dims[2]; | |
< ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); | |
< | |
< const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; | |
< const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding | |
< const bool is_vision = mode == GGML_ROPE_TYPE_VISION; | |
< | |
< if (is_mrope) { | |
< GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0); | |
< } | |
< | |
< if (is_vision) { | |
< GGML_ASSERT(n_dims == ne0/2); | |
< } | |
< | |
< const float * freq_factors = NULL; | |
< if (src2 != NULL) { | |
< GGML_ASSERT(src2->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src2->ne[0] >= n_dims / 2); | |
< freq_factors = (const float *) src2->data; | |
< } | |
< | |
< // backward process uses inverse rotation by cos and sin. | |
< // cos and sin build a rotation matrix, where the inverse is the transpose. | |
< // this essentially just switches the sign of sin. | |
< const float sin_sign = forward ? 1.0f : -1.0f; | |
< | |
< const int32_t * pos = (const int32_t *) src1->data; | |
< | |
< for (int64_t i3 = 0; i3 < ne3; i3++) { // batch | |
< for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len | |
< | |
< float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; | |
< if (!is_mrope) { | |
< const int64_t p = pos[i2]; | |
< ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); | |
< } | |
< else { | |
< const int64_t p_t = pos[i2]; | |
< const int64_t p_h = pos[i2 + ne2]; | |
< const int64_t p_w = pos[i2 + ne2 * 2]; | |
< const int64_t p_e = pos[i2 + ne2 * 3]; | |
< ggml_mrope_cache_init( | |
< p_t, p_h, p_w, p_e, sections, is_vision, | |
< freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); | |
< } | |
< | |
< for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads | |
< if (ir++ < ir0) continue; | |
< if (ir > ir1) break; | |
< | |
< if (is_neox || is_mrope) { | |
< if (is_vision){ | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = src[0]; | |
< const float x1 = src[n_dims]; | |
< | |
< dst_data[0] = x0*cos_theta - x1*sin_theta; | |
< dst_data[n_dims] = x0*sin_theta + x1*cos_theta; | |
< } | |
< } else { | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = src[0]; | |
< const float x1 = src[n_dims/2]; | |
< | |
< dst_data[0] = x0*cos_theta - x1*sin_theta; | |
< dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; | |
< } | |
< } | |
< } else { | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); | |
< float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); | |
< | |
< const float x0 = src[0]; | |
< const float x1 = src[1]; | |
< | |
< dst_data[0] = x0*cos_theta - x1*sin_theta; | |
< dst_data[1] = x0*sin_theta + x1*cos_theta; | |
< } | |
< } | |
< | |
< if (is_vision) { | |
< for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = src[0]; | |
< const float x1 = src[n_dims]; | |
< | |
< dst_data[0] = x0*cos_theta - x1*sin_theta; | |
< dst_data[n_dims] = x0*sin_theta + x1*cos_theta; | |
< } | |
< } else { | |
< // fill the remain channels with data from src tensor | |
< for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { | |
< const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); | |
< float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); | |
< | |
< dst_data[0] = src[0]; | |
< dst_data[1] = src[1]; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< // TODO: deduplicate f16/f32 code | |
< static void ggml_compute_forward_rope_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const bool forward) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< const struct ggml_tensor * src2 = dst->src[2]; | |
< | |
< float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; | |
< int sections[4]; | |
< | |
< //const int n_past = ((int32_t *) dst->op_params)[0]; | |
< const int n_dims = ((int32_t *) dst->op_params)[1]; | |
< const int mode = ((int32_t *) dst->op_params)[2]; | |
< //const int n_ctx = ((int32_t *) dst->op_params)[3]; | |
< const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; | |
< memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); | |
< memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); | |
< memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); | |
< memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); | |
< memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); | |
< memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); | |
< memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4); | |
< | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); | |
< //printf("n_past = %d, ne2 = %d\n", n_past, ne2); | |
< | |
< GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(dst); | |
< | |
< GGML_ASSERT(n_dims <= ne0); | |
< GGML_ASSERT(n_dims % 2 == 0); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< // row index used to determine which thread to use | |
< int ir = 0; | |
< | |
< const float theta_scale = powf(freq_base, -2.0f/n_dims); | |
< | |
< float corr_dims[2]; | |
< ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); | |
< | |
< const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; | |
< const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; | |
< const bool is_vision = mode == GGML_ROPE_TYPE_VISION; | |
< | |
< if (is_mrope) { | |
< GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0); | |
< } | |
< | |
< if (is_vision) { | |
< GGML_ASSERT(n_dims == ne0/2); | |
< } | |
< | |
< const float * freq_factors = NULL; | |
< if (src2 != NULL) { | |
< GGML_ASSERT(src2->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src2->ne[0] >= n_dims / 2); | |
< freq_factors = (const float *) src2->data; | |
< } | |
< | |
< // backward process uses inverse rotation by cos and sin. | |
< // cos and sin build a rotation matrix, where the inverse is the transpose. | |
< // this essentially just switches the sign of sin. | |
< const float sin_sign = forward ? 1.0f : -1.0f; | |
< | |
< const int32_t * pos = (const int32_t *) src1->data; | |
< | |
< for (int64_t i3 = 0; i3 < ne3; i3++) { | |
< for (int64_t i2 = 0; i2 < ne2; i2++) { | |
< | |
< float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; | |
< if (!is_mrope) { | |
< const int64_t p = pos[i2]; | |
< ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); | |
< } | |
< else { | |
< const int64_t p_t = pos[i2]; | |
< const int64_t p_h = pos[i2 + ne2]; | |
< const int64_t p_w = pos[i2 + ne2 * 2]; | |
< const int64_t p_e = pos[i2 + ne2 * 3]; | |
< ggml_mrope_cache_init( | |
< p_t, p_h, p_w, p_e, sections, is_vision, | |
< freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); | |
< } | |
< | |
< for (int64_t i1 = 0; i1 < ne1; i1++) { | |
< if (ir++ < ir0) continue; | |
< if (ir > ir1) break; | |
< | |
< if (is_neox || is_mrope) { | |
< if (is_vision) { | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = GGML_FP16_TO_FP32(src[0]); | |
< const float x1 = GGML_FP16_TO_FP32(src[n_dims]); | |
< | |
< dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); | |
< dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); | |
< } | |
< } else { | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = GGML_FP16_TO_FP32(src[0]); | |
< const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); | |
< | |
< dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); | |
< dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); | |
< } | |
< } | |
< } else { | |
< for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); | |
< | |
< const float x0 = GGML_FP16_TO_FP32(src[0]); | |
< const float x1 = GGML_FP16_TO_FP32(src[1]); | |
< | |
< dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); | |
< dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); | |
< } | |
< } | |
< | |
< if (is_vision) { | |
< for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { | |
< const int64_t ic = i0/2; | |
< | |
< const float cos_theta = cache[i0 + 0]; | |
< const float sin_theta = cache[i0 + 1]; | |
< | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); | |
< | |
< const float x0 = GGML_FP16_TO_FP32(src[0]); | |
< const float x1 = GGML_FP16_TO_FP32(src[n_dims]); | |
< | |
< dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); | |
< dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); | |
< } | |
< } else { | |
< for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); | |
< | |
< dst_data[0] = src[0]; | |
< dst_data[1] = src[1]; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_rope( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_rope_f16(params, dst, true); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rope_f32(params, dst, true); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_rope_back | |
< | |
< static void ggml_compute_forward_rope_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_rope_f16(params, dst, false); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rope_f32(params, dst, false); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_conv_transpose_1d | |
< | |
< static void ggml_compute_forward_conv_transpose_1d_f16_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nk = ne00*ne01*ne02; | |
< | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< if (ith == 0) { | |
< memset(params->wdata, 0, params->wsize); | |
< | |
< // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) | |
< { | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; | |
< | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); | |
< ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< dst_data[i00*ne02 + i02] = src[i00]; | |
< } | |
< } | |
< } | |
< } | |
< | |
< // permute source data (src1) from (L x Cin) to (Cin x L) | |
< { | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; | |
< ggml_fp16_t * dst_data = wdata; | |
< | |
< for (int64_t i11 = 0; i11 < ne11; i11++) { | |
< const float * const src = (float *)((char *) src1->data + i11*nb11); | |
< for (int64_t i10 = 0; i10 < ne10; i10++) { | |
< dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); | |
< } | |
< } | |
< } | |
< | |
< // need to zero dst since we are accumulating into it | |
< memset(dst->data, 0, ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; | |
< | |
< // total rows in dst | |
< const int nr = ne1; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; | |
< ggml_fp16_t * const wdata_src = wdata + nk; | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< float * dst_data = (float *)((char *) dst->data + i1*nb1); | |
< ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; | |
< for (int i10 = 0; i10 < ne10; i10++) { | |
< const int i1n = i10*ne11; | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< float v = 0; | |
< ggml_vec_dot_f16(ne02, &v, 0, | |
< (ggml_fp16_t *) wdata_src + i1n, 0, | |
< (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); | |
< dst_data[i10*s0 + i00] += v; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_conv_transpose_1d_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nk = ne00*ne01*ne02; | |
< | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< if (ith == 0) { | |
< memset(params->wdata, 0, params->wsize); | |
< | |
< // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) | |
< { | |
< float * const wdata = (float *) params->wdata + 0; | |
< | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); | |
< float * dst_data = wdata + i01*ne00*ne02; | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< dst_data[i00*ne02 + i02] = src[i00]; | |
< } | |
< } | |
< } | |
< } | |
< | |
< // prepare source data (src1) | |
< { | |
< float * const wdata = (float *) params->wdata + nk; | |
< float * dst_data = wdata; | |
< | |
< for (int64_t i11 = 0; i11 < ne11; i11++) { | |
< const float * const src = (float *)((char *) src1->data + i11*nb11); | |
< for (int64_t i10 = 0; i10 < ne10; i10++) { | |
< dst_data[i10*ne11 + i11] = src[i10]; | |
< } | |
< } | |
< } | |
< | |
< // need to zero dst since we are accumulating into it | |
< memset(dst->data, 0, ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; | |
< | |
< // total rows in dst | |
< const int nr = ne1; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< float * const wdata = (float *) params->wdata + 0; | |
< float * const wdata_src = wdata + nk; | |
< | |
< for (int i1 = ir0; i1 < ir1; i1++) { | |
< float * dst_data = (float *)((char *) dst->data + i1*nb1); | |
< float * wdata_kernel = wdata + i1*ne02*ne00; | |
< for (int i10 = 0; i10 < ne10; i10++) { | |
< const int i1n = i10*ne11; | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< float v = 0; | |
< ggml_vec_dot_f32(ne02, &v, 0, | |
< wdata_src + i1n, 0, | |
< wdata_kernel + i00*ne02, 0, 1); | |
< dst_data[i10*s0 + i00] += v; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_conv_transpose_1d( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_conv_transpose_1d_f16_f32(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_conv_transpose_1d_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_im2col_f32 | |
< // src0: kernel [OC, IC, KH, KW] | |
< // src1: image [N, IC, IH, IW] | |
< // dst: result [N, OH, OW, IC*KH*KW] | |
< static void ggml_compute_forward_im2col_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS; | |
< | |
< const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; | |
< const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; | |
< const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; | |
< const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; | |
< const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; | |
< const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; | |
< const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t N = is_2D ? ne13 : ne12; | |
< const int64_t IC = is_2D ? ne12 : ne11; | |
< const int64_t IH = is_2D ? ne11 : 1; | |
< const int64_t IW = ne10; | |
< | |
< const int64_t KH = is_2D ? ne01 : 1; | |
< const int64_t KW = ne00; | |
< | |
< const int64_t OH = is_2D ? ne2 : 1; | |
< const int64_t OW = ne1; | |
< | |
< int ofs0 = is_2D ? nb13 : nb12; | |
< int ofs1 = is_2D ? nb12 : nb11; | |
< | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] | |
< { | |
< float * const wdata = (float *) dst->data; | |
< | |
< for (int64_t in = 0; in < N; in++) { | |
< for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 | |
< for (int64_t iow = 0; iow < OW; iow++) { | |
< for (int64_t iic = ith; iic < IC; iic += nth) { | |
< | |
< // micro kernel | |
< float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] | |
< const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] | |
< | |
< for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 | |
< for (int64_t ikw = 0; ikw < KW; ikw++) { | |
< const int64_t iiw = iow*s0 + ikw*d0 - p0; | |
< const int64_t iih = ioh*s1 + ikh*d1 - p1; | |
< | |
< if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { | |
< dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; | |
< } else { | |
< dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< | |
< // ggml_compute_forward_im2col_f16 | |
< // src0: kernel [OC, IC, KH, KW] | |
< // src1: image [N, IC, IH, IW] | |
< // dst: result [N, OH, OW, IC*KH*KW] | |
< static void ggml_compute_forward_im2col_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F16); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS; | |
< | |
< const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; | |
< const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; | |
< const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; | |
< const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; | |
< const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; | |
< const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; | |
< const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t N = is_2D ? ne13 : ne12; | |
< const int64_t IC = is_2D ? ne12 : ne11; | |
< const int64_t IH = is_2D ? ne11 : 1; | |
< const int64_t IW = ne10; | |
< | |
< const int64_t KH = is_2D ? ne01 : 1; | |
< const int64_t KW = ne00; | |
< | |
< const int64_t OH = is_2D ? ne2 : 1; | |
< const int64_t OW = ne1; | |
< | |
< int ofs0 = is_2D ? nb13 : nb12; | |
< int ofs1 = is_2D ? nb12 : nb11; | |
< | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] | |
< { | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; | |
< | |
< for (int64_t in = 0; in < N; in++) { | |
< for (int64_t ioh = 0; ioh < OH; ioh++) { // 1 | |
< for (int64_t iow = 0; iow < OW; iow++) { | |
< for (int64_t iic = ith; iic < IC; iic += nth) { | |
< | |
< // micro kernel | |
< ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] | |
< const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW] | |
< | |
< for (int64_t ikh = 0; ikh < KH; ikh++) { // 1 | |
< for (int64_t ikw = 0; ikw < KW; ikw++) { | |
< const int64_t iiw = iow*s0 + ikw*d0 - p0; | |
< const int64_t iih = ioh*s1 + ikh*d1 - p1; | |
< | |
< if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { | |
< dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; | |
< } else { | |
< dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_im2col( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< switch (dst->type) { | |
< case GGML_TYPE_F16: | |
< { | |
< ggml_compute_forward_im2col_f16(params, dst); | |
< } break; | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_im2col_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_im2col_back_f32 | |
< | |
< static void ggml_compute_forward_im2col_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output | |
< const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS; | |
< | |
< const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; | |
< const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; | |
< const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; | |
< const int32_t p1 = ((const int32_t *)(dst->op_params))[3]; | |
< const int32_t d0 = ((const int32_t *)(dst->op_params))[4]; | |
< const int32_t d1 = ((const int32_t *)(dst->op_params))[5]; | |
< const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t N = is_2D ? ne3 : ne2; | |
< const int64_t IC = is_2D ? ne2 : ne1; | |
< const int64_t IH = is_2D ? ne1 : 1; | |
< const int64_t IW = ne0; | |
< | |
< const int64_t KH = is_2D ? ne11 : 1; | |
< const int64_t KW = ne10; | |
< | |
< const int64_t OH = is_2D ? ne02 : 1; | |
< const int64_t OW = ne01; | |
< | |
< int ofs0 = is_2D ? nb3 : nb2; | |
< int ofs1 = is_2D ? nb2 : nb1; | |
< | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< | |
< // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] | |
< { | |
< float * const wdata = (float *) dst->data; | |
< | |
< for (int64_t in = 0; in < N; in++) { | |
< for (int64_t iic = ith; iic < IC; iic += nth) { | |
< for (int64_t iih = 0; iih < IH; iih++) { | |
< for (int64_t iiw = 0; iiw < IW; iiw++) { | |
< | |
< // micro kernel | |
< float grad = 0.0f; | |
< for (int64_t ikh = 0; ikh < KH; ikh++) { | |
< for (int64_t ikw = 0; ikw < KW; ikw++) { | |
< // For s0 > 1 some values were skipped over in the forward pass. | |
< // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well. | |
< const int64_t tmpw = (iiw + p0 - ikw*d0); | |
< if (tmpw % s0 != 0) { | |
< continue; | |
< } | |
< const int64_t iow = tmpw / s0; | |
< | |
< // Equivalent logic as above except for s1. | |
< int64_t ioh; | |
< if (is_2D) { | |
< const int64_t tmph = iih + p1 - ikh*d1; | |
< | |
< if (tmph % s1 != 0) { | |
< continue; | |
< } | |
< | |
< ioh = tmph / s1; | |
< } else { | |
< ioh = 0; | |
< } | |
< | |
< if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) { | |
< continue; | |
< } | |
< | |
< const float * const grad_in = (const float *) src0->data | |
< + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] | |
< grad += grad_in[iic*(KH*KW) + ikh*KW + ikw]; | |
< } | |
< } | |
< float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW] | |
< dst_data[iih*IW + iiw] = grad; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_conv_transpose_2d | |
< | |
< static void ggml_compute_forward_conv_transpose_2d( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F16); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< GGML_TENSOR_BINARY_OP_LOCALS | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nk = ne00*ne01*ne02*ne03; | |
< | |
< GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); | |
< GGML_ASSERT(nb10 == sizeof(float)); | |
< | |
< if (ith == 0) { | |
< memset(params->wdata, 0, params->wsize); | |
< | |
< // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) | |
< { | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; | |
< | |
< for (int64_t i03 = 0; i03 < ne03; i03++) { | |
< for (int64_t i02 = 0; i02 < ne02; i02++) { | |
< const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); | |
< ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; | |
< for (int64_t i01 = 0; i01 < ne01; i01++) { | |
< for (int64_t i00 = 0; i00 < ne00; i00++) { | |
< dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) | |
< { | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; | |
< for (int i12 = 0; i12 < ne12; i12++) { | |
< for (int i11 = 0; i11 < ne11; i11++) { | |
< const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); | |
< ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; | |
< for (int i10 = 0; i10 < ne10; i10++) { | |
< dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); | |
< } | |
< } | |
< } | |
< } | |
< | |
< memset(dst->data, 0, ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< const int32_t stride = ggml_get_op_params_i32(dst, 0); | |
< | |
< // total patches in dst | |
< const int np = ne2; | |
< | |
< // patches per thread | |
< const int dp = (np + nth - 1)/nth; | |
< | |
< // patch range for this thread | |
< const int ip0 = dp*ith; | |
< const int ip1 = MIN(ip0 + dp, np); | |
< | |
< ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; | |
< ggml_fp16_t * const wdata_src = wdata + nk; | |
< | |
< for (int i2 = ip0; i2 < ip1; i2++) { // Cout | |
< float * dst_data = (float *)((char *) dst->data + i2*nb2); | |
< ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; | |
< for (int i11 = 0; i11 < ne11; i11++) { | |
< for (int i10 = 0; i10 < ne10; i10++) { | |
< const int i1n = i11*ne10*ne12 + i10*ne12; | |
< for (int i01 = 0; i01 < ne01; i01++) { | |
< for (int i00 = 0; i00 < ne00; i00++) { | |
< float v = 0; | |
< ggml_vec_dot_f16(ne03, &v, 0, | |
< wdata_src + i1n, 0, | |
< wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); | |
< dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_pool_1d_sk_p0 | |
< | |
< static void ggml_compute_forward_pool_1d_sk_p0( | |
< const struct ggml_compute_params * params, | |
< const enum ggml_op_pool op, | |
< const int k, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src = dst->src[0]; | |
< | |
< assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< const char * cdata = (const char *)src->data; | |
< const char * const data_end = cdata + ggml_nbytes(src); | |
< float * drow = (float *)dst->data; | |
< | |
< const int64_t rs = dst->ne[0]; | |
< | |
< while (cdata < data_end) { | |
< const void * srow = (const void *)cdata; | |
< int j = 0; | |
< for (int64_t i = 0; i < rs; ++i) { | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: drow[i] = 0; break; | |
< case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< for (int ki = 0; ki < k; ++ki) { | |
< const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: drow[i] += srow_j; break; | |
< case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< ++j; | |
< } | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: drow[i] /= k; break; | |
< case GGML_OP_POOL_MAX: break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< } | |
< | |
< cdata += src->nb[1]; | |
< drow += rs; | |
< } | |
< } | |
< | |
< // ggml_compute_forward_pool_1d | |
< | |
< static void ggml_compute_forward_pool_1d( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const int32_t * opts = (const int32_t *)dst->op_params; | |
< enum ggml_op_pool op = opts[0]; | |
< const int k0 = opts[1]; | |
< const int s0 = opts[2]; | |
< const int p0 = opts[3]; | |
< GGML_ASSERT(p0 == 0); // padding not supported | |
< GGML_ASSERT(k0 == s0); // only s = k supported | |
< | |
< ggml_compute_forward_pool_1d_sk_p0(params, op, k0, dst); | |
< } | |
< | |
< // ggml_compute_forward_pool_2d | |
< | |
< static void ggml_compute_forward_pool_2d( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src = dst->src[0]; | |
< | |
< assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< const int32_t * opts = (const int32_t *)dst->op_params; | |
< enum ggml_op_pool op = opts[0]; | |
< const int k0 = opts[1]; | |
< const int k1 = opts[2]; | |
< const int s0 = opts[3]; | |
< const int s1 = opts[4]; | |
< const int p0 = opts[5]; | |
< const int p1 = opts[6]; | |
< const char * cdata = (const char*)src->data; | |
< const char * const data_end = cdata + ggml_nbytes(src); | |
< | |
< const int64_t px = dst->ne[0]; | |
< const int64_t py = dst->ne[1]; | |
< const int64_t pa = px * py; | |
< | |
< float * dplane = (float *)dst->data; | |
< | |
< const int ka = k0 * k1; | |
< const int offset0 = -p0; | |
< const int offset1 = -p1; | |
< | |
< while (cdata < data_end) { | |
< for (int oy = 0; oy < py; ++oy) { | |
< float * const drow = dplane + oy * px; | |
< for (int ox = 0; ox < px; ++ox) { | |
< float * const out = drow + ox; | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: *out = 0; break; | |
< case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< | |
< const int ix = offset0 + ox * s0; | |
< const int iy = offset1 + oy * s1; | |
< | |
< for (int ky = 0; ky < k1; ++ky) { | |
< if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; | |
< const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); | |
< for (int kx = 0; kx < k0; ++kx) { | |
< int j = ix + kx; | |
< if (j < 0 || j >= src->ne[0]) continue; | |
< const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: *out += srow_j; break; | |
< case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< switch (op) { | |
< case GGML_OP_POOL_AVG: *out /= ka; break; | |
< case GGML_OP_POOL_MAX: break; | |
< case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< cdata += src->nb[2]; | |
< dplane += pa; | |
< } | |
< } | |
< | |
< // ggml_compute_forward_pool_2d_back | |
< | |
< static void ggml_compute_forward_pool_2d_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src = dst->src[0]; | |
< const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst | |
< | |
< assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< const int32_t * opts = (const int32_t *)dst->op_params; | |
< enum ggml_op_pool op = opts[0]; | |
< const int k0 = opts[1]; | |
< const int k1 = opts[2]; | |
< const int s0 = opts[3]; | |
< const int s1 = opts[4]; | |
< const int p0 = opts[5]; | |
< const int p1 = opts[6]; | |
< | |
< char * cdata = (char *) dst->data; | |
< const char * cdataf = (const char *) dstf->data; | |
< const char * const data_end = cdata + ggml_nbytes(dst); | |
< | |
< GGML_ASSERT(params->ith == 0); | |
< memset(cdata, 0, ggml_nbytes(dst)); | |
< | |
< const int64_t px = src->ne[0]; | |
< const int64_t py = src->ne[1]; | |
< const int64_t pa = px * py; | |
< | |
< const float * splane = (const float *) src->data; | |
< | |
< const int ka = k0 * k1; | |
< const int offset0 = -p0; | |
< const int offset1 = -p1; | |
< | |
< while (cdata < data_end) { | |
< for (int oy = 0; oy < py; ++oy) { | |
< const float * const srow = splane + oy * px; | |
< for (int ox = 0; ox < px; ++ox) { | |
< const float grad0 = srow[ox]; | |
< | |
< const int ix = offset0 + ox * s0; | |
< const int iy = offset1 + oy * s1; | |
< | |
< if (op == GGML_OP_POOL_MAX) { | |
< float maxval = -FLT_MAX; | |
< int kxmax = -1; | |
< int kymax = -1; | |
< | |
< for (int ky = 0; ky < k1; ++ky) { | |
< if (iy + ky < 0 || iy + ky >= dst->ne[1]) { | |
< continue; | |
< } | |
< const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky)); | |
< for (int kx = 0; kx < k0; ++kx) { | |
< int j = ix + kx; | |
< if (j < 0 || j >= dst->ne[0]) { | |
< continue; | |
< } | |
< | |
< const float val = dst->type == GGML_TYPE_F32 ? | |
< ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); | |
< if (val <= maxval) { | |
< continue; | |
< } | |
< | |
< maxval = val; | |
< kxmax = kx; | |
< kymax = ky; | |
< } | |
< } | |
< | |
< if (kxmax == -1 || kymax == -1) { | |
< continue; | |
< } | |
< | |
< void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax)); | |
< const int j = ix + kxmax; | |
< if (dst->type == GGML_TYPE_F32) { | |
< ((float *) drow)[j] += grad0; | |
< } else { | |
< ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); | |
< } | |
< } else if (op == GGML_OP_POOL_AVG) { | |
< const float grad = grad0 / ka; | |
< | |
< for (int ky = 0; ky < k1; ++ky) { | |
< if (iy + ky < 0 || iy + ky >= dst->ne[1]) { | |
< continue; | |
< } | |
< void * drow = (void *)(cdata + dst->nb[1] * (iy + ky)); | |
< for (int kx = 0; kx < k0; ++kx) { | |
< int j = ix + kx; | |
< if (j < 0 || j >= dst->ne[0]) { | |
< continue; | |
< } | |
< | |
< if (dst->type == GGML_TYPE_F32) { | |
< ((float *) drow)[j] += grad; | |
< } else { | |
< ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); | |
< } | |
< } | |
< } | |
< } else { | |
< GGML_ASSERT(false); | |
< } | |
< } | |
< } | |
< | |
< cdata += dst->nb[2]; | |
< cdataf += dst->nb[2]; | |
< splane += pa; | |
< } | |
< } | |
< | |
< // ggml_compute_forward_upscale | |
< | |
< static void ggml_compute_forward_upscale_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const float sf0 = (float)ne0/src0->ne[0]; | |
< const float sf1 = (float)ne1/src0->ne[1]; | |
< const float sf2 = (float)ne2/src0->ne[2]; | |
< const float sf3 = (float)ne3/src0->ne[3]; | |
< | |
< // TODO: optimize | |
< | |
< for (int64_t i3 = 0; i3 < ne3; i3++) { | |
< const int64_t i03 = i3 / sf3; | |
< for (int64_t i2 = ith; i2 < ne2; i2 += nth) { | |
< const int64_t i02 = i2 / sf2; | |
< for (int64_t i1 = 0; i1 < ne1; i1++) { | |
< const int64_t i01 = i1 / sf1; | |
< for (int64_t i0 = 0; i0 < ne0; i0++) { | |
< const int64_t i00 = i0 / sf0; | |
< | |
< const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); | |
< float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); | |
< | |
< *y = *x; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_upscale( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_upscale_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< | |
< // ggml_compute_forward_pad | |
< | |
< static void ggml_compute_forward_pad_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< GGML_ASSERT( dst->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< float * dst_ptr = (float *) dst->data; | |
< | |
< // TODO: optimize | |
< | |
< for (int64_t i2 = 0; i2 < ne2; ++i2) { | |
< for (int64_t i1 = ith; i1 < ne1; i1 += nth) { | |
< for (int64_t i0 = 0; i0 < ne0; ++i0) { | |
< for (int64_t i3 = 0; i3 < ne3; ++i3) { | |
< const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; | |
< | |
< const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); | |
< | |
< if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { | |
< dst_ptr[dst_idx] = *src_ptr; | |
< } else { | |
< dst_ptr[dst_idx] = 0; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_pad( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_pad_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_pad_reflect_1d | |
< | |
< static void ggml_compute_forward_pad_reflect_1d( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int32_t * opts = (const int32_t *) dst->op_params; | |
< const int p0 = opts[0]; | |
< const int p1 = opts[1]; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< for (int64_t i3 = 0; i3 < ne3; i3++) { | |
< for (int64_t i2 = 0; i2 < ne2; i2++) { | |
< for (int64_t i1 = ith; i1 < ne1; i1 += nth) { | |
< float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); | |
< float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); | |
< | |
< ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); | |
< | |
< for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } | |
< for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } | |
< } | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_arange | |
< | |
< static void ggml_compute_forward_arange_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< GGML_ASSERT(dst->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const float start = ggml_get_op_params_f32(dst, 0); | |
< const float stop = ggml_get_op_params_f32(dst, 1); | |
< const float step = ggml_get_op_params_f32(dst, 2); | |
< | |
< const int64_t steps = (int64_t) ceilf((stop - start) / step); | |
< | |
< GGML_ASSERT(ggml_nelements(dst) == steps); | |
< | |
< for (int64_t i = ith; i < steps; i+= nth) { | |
< float value = start + step * i; | |
< ((float *)dst->data)[i] = value; | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_arange( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< switch (dst->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_arange_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_timestep_embedding_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const int dim = ggml_get_op_params_i32(dst, 0); | |
< const int max_period = ggml_get_op_params_i32(dst, 1); | |
< | |
< int half = dim / 2; | |
< | |
< for (int64_t i = 0; i < ne00; i++) { | |
< float * embed_data = (float *)((char *) dst->data + i*nb1); | |
< for (int64_t j = ith; j < half; j += nth) { | |
< float timestep = ((float *)src0->data)[i]; | |
< float freq = (float)expf(-logf(max_period) * j / half); | |
< float arg = timestep * freq; | |
< embed_data[j] = cosf(arg); | |
< embed_data[j + half] = sinf(arg); | |
< } | |
< if (dim % 2 != 0 && ith == 0) { | |
< embed_data[dim] = 0.f; | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_timestep_embedding( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_timestep_embedding_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_argsort | |
< | |
< static void ggml_compute_forward_argsort_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t nr = ggml_nrows(src0); | |
< | |
< enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0); | |
< | |
< for (int64_t i = ith; i < nr; i += nth) { | |
< int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1); | |
< const float * src_data = (float *)((char *) src0->data + i*nb01); | |
< | |
< for (int64_t j = 0; j < ne0; j++) { | |
< dst_data[j] = j; | |
< } | |
< | |
< // C doesn't have a functional sort, so we do a bubble sort instead | |
< for (int64_t j = 0; j < ne0; j++) { | |
< for (int64_t k = j + 1; k < ne0; k++) { | |
< if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || | |
< (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { | |
< int32_t tmp = dst_data[j]; | |
< dst_data[j] = dst_data[k]; | |
< dst_data[k] = tmp; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_argsort( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_argsort_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_flash_attn_ext | |
< | |
< static void ggml_compute_forward_flash_attn_ext_f16( | |
< const struct ggml_compute_params * params, | |
< const struct ggml_tensor * q, | |
< const struct ggml_tensor * k, | |
< const struct ggml_tensor * v, | |
< const struct ggml_tensor * mask, | |
< struct ggml_tensor * dst) { | |
< | |
< GGML_TENSOR_LOCALS(int64_t, neq, q, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbq, q, nb) | |
< GGML_TENSOR_LOCALS(int64_t, nek, k, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbk, k, nb) | |
< GGML_TENSOR_LOCALS(int64_t, nev, v, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbv, v, nb) | |
< GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb, dst, nb) | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t DK = nek0; | |
< const int64_t DV = nev0; | |
< const int64_t N = neq1; | |
< | |
< GGML_ASSERT(ne0 == DV); | |
< GGML_ASSERT(ne2 == N); | |
< | |
< // input tensor rows must be contiguous | |
< GGML_ASSERT(nbq0 == ggml_type_size(q->type)); | |
< GGML_ASSERT(nbk0 == ggml_type_size(k->type)); | |
< GGML_ASSERT(nbv0 == ggml_type_size(v->type)); | |
< | |
< GGML_ASSERT(neq0 == DK); | |
< GGML_ASSERT(nek0 == DK); | |
< GGML_ASSERT(nev0 == DV); | |
< | |
< GGML_ASSERT(neq1 == N); | |
< | |
< // dst cannot be transposed or permuted | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< GGML_ASSERT(nb0 <= nb1); | |
< GGML_ASSERT(nb1 <= nb2); | |
< GGML_ASSERT(nb2 <= nb3); | |
< | |
< // broadcast factors | |
< const int64_t rk2 = neq2/nek2; | |
< const int64_t rk3 = neq3/nek3; | |
< | |
< const int64_t rv2 = neq2/nev2; | |
< const int64_t rv3 = neq3/nev3; | |
< | |
< // parallelize by q rows using ggml_vec_dot_f32 | |
< | |
< // total rows in q | |
< const int nr = neq1*neq2*neq3; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< float scale = 1.0f; | |
< float max_bias = 0.0f; | |
< float logit_softcap = 0.0f; | |
< | |
< memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); | |
< memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); | |
< memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float)); | |
< | |
< if (logit_softcap != 0) { | |
< scale /= logit_softcap; | |
< } | |
< | |
< const uint32_t n_head = neq2; | |
< const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); | |
< | |
< const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); | |
< const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); | |
< | |
< enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type; | |
< ggml_from_float_t const q_to_vec_dot = type_traits_cpu[k_vec_dot_type].from_float; | |
< ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot; | |
< ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float; | |
< | |
< GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type"); | |
< GGML_ASSERT(v_to_float && "fattn: unsupported V-type"); | |
< | |
< // loop over n_batch and n_head | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // q indices | |
< const int iq3 = ir/(neq2*neq1); | |
< const int iq2 = (ir - iq3*neq2*neq1)/neq1; | |
< const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); | |
< | |
< const uint32_t h = iq2; // head index | |
< const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f; | |
< | |
< float S = 0.0f; // sum | |
< float M = -INFINITY; // maximum KQ value | |
< | |
< float * VKQ32 = (float *) params->wdata + ith*(1*DK + 2*DV + CACHE_LINE_SIZE_F32); // FP32 VKQ accumulator | |
< float * V32 = (VKQ32 + 1*DV); // (temporary) FP32 V buffer | |
< ggml_fp16_t * VKQ16 = (ggml_fp16_t *) (VKQ32 + 1*DV); // (temporary) FP16 VKQ accumulator | |
< ggml_fp16_t * Q_q = (ggml_fp16_t *) (VKQ32 + 2*DV); // (temporary) buffer for Q converted to quantized/FP16 | |
< | |
< if (v->type == GGML_TYPE_F16) { | |
< memset(VKQ16, 0, DV*sizeof(ggml_fp16_t)); | |
< } else { | |
< memset(VKQ32, 0, DV*sizeof(float)); | |
< } | |
< | |
< const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; | |
< | |
< // k indices | |
< const int ik3 = iq3 / rk3; | |
< const int ik2 = iq2 / rk2; | |
< | |
< // v indices | |
< const int iv3 = iq3 / rv3; | |
< const int iv2 = iq2 / rv2; | |
< | |
< const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); | |
< q_to_vec_dot(pq, Q_q, DK); | |
< | |
< // online softmax / attention | |
< // loop over n_kv and n_head_kv | |
< // ref: https://arxiv.org/pdf/2112.05682.pdf | |
< for (int64_t ic = 0; ic < nek1; ++ic) { | |
< const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; | |
< if (mv == -INFINITY) { | |
< continue; | |
< } | |
< | |
< float s; // KQ value | |
< | |
< const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3); | |
< kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1); | |
< | |
< s = s*scale; // scale KQ value | |
< | |
< if (logit_softcap != 0.0f) { | |
< s = logit_softcap*tanhf(s); | |
< } | |
< | |
< s += mv; // apply mask | |
< | |
< const float Mold = M; | |
< | |
< float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value | |
< float vs = 1.0f; // post-softmax KQ value, expf(s - M) | |
< | |
< const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); | |
< | |
< if (v->type == GGML_TYPE_F16) { | |
< if (s > M) { | |
< // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f | |
< M = s; | |
< ms = expf(Mold - M); | |
< | |
< // V = V*expf(Mold - M) | |
< ggml_vec_scale_f16(DV, VKQ16, ms); | |
< } else { | |
< // no new maximum, ms == 1.0f, vs != 1.0f | |
< vs = expf(s - M); | |
< } | |
< | |
< // V += v*expf(s - M) | |
< ggml_vec_mad_f16(DV, VKQ16, (const ggml_fp16_t *) v_data, vs); | |
< } else { | |
< if (s > M) { | |
< // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f | |
< M = s; | |
< ms = expf(Mold - M); | |
< | |
< // V = V*expf(Mold - M) | |
< ggml_vec_scale_f32(DV, VKQ32, ms); | |
< } else { | |
< // no new maximum, ms == 1.0f, vs != 1.0f | |
< vs = expf(s - M); | |
< } | |
< | |
< v_to_float(v_data, V32, DV); | |
< | |
< // V += v*expf(s - M) | |
< ggml_vec_mad_f32(DV, VKQ32, V32, vs); | |
< } | |
< | |
< S = S*ms + vs; // scale and increment sum with partial sum | |
< } | |
< | |
< if (v->type == GGML_TYPE_F16) { | |
< for (int64_t d = 0; d < DV; ++d) { | |
< VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); | |
< } | |
< } | |
< | |
< // V /= S | |
< const float S_inv = 1.0f/S; | |
< ggml_vec_scale_f32(DV, VKQ32, S_inv); | |
< | |
< // dst indices | |
< const int i1 = iq1; | |
< const int i2 = iq2; | |
< const int i3 = iq3; | |
< | |
< // original | |
< //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); | |
< | |
< // permute(0, 2, 1, 3) | |
< memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_flash_attn_ext( | |
< const struct ggml_compute_params * params, | |
< const struct ggml_tensor * q, | |
< const struct ggml_tensor * k, | |
< const struct ggml_tensor * v, | |
< const struct ggml_tensor * mask, | |
< struct ggml_tensor * dst) { | |
< switch (dst->op_params[3]) { | |
< case GGML_PREC_DEFAULT: | |
< case GGML_PREC_F32: | |
< { | |
< // uses F32 accumulators | |
< ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_flash_attn_back | |
< | |
< static void ggml_compute_forward_flash_attn_back_f32( | |
< const struct ggml_compute_params * params, | |
< const bool masked, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * q = dst->src[0]; | |
< const struct ggml_tensor * k = dst->src[1]; | |
< const struct ggml_tensor * v = dst->src[2]; | |
< const struct ggml_tensor * d = dst->src[3]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, neq, q, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbq, q, nb) | |
< GGML_TENSOR_LOCALS(int64_t, nek, k, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbk, k, nb) | |
< GGML_TENSOR_LOCALS(int64_t, nev, v, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbv, v, nb) | |
< GGML_TENSOR_LOCALS(int64_t, ned, d, ne) | |
< GGML_TENSOR_LOCALS(size_t, nbd, d, nb) | |
< GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) | |
< GGML_TENSOR_LOCALS(size_t, nb, dst, nb) | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t D = neq0; | |
< const int64_t N = neq1; | |
< const int64_t P = nek1 - N; | |
< const int64_t M = P + N; | |
< | |
< const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); | |
< const int mxDM = MAX(D, Mup); | |
< | |
< // GGML_ASSERT(ne0 == D); | |
< // GGML_ASSERT(ne1 == N); | |
< GGML_ASSERT(P >= 0); | |
< | |
< GGML_ASSERT(nbq0 == sizeof(float)); | |
< GGML_ASSERT(nbk0 == sizeof(float)); | |
< GGML_ASSERT(nbv0 == sizeof(float)); | |
< | |
< GGML_ASSERT(neq0 == D); | |
< GGML_ASSERT(nek0 == D); | |
< GGML_ASSERT(nev1 == D); | |
< GGML_ASSERT(ned0 == D); | |
< | |
< GGML_ASSERT(neq1 == N); | |
< GGML_ASSERT(nek1 == N + P); | |
< GGML_ASSERT(nev1 == D); | |
< GGML_ASSERT(ned1 == N); | |
< | |
< // dst cannot be transposed or permuted | |
< GGML_ASSERT(nb0 == sizeof(float)); | |
< GGML_ASSERT(nb0 <= nb1); | |
< GGML_ASSERT(nb1 <= nb2); | |
< GGML_ASSERT(nb2 <= nb3); | |
< | |
< if (ith == 0) { | |
< memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< const int64_t elem_q = ggml_nelements(q); | |
< const int64_t elem_k = ggml_nelements(k); | |
< | |
< enum ggml_type result_type = dst->type; | |
< GGML_ASSERT(ggml_blck_size(result_type) == 1); | |
< const size_t tsize = ggml_type_size(result_type); | |
< | |
< const size_t offs_q = 0; | |
< const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN); | |
< const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN); | |
< | |
< void * grad_q = (char *) dst->data; | |
< void * grad_k = (char *) dst->data + offs_k; | |
< void * grad_v = (char *) dst->data + offs_v; | |
< | |
< const size_t nbgq1 = nb0*neq0; | |
< const size_t nbgq2 = nb0*neq0*neq1; | |
< const size_t nbgq3 = nb0*neq0*neq1*neq2; | |
< | |
< const size_t nbgk1 = nb0*nek0; | |
< const size_t nbgk2 = nb0*nek0*nek1; | |
< const size_t nbgk3 = nb0*nek0*nek1*neq2; | |
< | |
< const size_t nbgv1 = nb0*nev0; | |
< const size_t nbgv2 = nb0*nev0*nev1; | |
< const size_t nbgv3 = nb0*nev0*nev1*neq2; | |
< | |
< // parallelize by k rows using ggml_vec_dot_f32 | |
< | |
< // total rows in k | |
< const int nr = nek2*nek3; | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< const float scale = 1.0f/sqrtf(D); | |
< | |
< //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); | |
< | |
< // how often k2 (and v2) is repeated in q2 | |
< int nrep = neq2/nek2; | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< // q indices | |
< const int ik3 = ir/(nek2); | |
< const int ik2 = ir - ik3*nek2; | |
< | |
< const int iq3 = ik3; | |
< const int id3 = ik3; | |
< const int iv3 = ik3; | |
< const int iv2 = ik2; | |
< | |
< for (int irep = 0; irep < nrep; ++irep) { | |
< const int iq2 = ik2 + irep*nek2; | |
< const int id2 = iq2; | |
< | |
< // (ik2 + irep*nek2) % nek2 == ik2 | |
< for (int iq1 = 0; iq1 < neq1; ++iq1) { | |
< const int id1 = iq1; | |
< | |
< // not sure about CACHE_LINE_SIZE_F32.. | |
< // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? | |
< float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); | |
< float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); | |
< | |
< for (int i = M; i < Mup; ++i) { | |
< S[i] = -INFINITY; | |
< } | |
< | |
< const int64_t masked_begin = masked ? (P + iq1 + 1) : M; | |
< for (int64_t ic = 0; ic < masked_begin; ++ic) { | |
< // k indices | |
< const int ik1 = ic; | |
< | |
< // S indices | |
< const int i1 = ik1; | |
< | |
< ggml_vec_dot_f32(neq0, | |
< S + i1, 0, | |
< (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, | |
< (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); | |
< } | |
< | |
< // scale | |
< ggml_vec_scale_f32(masked_begin, S, scale); | |
< | |
< for (int64_t i = masked_begin; i < M; i++) { | |
< S[i] = -INFINITY; | |
< } | |
< | |
< // softmax | |
< // exclude known -INF S[..] values from max and loop | |
< // dont forget to set their SM values to zero | |
< { | |
< float max = -INFINITY; | |
< ggml_vec_max_f32(masked_begin, &max, S); | |
< | |
< ggml_float sum = 0.0; | |
< { | |
< #ifdef GGML_SOFT_MAX_ACCELERATE | |
< max = -max; | |
< vDSP_vsadd(SM, 1, &max, SM, 1, Mup); | |
< vvexpf(SM, SM, &Mup); | |
< ggml_vec_sum_f32(Mup, &sum, SM); | |
< #else | |
< sum = ggml_vec_soft_max_f32(Mup, SM, S, max); | |
< #endif | |
< } | |
< | |
< assert(sum > 0.0); | |
< | |
< sum = 1.0/sum; | |
< ggml_vec_scale_f32(masked_begin, SM, sum); | |
< | |
< } | |
< | |
< // step-by-step explanation | |
< { | |
< // forward-process shape grads from backward process | |
< // parallel_for ik2,ik3: | |
< // for irep: | |
< // iq2 = ik2 + irep*nek2 | |
< // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,ik2,ik3] += grad[kcur] | |
< // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] | |
< // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iv2,iv3] += grad[vcur] | |
< // for iq1: | |
< // kcur = k[:D,:M,ik2,ik3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur | |
< // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur | |
< // vcur = v[:M,:D,iv2,iv3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 | |
< // S0 = -Inf [D,1,1,1] | |
< // ~S1[i] = dot(kcur[:D,i], qcur) | |
< // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale | |
< // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) | |
< // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) | |
< // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur | |
< // ~S5[i] = dot(vcur[:,i], S4) | |
< // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,id1,id2,id3] | |
< // ~dst[i,iq1,iq2,iq3] = S5[i] ^ | |
< // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3] | |
< // dst backward-/ grad[dst] = d | |
< // | |
< // output gradients with their dependencies: | |
< // | |
< // grad[kcur] = grad[S1].T @ qcur | |
< // grad[S1] = diag_mask_zero(grad[S3], P) * scale | |
< // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) | |
< // grad[S4] = grad[S5] @ vcur | |
< // grad[S4] = d[:D,id1,id2,id3] @ vcur | |
< // grad[qcur] = grad[S1] @ kcur | |
< // grad[vcur] = grad[S5].T @ S4 | |
< // grad[vcur] = d[:D,id1,id2,id3].T @ S4 | |
< // | |
< // in post-order: | |
< // | |
< // S1 = qcur @ kcur.T | |
< // S2 = S1 * scale | |
< // S3 = diag_mask_inf(S2, P) | |
< // S4 = softmax(S3) | |
< // grad[S4] = d[:D,id1,id2,id3] @ vcur | |
< // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) | |
< // grad[S1] = diag_mask_zero(grad[S3], P) * scale | |
< // grad[qcur] = grad[S1] @ kcur | |
< // grad[kcur] = grad[S1].T @ qcur | |
< // grad[vcur] = d[:D,id1,id2,id3].T @ S4 | |
< // | |
< // using less variables (SM=S4): | |
< // | |
< // S = diag_mask_inf(qcur @ kcur.T * scale, P) | |
< // SM = softmax(S) | |
< // S = d[:D,iq1,iq2,iq3] @ vcur | |
< // dot_SM_gradSM = dot(SM, S) | |
< // S = SM * (S - dot(SM, S)) | |
< // S = diag_mask_zero(S, P) * scale | |
< // | |
< // grad[q][:D,iq1,iq2,iq3] += S @ kcur | |
< // grad[k][:D,:M,ik2,ik3] += S.T @ qcur | |
< // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM | |
< } | |
< | |
< // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] | |
< // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3] | |
< // for ic: | |
< // S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3] | |
< // exclude known future zero S[..] values from operation | |
< ggml_vec_set_f32(masked_begin, S, 0); | |
< for (int64_t ic = 0; ic < D; ++ic) { | |
< ggml_vec_mad_f32(masked_begin, | |
< S, | |
< (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), | |
< *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); | |
< } | |
< | |
< // S = SM * (S - dot(SM, S)) | |
< float dot_SM_gradSM = 0; | |
< ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); | |
< ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); | |
< ggml_vec_mul_f32 (masked_begin, S, S, SM); | |
< | |
< // S = diag_mask_zero(S, P) * scale | |
< // already done by above ggml_vec_set_f32 | |
< | |
< // exclude known zero S[..] values from operation | |
< ggml_vec_scale_f32(masked_begin, S, scale); | |
< | |
< // S shape [M,1] | |
< // SM shape [M,1] | |
< // kcur shape [D,M] | |
< // qcur shape [D,1] | |
< // vcur shape [M,D] | |
< | |
< // grad[q][:D,iq1,iq2,iq3] += S @ kcur | |
< // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] | |
< // for ic: | |
< // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3] | |
< // exclude known zero S[..] values from loop | |
< for (int64_t ic = 0; ic < masked_begin; ++ic) { | |
< ggml_vec_mad_f32(D, | |
< (float *) ((char *) grad_q + (iq1*nbgq1 + iq2*nbgq2 + iq3*nbgq3)), | |
< (float *) ((char *) k->data + (ic*nbk1 + ik2*nbk2 + ik3*nbk3)), | |
< S[ic]); | |
< } | |
< | |
< // grad[k][:D,:M,iq2,iq3] += S.T @ qcur | |
< // for ic: | |
< // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] | |
< // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] | |
< // exclude known zero S[..] values from loop | |
< for (int64_t ic = 0; ic < masked_begin; ++ic) { | |
< ggml_vec_mad_f32(D, | |
< (float *) ((char *) grad_k + (ic*nbgk1 + ik2*nbgk2 + ik3*nbgk3)), | |
< (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), | |
< S[ic]); | |
< } | |
< | |
< // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T @ SM | |
< // for ic: | |
< // grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M] | |
< // grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3] * SM[:M] | |
< // exclude known zero SM[..] values from mad | |
< for (int64_t ic = 0; ic < D; ++ic) { | |
< ggml_vec_mad_f32(masked_begin, | |
< (float *) ((char *) grad_v + ( ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)), | |
< SM, | |
< *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3))); | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_flash_attn_back( | |
< const struct ggml_compute_params * params, | |
< const bool masked, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * q = dst->src[0]; | |
< | |
< switch (q->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_flash_attn_back_f32(params, masked, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_ssm_conv | |
< | |
< static void ggml_compute_forward_ssm_conv_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const struct ggml_tensor * src0 = dst->src[0]; // conv_x | |
< const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nc = src1->ne[0]; // d_conv | |
< const int ncs = src0->ne[0]; // d_conv - 1 + n_t | |
< const int nr = src0->ne[1]; // d_inner | |
< const int n_t = dst->ne[1]; // tokens per sequence | |
< const int n_s = dst->ne[2]; // number of sequences in the batch | |
< | |
< GGML_ASSERT( dst->ne[0] == nr); | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src1->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< const int ir = ir1 - ir0; | |
< | |
< for (int i3 = 0; i3 < n_s; ++i3) { | |
< for (int i2 = 0; i2 < n_t; ++i2) { | |
< // {d_conv - 1 + n_t, d_inner, n_seqs} | |
< // sliding window | |
< const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s} | |
< const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner} | |
< float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s} | |
< | |
< // TODO: transpose the output for smaller strides for big batches? | |
< // d_inner | |
< for (int i1 = 0; i1 < ir; ++i1) { | |
< // rowwise dot product | |
< // NOTE: not using ggml_vec_dot_f32, because its sum is in double precision | |
< float sumf = 0.0f; | |
< | |
< // d_conv | |
< for (int i0 = 0; i0 < nc; ++i0) { | |
< sumf += s[i0 + i1*ncs] * c[i0 + i1*nc]; | |
< } | |
< x[i1] = sumf; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_ssm_conv( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< switch (dst->src[0]->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_ssm_conv_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_ssm_scan | |
< | |
< static void ggml_compute_forward_ssm_scan_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const struct ggml_tensor * src0 = dst->src[0]; // s | |
< const struct ggml_tensor * src1 = dst->src[1]; // x | |
< const struct ggml_tensor * src2 = dst->src[2]; // dt | |
< const struct ggml_tensor * src3 = dst->src[3]; // A | |
< const struct ggml_tensor * src4 = dst->src[4]; // B | |
< const struct ggml_tensor * src5 = dst->src[5]; // C | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int64_t nc = src0->ne[0]; // d_state | |
< const int64_t nr = src0->ne[1]; // d_inner | |
< const int64_t n_t = src1->ne[1]; // number of tokens per sequence | |
< const int64_t n_s = src0->ne[2]; // number of sequences in the batch | |
< | |
< GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); | |
< GGML_ASSERT(src0->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src1->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src2->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src3->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src4->nb[0] == sizeof(float)); | |
< GGML_ASSERT(src5->nb[0] == sizeof(float)); | |
< // required for the dot product between s and C | |
< GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); | |
< // required for per-sequence offsets for states | |
< GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); | |
< // required to get correct offset for state destination (i.e. src1->nb[3]) | |
< GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< const int ir = ir1 - ir0; | |
< | |
< for (int i3 = 0; i3 < n_s; ++i3) { | |
< for (int i2 = 0; i2 < n_t; ++i2) { | |
< const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s} | |
< const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} | |
< const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s} | |
< const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} | |
< const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s} | |
< const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s} | |
< float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s} | |
< float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s} | |
< | |
< // use the output as the source for the next token-wise iterations | |
< if (i2 > 0) { s0 = s; } | |
< | |
< // d_inner | |
< for (int i1 = 0; i1 < ir; ++i1) { | |
< // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 | |
< float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; | |
< float x_dt = x[i1] * dt_soft_plus; | |
< float sumf = 0.0f; | |
< // d_state | |
< for (int i0 = 0; i0 < nc; ++i0) { | |
< int i = i0 + i1*nc; | |
< // state = prev_state * dA + dB * x | |
< float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); | |
< // y = rowwise_dotprod(state, C) | |
< sumf += state * C[i0]; | |
< s[i] = state; | |
< } | |
< y[i1] = sumf; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_ssm_scan( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< switch (dst->src[0]->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_ssm_scan_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_win_part | |
< | |
< static void ggml_compute_forward_win_part_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< UNUSED(params); | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) | |
< GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) | |
< | |
< const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; | |
< const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; | |
< const int32_t w = ((const int32_t *)(dst->op_params))[2]; | |
< | |
< assert(ne00 == ne0); | |
< assert(ne3 == nep0*nep1); | |
< | |
< // TODO: optimize / multi-thread | |
< for (int py = 0; py < nep1; ++py) { | |
< for (int px = 0; px < nep0; ++px) { | |
< const int64_t i3 = py*nep0 + px; | |
< for (int64_t i2 = 0; i2 < ne2; ++i2) { | |
< for (int64_t i1 = 0; i1 < ne1; ++i1) { | |
< for (int64_t i0 = 0; i0 < ne0; ++i0) { | |
< const int64_t i02 = py*w + i2; | |
< const int64_t i01 = px*w + i1; | |
< const int64_t i00 = i0; | |
< | |
< const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0; | |
< const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; | |
< | |
< if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { | |
< ((float *) dst->data)[i] = 0.0f; | |
< } else { | |
< ((float *) dst->data)[i] = ((float *) src0->data)[j]; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_win_part( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_win_part_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_win_unpart | |
< | |
< static void ggml_compute_forward_win_unpart_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< UNUSED(params); | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) | |
< GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) | |
< | |
< const int32_t w = ((const int32_t *)(dst->op_params))[0]; | |
< | |
< // padding | |
< const int px = (w - ne1%w)%w; | |
< //const int py = (w - ne2%w)%w; | |
< | |
< const int npx = (px + ne1)/w; | |
< //const int npy = (py + ne2)/w; | |
< | |
< assert(ne0 == ne00); | |
< | |
< // TODO: optimize / multi-thread | |
< for (int64_t i2 = 0; i2 < ne2; ++i2) { | |
< for (int64_t i1 = 0; i1 < ne1; ++i1) { | |
< for (int64_t i0 = 0; i0 < ne0; ++i0) { | |
< const int ip2 = i2/w; | |
< const int ip1 = i1/w; | |
< | |
< const int64_t i02 = i2%w; | |
< const int64_t i01 = i1%w; | |
< const int64_t i00 = i0; | |
< | |
< const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; | |
< const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; | |
< | |
< ((float *) dst->data)[j] = ((float *) src0->data)[i]; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_win_unpart( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_win_unpart_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< //gmml_compute_forward_unary | |
< | |
< static void ggml_compute_forward_unary( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const enum ggml_unary_op op = ggml_get_unary_op(dst); | |
< | |
< switch (op) { | |
< case GGML_UNARY_OP_ABS: | |
< { | |
< ggml_compute_forward_abs(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_SGN: | |
< { | |
< ggml_compute_forward_sgn(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_NEG: | |
< { | |
< ggml_compute_forward_neg(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_STEP: | |
< { | |
< ggml_compute_forward_step(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_TANH: | |
< { | |
< ggml_compute_forward_tanh(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_ELU: | |
< { | |
< ggml_compute_forward_elu(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_RELU: | |
< { | |
< ggml_compute_forward_relu(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_SIGMOID: | |
< { | |
< ggml_compute_forward_sigmoid(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_GELU: | |
< { | |
< ggml_compute_forward_gelu(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_GELU_QUICK: | |
< { | |
< ggml_compute_forward_gelu_quick(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_SILU: | |
< { | |
< ggml_compute_forward_silu(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_HARDSWISH: | |
< { | |
< ggml_compute_forward_hardswish(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_HARDSIGMOID: | |
< { | |
< ggml_compute_forward_hardsigmoid(params, dst); | |
< } break; | |
< case GGML_UNARY_OP_EXP: | |
< { | |
< ggml_compute_forward_exp(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_get_rel_pos | |
< | |
< static void ggml_compute_forward_get_rel_pos_f16( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< UNUSED(params); | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< | |
< const int64_t w = ne1; | |
< | |
< ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; | |
< ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; | |
< | |
< for (int64_t i2 = 0; i2 < ne2; ++i2) { | |
< for (int64_t i1 = 0; i1 < ne1; ++i1) { | |
< const int64_t pos = (w - i1 - 1) + i2; | |
< for (int64_t i0 = 0; i0 < ne0; ++i0) { | |
< dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_get_rel_pos( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F16: | |
< case GGML_TYPE_BF16: | |
< { | |
< ggml_compute_forward_get_rel_pos_f16(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_add_rel_pos | |
< | |
< static void ggml_compute_forward_add_rel_pos_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< const struct ggml_tensor * src2 = dst->src[2]; | |
< | |
< const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; | |
< if (!inplace) { | |
< if (params->ith == 0) { | |
< memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< } | |
< // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 | |
< | |
< float * src1_data = (float *) src1->data; | |
< float * src2_data = (float *) src2->data; | |
< float * dst_data = (float *) dst->data; | |
< | |
< const int64_t ne10 = src1->ne[0]; | |
< const int64_t ne11 = src1->ne[1]; | |
< const int64_t ne12 = src1->ne[2]; | |
< const int64_t ne13 = src1->ne[3]; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< // total patches in dst | |
< const int np = ne13; | |
< | |
< // patches per thread | |
< const int dp = (np + nth - 1)/nth; | |
< | |
< // patch range for this thread | |
< const int ip0 = dp*ith; | |
< const int ip1 = MIN(ip0 + dp, np); | |
< | |
< for (int64_t i13 = ip0; i13 < ip1; ++i13) { | |
< for (int64_t i12 = 0; i12 < ne12; ++i12) { | |
< for (int64_t i11 = 0; i11 < ne11; ++i11) { | |
< const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; | |
< for (int64_t i10 = 0; i10 < ne10; ++i10) { | |
< const int64_t jp0 = jp1 + i10; | |
< const float src1_e = src1_data[jp0]; | |
< const float src2_e = src2_data[jp0]; | |
< | |
< const int64_t jdh = jp0 * ne10; | |
< const int64_t jdw = jdh - (ne10 - 1) * i10; | |
< | |
< for (int64_t j = 0; j < ne10; ++j) { | |
< dst_data[jdh + j ] += src2_e; | |
< dst_data[jdw + j*ne10] += src1_e; | |
< } | |
< } | |
< } | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_add_rel_pos( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_add_rel_pos_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_rwkv_wkv6 | |
< | |
< static void ggml_compute_forward_rwkv_wkv6_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const int64_t T = dst->src[1]->ne[2]; | |
< const int64_t C = dst->ne[0]; | |
< const int64_t HEADS = dst->src[1]->ne[1]; | |
< const int64_t n_seqs = dst->src[5]->ne[1]; | |
< const int64_t head_size = C / HEADS; | |
< | |
< float * dst_data = (float *) dst->data; | |
< float * state = ((float *) dst->data) + C * T; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< if (ith >= HEADS) { | |
< return; | |
< } | |
< | |
< const int h_start = (HEADS * ith) / nth; | |
< const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? | |
< (HEADS * (ith + 1)) / nth : HEADS; | |
< | |
< float * k = (float *) dst->src[0]->data; | |
< float * v = (float *) dst->src[1]->data; | |
< float * r = (float *) dst->src[2]->data; | |
< float * time_faaaa = (float *) dst->src[3]->data; | |
< float * time_decay = (float *) dst->src[4]->data; | |
< | |
< size_t t_stride = HEADS * head_size; // Same to C | |
< | |
< size_t h_stride = C / HEADS; | |
< GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS | |
< size_t h_stride_2d = head_size * head_size; | |
< | |
< if (ith == 0) { | |
< memset(dst_data, 0, T * C * sizeof(float)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< | |
< #if defined(__AVX__) && !defined(__AVX512F__) | |
< #define GGML_F32X GGML_F32x8 | |
< #define GGML_F32X_SET1 GGML_F32x8_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x8_LOAD | |
< #define GGML_F32X_STORE GGML_F32x8_STORE | |
< #define GGML_F32X_MUL GGML_F32x8_MUL | |
< #define GGML_F32X_FMA GGML_F32x8_FMA | |
< #define WKV_VECTOR_SIZE 8 | |
< #elif defined(__AVX512F__) | |
< #define GGML_F32X GGML_F32x16 | |
< #define GGML_F32X_SET1 GGML_F32x16_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x16_LOAD | |
< #define GGML_F32X_STORE GGML_F32x16_STORE | |
< #define GGML_F32X_MUL GGML_F32x16_MUL | |
< #define GGML_F32X_FMA GGML_F32x16_FMA | |
< #define WKV_VECTOR_SIZE 16 | |
< #elif defined(__ARM_NEON) && defined(__aarch64__) | |
< #define GGML_F32X GGML_F32x4 | |
< #define GGML_F32X_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32X_STORE GGML_F32x4_STORE | |
< #define GGML_F32X_MUL GGML_F32x4_MUL | |
< #define GGML_F32X_FMA GGML_F32x4_FMA | |
< #define WKV_VECTOR_SIZE 4 | |
< #endif | |
< | |
< #ifdef WKV_VECTOR_SIZE | |
< const int64_t vec_count = head_size / WKV_VECTOR_SIZE; | |
< | |
< for (int64_t t = 0; t < T; t++) { | |
< size_t t_offset = t * t_stride; | |
< size_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< size_t h_offset = h * h_stride; | |
< size_t t_h_offset = t_offset + h_offset; | |
< size_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t i = 0; i < head_size; i++) { | |
< size_t t_h_i_offset = t_h_offset + i; | |
< size_t h_i_offset = h_offset + i; | |
< size_t h_2d_i_offset = h_2d_offset + i * h_stride; | |
< | |
< float k_val = k[t_h_i_offset]; | |
< float r_val = r[t_h_i_offset]; | |
< float time_faaaa_val = time_faaaa[h_i_offset]; | |
< float time_decay_val = time_decay[t_h_i_offset]; | |
< | |
< // Broadcast scalar values to vectors | |
< GGML_F32X k_vec = GGML_F32X_SET1(k_val); | |
< GGML_F32X r_vec = GGML_F32X_SET1(r_val); | |
< GGML_F32X time_faaaa_vec = GGML_F32X_SET1(time_faaaa_val); | |
< GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val); | |
< | |
< for (int64_t j = 0; j < vec_count; j++) { | |
< size_t base_j = j * WKV_VECTOR_SIZE; | |
< size_t t_h_j_offset = t_h_offset + base_j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + base_j; | |
< | |
< // Load x elements at once | |
< GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]); | |
< GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]); | |
< GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]); | |
< | |
< // Compute kv = v * k | |
< GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec); | |
< | |
< // Compute temp = kv * time_faaaa + prev_state | |
< GGML_F32X temp_vec = GGML_F32X_FMA(prev_state_vec, kv_vec, time_faaaa_vec); | |
< | |
< // Update dst: dst += temp * r | |
< dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, r_vec); | |
< GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec); | |
< | |
< // Update state: state = prev_state * time_decay + kv | |
< GGML_F32X new_state_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, time_decay_vec); | |
< GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], new_state_vec); | |
< } | |
< | |
< // Handle remaining elements, this will not be used. | |
< for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) { | |
< size_t t_h_j_offset = t_h_offset + j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< float v_val = v[t_h_j_offset]; | |
< float kv_val = v_val * k_val; | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< float temp_val = kv_val * time_faaaa_val + prev_state_val; | |
< dst_data[t_h_j_offset] += temp_val * r_val; | |
< state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; | |
< } | |
< } | |
< } | |
< } | |
< | |
< #else | |
< // basically fused operations: | |
< // dst = r @ (time_faaaa * (k @ v) + state), | |
< // state = time_decay * state + (k @ v), | |
< // recursive through each token | |
< for (int64_t t = 0; t < T; t++) { | |
< size_t t_offset = t * t_stride; | |
< size_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[5]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< size_t h_offset = h * h_stride; | |
< size_t t_h_offset = t_offset + h_offset; | |
< size_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t i = 0; i < head_size; i++) { | |
< size_t t_h_i_offset = t_h_offset + i; | |
< size_t h_i_offset = h_offset + i; | |
< size_t h_2d_i_offset = h_2d_offset + i * h_stride; | |
< | |
< float k_val = k[t_h_i_offset]; | |
< float r_val = r[t_h_i_offset]; | |
< float time_faaaa_val = time_faaaa[h_i_offset]; | |
< // RWKV v6: different time_decay for each token. | |
< float time_decay_val = time_decay[t_h_i_offset]; | |
< | |
< for (int64_t j = 0; j < head_size; j++) { | |
< size_t t_h_j_offset = t_h_offset + j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< | |
< float v_val = v[t_h_j_offset]; | |
< float kv_val = v_val * k_val; | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< float temp_val = kv_val * time_faaaa_val + prev_state_val; | |
< dst_data[t_h_j_offset] += temp_val * r_val; | |
< state_cur[h_2d_i_j_offset] = prev_state_val * time_decay_val + kv_val; | |
< } | |
< } | |
< } | |
< } | |
< #endif | |
< } | |
< | |
< | |
< static void ggml_compute_forward_rwkv_wkv6( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rwkv_wkv6_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_gla | |
< | |
< static void ggml_compute_forward_gla_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const int64_t T = dst->src[1]->ne[2]; | |
< const int64_t C = dst->ne[0]; | |
< const int64_t HEADS = dst->src[1]->ne[1]; | |
< const int64_t n_seqs = dst->src[4]->ne[1]; | |
< const int64_t head_size = C / HEADS; | |
< const float scale = ggml_get_op_params_f32(dst, 0); | |
< | |
< float * dst_data = (float *) dst->data; | |
< float * state = ((float *) dst->data) + C * T; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< if (ith >= HEADS) { | |
< return; | |
< } | |
< | |
< const int h_start = (HEADS * ith) / nth; | |
< const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? | |
< (HEADS * (ith + 1)) / nth : HEADS; | |
< | |
< float * k = (float *) dst->src[0]->data; | |
< float * v = (float *) dst->src[1]->data; | |
< float * q = (float *) dst->src[2]->data; | |
< float * g = (float *) dst->src[3]->data; | |
< | |
< size_t t_stride = HEADS * head_size; // Same to C | |
< | |
< size_t h_stride = C / HEADS; | |
< GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS | |
< size_t h_stride_2d = head_size * head_size; | |
< | |
< if (ith == 0) { | |
< memset(dst_data, 0, T * C * sizeof(float)); | |
< } | |
< ggml_barrier(params->threadpool); | |
< | |
< | |
< #if defined(__AVX__) && !defined(__AVX512F__) | |
< #define GGML_F32X GGML_F32x8 | |
< #define GGML_F32X_SET1 GGML_F32x8_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x8_LOAD | |
< #define GGML_F32X_STORE GGML_F32x8_STORE | |
< #define GGML_F32X_MUL GGML_F32x8_MUL | |
< #define GGML_F32X_FMA GGML_F32x8_FMA | |
< #define GLA_VECTOR_SIZE 8 | |
< #elif defined(__AVX512F__) | |
< #define GGML_F32X GGML_F32x16 | |
< #define GGML_F32X_SET1 GGML_F32x16_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x16_LOAD | |
< #define GGML_F32X_STORE GGML_F32x16_STORE | |
< #define GGML_F32X_MUL GGML_F32x16_MUL | |
< #define GGML_F32X_FMA GGML_F32x16_FMA | |
< #define GLA_VECTOR_SIZE 16 | |
< #elif defined(__ARM_NEON) && defined(__aarch64__) | |
< #define GGML_F32X GGML_F32x4 | |
< #define GGML_F32X_SET1 GGML_F32x4_SET1 | |
< #define GGML_F32X_LOAD GGML_F32x4_LOAD | |
< #define GGML_F32X_STORE GGML_F32x4_STORE | |
< #define GGML_F32X_MUL GGML_F32x4_MUL | |
< #define GGML_F32X_FMA GGML_F32x4_FMA | |
< #define GLA_VECTOR_SIZE 4 | |
< #endif | |
< | |
< #ifdef GLA_VECTOR_SIZE | |
< const int64_t vec_count = head_size / GLA_VECTOR_SIZE; | |
< | |
< for (int64_t t = 0; t < T; t++) { | |
< size_t t_offset = t * t_stride; | |
< size_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< size_t h_offset = h * h_stride; | |
< size_t t_h_offset = t_offset + h_offset; | |
< size_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t i = 0; i < head_size; i++) { | |
< size_t t_h_i_offset = t_h_offset + i; | |
< size_t h_2d_i_offset = h_2d_offset + i * h_stride; | |
< | |
< float k_val = k[t_h_i_offset]; | |
< float q_val = q[t_h_i_offset] * scale; | |
< float g_val = g[t_h_i_offset]; | |
< | |
< // Broadcast scalar values to vectors | |
< GGML_F32X k_vec = GGML_F32X_SET1(k_val); | |
< GGML_F32X q_vec = GGML_F32X_SET1(q_val); | |
< GGML_F32X g_vec = GGML_F32X_SET1(g_val); | |
< | |
< for (int64_t j = 0; j < vec_count; j++) { | |
< size_t base_j = j * GLA_VECTOR_SIZE; | |
< size_t t_h_j_offset = t_h_offset + base_j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + base_j; | |
< | |
< // Load x elements at once | |
< GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]); | |
< GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]); | |
< GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]); | |
< | |
< // Compute kv = v * k | |
< GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec); | |
< | |
< // Compute temp = prev_state * g + kv | |
< GGML_F32X temp_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, g_vec); | |
< | |
< // Update dst: dst += temp * q | |
< dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, q_vec); | |
< GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec); | |
< | |
< // Update state | |
< GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], temp_vec); | |
< } | |
< | |
< // Handle remaining elements, this will not be used. | |
< for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) { | |
< size_t t_h_j_offset = t_h_offset + j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< float v_val = v[t_h_j_offset]; | |
< float kv_val = v_val * k_val; | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< float temp_val = kv_val + prev_state_val * g_val; | |
< dst_data[t_h_j_offset] += temp_val * q_val; | |
< state_cur[h_2d_i_j_offset] = temp_val; | |
< } | |
< } | |
< } | |
< } | |
< | |
< #else | |
< for (int64_t t = 0; t < T; t++) { | |
< size_t t_offset = t * t_stride; | |
< size_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< size_t h_offset = h * h_stride; | |
< size_t t_h_offset = t_offset + h_offset; | |
< size_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t i = 0; i < head_size; i++) { | |
< size_t t_h_i_offset = t_h_offset + i; | |
< size_t h_2d_i_offset = h_2d_offset + i * h_stride; | |
< | |
< float k_val = k[t_h_i_offset]; | |
< float q_val = q[t_h_i_offset] * scale; | |
< float g_val = g[t_h_i_offset]; | |
< | |
< for (int64_t j = 0; j < head_size; j++) { | |
< size_t t_h_j_offset = t_h_offset + j; | |
< size_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< | |
< float v_val = v[t_h_j_offset]; | |
< float kv_val = v_val * k_val; | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< float temp_val = prev_state_val * g_val + kv_val; | |
< dst_data[t_h_j_offset] += temp_val * q_val; | |
< state_cur[h_2d_i_j_offset] = temp_val; | |
< } | |
< } | |
< } | |
< } | |
< #endif | |
< } | |
< | |
< | |
< static void ggml_compute_forward_gla( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_gla_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_rwkv_wkv7 | |
< | |
< static void ggml_compute_forward_rwkv_wkv7_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< const int64_t T = dst->src[1]->ne[2]; | |
< const int64_t C = dst->ne[0]; | |
< const int64_t HEADS = dst->src[1]->ne[1]; | |
< const int64_t n_seqs = dst->src[6]->ne[1]; | |
< const int64_t head_size = C / HEADS; | |
< | |
< float * dst_data = (float *) dst->data; | |
< float * state = ((float *) dst->data) + C * T; | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< if (ith >= HEADS) { | |
< return; | |
< } | |
< | |
< const int h_start = (HEADS * ith) / nth; | |
< const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? | |
< (HEADS * (ith + 1)) / nth : HEADS; | |
< | |
< float * r = (float *) dst->src[0]->data; | |
< float * w = (float *) dst->src[1]->data; | |
< float * k = (float *) dst->src[2]->data; | |
< float * v = (float *) dst->src[3]->data; | |
< float * a = (float *) dst->src[4]->data; | |
< float * b = (float *) dst->src[5]->data; | |
< | |
< int64_t t_stride = HEADS * head_size; // Same to C | |
< | |
< int64_t h_stride = C / HEADS; | |
< GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS | |
< int64_t h_stride_2d = head_size * head_size; | |
< | |
< #if defined(GGML_SIMD) | |
< for (int64_t t = 0; t < T; t++) { | |
< int64_t t_offset = t * t_stride; | |
< int64_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< int64_t h_offset = h * h_stride; | |
< int64_t t_h_offset = t_offset + h_offset; | |
< int64_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t ii = 0; ii < head_size; ii++) { | |
< int64_t t_h_i_offset = t_h_offset + ii; | |
< int64_t h_2d_i_offset = h_2d_offset + ii * h_stride; | |
< | |
< GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]); | |
< | |
< float sa = 0; | |
< { | |
< GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; | |
< GGML_F32_VEC ax[GGML_F32_ARR]; | |
< GGML_F32_VEC ay[GGML_F32_ARR]; | |
< for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) { | |
< for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { | |
< ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]); | |
< ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]); | |
< sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]); | |
< } | |
< } | |
< GGML_F32_VEC_REDUCE(sa, sum); | |
< } | |
< | |
< GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa); | |
< | |
< int64_t j = 0; | |
< GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; | |
< for (; j < head_size; j += GGML_F32_STEP) { | |
< for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { | |
< int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR; | |
< int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR; | |
< | |
< GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]); | |
< GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]); | |
< GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]); | |
< GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]); | |
< | |
< k_vec = GGML_F32_VEC_MUL(v_vec, k_vec); | |
< | |
< GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]); | |
< // kv + s * decay + sa * b | |
< state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec); | |
< state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec); | |
< GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec); | |
< | |
< result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec); | |
< } | |
< } | |
< GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec); | |
< | |
< // There shouldn't be left-overs though. | |
< for (; j < head_size; j++) { | |
< int64_t t_h_j_offset = t_h_offset + j; | |
< int64_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< | |
< float r_val = r[t_h_j_offset]; | |
< float w_val = w[t_h_j_offset]; | |
< float k_val = k[t_h_j_offset]; | |
< float b_val = b[t_h_j_offset]; | |
< float kv_val = v[t_h_i_offset] * k_val; | |
< | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; | |
< dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val; | |
< } | |
< } | |
< } | |
< } | |
< #else | |
< for (int64_t t = 0; t < T; t++) { | |
< int64_t t_offset = t * t_stride; | |
< int64_t state_offset = head_size * C * (t / (T / n_seqs)); | |
< float * state_cur = state + state_offset; | |
< float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; | |
< | |
< for (int64_t h = h_start; h < h_end; h++) { | |
< int64_t h_offset = h * h_stride; | |
< int64_t t_h_offset = t_offset + h_offset; | |
< int64_t h_2d_offset = h * h_stride_2d; | |
< | |
< for (int64_t i = 0; i < head_size; i++) { | |
< int64_t t_h_i_offset = t_h_offset + i; | |
< int64_t h_2d_i_offset = h_2d_offset + i * h_stride; | |
< | |
< float v_val = v[t_h_i_offset]; | |
< | |
< float sa = 0, result = 0; | |
< for (int64_t j = 0; j < head_size; j++) { | |
< sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j]; | |
< } | |
< | |
< for (int64_t j = 0; j < head_size; j++) { | |
< int64_t t_h_j_offset = t_h_offset + j; | |
< int64_t h_2d_i_j_offset = h_2d_i_offset + j; | |
< | |
< float r_val = r[t_h_j_offset]; | |
< float w_val = w[t_h_j_offset]; | |
< float k_val = k[t_h_j_offset]; | |
< float b_val = b[t_h_j_offset]; | |
< float kv_val = v_val * k_val; | |
< float prev_state_val = state_prev[h_2d_i_j_offset]; | |
< state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; | |
< result += state_cur[h_2d_i_j_offset] * r_val; | |
< } | |
< dst_data[t_h_i_offset] = result; | |
< } | |
< } | |
< } | |
< #endif | |
< } | |
< | |
< | |
< static void ggml_compute_forward_rwkv_wkv7( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_rwkv_wkv7_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_map_unary | |
< | |
< static void ggml_compute_forward_map_unary_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_unary_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, dst)); | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< for (int i = 0; i < n; i++) { | |
< fun(nc, | |
< (float *) ((char *) dst->data + i*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i*(src0->nb[1]))); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_map_unary( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_unary_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_map_unary_f32(params, dst, fun); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_map_binary | |
< | |
< static void ggml_compute_forward_map_binary_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_binary_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< assert(ggml_is_contiguous_1(src0)); | |
< assert(ggml_is_contiguous_1(src1)); | |
< assert(ggml_is_contiguous_1(dst)); | |
< assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); | |
< | |
< const int n = ggml_nrows(src0); | |
< const int nc = src0->ne[0]; | |
< | |
< for (int i = 0; i < n; i++) { | |
< fun(nc, | |
< (float *) ((char *) dst->data + i*( dst->nb[1])), | |
< (float *) ((char *) src0->data + i*(src0->nb[1])), | |
< (float *) ((char *) src1->data + i*(src1->nb[1]))); | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_map_binary( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_binary_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_map_binary_f32(params, dst, fun); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_map_custom1 | |
< | |
< static void ggml_compute_forward_map_custom1_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_custom1_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< fun(dst, a); | |
< } | |
< | |
< // ggml_compute_forward_map_custom2 | |
< | |
< static void ggml_compute_forward_map_custom2_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_custom2_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< const struct ggml_tensor * b = dst->src[1]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< fun(dst, a, b); | |
< } | |
< | |
< // ggml_compute_forward_map_custom3 | |
< | |
< static void ggml_compute_forward_map_custom3_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst, | |
< const ggml_custom3_op_f32_t fun) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< const struct ggml_tensor * b = dst->src[1]; | |
< const struct ggml_tensor * c = dst->src[1]; | |
< | |
< if (params->ith != 0) { | |
< return; | |
< } | |
< | |
< fun(dst, a, b, c); | |
< } | |
< | |
< // ggml_compute_forward_map_custom1 | |
< | |
< static void ggml_compute_forward_map_custom1( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< | |
< struct ggml_map_custom1_op_params p; | |
< memcpy(&p, dst->op_params, sizeof(p)); | |
< | |
< p.fun(dst, a, params->ith, params->nth, p.userdata); | |
< } | |
< | |
< // ggml_compute_forward_map_custom2 | |
< | |
< static void ggml_compute_forward_map_custom2( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< const struct ggml_tensor * b = dst->src[1]; | |
< | |
< struct ggml_map_custom2_op_params p; | |
< memcpy(&p, dst->op_params, sizeof(p)); | |
< | |
< p.fun(dst, a, b, params->ith, params->nth, p.userdata); | |
< } | |
< | |
< // ggml_compute_forward_map_custom3 | |
< | |
< static void ggml_compute_forward_map_custom3( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * a = dst->src[0]; | |
< const struct ggml_tensor * b = dst->src[1]; | |
< const struct ggml_tensor * c = dst->src[2]; | |
< | |
< struct ggml_map_custom3_op_params p; | |
< memcpy(&p, dst->op_params, sizeof(p)); | |
< | |
< p.fun(dst, a, b, c, params->ith, params->nth, p.userdata); | |
< } | |
< | |
< // ggml_compute_forward_cross_entropy_loss | |
< | |
< static void ggml_compute_forward_cross_entropy_loss_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src1 = dst->src[1]; | |
< | |
< GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src1->type == GGML_TYPE_F32); | |
< GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); | |
< GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); | |
< GGML_ASSERT(ggml_are_same_shape(src0, src1)); | |
< GGML_ASSERT(ggml_is_scalar(dst)); | |
< GGML_ASSERT(dst->type == GGML_TYPE_F32); | |
< | |
< // TODO: handle transposed/permuted matrices | |
< const int64_t nc = src0->ne[0]; | |
< const int64_t nr = ggml_nrows(src0); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< float * sums = (float *) params->wdata; | |
< float * st = ((float *) params->wdata) + nth + ith*nc; | |
< float sum_thread = 0.0f; | |
< | |
< GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); | |
< | |
< // rows per thread | |
< const int64_t dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int64_t ir0 = dr*ith; | |
< const int64_t ir1 = MIN(ir0 + dr, nr); | |
< | |
< for (int64_t i1 = ir0; i1 < ir1; ++i1) { | |
< const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]); | |
< const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]); | |
< | |
< #ifndef NDEBUG | |
< for (int64_t i = 0; i < nc; ++i) { | |
< //printf("p[%d] = %f\n", i, p[i]); | |
< assert(!isnan(s0[i])); | |
< assert(!isnan(s1[i])); | |
< } | |
< #endif | |
< | |
< float max = -INFINITY; | |
< ggml_vec_max_f32(nc, &max, s0); | |
< const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max); | |
< assert(sum_softmax >= 0.0); | |
< | |
< ggml_vec_add1_f32(nc, st, st, -sum_softmax); | |
< ggml_vec_mul_f32(nc, st, st, s1); | |
< | |
< float sum_st = 0.0f; | |
< ggml_vec_sum_f32(nc, &sum_st, st); | |
< sum_thread += sum_st; | |
< | |
< #ifndef NDEBUG | |
< for (int64_t i = 0; i < nc; ++i) { | |
< assert(!isnan(st[i])); | |
< assert(!isinf(st[i])); | |
< } | |
< #endif | |
< } | |
< sums[ith] = sum_thread; | |
< ggml_barrier(params->threadpool); | |
< | |
< if (ith == 0) { | |
< float * dp = (float *) dst->data; | |
< ggml_vec_sum_f32(nth, dp, sums); | |
< dp[0] *= -1.0f / (float) nr; | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_cross_entropy_loss( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_cross_entropy_loss_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< // ggml_compute_forward_cross_entropy_loss_back | |
< | |
< static void ggml_compute_forward_cross_entropy_loss_back_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * grad = dst->src[0]; // gradient of forward pass output | |
< const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass | |
< const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass | |
< | |
< GGML_ASSERT(ggml_is_contiguous(dst)); | |
< GGML_ASSERT(ggml_is_contiguous(src0f)); | |
< GGML_ASSERT(ggml_is_contiguous(src1f)); | |
< GGML_ASSERT(ggml_is_contiguous(grad)); | |
< GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst)); | |
< | |
< const int64_t ith = params->ith; | |
< const int64_t nth = params->nth; | |
< | |
< // TODO: handle transposed/permuted matrices | |
< const int64_t nc = src0f->ne[0]; | |
< const int64_t nr = ggml_nrows(src0f); | |
< | |
< // rows per thread | |
< const int64_t dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int64_t ir0 = dr*ith; | |
< const int64_t ir1 = MIN(ir0 + dr, nr); | |
< | |
< const float d_by_nr = ((const float *) grad->data)[0] / (float) nr; | |
< | |
< for (int64_t i1 = ir0; i1 < ir1; i1++) { | |
< float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); | |
< const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]); | |
< const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]); | |
< | |
< #ifndef NDEBUG | |
< for (int64_t i = 0; i < nc; ++i) { | |
< //printf("p[%d] = %f\n", i, p[i]); | |
< assert(!isnan(s0[i])); | |
< assert(!isnan(s1[i])); | |
< } | |
< #endif | |
< | |
< // soft_max | |
< float max = -INFINITY; | |
< ggml_vec_max_f32(nc, &max, s0); | |
< const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); | |
< assert(sum > 0.0); | |
< ggml_vec_scale_f32(nc, ds0, 1.0/sum); | |
< | |
< // grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr | |
< ggml_vec_sub_f32(nc, ds0, ds0, s1); | |
< ggml_vec_scale_f32(nc, ds0, d_by_nr); | |
< | |
< #ifndef NDEBUG | |
< for (int64_t i = 0; i < nc; ++i) { | |
< assert(!isnan(ds0[i])); | |
< assert(!isinf(ds0[i])); | |
< } | |
< #endif | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_cross_entropy_loss_back( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_cross_entropy_loss_back_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_opt_step_adamw_f32( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< const struct ggml_tensor * src0_grad = dst->src[1]; | |
< const struct ggml_tensor * src0_grad_m = dst->src[2]; | |
< const struct ggml_tensor * src0_grad_v = dst->src[3]; | |
< const struct ggml_tensor * adamw_params = dst->src[4]; | |
< | |
< GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); | |
< GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); | |
< GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); | |
< GGML_ASSERT(ggml_nelements(adamw_params) == 7); | |
< | |
< const int ith = params->ith; | |
< const int nth = params->nth; | |
< | |
< const int nr = ggml_nrows(src0); | |
< | |
< GGML_TENSOR_UNARY_OP_LOCALS | |
< GGML_ASSERT(nb00 == sizeof(float)); | |
< | |
< // rows per thread | |
< const int dr = (nr + nth - 1)/nth; | |
< | |
< // row range for this thread | |
< const int ir0 = dr*ith; | |
< const int ir1 = MIN(ir0 + dr, nr); | |
< | |
< const float * adamw_params_ptr = ggml_get_data_f32(adamw_params); | |
< const float alpha = adamw_params_ptr[0]; | |
< const float beta1 = adamw_params_ptr[1]; | |
< const float beta2 = adamw_params_ptr[2]; | |
< const float eps = adamw_params_ptr[3]; | |
< const float wd = adamw_params_ptr[4]; | |
< const float beta1h = adamw_params_ptr[5]; | |
< const float beta2h = adamw_params_ptr[6]; | |
< | |
< for (int ir = ir0; ir < ir1; ++ir) { | |
< const int64_t i03 = ir/(ne02*ne01); | |
< const int64_t i02 = (ir - i03*ne02*ne01)/ne01; | |
< const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); | |
< | |
< const size_t offset = i03*nb03 + i02*nb02 + i01*nb01; | |
< | |
< float * w = (float *) ((char *) src0->data + offset); // weight | |
< const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad | |
< float * m = (float *) ((char *) src0_grad_m->data + offset); | |
< float * v = (float *) ((char *) src0_grad_v->data + offset); | |
< | |
< for (int i00 = 0; i00 < ne00; ++i00) { | |
< m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); | |
< v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); | |
< | |
< const float mh = m[i00]*beta1h; | |
< const float vh = sqrtf(v[i00]*beta2h) + eps; | |
< | |
< // The weight decay is applied independently of the Adam momenta m and v. | |
< // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss. | |
< // See: https://arxiv.org/pdf/1711.05101v3.pdf | |
< w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh; | |
< } | |
< } | |
< } | |
< | |
< static void ggml_compute_forward_opt_step_adamw( | |
< const struct ggml_compute_params * params, | |
< struct ggml_tensor * dst) { | |
< | |
< const struct ggml_tensor * src0 = dst->src[0]; | |
< | |
< switch (src0->type) { | |
< case GGML_TYPE_F32: | |
< { | |
< ggml_compute_forward_opt_step_adamw_f32(params, dst); | |
< } break; | |
< default: | |
< { | |
< GGML_ABORT("fatal error"); | |
< } | |
< } | |
< } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment