39[[nodiscard]]
inline float hsum(__m256 v)
noexcept {
40 __m128 lo = _mm256_castps256_ps128(v);
41 __m128 hi = _mm256_extractf128_ps(v, 1);
42 lo = _mm_add_ps(lo, hi);
43 __m128 shuf = _mm_movehdup_ps(lo);
44 lo = _mm_add_ps(lo, shuf);
45 return _mm_cvtss_f32(_mm_add_ss(lo, _mm_movehl_ps(shuf, lo)));
55 float32x4_t acc = vdupq_n_f32(0.0f);
56 for (; i + 4 <= n; i += 4) {
57 float32x4_t d = vsubq_f32(vld1q_f32(a + i), vld1q_f32(b + i));
58 acc = vmlaq_f32(acc, d, d);
61#elif defined(VANE_AVX2)
62 __m256 acc = _mm256_setzero_ps();
63 for (; i + 8 <= n; i += 8) {
64 __m256 d = _mm256_sub_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i));
65 acc = _mm256_fmadd_ps(d, d, acc);
71 float d = a[i] - b[i];
83 float32x4_t acc = vdupq_n_f32(0.0f);
84 for (; i + 4 <= n; i += 4)
85 acc = vmlaq_f32(acc, vld1q_f32(a + i), vld1q_f32(b + i));
87#elif defined(VANE_AVX2)
88 __m256 acc = _mm256_setzero_ps();
89 for (; i + 8 <= n; i += 8)
90 acc = _mm256_fmadd_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i), acc);
101 float dot = 0.0f, na = 0.0f, nb = 0.0f;
105 float32x4_t vdot = vdupq_n_f32(0.0f), vna = vdupq_n_f32(0.0f), vnb = vdupq_n_f32(0.0f);
106 for (; i + 4 <= n; i += 4) {
107 float32x4_t va = vld1q_f32(a + i), vb = vld1q_f32(b + i);
108 vdot = vmlaq_f32(vdot, va, vb);
109 vna = vmlaq_f32(vna, va, va);
110 vnb = vmlaq_f32(vnb, vb, vb);
113#elif defined(VANE_AVX2)
114 __m256 vdot = _mm256_setzero_ps(), vna = _mm256_setzero_ps(), vnb = _mm256_setzero_ps();
115 for (; i + 8 <= n; i += 8) {
116 __m256 va = _mm256_loadu_ps(a + i), vb = _mm256_loadu_ps(b + i);
117 vdot = _mm256_fmadd_ps(va, vb, vdot);
118 vna = _mm256_fmadd_ps(va, va, vna);
119 vnb = _mm256_fmadd_ps(vb, vb, vnb);
130 float denom = na * nb;
132 float sim = dot / sqrtf(denom);
133 return 1.0f - std::clamp(sim, -1.0f, 1.0f);