30 #error Advanced Vector Extensions required.
33 #ifndef _mm256_set_m128
34 #define _mm256_set_m128(va, vb) \
35 _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
38 inline __m256 madd256_ps(__m256 a, __m256 b, __m256 c) {
39 return _mm256_add_ps(_mm256_mul_ps(a, b), c);
41 inline __m128 madd128_ps(__m128 a, __m128 b, __m128 c) {
42 return _mm_add_ps(_mm_mul_ps(a, b), c);
44 inline __m128 madd128_ss(__m128 a, __m128 b, __m128 c) {
45 return _mm_add_ss(_mm_mul_ss(a, b), c);
47 inline __m256d madd256_pd(__m256d a, __m256d b, __m256d c) {
48 return _mm256_add_pd(_mm256_mul_pd(a, b), c);
50 inline __m128d madd128_pd(__m128d a, __m128d b, __m128d c) {
51 return _mm_add_pd(_mm_mul_pd(a, b), c);
53 inline __m128d madd128_sd(__m128d a, __m128d b, __m128d c) {
54 return _mm_add_sd(_mm_mul_sd(a, b), c);
60 inline __m128 hsum256_ps(__m256 x) {
62 const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
64 const __m128 loQuad = _mm256_castps256_ps128(x);
66 const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
68 const __m128 loDual = sumQuad;
70 const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
72 const __m128 sumDual = _mm_add_ps(loDual, hiDual);
74 const __m128 lo = sumDual;
76 const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
78 const __m128 sum = _mm_add_ss(lo, hi);
83 inline __m128 hsum2x256_ps(__m256 a, __m256 b) {
85 __m256 x = _mm256_permute2f128_ps(a, b, 0x20);
87 __m256 y = _mm256_permute2f128_ps(a, b, 0x31);
89 x = _mm256_add_ps(x, y);
91 y = _mm256_permute_ps(x, _MM_SHUFFLE(3, 2, 3, 2));
93 x = _mm256_add_ps(x, y);
95 y = _mm256_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1));
97 x = _mm256_add_ps(x, y);
99 __m128 upper = _mm256_extractf128_ps(x, 1);
101 __m128 ret = _mm_unpacklo_ps(_mm256_castps256_ps128(x), upper);
105 inline __m128d hsum256_pd(__m256d x) {
107 const __m128d hiDual = _mm256_extractf128_pd(x, 1);
109 const __m128d loDual = _mm256_castpd256_pd128(x);
111 const __m128d sumDual = _mm_add_pd(loDual, hiDual);
113 const __m128d sum = _mm_hadd_pd(sumDual, _mm_setzero_pd());
126 inline __m256 leftShift(__m256 a) {
133 inline __m256 leftShift<4>(__m256 x) {
137 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3));
139 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
141 __m256 y = _mm256_blend_ps(t0, t1, 0x11);
147 inline __m256 leftShift<8>(__m256 x) {
151 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2));
153 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
155 __m256 y = _mm256_blend_ps(t0, t1, 0x33 );
160 inline __m256 leftShift<12>(__m256 x) {
164 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(0, 3, 2, 1));
166 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
168 __m256 y = _mm256_blend_ps(t0, t1, 0x77 );
173 inline __m256 leftShift<16>(__m256 x) {
177 __m256 y = _mm256_permute2f128_ps(x, x, 0x08);
182 inline __m256 leftShift<20>(__m256 x) {
186 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3));
188 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
190 __m256 y = _mm256_blend_ps(t1, _mm256_setzero_ps(), 0x10 );
195 inline __m256 leftShift<24>(__m256 x) {
199 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2));
201 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
203 __m256 y = _mm256_blend_ps(_mm256_setzero_ps(), t1, 0xC0 );
208 inline __m256 leftShift<28>(__m256 x) {
212 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(0, 3, 2, 1));
214 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x08);
216 __m256 y = _mm256_blend_ps(_mm256_setzero_ps(), t1, 0x80 );
221 inline __m256 rightShift(__m256 a)
229 inline __m256 rightShift<4>(__m256 x) {
233 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(0, 3, 2, 1));
235 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
240 __m256 y = _mm256_blend_ps(t0, t1, 0x88 );
245 inline __m256 rightShift<8>(__m256 x) {
249 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2));
251 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
256 __m256 y = _mm256_blend_ps(t0, t1, 0xCC );
261 inline __m256 rightShift<12>(__m256 x) {
265 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3));
267 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
272 __m256 y = _mm256_blend_ps(t0, t1, 0xEE );
277 inline __m256 rightShift<16>(__m256 x)
282 __m256 y = _mm256_permute2f128_ps(x, x, 0x81);
287 inline __m256 rightShift<20>(__m256 x) {
291 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(0, 3, 2, 1));
293 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
298 __m256 y = _mm256_blend_ps(t1, _mm256_setzero_ps(), 0xF8 );
303 inline __m256 rightShift<24>(__m256 x) {
307 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(1, 0, 3, 2));
309 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
314 __m256 y = _mm256_blend_ps(t1, _mm256_setzero_ps(), 0xFC );
319 inline __m256 rightShift<28>(__m256 x) {
323 __m256 t0 = _mm256_permute_ps(x, _MM_SHUFFLE(2, 1, 0, 3));
325 __m256 t1 = _mm256_permute2f128_ps(t0, t0, 0x81);
330 __m256 y = _mm256_blend_ps(t1, _mm256_setzero_ps(), 0xFE );
Definition: avx_kernel_common.h:119