28 #if defined(CNN_USE_SSE) || defined(CNN_USE_AVX)
29 #include <immintrin.h>
36 #define VECTORIZE_ALIGN(x) __declspec(align(x))
37 #elif defined(__GNUC__)
38 #define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
40 #define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
48 inline bool is_aligned(T,
const typename T::value_type* ) {
53 inline bool is_aligned(T,
const typename T::value_type* p1,
const typename T::value_type* p2) {
54 return is_aligned(T(), p1) && is_aligned(T(), p2);
61 typedef T register_type;
66 static register_type set1(
const value_type& x) {
return x; }
67 static register_type zero() {
return register_type(0); }
68 static register_type mul(
const register_type& v1,
const register_type& v2) {
return v1 * v2; }
69 static register_type add(
const register_type& v1,
const register_type& v2) {
return v1 + v2; }
70 static register_type load(
const value_type* px) {
return *px; }
71 static register_type loadu(
const value_type* px) {
return *px; }
72 static void store(value_type* px,
const register_type& v) { *px = v; }
73 static void storeu(value_type* px,
const register_type& v) { *px = v; }
74 static value_type resemble(
const register_type& x) {
return x; }
80 typedef __m128 register_type;
81 typedef float value_type;
85 static register_type set1(
const value_type& x) {
return _mm_set1_ps(x); }
86 static register_type zero() { register_type v = {};
return v; }
87 static register_type mul(
const register_type& v1,
const register_type& v2) {
return _mm_mul_ps(v1, v2); }
88 static register_type add(
const register_type& v1,
const register_type& v2) {
return _mm_add_ps(v1, v2); }
89 static register_type load(
const value_type* px) {
return _mm_load_ps(px); }
90 static register_type loadu(
const value_type* px) {
return _mm_loadu_ps(px); }
91 static void store(value_type* px,
const register_type& v) { _mm_store_ps(px, v); }
92 static void storeu(value_type* px,
const register_type& v) { _mm_storeu_ps(px, v); }
93 static value_type resemble(
const register_type& x) {
94 VECTORIZE_ALIGN(16) float tmp[4];
96 return tmp[0] + tmp[1] + tmp[2] + tmp[3];
101 typedef __m128d register_type;
102 typedef double value_type;
106 static register_type set1(
const value_type& x) {
return _mm_set1_pd(x); }
107 static register_type zero() { register_type v = {};
return v; }
108 static register_type mul(
const register_type& v1,
const register_type& v2) {
return _mm_mul_pd(v1, v2); }
109 static register_type add(
const register_type& v1,
const register_type& v2) {
return _mm_add_pd(v1, v2); }
110 static register_type load(
const value_type* px) {
return _mm_load_pd(px); }
111 static register_type loadu(
const value_type* px) {
return _mm_loadu_pd(px); }
112 static void store(value_type* px,
const register_type& v) { _mm_store_pd(px, v); }
113 static void storeu(value_type* px,
const register_type& v) { _mm_storeu_pd(px, v); }
114 static value_type resemble(
const register_type& x) {
115 VECTORIZE_ALIGN(16) double tmp[2];
116 _mm_store_pd(tmp, x);
117 return tmp[0] + tmp[1];
124 struct sse<float> :
public float_sse {};
126 struct sse<double> :
public double_sse {};
129 inline bool is_aligned(sse<T>,
const typename sse<T>::value_type* p) {
130 return reinterpret_cast<std::size_t
>(p) % 16 == 0;
138 typedef __m256 register_type;
139 typedef float value_type;
143 static register_type set1(
const value_type& x) {
return _mm256_set1_ps(x); }
144 static register_type zero() { register_type v = {};
return v; }
145 static register_type mul(
const register_type& v1,
const register_type& v2) {
return _mm256_mul_ps(v1, v2); }
146 static register_type add(
const register_type& v1,
const register_type& v2) {
return _mm256_add_ps(v1, v2); }
147 static register_type load(
const value_type* px) {
return _mm256_load_ps(px); }
148 static register_type loadu(
const value_type* px) {
return _mm256_loadu_ps(px); }
149 static void store(value_type* px,
const register_type& v) { _mm256_store_ps(px, v); }
150 static void storeu(value_type* px,
const register_type& v) { _mm256_storeu_ps(px, v); }
151 static value_type resemble(
const register_type& x) {
152 VECTORIZE_ALIGN(32) float tmp[8];
153 _mm256_store_ps(tmp, x);
154 return std::accumulate(tmp, tmp + 8, 0.0f);
159 typedef __m256d register_type;
160 typedef double value_type;
164 static register_type set1(
const value_type& x) {
return _mm256_set1_pd(x); }
165 static register_type zero() { register_type v = {};
return v; }
166 static register_type mul(
const register_type& v1,
const register_type& v2) {
return _mm256_mul_pd(v1, v2); }
167 static register_type add(
const register_type& v1,
const register_type& v2) {
return _mm256_add_pd(v1, v2); }
168 static register_type load(
const value_type* px) {
return _mm256_load_pd(px); }
169 static register_type loadu(
const value_type* px) {
return _mm256_loadu_pd(px); }
170 static void store(value_type* px,
const register_type& v) { _mm256_store_pd(px, v); }
171 static void storeu(value_type* px,
const register_type& v) { _mm256_storeu_pd(px, v); }
172 static value_type resemble(
const register_type& x) {
173 VECTORIZE_ALIGN(32) double tmp[4];
174 _mm256_store_pd(tmp, x);
175 return std::accumulate(tmp, tmp + 4, 0.0);
182 struct avx<float> :
public float_avx {};
184 struct avx<double> :
public double_avx {};
187 inline bool is_aligned(avx<T>,
const typename avx<T>::value_type* p) {
188 return reinterpret_cast<std::size_t
>(p) % 32 == 0;
195 inline typename T::value_type dot_product_nonaligned(
const typename T::value_type* f1,
const typename T::value_type* f2, std::size_t size) {
196 typename T::register_type result = T::zero();
198 for (std::size_t i = 0; i < size/T::unroll_size; i++)
199 result = T::add(result, T::mul(T::loadu(&f1[i*T::unroll_size]), T::loadu(&f2[i*T::unroll_size])));
201 typename T::value_type sum = T::resemble(result);
203 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
204 sum += f1[i] * f2[i];
211 inline typename T::value_type dot_product_aligned(
const typename T::value_type* f1,
const typename T::value_type* f2, std::size_t size) {
212 typename T::register_type result = T::zero();
214 assert(is_aligned(T(), f1));
215 assert(is_aligned(T(), f2));
217 for (std::size_t i = 0; i < size/T::unroll_size; i++)
218 result = T::add(result, T::mul(T::load(&f1[i*T::unroll_size]), T::load(&f2[i*T::unroll_size])));
220 typename T::value_type sum = T::resemble(result);
222 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
223 sum += f1[i] * f2[i];
229 inline void muladd_aligned(
const typename T::value_type* src,
typename T::value_type c, std::size_t size,
typename T::value_type* dst) {
230 typename T::register_type factor = T::set1(c);
232 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
233 typename T::register_type d = T::load(&dst[i*T::unroll_size]);
234 typename T::register_type s = T::load(&src[i*T::unroll_size]);
235 T::store(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
238 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
239 dst[i] += src[i] * c;
244 inline void muladd_nonaligned(
const typename T::value_type* src,
typename T::value_type c, std::size_t size,
typename T::value_type* dst) {
245 typename T::register_type factor = T::set1(c);
247 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
248 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
249 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
250 T::storeu(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
253 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
254 dst[i] += src[i] * c;
258 inline void reduce_nonaligned(
const typename T::value_type* src, std::size_t size,
typename T::value_type* dst) {
259 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
260 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
261 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
262 T::storeu(&dst[i*T::unroll_size], T::add(d, s));
265 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
270 inline void reduce_aligned(
const typename T::value_type* src, std::size_t size,
typename T::value_type* dst) {
271 for (std::size_t i = 0; i < size/T::unroll_size; i++) {
272 typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
273 typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
274 T::storeu(&dst[i*T::unroll_size], T::add(d, s));
277 for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
283 #if defined(CNN_USE_AVX)
284 #define VECTORIZE_TYPE(T) detail::avx<T>
285 #elif defined(CNN_USE_SSE)
286 #define VECTORIZE_TYPE(T) detail::sse<T>
288 #define VECTORIZE_TYPE(T) detail::generic_vec_type<T>
293 void muladd(
const T* src, T c, std::size_t size, T* dst) {
294 if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
295 detail::muladd_aligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
297 detail::muladd_nonaligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
302 T dot(
const T* s1,
const T* s2, std::size_t size) {
303 if (detail::is_aligned(VECTORIZE_TYPE(T)(), s1, s2))
304 return detail::dot_product_aligned<VECTORIZE_TYPE(T)>(s1, s2, size);
306 return detail::dot_product_nonaligned<VECTORIZE_TYPE(T)>(s1, s2, size);
311 void reduce(
const T* src, std::size_t size, T* dst) {
312 if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
313 return detail::reduce_aligned<VECTORIZE_TYPE(T)>(src, size, dst);
315 return detail::reduce_nonaligned<VECTORIZE_TYPE(T)>(src, size, dst);