tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
product.h
1 /*
2  Copyright (c) 2013, Taiga Nomi
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 #if defined(CNN_USE_SSE) || defined(CNN_USE_AVX)
29 #include <immintrin.h>
30 #endif
31 #include <cstdint>
32 #include <cassert>
33 #include <numeric>
34 
35 #if defined(_MSC_VER)
36 #define VECTORIZE_ALIGN(x) __declspec(align(x))
37 #elif defined(__GNUC__)
38 #define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
39 #else
40 #define VECTORIZE_ALIGN(x) __attribute__((aligned(x)))
41 #endif
42 
43 namespace vectorize {
44 namespace detail {
45 
46 
47 template<typename T>
48 inline bool is_aligned(T, const typename T::value_type* /*p*/) {
49  return true;
50 }
51 
52 template<typename T>
53 inline bool is_aligned(T, const typename T::value_type* p1, const typename T::value_type* p2) {
54  return is_aligned(T(), p1) && is_aligned(T(), p2);
55 }
56 
57 // traits
58 
59 template <typename T>
61  typedef T register_type;
62  typedef T value_type;
63  enum {
64  unroll_size = 1
65  };
66  static register_type set1(const value_type& x) { return x; }
67  static register_type zero() { return register_type(0); }
68  static register_type mul(const register_type& v1, const register_type& v2) { return v1 * v2; }
69  static register_type add(const register_type& v1, const register_type& v2) { return v1 + v2; }
70  static register_type load(const value_type* px) { return *px; }
71  static register_type loadu(const value_type* px) { return *px; }
72  static void store(value_type* px, const register_type& v) { *px = v; }
73  static void storeu(value_type* px, const register_type& v) { *px = v; }
74  static value_type resemble(const register_type& x) { return x; }
75 };
76 
77 #ifdef CNN_USE_SSE
78 
79 struct float_sse {
80  typedef __m128 register_type;
81  typedef float value_type;
82  enum {
83  unroll_size = 4
84  };
85  static register_type set1(const value_type& x) { return _mm_set1_ps(x); }
86  static register_type zero() { register_type v = {}; return v; }
87  static register_type mul(const register_type& v1, const register_type& v2) { return _mm_mul_ps(v1, v2); }
88  static register_type add(const register_type& v1, const register_type& v2) { return _mm_add_ps(v1, v2); }
89  static register_type load(const value_type* px) { return _mm_load_ps(px); }
90  static register_type loadu(const value_type* px) { return _mm_loadu_ps(px); }
91  static void store(value_type* px, const register_type& v) { _mm_store_ps(px, v); }
92  static void storeu(value_type* px, const register_type& v) { _mm_storeu_ps(px, v); }
93  static value_type resemble(const register_type& x) {
94  VECTORIZE_ALIGN(16) float tmp[4];
95  _mm_store_ps(tmp, x);
96  return tmp[0] + tmp[1] + tmp[2] + tmp[3];
97  }
98 };
99 
100 struct double_sse {
101  typedef __m128d register_type;
102  typedef double value_type;
103  enum {
104  unroll_size = 2
105  };
106  static register_type set1(const value_type& x) { return _mm_set1_pd(x); }
107  static register_type zero() { register_type v = {}; return v; }
108  static register_type mul(const register_type& v1, const register_type& v2) { return _mm_mul_pd(v1, v2); }
109  static register_type add(const register_type& v1, const register_type& v2) { return _mm_add_pd(v1, v2); }
110  static register_type load(const value_type* px) { return _mm_load_pd(px); }
111  static register_type loadu(const value_type* px) { return _mm_loadu_pd(px); }
112  static void store(value_type* px, const register_type& v) { _mm_store_pd(px, v); }
113  static void storeu(value_type* px, const register_type& v) { _mm_storeu_pd(px, v); }
114  static value_type resemble(const register_type& x) {
115  VECTORIZE_ALIGN(16) double tmp[2];
116  _mm_store_pd(tmp, x);
117  return tmp[0] + tmp[1];
118  }
119 };
120 
121 template<typename T>
122 struct sse {};
123 template<>
124 struct sse<float> : public float_sse {};
125 template<>
126 struct sse<double> : public double_sse {};
127 
128 template<typename T>
129 inline bool is_aligned(sse<T>, const typename sse<T>::value_type* p) {
130  return reinterpret_cast<std::size_t>(p) % 16 == 0;
131 }
132 
133 #endif // CNN_USE_SSE
134 
135 #ifdef CNN_USE_AVX
136 
137 struct float_avx {
138  typedef __m256 register_type;
139  typedef float value_type;
140  enum {
141  unroll_size = 8
142  };
143  static register_type set1(const value_type& x) { return _mm256_set1_ps(x); }
144  static register_type zero() { register_type v = {}; return v; }
145  static register_type mul(const register_type& v1, const register_type& v2) { return _mm256_mul_ps(v1, v2); }
146  static register_type add(const register_type& v1, const register_type& v2) { return _mm256_add_ps(v1, v2); }
147  static register_type load(const value_type* px) { return _mm256_load_ps(px); }
148  static register_type loadu(const value_type* px) { return _mm256_loadu_ps(px); }
149  static void store(value_type* px, const register_type& v) { _mm256_store_ps(px, v); }
150  static void storeu(value_type* px, const register_type& v) { _mm256_storeu_ps(px, v); }
151  static value_type resemble(const register_type& x) {
152  VECTORIZE_ALIGN(32) float tmp[8];
153  _mm256_store_ps(tmp, x);
154  return std::accumulate(tmp, tmp + 8, 0.0f);
155  }
156 };
157 
158 struct double_avx {
159  typedef __m256d register_type;
160  typedef double value_type;
161  enum {
162  unroll_size = 4
163  };
164  static register_type set1(const value_type& x) { return _mm256_set1_pd(x); }
165  static register_type zero() { register_type v = {}; return v; }
166  static register_type mul(const register_type& v1, const register_type& v2) { return _mm256_mul_pd(v1, v2); }
167  static register_type add(const register_type& v1, const register_type& v2) { return _mm256_add_pd(v1, v2); }
168  static register_type load(const value_type* px) { return _mm256_load_pd(px); }
169  static register_type loadu(const value_type* px) { return _mm256_loadu_pd(px); }
170  static void store(value_type* px, const register_type& v) { _mm256_store_pd(px, v); }
171  static void storeu(value_type* px, const register_type& v) { _mm256_storeu_pd(px, v); }
172  static value_type resemble(const register_type& x) {
173  VECTORIZE_ALIGN(32) double tmp[4];
174  _mm256_store_pd(tmp, x);
175  return std::accumulate(tmp, tmp + 4, 0.0);
176  }
177 };
178 
179 template<typename T>
180 struct avx {};
181 template<>
182 struct avx<float> : public float_avx {};
183 template<>
184 struct avx<double> : public double_avx {};
185 
186 template<typename T>
187 inline bool is_aligned(avx<T>, const typename avx<T>::value_type* p) {
188  return reinterpret_cast<std::size_t>(p) % 32 == 0;
189 }
190 
191 #endif // CNN_USE_AVX
192 
193 // generic dot-product
194 template<typename T>
195 inline typename T::value_type dot_product_nonaligned(const typename T::value_type* f1, const typename T::value_type* f2, std::size_t size) {
196  typename T::register_type result = T::zero();
197 
198  for (std::size_t i = 0; i < size/T::unroll_size; i++)
199  result = T::add(result, T::mul(T::loadu(&f1[i*T::unroll_size]), T::loadu(&f2[i*T::unroll_size])));
200 
201  typename T::value_type sum = T::resemble(result);
202 
203  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
204  sum += f1[i] * f2[i];
205 
206  return sum;
207 }
208 
209 // generic dot-product(aligned)
210 template<typename T>
211 inline typename T::value_type dot_product_aligned(const typename T::value_type* f1, const typename T::value_type* f2, std::size_t size) {
212  typename T::register_type result = T::zero();
213 
214  assert(is_aligned(T(), f1));
215  assert(is_aligned(T(), f2));
216 
217  for (std::size_t i = 0; i < size/T::unroll_size; i++)
218  result = T::add(result, T::mul(T::load(&f1[i*T::unroll_size]), T::load(&f2[i*T::unroll_size])));
219 
220  typename T::value_type sum = T::resemble(result);
221 
222  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
223  sum += f1[i] * f2[i];
224 
225  return sum;
226 }
227 
228 template<typename T>
229 inline void muladd_aligned(const typename T::value_type* src, typename T::value_type c, std::size_t size, typename T::value_type* dst) {
230  typename T::register_type factor = T::set1(c);
231 
232  for (std::size_t i = 0; i < size/T::unroll_size; i++) {
233  typename T::register_type d = T::load(&dst[i*T::unroll_size]);
234  typename T::register_type s = T::load(&src[i*T::unroll_size]);
235  T::store(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
236  }
237 
238  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
239  dst[i] += src[i] * c;
240 }
241 
242 
243 template<typename T>
244 inline void muladd_nonaligned(const typename T::value_type* src, typename T::value_type c, std::size_t size, typename T::value_type* dst) {
245  typename T::register_type factor = T::set1(c);
246 
247  for (std::size_t i = 0; i < size/T::unroll_size; i++) {
248  typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
249  typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
250  T::storeu(&dst[i*T::unroll_size], T::add(d, T::mul(s, factor)));
251  }
252 
253  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
254  dst[i] += src[i] * c;
255 }
256 
257 template<typename T>
258 inline void reduce_nonaligned(const typename T::value_type* src, std::size_t size, typename T::value_type* dst) {
259  for (std::size_t i = 0; i < size/T::unroll_size; i++) {
260  typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
261  typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
262  T::storeu(&dst[i*T::unroll_size], T::add(d, s));
263  }
264 
265  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
266  dst[i] += src[i];
267 }
268 
269 template<typename T>
270 inline void reduce_aligned(const typename T::value_type* src, std::size_t size, typename T::value_type* dst) {
271  for (std::size_t i = 0; i < size/T::unroll_size; i++) {
272  typename T::register_type d = T::loadu(&dst[i*T::unroll_size]);
273  typename T::register_type s = T::loadu(&src[i*T::unroll_size]);
274  T::storeu(&dst[i*T::unroll_size], T::add(d, s));
275  }
276 
277  for (std::size_t i = (size/T::unroll_size)*T::unroll_size; i < size; i++)
278  dst[i] += src[i];
279 }
280 
281 } // namespace detail
282 
283 #if defined(CNN_USE_AVX)
284 #define VECTORIZE_TYPE(T) detail::avx<T>
285 #elif defined(CNN_USE_SSE)
286 #define VECTORIZE_TYPE(T) detail::sse<T>
287 #else
288 #define VECTORIZE_TYPE(T) detail::generic_vec_type<T>
289 #endif
290 
291 // dst[i] += c * src[i]
292 template<typename T>
293 void muladd(const T* src, T c, std::size_t size, T* dst) {
294  if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
295  detail::muladd_aligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
296  else
297  detail::muladd_nonaligned<VECTORIZE_TYPE(T)>(src, c, size, dst);
298 }
299 
300 // sum(s1[i] * s2[i])
301 template<typename T>
302 T dot(const T* s1, const T* s2, std::size_t size) {
303  if (detail::is_aligned(VECTORIZE_TYPE(T)(), s1, s2))
304  return detail::dot_product_aligned<VECTORIZE_TYPE(T)>(s1, s2, size);
305  else
306  return detail::dot_product_nonaligned<VECTORIZE_TYPE(T)>(s1, s2, size);
307 }
308 
310 template<typename T>
311 void reduce(const T* src, std::size_t size, T* dst) {
312  if (detail::is_aligned(VECTORIZE_TYPE(T)(), src, dst))
313  return detail::reduce_aligned<VECTORIZE_TYPE(T)>(src, size, dst);
314  else
315  return detail::reduce_nonaligned<VECTORIZE_TYPE(T)>(src, size, dst);
316 }
317 
318 } // namespace vectorize
Definition: product.h:60