tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
tiny_quantized_fully_connected_kernel.h
1 /*
2  Copyright (c) 2016, Taiga Nomi, Edgar Riba
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 
29 #include "tiny_dnn/core/params/fully_params.h"
30 #include "tiny_dnn/core/kernels/tiny_quantization_kernel.h"
31 #include "tiny_dnn/core/kernels/tiny_quantized_matmul_kernel.h"
32 
33 namespace tiny_dnn {
34 namespace core {
35 namespace kernels {
36 
37 inline void tiny_quantized_fully_connected_kernel(const fully_params& params,
38  const vec_t& in,
39  const vec_t& W,
40  const vec_t& b,
41  vec_t& a,
42  const bool layer_parallelize) {
43  // input quantization
44  float_t min_input(in[0]);
45  float_t max_input(in[0]);
46  for (serial_size_t c = 0; c < params.in_size_; c++) {
47  min_input = std::min(min_input, in[c]);
48  max_input = std::max(max_input, in[c]);
49  }
50  std::vector<uint8_t> in_quantized =
51  float_tensor_to_quantized<uint8_t>(in, min_input, max_input);
52  // filter quantization
53  float_t min_filter(W[0]);
54  float_t max_filter(W[0]);
55  for (serial_size_t c = 0; c < W.size(); c++) {
56  min_filter = std::min(min_filter, W[c]);
57  max_filter = std::max(max_filter, W[c]);
58  }
59  if (min_filter == max_filter) {
60  max_filter = W[0] + 1e-3f;
61  min_filter = W[0] - 1e-3f;
62  }
63  std::vector<uint8_t> W_quantized =
64  float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
65  // output range
66  float_t min_output_value;
67  float_t max_output_value;
68  quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
69  min_input, max_input, min_filter, max_filter, &min_output_value,
70  &max_output_value);
71  // bias quantization
72  float_t min_bias(0);
73  float_t max_bias(0);
74  std::vector<uint8_t> bias_quantized;
75  if (params.has_bias_) {
76  for (serial_size_t inc = 0; inc < b.size(); inc++) {
77  min_bias = std::min(min_bias, b[inc]);
78  max_bias = std::max(max_bias, b[inc]);
79  }
80  if (min_bias == max_bias) {
81  max_bias = b[0] + 1e-3f;
82  min_bias = b[0] - 1e-3f;
83  }
84  bias_quantized =
85  float_tensor_to_quantized<uint8_t>(b, min_bias, max_bias);
86  }
87  min_output_value += min_bias;
88  max_output_value += max_bias;
89 
90  std::vector<int32_t> a_quantized(a.size(), static_cast<int32_t>(0));
91 
92  // calculating offset
93  const int32_t offset_input =
94  float_to_quantized_unclamped<uint8_t>(0.0f, min_input, max_input);
95  const int32_t offset_filter =
96  float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
97  const int32_t zero_in_total_space =
98  float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
99 
100  const int32_t offset_output = 0;
101  const int32_t mult_output = 1;
102  const int32_t shift_output = 0;
103 
104  bool use_gemm = false;
105  if (use_gemm) {
106  std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
107  tiny_quantized_matmul(in_quantized,
108  W_quantized,
109  a_quantized,
110  shape,
111  offset_input,
112  offset_filter,
113  offset_output,
114  mult_output,
115  shift_output);
116  if (params.has_bias_) {
117  for_i(layer_parallelize, params.out_size_, [&](int i) {
118  a[i] += b[i];
119  });
120  }
121  } else {
122  for_i(layer_parallelize, params.out_size_, [&](int i) {
123  for (serial_size_t c = 0; c < params.in_size_; c++) {
124  a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
125  static_cast<int32_t>(in_quantized[c] - offset_input);
126  }
127  if (params.has_bias_) {
128  a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
129  }
130  });
131  }
132 
133  float_t min_output_requantized;
134  float_t max_output_requantized;
135  std::vector<uint8_t> a_requantized(a_quantized.size(), static_cast<uint8_t>(0));
136 
137  // Requantize from 32bits to 8 bits for next layer
138  quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
139  &min_output_requantized, &max_output_requantized, &a_requantized);
140 
141  // dequantize to flaot, this could be removed within concatenated quantized network
142  a = quantized_tensor_to_float<uint8_t>(a_requantized, min_output_requantized, max_output_requantized);
143 }
144 
145 inline void tiny_quantized_fully_connected_back_kernel(const fully_params& params,
146  const vec_t& prev_out,
147  const vec_t& W,
148  vec_t& dW,
149  vec_t& prev_delta,
150  vec_t& curr_delta,
151  vec_t& db,
152  const bool layer_parallelize) {
153  // previous output quantization
154  float_t min_prev_out(prev_out[0]);
155  float_t max_prev_out(prev_out[0]);
156  for (serial_size_t inc = 0; inc < prev_out.size(); inc++) {
157  min_prev_out = std::min(min_prev_out, prev_out[inc]);
158  max_prev_out = std::max(min_prev_out, prev_out[inc]);
159  }
160  std::vector<uint8_t> prev_out_quantized =
161  float_tensor_to_quantized<uint8_t>(prev_out, min_prev_out, max_prev_out);
162 
163  // filter quantization
164  float_t min_filter(W[0]);
165  float_t max_filter(W[0]);
166  for (serial_size_t c = 0; c < W.size(); c++) {
167  min_filter = std::min(min_filter, W[c]);
168  max_filter = std::max(max_filter, W[c]);
169  }
170  if (min_filter == max_filter) {
171  max_filter = W[0] + 1e-3f;
172  min_filter = W[0] - 1e-3f;
173  }
174  std::vector<uint8_t> W_quantized =
175  float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
176 
177  // current delta quantization
178  float_t min_curr_delta(curr_delta[0]);
179  float_t max_curr_delta(curr_delta[0]);
180  for (serial_size_t inc = 0; inc < curr_delta.size(); inc++) {
181  min_curr_delta = std::min(min_curr_delta, curr_delta[inc]);
182  max_curr_delta = std::max(max_curr_delta, curr_delta[inc]);
183  }
184  std::vector<uint8_t> curr_delta_quantized =
185  float_tensor_to_quantized<uint8_t>(curr_delta, min_curr_delta, max_curr_delta);
186 
187  // output range for previous delta
188  float_t min_prev_delta_value;
189  float_t max_prev_delta_value;
190  quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
191  min_curr_delta, max_curr_delta, min_filter, max_filter, &min_prev_delta_value,
192  &max_prev_delta_value);
193 
194  std::vector<int32_t> prev_delta_quantized(prev_delta.size(), static_cast<int32_t>(0));
195 
196  // output range for dW
197  float_t min_dW_value;
198  float_t max_dW_value;
199  quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
200  min_curr_delta, max_curr_delta, min_prev_out, max_prev_out, &min_dW_value,
201  &max_dW_value);
202 
203  std::vector<int32_t> dW_quantized(dW.size(), static_cast<int32_t>(0));
204 
205  // calculating offset
206  const int32_t offset_prev_out =
207  float_to_quantized_unclamped<uint8_t>(0.0f, min_prev_out, max_prev_out);
208  const int32_t offset_filter =
209  float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
210  const int32_t offset_curr_delta =
211  float_to_quantized_unclamped<uint8_t>(0.0f, min_curr_delta, max_curr_delta);
212  //const int32_t zero_in_prev_delta =
213  // float_to_quantized<int32_t>(0.0f, min_prev_delta_value, max_prev_delta_value);
214 
215  for (serial_size_t c = 0; c < params.in_size_; c++) {
216  // propagate delta to previous layer
217  // prev_delta[c] += current_delta[r] * W_[c * out_size_ + r]
218  for (serial_size_t io = 0; io < params.out_size_; io++) {
219  prev_delta_quantized[c] += (static_cast<int32_t>(curr_delta_quantized[io]) - offset_curr_delta)
220  * (static_cast<int32_t>(W_quantized[c * params.out_size_ + io]) - offset_filter);
221  }
222  }
223 
224  float_t min_prev_delta_requantized;
225  float_t max_prev_delta_requantized;
226  std::vector<uint8_t> prev_delta_requantized(prev_delta_quantized.size(), static_cast<uint8_t>(0));
227 
228  // Requantize from 32bits to 8 bits for next layer
229  quantize_down_and_shrink_range<int32_t, uint8_t>(prev_delta_quantized, min_prev_delta_value, max_prev_delta_value,
230  &min_prev_delta_requantized, &max_prev_delta_requantized, &prev_delta_requantized);
231 
232  // dequantize to flaot, this could be removed within concatenated quantized network
233  prev_delta = quantized_tensor_to_float<uint8_t>(prev_delta_requantized, min_prev_delta_requantized, max_prev_delta_requantized);
234 
235  for_(layer_parallelize, 0, size_t(params.out_size_), [&](const blocked_range& r) {
236  // accumulate weight-step using delta
237  // dW[c * out_size + i] += current_delta[i] * prev_out[c]
238  for (serial_size_t c = 0; c < params.in_size_; c++) {
239  for (serial_size_t io = 0; io < params.out_size_; io++) {
240  dW_quantized[c * params.out_size_ + io] += (static_cast<int32_t>(curr_delta_quantized[io]) - offset_curr_delta)
241  * (static_cast<int32_t>(prev_out_quantized[c]) - offset_prev_out);
242  }
243  }
244 
245  if (params.has_bias_) {
246  // vec_t& db = *in_grad[2];
247  for (int i = r.begin(); i < r.end(); i++) {
248  db[i] += curr_delta[i];
249  }
250  }
251  });
252 
253  float_t min_dW_requantized;
254  float_t max_dW_requantized;
255  std::vector<uint8_t> dW_requantized(dW_quantized.size(), static_cast<uint8_t>(0));
256 
257  // requantize from 32bits to 8 bits for next layer
258  quantize_down_and_shrink_range<int32_t, uint8_t>(dW_quantized, min_dW_value, max_dW_value,
259  &min_dW_requantized, &max_dW_requantized, &dW_requantized);
260 
261  // dequantize to flaot, this could be removed within concatenated quantized network
262  dW = quantized_tensor_to_float<uint8_t>(dW_requantized, min_dW_requantized, max_dW_requantized);
263 }
264 
265 inline void tiny_quantized_fully_connected_kernel(const fully_params& params,
266  const vec_t& in,
267  const vec_t& W,
268  const vec_t& b,
269  const vec_t& in_r,
270  const vec_t& W_r,
271  const vec_t& b_r,
272  vec_t& a,
273  vec_t& a_r,
274  const bool layer_parallelize) {
275  // filter range
276  float_t min_filter(W_r[0]);
277  float_t max_filter(W_r[1]);
278  if (min_filter == max_filter) {
279  max_filter = W_r[1] + 1e-3f;
280  min_filter = W_r[0] - 1e-3f;
281  }
282  // bias range
283  float_t min_bias(b_r[0]);
284  float_t max_bias(b_r[1]);
285  if (params.has_bias_) {
286  if (min_bias == max_bias) {
287  max_bias = b_r[1] + 1e-3f;
288  min_bias = b_r[0] - 1e-3f;
289  }
290  }
291  // output range
292  float_t min_output_value;
293  float_t max_output_value;
294  quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
295  in_r[0], in_r[1], min_filter, max_filter, &min_output_value,
296  &max_output_value);
297  // data type restore
298  std::vector<uint8_t> in_quantized, W_quantized, bias_quantized;
299  for (size_t i = 0; i < in.size(); i++) {
300  in_quantized.push_back(static_cast<uint8_t>(in[i]));
301  }
302  for (size_t i = 0; i < W.size(); i++) {
303  W_quantized.push_back(static_cast<uint8_t>(W[i]));
304  }
305  for (size_t i = 0; i < b.size(); i++) {
306  bias_quantized.push_back(static_cast<uint8_t>(b[i]));
307  }
308  min_output_value += min_bias;
309  max_output_value += max_bias;
310 
311  std::vector<int32_t> a_quantized(a.size(), static_cast<int32_t>(0));
312 
313  // calculating offset
314  const int32_t offset_input =
315  float_to_quantized_unclamped<uint8_t>(0.0f, in_r[0], in_r[1]);
316  const int32_t offset_filter =
317  float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
318  const int32_t zero_in_total_space =
319  float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
320 
321  const int32_t offset_output = 0;
322  const int32_t mult_output = 1;
323  const int32_t shift_output = 0;
324 
325  bool use_gemm = false;
326  if (use_gemm) {
327  std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
328  tiny_quantized_matmul(in_quantized,
329  W_quantized,
330  a_quantized,
331  shape,
332  offset_input,
333  offset_filter,
334  offset_output,
335  mult_output,
336  shift_output);
337  if (params.has_bias_) {
338  for_i(layer_parallelize, params.out_size_, [&](int i) {
339  a[i] += b[i];
340  });
341  }
342  } else {
343  for_i(layer_parallelize, params.out_size_, [&](int i) {
344  for (serial_size_t c = 0; c < params.in_size_; c++) {
345  a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
346  static_cast<int32_t>(in_quantized[c] - offset_input);
347  }
348  if (params.has_bias_) {
349  a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
350  }
351  });
352  }
353 
354  float_t min_output_requantized;
355  float_t max_output_requantized;
356  std::vector<uint8_t> a_requantized(a_quantized.size(), static_cast<uint8_t>(0));
357 
358  // Requantize from 32bits to 8 bits for next layer
359  quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
360  &min_output_requantized, &max_output_requantized, &a_requantized);
361  // store directly in float datatype
362  for (size_t i = 0; i < a_requantized.size(); i++) {
363  a[i] = static_cast<float>(a_requantized[i]);
364  }
365  a_r[0] = min_output_requantized;
366  a_r[1] = max_output_requantized;
367 }
368 
369 } // namespace kernels
370 } // namespace core
371 } // namespace tiny_dnn