29 #include "tiny_dnn/core/params/fully_params.h"
30 #include "tiny_dnn/core/kernels/tiny_quantization_kernel.h"
31 #include "tiny_dnn/core/kernels/tiny_quantized_matmul_kernel.h"
37 inline void tiny_quantized_fully_connected_kernel(
const fully_params& params,
42 const bool layer_parallelize) {
44 float_t min_input(in[0]);
45 float_t max_input(in[0]);
46 for (serial_size_t c = 0; c < params.in_size_; c++) {
47 min_input = std::min(min_input, in[c]);
48 max_input = std::max(max_input, in[c]);
50 std::vector<uint8_t> in_quantized =
51 float_tensor_to_quantized<uint8_t>(in, min_input, max_input);
53 float_t min_filter(W[0]);
54 float_t max_filter(W[0]);
55 for (serial_size_t c = 0; c < W.size(); c++) {
56 min_filter = std::min(min_filter, W[c]);
57 max_filter = std::max(max_filter, W[c]);
59 if (min_filter == max_filter) {
60 max_filter = W[0] + 1e-3f;
61 min_filter = W[0] - 1e-3f;
63 std::vector<uint8_t> W_quantized =
64 float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
66 float_t min_output_value;
67 float_t max_output_value;
68 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
69 min_input, max_input, min_filter, max_filter, &min_output_value,
74 std::vector<uint8_t> bias_quantized;
75 if (params.has_bias_) {
76 for (serial_size_t inc = 0; inc < b.size(); inc++) {
77 min_bias = std::min(min_bias, b[inc]);
78 max_bias = std::max(max_bias, b[inc]);
80 if (min_bias == max_bias) {
81 max_bias = b[0] + 1e-3f;
82 min_bias = b[0] - 1e-3f;
85 float_tensor_to_quantized<uint8_t>(b, min_bias, max_bias);
87 min_output_value += min_bias;
88 max_output_value += max_bias;
90 std::vector<int32_t> a_quantized(a.size(),
static_cast<int32_t
>(0));
93 const int32_t offset_input =
94 float_to_quantized_unclamped<uint8_t>(0.0f, min_input, max_input);
95 const int32_t offset_filter =
96 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
97 const int32_t zero_in_total_space =
98 float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
100 const int32_t offset_output = 0;
101 const int32_t mult_output = 1;
102 const int32_t shift_output = 0;
104 bool use_gemm =
false;
106 std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
107 tiny_quantized_matmul(in_quantized,
116 if (params.has_bias_) {
117 for_i(layer_parallelize, params.out_size_, [&](
int i) {
122 for_i(layer_parallelize, params.out_size_, [&](
int i) {
123 for (serial_size_t c = 0; c < params.in_size_; c++) {
124 a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
125 static_cast<int32_t>(in_quantized[c] - offset_input);
127 if (params.has_bias_) {
128 a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
133 float_t min_output_requantized;
134 float_t max_output_requantized;
135 std::vector<uint8_t> a_requantized(a_quantized.size(),
static_cast<uint8_t
>(0));
138 quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
139 &min_output_requantized, &max_output_requantized, &a_requantized);
142 a = quantized_tensor_to_float<uint8_t>(a_requantized, min_output_requantized, max_output_requantized);
145 inline void tiny_quantized_fully_connected_back_kernel(
const fully_params& params,
146 const vec_t& prev_out,
152 const bool layer_parallelize) {
154 float_t min_prev_out(prev_out[0]);
155 float_t max_prev_out(prev_out[0]);
156 for (serial_size_t inc = 0; inc < prev_out.size(); inc++) {
157 min_prev_out = std::min(min_prev_out, prev_out[inc]);
158 max_prev_out = std::max(min_prev_out, prev_out[inc]);
160 std::vector<uint8_t> prev_out_quantized =
161 float_tensor_to_quantized<uint8_t>(prev_out, min_prev_out, max_prev_out);
164 float_t min_filter(W[0]);
165 float_t max_filter(W[0]);
166 for (serial_size_t c = 0; c < W.size(); c++) {
167 min_filter = std::min(min_filter, W[c]);
168 max_filter = std::max(max_filter, W[c]);
170 if (min_filter == max_filter) {
171 max_filter = W[0] + 1e-3f;
172 min_filter = W[0] - 1e-3f;
174 std::vector<uint8_t> W_quantized =
175 float_tensor_to_quantized<uint8_t>(W, min_filter, max_filter);
178 float_t min_curr_delta(curr_delta[0]);
179 float_t max_curr_delta(curr_delta[0]);
180 for (serial_size_t inc = 0; inc < curr_delta.size(); inc++) {
181 min_curr_delta = std::min(min_curr_delta, curr_delta[inc]);
182 max_curr_delta = std::max(max_curr_delta, curr_delta[inc]);
184 std::vector<uint8_t> curr_delta_quantized =
185 float_tensor_to_quantized<uint8_t>(curr_delta, min_curr_delta, max_curr_delta);
188 float_t min_prev_delta_value;
189 float_t max_prev_delta_value;
190 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
191 min_curr_delta, max_curr_delta, min_filter, max_filter, &min_prev_delta_value,
192 &max_prev_delta_value);
194 std::vector<int32_t> prev_delta_quantized(prev_delta.size(),
static_cast<int32_t
>(0));
197 float_t min_dW_value;
198 float_t max_dW_value;
199 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
200 min_curr_delta, max_curr_delta, min_prev_out, max_prev_out, &min_dW_value,
203 std::vector<int32_t> dW_quantized(dW.size(),
static_cast<int32_t
>(0));
206 const int32_t offset_prev_out =
207 float_to_quantized_unclamped<uint8_t>(0.0f, min_prev_out, max_prev_out);
208 const int32_t offset_filter =
209 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
210 const int32_t offset_curr_delta =
211 float_to_quantized_unclamped<uint8_t>(0.0f, min_curr_delta, max_curr_delta);
215 for (serial_size_t c = 0; c < params.in_size_; c++) {
218 for (serial_size_t io = 0; io < params.out_size_; io++) {
219 prev_delta_quantized[c] += (
static_cast<int32_t
>(curr_delta_quantized[io]) - offset_curr_delta)
220 * (
static_cast<int32_t
>(W_quantized[c * params.out_size_ + io]) - offset_filter);
224 float_t min_prev_delta_requantized;
225 float_t max_prev_delta_requantized;
226 std::vector<uint8_t> prev_delta_requantized(prev_delta_quantized.size(),
static_cast<uint8_t
>(0));
229 quantize_down_and_shrink_range<int32_t, uint8_t>(prev_delta_quantized, min_prev_delta_value, max_prev_delta_value,
230 &min_prev_delta_requantized, &max_prev_delta_requantized, &prev_delta_requantized);
233 prev_delta = quantized_tensor_to_float<uint8_t>(prev_delta_requantized, min_prev_delta_requantized, max_prev_delta_requantized);
235 for_(layer_parallelize, 0,
size_t(params.out_size_), [&](
const blocked_range& r) {
238 for (serial_size_t c = 0; c < params.in_size_; c++) {
239 for (serial_size_t io = 0; io < params.out_size_; io++) {
240 dW_quantized[c * params.out_size_ + io] += (static_cast<int32_t>(curr_delta_quantized[io]) - offset_curr_delta)
241 * (static_cast<int32_t>(prev_out_quantized[c]) - offset_prev_out);
245 if (params.has_bias_) {
247 for (int i = r.begin(); i < r.end(); i++) {
248 db[i] += curr_delta[i];
253 float_t min_dW_requantized;
254 float_t max_dW_requantized;
255 std::vector<uint8_t> dW_requantized(dW_quantized.size(),
static_cast<uint8_t
>(0));
258 quantize_down_and_shrink_range<int32_t, uint8_t>(dW_quantized, min_dW_value, max_dW_value,
259 &min_dW_requantized, &max_dW_requantized, &dW_requantized);
262 dW = quantized_tensor_to_float<uint8_t>(dW_requantized, min_dW_requantized, max_dW_requantized);
265 inline void tiny_quantized_fully_connected_kernel(
const fully_params& params,
274 const bool layer_parallelize) {
276 float_t min_filter(W_r[0]);
277 float_t max_filter(W_r[1]);
278 if (min_filter == max_filter) {
279 max_filter = W_r[1] + 1e-3f;
280 min_filter = W_r[0] - 1e-3f;
283 float_t min_bias(b_r[0]);
284 float_t max_bias(b_r[1]);
285 if (params.has_bias_) {
286 if (min_bias == max_bias) {
287 max_bias = b_r[1] + 1e-3f;
288 min_bias = b_r[0] - 1e-3f;
292 float_t min_output_value;
293 float_t max_output_value;
294 quantization_range_for_multiplication<uint8_t, uint8_t, int32_t>(
295 in_r[0], in_r[1], min_filter, max_filter, &min_output_value,
298 std::vector<uint8_t> in_quantized, W_quantized, bias_quantized;
299 for (
size_t i = 0; i < in.size(); i++) {
300 in_quantized.push_back(
static_cast<uint8_t
>(in[i]));
302 for (
size_t i = 0; i < W.size(); i++) {
303 W_quantized.push_back(
static_cast<uint8_t
>(W[i]));
305 for (
size_t i = 0; i < b.size(); i++) {
306 bias_quantized.push_back(
static_cast<uint8_t
>(b[i]));
308 min_output_value += min_bias;
309 max_output_value += max_bias;
311 std::vector<int32_t> a_quantized(a.size(),
static_cast<int32_t
>(0));
314 const int32_t offset_input =
315 float_to_quantized_unclamped<uint8_t>(0.0f, in_r[0], in_r[1]);
316 const int32_t offset_filter =
317 float_to_quantized_unclamped<uint8_t>(0.0f, min_filter, max_filter);
318 const int32_t zero_in_total_space =
319 float_to_quantized<int32_t>(0.0f, min_output_value, max_output_value);
321 const int32_t offset_output = 0;
322 const int32_t mult_output = 1;
323 const int32_t shift_output = 0;
325 bool use_gemm =
false;
327 std::vector<size_t> shape{params.in_size_, 1, params.out_size_, params.in_size_};
328 tiny_quantized_matmul(in_quantized,
337 if (params.has_bias_) {
338 for_i(layer_parallelize, params.out_size_, [&](
int i) {
343 for_i(layer_parallelize, params.out_size_, [&](
int i) {
344 for (serial_size_t c = 0; c < params.in_size_; c++) {
345 a_quantized[i] += static_cast<int32_t>(W_quantized[c * params.out_size_ + i] - offset_filter) *
346 static_cast<int32_t>(in_quantized[c] - offset_input);
348 if (params.has_bias_) {
349 a_quantized[i] += (bias_quantized[i] - zero_in_total_space);
354 float_t min_output_requantized;
355 float_t max_output_requantized;
356 std::vector<uint8_t> a_requantized(a_quantized.size(),
static_cast<uint8_t
>(0));
359 quantize_down_and_shrink_range<int32_t, uint8_t>(a_quantized, min_output_value, max_output_value,
360 &min_output_requantized, &max_output_requantized, &a_requantized);
362 for (
size_t i = 0; i < a_requantized.size(); i++) {
363 a[i] =
static_cast<float>(a_requantized[i]);
365 a_r[0] = min_output_requantized;
366 a_r[1] = max_output_requantized;