tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
tiny_quantization_kernel.h
1 /*
2  Copyright (c) 2016, Taiga Nomi, Edgar Riba
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 
29 namespace tiny_dnn {
30 namespace core {
31 namespace kernels {
32 
33 template <class T>
34 T highest() {
35  return (std::numeric_limits<T>::max)();
36 }
37 
38 template <class T>
39 T lowest() {
40  return std::numeric_limits<T>::is_integer ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
41 }
42 
43 // We have to be able to detect and handle overflows in int32, so this function
44 // uses doubles and int64's to make sure we have enough room.
45 template <class T>
46 int64_t float_to_quantized_unclamped(float_t input, float_t range_min, float_t range_max) {
47  if (range_min == range_max) {
48  return 0;
49  }
50  const int number_of_bits = sizeof(T) * 8;
51  const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
52  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
53  const double range = ((range_max - range_min) * range_adjust);
54  const double range_scale = (number_of_steps / range);
55  int64_t quantized =
56  static_cast<int64_t>(round(input * range_scale) - round(range_min * range_scale));
57  const int64_t lowest_quantized =
58  static_cast<int64_t>(lowest<T>());
59  quantized += lowest_quantized;
60  return quantized;
61 }
62 
63 inline int32_t int64_to_int32(int64_t src) {
64  assert(src <= std::numeric_limits<int32_t>::max() && src >= std::numeric_limits<int32_t>::min());
65  return static_cast<int32_t>(src);
66 }
67 
68 // This converts the float into the final quantized type, clamping/saturating
69 // any over or underflows.
70 template <class T>
71 T float_to_quantized(float_t input, float_t range_min, float_t range_max) {
72  int64_t quantized = float_to_quantized_unclamped<T>(input, range_min, range_max);
73  const int64_t lowest_quantized =
74  static_cast<int64_t>(lowest<T>());
75  const int64_t highest_quantized =
76  static_cast<int64_t>(highest<T>());
77  quantized = std::max<int64_t>(quantized, lowest_quantized);
78  quantized = std::min<int64_t>(quantized, highest_quantized);
79  return static_cast<T>(static_cast<int32_t>(quantized));
80 }
81 
82 template <class T>
83 float quantized_to_float(T input, float_t range_min, float_t range_max) {
84  if (range_min == range_max) {
85  return range_min;
86  }
87  const int number_of_bits = sizeof(T) * 8;
88  const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
89  const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
90  const double range = ((range_max - range_min) * range_adjust);
91  const double range_scale = (range / number_of_steps);
92  const int64_t lowest_quantized =
93  static_cast<int64_t>(lowest<T>());
94  const double offset_input = static_cast<double>(input) - lowest_quantized;
95  const double result = range_min + (offset_input * range_scale);
96  return static_cast<float_t>(result);
97 }
98 
99 template <class T>
100 float float_for_one_quantized_level(float_t range_min, float_t range_max) {
101  const int64_t highest_ = static_cast<int64_t>(highest<T>());
102  const int64_t lowest_ = static_cast<int64_t>(lowest<T>());
103  const float float_for_one_quantized_level =
104  (range_max - range_min) / (highest_ - lowest_);
105  return float_for_one_quantized_level;
106 }
107 
108 template <class T1, class T2, class T3>
109 void quantization_range_for_multiplication(float_t min_a, float_t max_a, float_t min_b,
110  float_t max_b, float_t* min_c,
111  float_t* max_c) {
112  const float_t a_float_for_one_quant_level =
113  float_for_one_quantized_level<T1>(min_a, max_a);
114  const float_t b_float_for_one_quant_level =
115  float_for_one_quantized_level<T2>(min_b, max_b);
116 
117  const int64_t c_highest = static_cast<int64_t>(highest<T3>());
118  const int64_t c_lowest = static_cast<int64_t>(lowest<T3>());
119  const float c_float_for_one_quant_level =
120  a_float_for_one_quant_level * b_float_for_one_quant_level;
121 
122  *min_c = c_float_for_one_quant_level * c_lowest;
123  *max_c = c_float_for_one_quant_level * c_highest;
124 }
125 
126 template <class T1, class T2>
127 inline T2 requantize_in_new_range(T1 input, float_t min_input, float_t max_input,
128  float_t min_new, float_t max_new) {
129  const float_t input_float = quantized_to_float<T1>(input, min_input, max_input);
130  return float_to_quantized<T2>(input_float, min_new, max_new);
131 }
132 
133 template <class T1, class T2>
134 inline void requantize_many_in_new_range(T1* input, size_t count, float_t min_input,
135  float_t max_input, float_t min_output,
136  float_t max_output, T2* output) {
137  for (size_t index = 0; index < count; ++index) {
138  const float_t input_float =
139  quantized_to_float<T1>(input[index], min_input, max_input);
140  output[index] = float_to_quantized<T2>(input_float, min_output, max_output);
141  }
142 }
143 
144 // Because converting 32-bit accumulated results down to eight bit is a common
145 // case, we have a specialized code path to handle it as efficiently as
146 // possible using only fixed-point math for the inner loop.
147 template <>
148 inline void requantize_many_in_new_range<int32_t, uint8_t>(
149  int32_t* input, size_t count, float_t min_input, float_t max_input,
150  float_t min_output, float_t max_output, uint8_t* output) {
151  // Initially we calculate all the constants we need once, before we go into
152  // the inner loop.
153  const int fp_shift = 16;
154  const float input_range = max_input - min_input;
155  const float output_range = max_output - min_output;
156  const float recip_output_range = (255.0f / output_range);
157  const int64_t recip_output_range_fp =
158  static_cast<int64_t>(recip_output_range * (1 << fp_shift));
159  const int64_t range_scale_fp =
160  static_cast<int64_t>(255.0f * (1 << fp_shift) * input_range / output_range);
161  const int64_t input_offset_fp =
162  static_cast<int64_t>((min_input * recip_output_range_fp) + (range_scale_fp >> 1));
163  const int64_t output_offset_fp = static_cast<int64_t>(round((min_output * 255.0f) / output_range));
164  const int64_t rounding_delta = 1 << (fp_shift - 1);
165  // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
166  // that could be easily adapted for a SIMD implementation. It should also be
167  // possible to perform all the calculations in 32-bit rather than 64, but
168  // that's not been implemented yet.
169  for (size_t index = 0; index < count; ++index) {
170  const int64_t input_value = static_cast<int64_t>(input[index]);
171  const int64_t fp_value =
172  ((input_value * range_scale_fp) >> 32) + input_offset_fp;
173  const int64_t round_intermediate =
174  ((fp_value >= 0) ? (fp_value + rounding_delta)
175  : (fp_value - rounding_delta)) >>
176  fp_shift;
177  int64_t quantized_int64 = (round_intermediate - output_offset_fp);
178  quantized_int64 = std::max<int64_t>(quantized_int64, 0LL);
179  quantized_int64 = std::min<int64_t>(quantized_int64, 255LL);
180  output[index] = static_cast<uint8_t>(static_cast<int32_t>(quantized_int64));
181  }
182 }
183 
184 // REQUIRES: 'result->NumElements() == input.NumElements()'
185 template <class T>
186 void float_tensor_to_quantized_in_place(const vec_t& input, float_t min, float_t max,
187  std::vector<T>* result) {
188  const size_t data_size = input.size();
189  for (size_t i = 0; i < data_size; ++i) {
190  (*result)[i] = float_to_quantized<T>(input[i], min, max);
191  }
192 }
193 
194 template <class T>
195 std::vector<T> float_tensor_to_quantized(const vec_t& input, float_t min, float_t max) {
196  std::vector<T> result(input.size(), static_cast<T>(0));
197  float_tensor_to_quantized_in_place<T>(input, min, max, &result);
198  return result;
199 }
200 
201 // REQUIRES: 'result->NumElements() == input.NumElements()'
202 template <class T>
203 void quantized_tensor_to_float_in_place(const std::vector<T>& input, float_t min, float_t max,
204  vec_t* result) {
205  const size_t data_size = input.size();
206  for (size_t i = 0; i < data_size; ++i) {
207  (*result)[i] = quantized_to_float<T>(input[i], min, max);
208  }
209 }
210 
211 template <class T>
212 vec_t quantized_tensor_to_float(const std::vector<T>& input, float_t min, float_t max) {
213  vec_t result(input.size(), static_cast<float_t>(0));
214  quantized_tensor_to_float_in_place<T>(input, min, max, &result);
215  return result;
216 }
217 
218 template <class T1, class T2>
219 void quantize_down_and_shrink_range( std::vector<T1>& input, float_t min_input, float_t max_input,
220  float_t* min_new, float_t* max_new, std::vector<T2>* output){
221  const int32_t input_lowest_quantized = static_cast<int32_t>(lowest<T1>());
222  const int32_t input_highest_quantized = static_cast<int32_t>(highest<T1>());
223  T1 actual_min_quantized = input_highest_quantized;
224  T1 actual_max_quantized = input_lowest_quantized;
225  for (serial_size_t i = 0; i < input.size(); ++i) {
226  const T1 value = input[i];
227  actual_min_quantized = std::min(actual_min_quantized, value);
228  actual_max_quantized = std::max(actual_max_quantized, value);
229  }
230  // We want to make sure that the minimum is no larger than zero, so that the
231  // convolution operation can run efficiently.
232  *min_new = std::min(0.0f, quantized_to_float(actual_min_quantized, min_input,
233  max_input));
234  *max_new = quantized_to_float(actual_max_quantized, min_input, max_input);
235  requantize_many_in_new_range<int32_t, uint8_t>(&input[0], input.size(),
236  min_input, max_input, *min_new,
237  *max_new, &(*output)[0]);
238 }
239 
240 } // namespace kernels
241 } // namespace core
242 } // namespace tiny_dnn