35 return (std::numeric_limits<T>::max)();
40 return std::numeric_limits<T>::is_integer ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
46 int64_t float_to_quantized_unclamped(float_t input, float_t range_min, float_t range_max) {
47 if (range_min == range_max) {
50 const int number_of_bits =
sizeof(T) * 8;
51 const int64_t number_of_steps =
static_cast<int64_t
>(1) << number_of_bits;
52 const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
53 const double range = ((range_max - range_min) * range_adjust);
54 const double range_scale = (number_of_steps / range);
56 static_cast<int64_t
>(round(input * range_scale) - round(range_min * range_scale));
57 const int64_t lowest_quantized =
58 static_cast<int64_t
>(lowest<T>());
59 quantized += lowest_quantized;
63 inline int32_t int64_to_int32(int64_t src) {
64 assert(src <= std::numeric_limits<int32_t>::max() && src >= std::numeric_limits<int32_t>::min());
65 return static_cast<int32_t
>(src);
71 T float_to_quantized(float_t input, float_t range_min, float_t range_max) {
72 int64_t quantized = float_to_quantized_unclamped<T>(input, range_min, range_max);
73 const int64_t lowest_quantized =
74 static_cast<int64_t
>(lowest<T>());
75 const int64_t highest_quantized =
76 static_cast<int64_t
>(highest<T>());
77 quantized = std::max<int64_t>(quantized, lowest_quantized);
78 quantized = std::min<int64_t>(quantized, highest_quantized);
79 return static_cast<T
>(
static_cast<int32_t
>(quantized));
83 float quantized_to_float(T input, float_t range_min, float_t range_max) {
84 if (range_min == range_max) {
87 const int number_of_bits =
sizeof(T) * 8;
88 const int64_t number_of_steps =
static_cast<int64_t
>(1) << number_of_bits;
89 const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
90 const double range = ((range_max - range_min) * range_adjust);
91 const double range_scale = (range / number_of_steps);
92 const int64_t lowest_quantized =
93 static_cast<int64_t
>(lowest<T>());
94 const double offset_input =
static_cast<double>(input) - lowest_quantized;
95 const double result = range_min + (offset_input * range_scale);
96 return static_cast<float_t
>(result);
100 float float_for_one_quantized_level(float_t range_min, float_t range_max) {
101 const int64_t highest_ =
static_cast<int64_t
>(highest<T>());
102 const int64_t lowest_ =
static_cast<int64_t
>(lowest<T>());
103 const float float_for_one_quantized_level =
104 (range_max - range_min) / (highest_ - lowest_);
105 return float_for_one_quantized_level;
108 template <
class T1,
class T2,
class T3>
109 void quantization_range_for_multiplication(float_t min_a, float_t max_a, float_t min_b,
110 float_t max_b, float_t* min_c,
112 const float_t a_float_for_one_quant_level =
113 float_for_one_quantized_level<T1>(min_a, max_a);
114 const float_t b_float_for_one_quant_level =
115 float_for_one_quantized_level<T2>(min_b, max_b);
117 const int64_t c_highest =
static_cast<int64_t
>(highest<T3>());
118 const int64_t c_lowest =
static_cast<int64_t
>(lowest<T3>());
119 const float c_float_for_one_quant_level =
120 a_float_for_one_quant_level * b_float_for_one_quant_level;
122 *min_c = c_float_for_one_quant_level * c_lowest;
123 *max_c = c_float_for_one_quant_level * c_highest;
126 template <
class T1,
class T2>
127 inline T2 requantize_in_new_range(T1 input, float_t min_input, float_t max_input,
128 float_t min_new, float_t max_new) {
129 const float_t input_float = quantized_to_float<T1>(input, min_input, max_input);
130 return float_to_quantized<T2>(input_float, min_new, max_new);
133 template <
class T1,
class T2>
134 inline void requantize_many_in_new_range(T1* input,
size_t count, float_t min_input,
135 float_t max_input, float_t min_output,
136 float_t max_output, T2* output) {
137 for (
size_t index = 0; index < count; ++index) {
138 const float_t input_float =
139 quantized_to_float<T1>(input[index], min_input, max_input);
140 output[index] = float_to_quantized<T2>(input_float, min_output, max_output);
148 inline void requantize_many_in_new_range<int32_t, uint8_t>(
149 int32_t* input,
size_t count, float_t min_input, float_t max_input,
150 float_t min_output, float_t max_output, uint8_t* output) {
153 const int fp_shift = 16;
154 const float input_range = max_input - min_input;
155 const float output_range = max_output - min_output;
156 const float recip_output_range = (255.0f / output_range);
157 const int64_t recip_output_range_fp =
158 static_cast<int64_t
>(recip_output_range * (1 << fp_shift));
159 const int64_t range_scale_fp =
160 static_cast<int64_t
>(255.0f * (1 << fp_shift) * input_range / output_range);
161 const int64_t input_offset_fp =
162 static_cast<int64_t
>((min_input * recip_output_range_fp) + (range_scale_fp >> 1));
163 const int64_t output_offset_fp =
static_cast<int64_t
>(round((min_output * 255.0f) / output_range));
164 const int64_t rounding_delta = 1 << (fp_shift - 1);
169 for (
size_t index = 0; index < count; ++index) {
170 const int64_t input_value =
static_cast<int64_t
>(input[index]);
171 const int64_t fp_value =
172 ((input_value * range_scale_fp) >> 32) + input_offset_fp;
173 const int64_t round_intermediate =
174 ((fp_value >= 0) ? (fp_value + rounding_delta)
175 : (fp_value - rounding_delta)) >>
177 int64_t quantized_int64 = (round_intermediate - output_offset_fp);
178 quantized_int64 = std::max<int64_t>(quantized_int64, 0LL);
179 quantized_int64 = std::min<int64_t>(quantized_int64, 255LL);
180 output[index] =
static_cast<uint8_t
>(
static_cast<int32_t
>(quantized_int64));
186 void float_tensor_to_quantized_in_place(
const vec_t& input, float_t min, float_t max,
187 std::vector<T>* result) {
188 const size_t data_size = input.size();
189 for (
size_t i = 0; i < data_size; ++i) {
190 (*result)[i] = float_to_quantized<T>(input[i], min, max);
195 std::vector<T> float_tensor_to_quantized(
const vec_t& input, float_t min, float_t max) {
196 std::vector<T> result(input.size(),
static_cast<T
>(0));
197 float_tensor_to_quantized_in_place<T>(input, min, max, &result);
203 void quantized_tensor_to_float_in_place(
const std::vector<T>& input, float_t min, float_t max,
205 const size_t data_size = input.size();
206 for (
size_t i = 0; i < data_size; ++i) {
207 (*result)[i] = quantized_to_float<T>(input[i], min, max);
212 vec_t quantized_tensor_to_float(
const std::vector<T>& input, float_t min, float_t max) {
213 vec_t result(input.size(),
static_cast<float_t
>(0));
214 quantized_tensor_to_float_in_place<T>(input, min, max, &result);
218 template <
class T1,
class T2>
219 void quantize_down_and_shrink_range( std::vector<T1>& input, float_t min_input, float_t max_input,
220 float_t* min_new, float_t* max_new, std::vector<T2>* output){
221 const int32_t input_lowest_quantized =
static_cast<int32_t
>(lowest<T1>());
222 const int32_t input_highest_quantized =
static_cast<int32_t
>(highest<T1>());
223 T1 actual_min_quantized = input_highest_quantized;
224 T1 actual_max_quantized = input_lowest_quantized;
225 for (serial_size_t i = 0; i < input.size(); ++i) {
226 const T1 value = input[i];
227 actual_min_quantized = std::min(actual_min_quantized, value);
228 actual_max_quantized = std::max(actual_max_quantized, value);
232 *min_new = std::min(0.0f, quantized_to_float(actual_min_quantized, min_input,
234 *max_new = quantized_to_float(actual_max_quantized, min_input, max_input);
235 requantize_many_in_new_range<int32_t, uint8_t>(&input[0], input.size(),
236 min_input, max_input, *min_new,
237 *max_new, &(*output)[0]);