tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
batch_normalization_layer.h
1 /*
2  Copyright (c) 2016, Taiga Nomi
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 #include "tiny_dnn/util/util.h"
29 #include "tiny_dnn/util/math_functions.h"
30 #include "tiny_dnn/layers/layer.h"
31 
32 #include <algorithm>
33 
34 namespace tiny_dnn {
35 
36 
43 public:
44  typedef layer Base;
45 
52  batch_normalization_layer(const layer& prev_layer,
53  float_t epsilon = 1e-5,
54  float_t momentum = 0.999,
55  net_phase phase = net_phase::train)
56  : Base({ vector_type::data }, { vector_type::data }),
57  in_channels_(prev_layer.out_shape()[0].depth_),
58  in_spatial_size_(prev_layer.out_shape()[0].area()),
59  phase_(phase),
60  momentum_(momentum),
61  eps_(epsilon),
62  update_immidiately_(false)
63  {
64  init();
65  }
66 
74  batch_normalization_layer(serial_size_t in_spatial_size,
75  serial_size_t in_channels,
76  float_t epsilon = 1e-5,
77  float_t momentum = 0.999,
78  net_phase phase = net_phase::train)
79  : Base({ vector_type::data }, { vector_type::data }),
80  in_channels_(in_channels),
81  in_spatial_size_(in_spatial_size),
82  phase_(phase),
83  momentum_(momentum),
84  eps_(epsilon),
85  update_immidiately_(false)
86  {
87  init();
88  }
89 
91 
93  serial_size_t fan_in_size() const override {
94  return 1;
95  }
96 
98  serial_size_t fan_out_size() const override {
99  return 1;
100  }
101 
102  std::vector<index3d<serial_size_t>> in_shape() const override {
103  return{ index3d<serial_size_t>(in_spatial_size_, 1, in_channels_) };
104  }
105 
106  std::vector<index3d<serial_size_t>> out_shape() const override {
107  return{ index3d<serial_size_t>(in_spatial_size_, 1, in_channels_) };
108  }
109 
110  void back_propagation(const std::vector<tensor_t*>& in_data,
111  const std::vector<tensor_t*>& out_data,
112  std::vector<tensor_t*>& out_grad,
113  std::vector<tensor_t*>& in_grad) override {
114  tensor_t& prev_delta = *in_grad[0];
115  tensor_t& curr_delta = *out_grad[0];
116  const tensor_t& curr_out = *out_data[0];
117  serial_size_t num_samples = static_cast<serial_size_t>(curr_out.size());
118 
119  CNN_UNREFERENCED_PARAMETER(in_data);
120 
121  tensor_t delta_dot_y = curr_out;
122  vec_t mean_delta_dot_y, mean_delta, mean_Y;
123 
124  for (serial_size_t i = 0; i < num_samples; i++) {
125  for (serial_size_t j = 0; j < curr_out[0].size(); j++) {
126  delta_dot_y[i][j] *= curr_delta[i][j];
127  }
128  }
129  moments(delta_dot_y, in_spatial_size_, in_channels_, &mean_delta_dot_y, nullptr);
130  moments(curr_delta, in_spatial_size_, in_channels_, &mean_delta, nullptr);
131 
132  // if Y = (X-mean(X))/(sqrt(var(X)+eps)), then
133  //
134  // dE(Y)/dX =
135  // (dE/dY - mean(dE/dY) - mean(dE/dY \cdot Y) \cdot Y)
136  // ./ sqrt(var(X) + eps)
137  //
138  for_i(num_samples, [&](int i) {
139  for (serial_size_t j = 0; j < in_channels_; j++) {
140  for (serial_size_t k = 0; k < in_spatial_size_; k++) {
141  serial_size_t index = j*in_spatial_size_ + k;
142 
143  prev_delta[i][index]
144  = curr_delta[i][index] - mean_delta[j] - mean_delta_dot_y[j] * curr_out[i][index];
145 
146  // stddev_ is calculated in the forward pass
147  prev_delta[i][index] /= stddev_[j];
148  }
149  }
150  });
151  }
152 
153  void forward_propagation(const std::vector<tensor_t*>& in_data,
154  std::vector<tensor_t*>& out_data) override {
155  vec_t* mean = nullptr;
156  vec_t* variance = nullptr;
157  tensor_t& in = *in_data[0];
158  tensor_t& out = *out_data[0];
159 
160  if (phase_ == net_phase::train) {
161  // calculate mean/variance from this batch in train phase
162  mean = &mean_current_;
163  variance = &variance_current_;
164  moments(*in_data[0], in_spatial_size_, in_channels_, mean, variance);
165  }
166  else {
167  // use stored mean/variance in test phase
168  mean = &mean_;
169  variance = &variance_;
170  }
171 
172  // y = (x - mean) ./ sqrt(variance + eps)
173  calc_stddev(*variance);
174 
175  for_i(parallelize_, in_data[0]->size(), [&](int i) {
176  const float_t* inptr = &in[i][0];
177  float_t* outptr = &out[i][0];
178 
179  for (serial_size_t j = 0; j < in_channels_; j++) {
180  float_t m = (*mean)[j];
181 
182  for (serial_size_t k = 0; k < in_spatial_size_; k++) {
183  *outptr++ = (*inptr++ - m) / stddev_[j];
184  }
185  }
186  });
187 
188  if (phase_ == net_phase::train && update_immidiately_) {
189  mean_ = mean_current_;
190  variance_ = variance_current_;
191  }
192  }
193 
194  void set_context(net_phase ctx) override
195  {
196  phase_ = ctx;
197  }
198 
199  std::string layer_type() const override { return "batch-norm"; }
200 
201  virtual void post_update() override {
202  for (serial_size_t i = 0; i < mean_.size(); i++) {
203  mean_[i] = momentum_ * mean_[i] + (1 - momentum_) * mean_current_[i];
204  variance_[i] = momentum_ * variance_[i] + (1 - momentum_) * variance_current_[i];
205  }
206  }
207 
208  virtual void save(std::ostream& os) const override {
209  Base::save(os);
210  for (auto m : mean_) os << m << " ";
211  for (auto v : variance_) os << v << " ";
212  }
213 
214  virtual void load(std::istream& is) override {
215  Base::load(is);
216  for (auto& m : mean_) is >> m;
217  for (auto& v : variance_) is >> v;
218  }
219 
220  virtual void load(const std::vector<float_t>& src, int& idx) override {
221  Base::load(src, idx);
222  for (auto& m : mean_) m = src[idx++];
223  for (auto& v : variance_) v = src[idx++];
224  }
225 
226  void update_immidiately(bool update) {
227  update_immidiately_ = update;
228  }
229 
230  void set_stddev(const vec_t& stddev) {
231  stddev_ = stddev;
232  }
233 
234  void set_mean(const vec_t& mean) {
235  mean_ = mean;
236  }
237 
238  void set_variance(const vec_t& variance) {
239  variance_ = variance;
240  calc_stddev(variance);
241  }
242 
243  template <class Archive>
244  static void load_and_construct(Archive & ar, cereal::construct<batch_normalization_layer> & construct) {
245  shape3d in;
246  serial_size_t in_spatial_size, in_channels;
247  float_t eps, momentum;
248  net_phase phase;
249  vec_t mean, variance;
250 
251  ar(cereal::make_nvp("in_spatial_size", in_spatial_size),
252  cereal::make_nvp("in_channels", in_channels),
253  cereal::make_nvp("epsilon", eps),
254  cereal::make_nvp("momentum", momentum),
255  cereal::make_nvp("phase", phase),
256  cereal::make_nvp("mean", mean),
257  cereal::make_nvp("variance", variance));
258  construct(in_spatial_size, in_channels, eps, momentum, phase);
259  construct->set_mean(mean);
260  construct->set_variance(variance);
261  }
262 
263  template <class Archive>
264  void serialize(Archive & ar) {
265  layer::serialize_prolog(ar);
266  ar(cereal::make_nvp("in_spatial_size", in_spatial_size_),
267  cereal::make_nvp("in_channels", in_channels_),
268  cereal::make_nvp("epsilon", eps_),
269  cereal::make_nvp("momentum", momentum_),
270  cereal::make_nvp("phase", phase_),
271  cereal::make_nvp("mean", mean_),
272  cereal::make_nvp("variance", variance_));
273  }
274 
275  float_t epsilon() const {
276  return eps_;
277  }
278 
279  float_t momentum() const {
280  return momentum_;
281  }
282 
283 private:
284  void calc_stddev(const vec_t& variance) {
285  for (size_t i = 0; i < in_channels_; i++) {
286  stddev_[i] = sqrt(variance[i] + eps_);
287  }
288  }
289 
290  void init() {
291  mean_current_.resize(in_channels_);
292  mean_.resize(in_channels_);
293  variance_current_.resize(in_channels_);
294  variance_.resize(in_channels_);
295  tmp_mean_.resize(in_channels_);
296  stddev_.resize(in_channels_);
297  }
298 
299  serial_size_t in_channels_;
300  serial_size_t in_spatial_size_;
301 
302  net_phase phase_;
303  float_t momentum_;
304  float_t eps_;
305 
306  // mean/variance for this mini-batch
307  vec_t mean_current_;
308  vec_t variance_current_;
309 
310  vec_t tmp_mean_;
311 
312  // moving average of mean/variance
313  vec_t mean_;
314  vec_t variance_;
315  vec_t stddev_;
316 
317  // for test
318  bool update_immidiately_;
319 };
320 
321 } // namespace tiny_dnn
Batch Normalization.
Definition: batch_normalization_layer.h:42
void set_context(net_phase ctx) override
notify changing context (train <=> test)
Definition: batch_normalization_layer.h:194
void back_propagation(const std::vector< tensor_t * > &in_data, const std::vector< tensor_t * > &out_data, std::vector< tensor_t * > &out_grad, std::vector< tensor_t * > &in_grad) override
return delta of previous layer (delta=\frac{dE}{da}, a=wx in fully-connected layer)
Definition: batch_normalization_layer.h:110
virtual void post_update() override
return delta2 of previous layer (delta2=\frac{d^2E}{da^2}, diagonal of hessian matrix) it is never ca...
Definition: batch_normalization_layer.h:201
std::vector< index3d< serial_size_t > > out_shape() const override
array of output shapes (width x height x depth)
Definition: batch_normalization_layer.h:106
void forward_propagation(const std::vector< tensor_t * > &in_data, std::vector< tensor_t * > &out_data) override
Definition: batch_normalization_layer.h:153
std::string layer_type() const override
name of layer, should be unique for each concrete class
Definition: batch_normalization_layer.h:199
std::vector< index3d< serial_size_t > > in_shape() const override
array of input shapes (width x height x depth)
Definition: batch_normalization_layer.h:102
serial_size_t fan_in_size() const override
number of outgoing connections for each input unit
Definition: batch_normalization_layer.h:93
virtual ~batch_normalization_layer()
number of incoming connections for each output unit
Definition: batch_normalization_layer.h:90
batch_normalization_layer(const layer &prev_layer, float_t epsilon=1e-5, float_t momentum=0.999, net_phase phase=net_phase::train)
Definition: batch_normalization_layer.h:52
serial_size_t fan_out_size() const override
number of outgoing connections for each input unit used only for weight/bias initialization methods w...
Definition: batch_normalization_layer.h:98
batch_normalization_layer(serial_size_t in_spatial_size, serial_size_t in_channels, float_t epsilon=1e-5, float_t momentum=0.999, net_phase phase=net_phase::train)
Definition: batch_normalization_layer.h:74
base class of all kind of NN layers
Definition: layer.h:62
bool parallelize_
Flag indicating whether the layer/node operations ara paralellized.
Definition: layer.h:696
serial_size_t in_channels() const
number of outgoing edges in this layer
Definition: layer.h:146
SGD with momentum.
Definition: optimizer.h:178