tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
optimizer.h
1 /*
2  Copyright (c) 2013, Taiga Nomi
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 #include "tiny_dnn/util/util.h"
29 #include <unordered_map>
30 
31 namespace tiny_dnn {
32 
37 struct optimizer {
38  optimizer() = default;
39  optimizer(const optimizer &) = default;
40 #ifndef CNN_DEFAULT_MOVE_CONSTRUCTOR_UNAVAILABLE
41  optimizer(optimizer &&) = default;
42 #endif
43  optimizer &operator =(const optimizer &) = default;
44 #ifndef CNN_DEFAULT_ASSIGNMENT_OPERATOR_UNAVAILABLE
45  optimizer &operator =(optimizer &&) = default;
46 #endif
47  virtual ~optimizer() = default;
48  virtual void update(const vec_t& dW, vec_t &W, bool parallelize) = 0;
49  virtual void reset() {} // override to implement pre-learning action
50 };
51 
52 // helper class to hold N values for each weight
53 template <int N>
54 struct stateful_optimizer : public optimizer {
55  void reset() override {
56  for (auto& e : E_) e.clear();
57  }
58 
59 protected:
60  template <int Index>
61  vec_t& get(const vec_t& key) {
62  static_assert(Index < N, "index out of range");
63  if (E_[Index][&key].empty())
64  E_[Index][&key].resize(key.size(), float_t());
65  return E_[Index][&key];
66  }
67  std::unordered_map<const vec_t*, vec_t> E_[N];
68 };
69 
77 struct adagrad : public stateful_optimizer<1> {
78  adagrad() : alpha(float_t(0.01)), eps(float_t(1e-8)) {}
79 
80  void update(const vec_t& dW, vec_t &W, bool parallelize) {
81  vec_t& g = get<0>(W);
82  for_i(parallelize, static_cast<int>(W.size()), [&](int i) {
83  g[i] += dW[i] * dW[i];
84  W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
85  });
86  }
87 
88  float_t alpha; // learning rate
89 private:
90  float_t eps;
91 };
92 
99 struct RMSprop : public stateful_optimizer<1> {
100  RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
101 
102  void update(const vec_t& dW, vec_t& W, bool parallelize) {
103  vec_t& g = get<0>(W);
104 
105  for_i(parallelize, static_cast<int>(W.size()), [&](int i)
106  {
107  g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
108  W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
109  });
110  }
111 
112  float_t alpha; // learning rate
113  float_t mu; // decay term
114 private:
115  float_t eps; // constant value to avoid zero-division
116 };
117 
118 
125 struct adam : public stateful_optimizer<2> {
126  adam() : alpha(float_t(0.001)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
127 
128  void update(const vec_t& dW, vec_t& W, bool parallelize) {
129  vec_t& mt = get<0>(W);
130  vec_t& vt = get<1>(W);
131 
132  b1_t*=b1;b2_t*=b2;
133 
134  for_i(parallelize, static_cast<int>(W.size()), [&](int i){
135  mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
136  vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
137 
138  W[i] -= alpha * ( mt[i]/(float_t(1) -b1_t) ) / std::sqrt( (vt[i]/(float_t(1)-b2_t)) + eps);
139  });
140  }
141 
142  float_t alpha; // learning rate
143  float_t b1; // decay term
144  float_t b2; // decay term
145  float_t b1_t; // decay term power t
146  float_t b2_t; // decay term power t
147 private:
148  float_t eps; // constant value to avoid zero-division
149 };
150 
151 
152 
158 struct gradient_descent : public optimizer {
159  gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {}
160 
161  void update(const vec_t& dW, vec_t& W, bool parallelize) {
162  for_i(parallelize, static_cast<int>(W.size()), [&](int i){
163  W[i] = W[i] - alpha * (dW[i] + lambda * W[i]);
164  });
165  }
166 
167  float_t alpha; // learning rate
168  float_t lambda; // weight decay
169 };
170 
178 struct momentum : public stateful_optimizer<1> {
179 public:
180  momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
181 
182  void update(const vec_t& dW, vec_t& W, bool parallelize) {
183  vec_t& dWprev = get<0>(W);
184 
185  for_i(parallelize, static_cast<int>(W.size()), [&](int i){
186  float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
187  W[i] += V;
188  dWprev[i] = V;
189  });
190  }
191 
192  float_t alpha; // learning rate
193  float_t lambda; // weight decay
194  float_t mu; // momentum
195 };
196 
197 } // namespace tiny_dnn
RMSprop.
Definition: optimizer.h:99
adaptive gradient method
Definition: optimizer.h:77
[a new optimizer (2015)]
Definition: optimizer.h:125
SGD without momentum.
Definition: optimizer.h:158
SGD with momentum.
Definition: optimizer.h:178
base class of optimizer usesHessian : true if an optimizer uses hessian (2nd order derivative of loss...
Definition: optimizer.h:37
Definition: optimizer.h:54