28 #include "tiny_dnn/util/util.h"
29 #include <unordered_map>
40 #ifndef CNN_DEFAULT_MOVE_CONSTRUCTOR_UNAVAILABLE
44 #ifndef CNN_DEFAULT_ASSIGNMENT_OPERATOR_UNAVAILABLE
48 virtual void update(
const vec_t& dW, vec_t &W,
bool parallelize) = 0;
49 virtual void reset() {}
55 void reset()
override {
56 for (
auto& e : E_) e.clear();
61 vec_t& get(
const vec_t& key) {
62 static_assert(Index < N,
"index out of range");
63 if (E_[Index][&key].empty())
64 E_[Index][&key].resize(key.size(), float_t());
65 return E_[Index][&key];
67 std::unordered_map<const vec_t*, vec_t> E_[N];
78 adagrad() : alpha(float_t(0.01)), eps(float_t(1e-8)) {}
80 void update(
const vec_t& dW, vec_t &W,
bool parallelize) {
82 for_i(parallelize,
static_cast<int>(W.size()), [&](
int i) {
83 g[i] += dW[i] * dW[i];
84 W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps);
100 RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {}
102 void update(
const vec_t& dW, vec_t& W,
bool parallelize) {
103 vec_t& g = get<0>(W);
105 for_i(parallelize,
static_cast<int>(W.size()), [&](
int i)
107 g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i];
108 W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps);
126 adam() : alpha(float_t(0.001)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {}
128 void update(
const vec_t& dW, vec_t& W,
bool parallelize) {
129 vec_t& mt = get<0>(W);
130 vec_t& vt = get<1>(W);
134 for_i(parallelize,
static_cast<int>(W.size()), [&](
int i){
135 mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i];
136 vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i];
138 W[i] -= alpha * ( mt[i]/(float_t(1) -b1_t) ) / std::sqrt( (vt[i]/(float_t(1)-b2_t)) + eps);
161 void update(
const vec_t& dW, vec_t& W,
bool parallelize) {
162 for_i(parallelize,
static_cast<int>(W.size()), [&](
int i){
163 W[i] = W[i] - alpha * (dW[i] + lambda * W[i]);
180 momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {}
182 void update(
const vec_t& dW, vec_t& W,
bool parallelize) {
183 vec_t& dWprev = get<0>(W);
185 for_i(parallelize,
static_cast<int>(W.size()), [&](
int i){
186 float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda);
RMSprop.
Definition: optimizer.h:99
adaptive gradient method
Definition: optimizer.h:77
[a new optimizer (2015)]
Definition: optimizer.h:125
SGD without momentum.
Definition: optimizer.h:158
SGD with momentum.
Definition: optimizer.h:178
base class of optimizer usesHessian : true if an optimizer uses hessian (2nd order derivative of loss...
Definition: optimizer.h:37
Definition: optimizer.h:54