tiny_dnn  1.0.0
A header only, dependency-free deep learning framework in C++11
loss_function.h
1 /*
2  Copyright (c) 2013, Taiga Nomi
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are met:
7  * Redistributions of source code must retain the above copyright
8  notice, this list of conditions and the following disclaimer.
9  * Redistributions in binary form must reproduce the above copyright
10  notice, this list of conditions and the following disclaimer in the
11  documentation and/or other materials provided with the distribution.
12  * Neither the name of the <organization> nor the
13  names of its contributors may be used to endorse or promote products
14  derived from this software without specific prior written permission.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 #pragma once
28 #include "tiny_dnn/util/util.h"
29 
30 namespace tiny_dnn {
31 
32 // mean-squared-error loss function for regression
33 class mse {
34 public:
35  static float_t f(const vec_t& y, const vec_t& t) {
36  assert(y.size() == t.size());
37  float_t d = 0.0;
38 
39  for(serial_size_t i = 0; i < y.size(); ++i)
40  d += (y[i] - t[i]) * (y[i] - t[i]);
41 
42  return d/y.size();
43  }
44 
45  static vec_t df(const vec_t& y, const vec_t& t) {
46  assert(y.size() == t.size());
47  vec_t d(t.size());
48  float_t factor = float_t(2) / static_cast<float_t>(t.size());
49 
50  for(serial_size_t i = 0; i < y.size(); ++i)
51  d[i] = factor * (y[i] - t[i]);
52 
53  return d;
54  }
55 };
56 
57 // absolute loss function for regression
58 class absolute {
59 public:
60  static float_t f(const vec_t& y, const vec_t& t) {
61  assert(y.size() == t.size());
62  float_t d = float_t(0);
63 
64  for(serial_size_t i = 0; i < y.size(); ++i)
65  d += std::abs(y[i] - t[i]);
66 
67  return d/y.size();
68  }
69 
70  static vec_t df(const vec_t& y, const vec_t& t) {
71  assert(y.size() == t.size());
72  vec_t d(t.size());
73  float_t factor = float_t(1) / static_cast<float_t>(t.size());
74 
75  for(serial_size_t i = 0; i < y.size(); ++i) {
76  float_t sign = y[i] - t[i];
77  if(sign < 0.f)
78  d[i] = -float_t(1) * factor;
79  else if(sign > 0.f)
80  d[i] = float_t(1) * factor;
81  else
82  d[i] = float_t(0);
83  }
84 
85  return d;
86  }
87 };
88 
89 // absolute loss with epsilon range for regression
90 // epsilon range [-eps, eps] with eps = 1./fraction
91 template<int fraction>
92 class absolute_eps {
93 public:
94  static float_t f(const vec_t& y, const vec_t& t) {
95  assert(y.size() == t.size());
96  float_t d = float_t(0);
97  const float_t eps = float_t(1) / fraction;
98 
99  for(serial_size_t i = 0; i < y.size(); ++i) {
100  float_t diff = std::abs(y[i] - t[i]);
101  if(diff > eps)
102  d += diff;
103  }
104  return d / y.size();
105  }
106 
107  static vec_t df(const vec_t& y, const vec_t& t) {
108  assert(y.size() == t.size());
109  vec_t d(t.size());
110  const float_t factor = float_t(1) / static_cast<float_t>(t.size());
111  const float_t eps = float_t(1) / fraction;
112 
113  for(serial_size_t i = 0; i < y.size(); ++i) {
114  float_t sign = y[i] - t[i];
115  if(sign < -eps)
116  d[i] = -float_t(1) * factor;
117  else if(sign > eps)
118  d[i] = float_t(1) * factor;
119  else
120  d[i] = 0.f;
121  }
122  return d;
123  }
124 };
125 
126 // cross-entropy loss function for (multiple independent) binary classifications
128 public:
129  static float_t f(const vec_t& y, const vec_t& t) {
130  assert(y.size() == t.size());
131  float_t d = float_t(0);
132 
133  for(serial_size_t i = 0; i < y.size(); ++i)
134  d += -t[i] * std::log(y[i]) - (float_t(1) - t[i]) * std::log(float_t(1) - y[i]);
135 
136  return d;
137  }
138 
139  static vec_t df(const vec_t& y, const vec_t& t) {
140  assert(y.size() == t.size());
141  vec_t d(t.size());
142 
143  for(serial_size_t i = 0; i < y.size(); ++i)
144  d[i] = (y[i] - t[i]) / (y[i] * (float_t(1) - y[i]));
145 
146  return d;
147  }
148 };
149 
150 // cross-entropy loss function for multi-class classification
152 public:
153  static float_t f(const vec_t& y, const vec_t& t) {
154  assert(y.size() == t.size());
155  float_t d = 0.0;
156 
157  for(serial_size_t i = 0; i < y.size(); ++i)
158  d += -t[i] * std::log(y[i]);
159 
160  return d;
161  }
162 
163  static vec_t df(const vec_t& y, const vec_t& t) {
164  assert(y.size() == t.size());
165  vec_t d(t.size());
166 
167  for(serial_size_t i = 0; i < y.size(); ++i)
168  d[i] = - t[i] / y[i];
169 
170  return d;
171  }
172 };
173 
174 template <typename E>
175 vec_t gradient(const vec_t& y, const vec_t& t) {
176  assert(y.size() == t.size());
177  return E::df(y, t);
178 }
179 
180 template <typename E>
181 std::vector<vec_t> gradient(const std::vector<vec_t>& y, const std::vector<vec_t>& t) {
182  std::vector<vec_t> grads;
183 
184  assert(y.size() == t.size());
185 
186  for (serial_size_t i = 0; i < y.size(); i++)
187  grads.push_back(gradient<E>(y[i], t[i]));
188 
189  return grads;
190 }
191 
192 inline void apply_cost_if_defined(std::vector<vec_t>& sample_gradient,
193  const std::vector<vec_t>& sample_cost) {
194  if (sample_gradient.size() == sample_cost.size()) {
195  // @todo consider adding parallelism
196  const serial_size_t channel_count = static_cast<serial_size_t>(sample_gradient.size());
197  for (size_t channel = 0; channel < channel_count; ++channel) {
198  if (sample_gradient[channel].size() == sample_cost[channel].size()) {
199  const size_t element_count = sample_gradient[channel].size();
200 
201  // @todo optimize? (use AVX or so)
202  for (size_t element = 0; element < element_count; ++element) {
203  sample_gradient[channel][element] *= sample_cost[channel][element];
204  }
205  }
206  }
207  }
208 }
209 
210 // gradient for a minibatch
211 template <typename E>
212 std::vector<tensor_t> gradient(const std::vector<tensor_t>& y,
213  const std::vector<tensor_t>& t,
214  const std::vector<tensor_t>& t_cost) {
215 
216  const serial_size_t sample_count = static_cast<serial_size_t>(y.size());
217  const serial_size_t channel_count = static_cast<serial_size_t>(y[0].size());
218 
219  std::vector<tensor_t> gradients(sample_count);
220 
221  CNN_UNREFERENCED_PARAMETER(channel_count);
222  assert(y.size() == t.size());
223  assert(t_cost.empty() || t_cost.size() == t.size());
224 
225  // @todo add parallelism
226  for (serial_size_t sample = 0; sample < sample_count; ++sample) {
227  assert(y[sample].size() == channel_count);
228  assert(t[sample].size() == channel_count);
229  assert(t_cost.empty() || t_cost[sample].empty() ||
230  t_cost[sample].size() == channel_count);
231 
232  gradients[sample] = gradient<E>(y[sample], t[sample]);
233 
234  if (sample < t_cost.size()) {
235  apply_cost_if_defined(gradients[sample], t_cost[sample]);
236  }
237  }
238 
239  return gradients;
240 }
241 
242 } // namespace tiny_dnn
Definition: loss_function.h:92
Definition: loss_function.h:58
Definition: loss_function.h:151
Definition: loss_function.h:127
Definition: loss_function.h:33