52 #include "tiny_dnn/core/framework/device.fwd.h"
54 #if defined(USE_OPENCL) || defined(USE_CUDA)
56 #include "third_party/CLCudaAPI/clpp11.h"
58 #include "third_party/CLCudaAPI/cupp11.h"
64 template<
typename U =
float_t>
86 explicit Tensor(
const size_t d0,
90 reshape(d0, d1, d2, d3);
93 explicit Tensor(
const std::array<size_t, 4>& shape) {
94 reshape(shape[0], shape[1], shape[2], shape[3]);
97 explicit Tensor(
const std::vector<size_t>& shape) {
98 assert(shape.size() == 4);
99 reshape(shape[0], shape[1], shape[2], shape[3]);
106 shape_ = other.shape_;
107 host_data_ = other.host_data_;
108 data_is_on_host_ =
true;
115 shape_ = other.shape_;
116 data_is_on_host_ =
true;
118 host_data_ = other.host_data_;
124 #ifdef CNN_USE_DEFAULT_MOVE_CONSTRUCTORS
129 shape_ = std::move(other.shape_);
130 host_data_ = std::move(other.host_data_);
131 #if defined(USE_OPENCL) || defined(USE_CUDA)
132 device_data_ = std::move(other.device_data_);
134 data_is_on_host_ = other.data_is_on_host_;
135 data_dirty_ = other.data_dirty_;
139 shape_ = std::move(other.shape_);
140 host_data_ = std::move(other.host_data_);
141 #if defined(USE_OPENCL) || defined(USE_CUDA)
142 device_data_ = std::move(other.device_data_);
144 data_is_on_host_ = other.data_is_on_host_;
145 data_dirty_ = other.data_dirty_;
151 const std::array<size_t, 4>& shape()
const {
return shape_; }
155 U& host_at(
const size_t d0,
159 return *host_ptr(d0, d1, d2, d3);
162 U host_at(
const size_t d0,
165 const size_t d3)
const {
166 return *host_ptr(d0, d1, d2, d3);
171 const U* host_ptr(
const size_t d0,
174 const size_t d3)
const {
175 if (d0 >= shape_[0] || d1 >= shape_[1] ||
176 d2 >= shape_[2] || d3 >= shape_[3]) {
177 throw nn_error(
"Access tensor out of range.");
180 return host_data() + (
181 shape_[1] * shape_[2] * shape_[3] * d0 +
182 shape_[1] * shape_[2] * d3 +
188 U* host_ptr(
const size_t d0,
192 if (d0 >= shape_[0] || d1 >= shape_[1] ||
193 d2 >= shape_[2] || d3 >= shape_[3]) {
194 throw nn_error(
"Access tensor out of range.");
197 return mutable_host_data() + (
198 shape_[1] * shape_[2] * shape_[3] * d0 +
199 shape_[1] * shape_[2] * d3 +
205 const U* host_data()
const {
207 return host_data_.data();
210 U* mutable_host_data() {
213 return host_data_.data();
216 #if defined(USE_OPENCL) || defined(USE_CUDA)
217 const void *device_data()
const {
219 return (*device_data_)();
222 void *mutable_device_data() {
225 return (*device_data_)();
229 size_t size()
const {
230 return host_data_.size();
234 data_is_on_host_ =
true;
236 std::fill(std::begin(host_data_), std::end(host_data_), value);
239 void reshape(
const size_t d0,
247 host_data_.resize(calcSize(), U(0));
250 void reshape(
const std::array<size_t, 4> &sz) {
252 host_data_.resize(calcSize(), U(0));
256 size_t calcSize()
const {
257 return std::accumulate(std::begin(shape_), std::end(shape_),
size_t(1), std::multiplies<size_t>());
260 void toDevice()
const {
261 if (data_is_on_host_ && data_dirty_) {
262 #if defined(USE_OPENCL) || defined(USE_CUDA)
263 CLCudaAPI::Queue queue = device->queue();
264 if (device_data_ && device_data_->GetSize() >= host_data_.size()) {
265 device_data_->Write(queue, host_data.size(), host_data_.data(), 0);
268 CLCudaAPI::Context ctx = device->context();
269 device_data_ = make_unique<CLCudaAPI::Buffer<U> >(
270 ctx, queue, host_data_.begin(), host_data_.end());
273 data_is_on_host_ =
false;
278 void fromDevice()
const {
279 if (!data_is_on_host_ && data_dirty_) {
280 #if defined(USE_OPENCL) || defined(USE_CUDA)
282 assert(device_data_);
283 device_data_->Read(device_->queue(), host_data_.size(),
const_cast<U*
>(host_data_.data()));
285 data_is_on_host_ =
true;
297 std::array<size_t, 4> shape_;
300 std::vector<U, aligned_allocator<U, 64> > host_data_;
302 #if defined(USE_OPENCL) || defined(USE_CUDA)
304 std::unique_ptr<CLCudaAPI::Buffer<U> > device_data_;
306 mutable bool data_is_on_host_;
307 mutable bool data_dirty_;
315 inline std::ostream& operator<< (std::ostream &os,
317 const std::vector<serial_size_t>& shape = tensor.shape();
318 for (serial_size_t i = 0; i < shape[0]; ++i) {
319 os <<
"-- Batch: " << i <<
"\n";
320 for (serial_size_t j = 0; j < shape[3]; ++j) {
321 os <<
"-- Channel: " << j <<
"\n";
323 for (serial_size_t k = 0; k < shape[1]; ++k) {
324 for (serial_size_t l = 0; l < shape[2]; ++l) {
325 os <<
" " << tensor.at(i, k, l, j) <<
" ";
331 os <<
"----------------\n"
332 <<
"--> Tensor size: [ "
333 << shape[0] <<
" x " << shape[1] <<
" x "
334 << shape[2] <<
" x " << shape[3] <<
" ]\n";
340 template<
typename TD,
typename TS1,
typename TS2,
typename F>
void binary_tensor_tensor_elementwise_operation(Tensor<TD> &dst,
const Tensor<TS1> &src1,
const Tensor<TS2> &src2, F f) {
341 if (src1.shape() != src2.shape()) {
342 throw nn_error(
"Tensor must have same shape");
345 dst.reshape(src1.shape());
347 TD* pdst = dst.mutable_host_data();
348 const TS1* psrc1 = src1.host_data();
349 const TS2* psrc2 = src2.host_data();
351 for_i(
true, dst.size(), [pdst, psrc1, psrc2, &f](
size_t i) {
352 pdst[i] = f(psrc1[i], psrc2[i]);
356 template<
typename TD,
typename TS,
typename F>
void unary_tensor_elementwise_operation(Tensor<TD> &dst,
const Tensor<TS> &src, F f) {
357 dst.reshape(src.shape());
359 TD* pdst = dst.mutable_host_data();
360 const TS* psrc = src.host_data();
362 for_i(
true, dst.size(), [pdst, psrc, &f](
size_t i) {
363 pdst[i] = f(psrc[i]);
367 template<
typename TD,
typename TS1,
typename TS2,
typename F>
void binary_tensor_scalar_operation(Tensor<TD> &dst,
const Tensor<TS1> &src1, TS2 src2, F f) {
368 dst.reshape(src1.shape());
370 TD* pdst = dst.mutable_host_data();
371 const TS1* psrc1 = src1.host_data();
373 for_i(
true, dst.size(), [pdst, psrc1, src2, &f](
size_t i) {
374 pdst[i] = f(psrc1[i], src2);
378 template<
typename TD,
typename TS1,
typename TS2,
typename F>
void binary_scalar_tensor_operation(Tensor<TD> &dst, TS1 src1,
const Tensor<TS2> &src2, F f) {
379 dst.reshape(src2.shape());
381 TD* pdst = dst.mutable_host_data();
382 const TS2* psrc2 = src2.host_data();
384 for_i(
true, dst.size(), [pdst, src1, psrc2, &f](
size_t i) {
385 pdst[i] = f(src1, psrc2[i]);
392 template<
typename TS1,
typename TS2>
auto plus(TS1 s1, TS2 s2) -> decltype(s1 + s2) {
return s1 + s2; }
394 template<
typename TS1,
typename TS2>
auto minus(TS1 s1, TS2 s2) -> decltype(s1 - s2) {
return s1 - s2; }
396 template<
typename TS1,
typename TS2>
auto multiplies(TS1 s1, TS2 s2) -> decltype(s1 * s2) {
return s1 * s2; }
398 template<
typename TS1,
typename TS2>
auto divides_checked(TS1 s1, TS2 s2) -> decltype(s1 / s2) {
399 typedef decltype(s1 / s2) result_type;
400 return (s2 == result_type{}) ? std::numeric_limits<result_type>::quiet_NaN() : s1 / s2;
403 template<
typename TS1,
typename TS2>
auto divides_unchecked(TS1 s1, TS2 s2) -> decltype(s1 / s2) {
407 template<
typename T> T sqrt_checked(T s1) {
408 return (s1 <= T{}) ? std::numeric_limits<T>::quiet_NaN() : sqrt(s1);
412 template<
typename T> T exp(T s1) {
417 template<
typename TD,
typename TS1,
typename TS2>
void layer_add(Tensor<TD> &dst, TS1 src1,
const Tensor<TS2> &src2) {
418 binary_scalar_tensor_operation(dst, src1, src2, details::plus<TS1, TS2>);
421 template<
typename TD,
typename TS1,
typename TS2>
void layer_add(Tensor<TD> &dst,
const Tensor<TS1> &src1, TS2 src2) {
422 binary_tensor_scalar_operation(dst, src1, src2, details::plus<TS1, TS2>);
425 template<
typename TD,
typename TS1,
typename TS2>
void layer_add(Tensor<TD> &dst,
const Tensor<TS1> &src1,
const Tensor<TS2> &src2) {
426 binary_tensor_tensor_elementwise_operation(dst, src1, src2, details::plus<TS1, TS2>);
429 template<
typename TD,
typename TS1,
typename TS2>
void layer_sub(Tensor<TD> &dst, TS1 src1,
const Tensor<TS2> &src2) {
430 binary_scalar_tensor_operation(dst, src1, src2, details::minus<TS1, TS2>);
433 template<
typename TD,
typename TS1,
typename TS2>
void layer_sub(Tensor<TD> &dst,
const Tensor<TS1> &src1, TS2 src2) {
434 binary_tensor_scalar_operation(dst, src1, src2, details::minus<TS1, TS2>);
437 template<
typename TD,
typename TS1,
typename TS2>
void layer_sub(Tensor<TD> &dst,
const Tensor<TS1> &src1,
const Tensor<TS2> &src2) {
438 binary_tensor_tensor_elementwise_operation(dst, src1, src2, details::minus<TS1, TS2>);
441 template<
typename TD,
typename TS1,
typename TS2>
void layer_mul(Tensor<TD> &dst, TS1 src1,
const Tensor<TS2> &src2) {
442 binary_scalar_tensor_operation(dst, src1, src2, details::multiplies<TS1, TS2>);
445 template<
typename TD,
typename TS1,
typename TS2>
void layer_mul(Tensor<TD> &dst,
const Tensor<TS1> &src1, TS2 src2) {
446 binary_tensor_scalar_operation(dst, src1, src2, details::multiplies<TS1, TS2>);
449 template<
typename TD,
typename TS1,
typename TS2>
void layer_mul(Tensor<TD> &dst,
const Tensor<TS1> &src1,
const Tensor<TS2> &src2) {
450 binary_tensor_tensor_elementwise_operation(dst, src1, src2, details::multiplies<TS1, TS2>);
453 template<
typename TD,
typename TS1,
typename TS2>
void layer_div(Tensor<TD> &dst, TS1 src1,
const Tensor<TS2> &src2) {
454 binary_scalar_tensor_operation(dst, src1, src2, details::divides_checked<TS1, TS2>);
457 template<
typename TD,
typename TS1,
typename TS2>
void layer_div(Tensor<TD> &dst,
const Tensor<TS1> &src1, TS2 src2) {
458 if (src2 == TS2(0.0)) {
459 dst.reshape(src1.shape());
460 dst.fill(std::numeric_limits<TD>::quiet_NaN());
462 binary_tensor_scalar_operation(dst, src1, src2, details::divides_unchecked<TS1, TS2>);
466 template<
typename TD,
typename TS1,
typename TS2>
void layer_div(Tensor<TD> &dst,
const Tensor<TS1> &src1,
const Tensor<TS2> &src2) {
467 binary_tensor_tensor_elementwise_operation(dst, src1, src2, details::divides_checked<TS1, TS2>);
470 template<
typename TD,
typename TS>
void layer_sqrt(Tensor<TD> &dst,
const Tensor<TS> &src1) {
471 return unary_tensor_elementwise_operation(dst, src1, details::sqrt_checked<TS>);
474 template<
typename TD,
typename TS>
void layer_exp(Tensor<TD> &dst,
const Tensor<TS> &src1) {
475 return unary_tensor_elementwise_operation(dst, src1, details::exp<TS>);
Definition: device.fwd.h:73
error exception class for tiny-dnn
Definition: nn_error.h:37