Actual source code: veckok.kokkos.cxx

  1: /*
  2:    Implements the sequential Kokkos vectors.
  3: */
  4: #include <petscvec_kokkos.hpp>

  6: #include <petsc/private/sfimpl.h>
  7: #include <petsc/private/petscimpl.h>
  8: #include <petscmath.h>
  9: #include <petscviewer.h>
 10: #include <KokkosBlas.hpp>

 12: #include <petscerror.h>
 13: #include <../src/vec/vec/impls/dvecimpl.h>
 14: #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>

 16: #if defined(PETSC_USE_DEBUG)
 17: #define VecErrorIfNotKokkos(v)                                                                 \
 18:   do {                                                                                         \
 19:     PetscBool isKokkos = PETSC_FALSE;                                                          \
 20:     PetscObjectTypeCompareAny((PetscObject)(v),&isKokkos,VECSEQKOKKOS,VECMPIKOKKOS,VECKOKKOS,""); \
 22:   } while (0)
 23: #else
 24: #define VecErrorIfNotKokkos(v) do {(void)(v);} while (0)
 25: #endif

 27: template<class MemorySpace>
 28: PetscErrorCode VecGetKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace>* kv,PetscBool overwrite)
 29: {
 30:   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);

 32:   VecErrorIfNotKokkos(v);
 33:   if (!overwrite) veckok->v_dual.sync<MemorySpace>(); /* If overwrite=true, no need to sync the space, since caller will overwrite the data */
 34:   *kv = veckok->v_dual.view<MemorySpace>();
 35:   return 0;
 36: }

 38: template<class MemorySpace>
 39: PetscErrorCode VecRestoreKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace>* kv,PetscBool overwrite)
 40: {
 41:   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);

 43:   VecErrorIfNotKokkos(v);
 44:   if (overwrite) veckok->v_dual.clear_sync_state(); /* If overwrite=true, clear the old sync state since user forced an overwrite */
 45:   veckok->v_dual.modify<MemorySpace>();
 46:   PetscObjectStateIncrease((PetscObject)v);
 47:   return 0;
 48: }

 50: template<class MemorySpace>
 51: PetscErrorCode VecGetKokkosView(Vec v,ConstPetscScalarKokkosViewType<MemorySpace>* kv)
 52: {
 53:   Vec_Kokkos *veckok = static_cast<Vec_Kokkos*>(v->spptr);
 54:   VecErrorIfNotKokkos(v);
 55:   veckok->v_dual.sync<MemorySpace>(); /* Sync the space for caller to read */
 56:   *kv = veckok->v_dual.view<MemorySpace>();
 57:   return 0;
 58: }

 60: /* Function template explicit instantiation */
 61: template   PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView         (Vec,ConstPetscScalarKokkosView*);
 62: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView         (Vec v,PetscScalarKokkosView* kv) {return VecGetKokkosView_Private(v,kv,PETSC_FALSE);}
 63: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView     (Vec v,PetscScalarKokkosView* kv) {return VecRestoreKokkosView_Private(v,kv,PETSC_FALSE);}
 64: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite    (Vec v,PetscScalarKokkosView* kv) {return VecGetKokkosView_Private(v,kv,PETSC_TRUE);}
 65: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosView* kv) {return VecRestoreKokkosView_Private(v,kv,PETSC_TRUE);}

 67: #if !defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) /* Get host views if the default memory space is not host space */
 68: template   PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView         (Vec,ConstPetscScalarKokkosViewHost*);
 69: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView         (Vec v,PetscScalarKokkosViewHost* kv) {return VecGetKokkosView_Private(v,kv,PETSC_FALSE);}
 70: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView     (Vec v,PetscScalarKokkosViewHost* kv) {return VecRestoreKokkosView_Private(v,kv,PETSC_FALSE);}
 71: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite    (Vec v,PetscScalarKokkosViewHost* kv) {return VecGetKokkosView_Private(v,kv,PETSC_TRUE);}
 72: template<> PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosViewHost* kv) {return VecRestoreKokkosView_Private(v,kv,PETSC_TRUE);}
 73: #endif

 75: PetscErrorCode VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)
 76: {
 77:   const PetscInt n = xin->map->n;
 78:   PetscScalar    *xx;

 80:   VecGetArrayWrite(xin,&xx); /* TODO: generate randoms directly on device */
 81:   for (PetscInt i=0; i<n; i++) PetscRandomGetValue(r,&xx[i]);
 82:   VecRestoreArrayWrite(xin,&xx);
 83:   return 0;
 84: }

 86: /* x = |x| */
 87: PetscErrorCode VecAbs_SeqKokkos(Vec xin)
 88: {
 89:   PetscScalarKokkosView xv;

 91:   PetscLogGpuTimeBegin();
 92:   VecGetKokkosView(xin,&xv);
 93:   KokkosBlas::abs(xv,xv);
 94:   VecRestoreKokkosView(xin,&xv);
 95:   PetscLogGpuTimeEnd();
 96:   return 0;
 97: }

 99: /* x = 1/x */
100: PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
101: {
102:   PetscScalarKokkosView xv;

104:   PetscLogGpuTimeBegin();
105:   VecGetKokkosView(xin,&xv);
106:   Kokkos::parallel_for(xv.extent(0),KOKKOS_LAMBDA(const int64_t i) {if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0/xv(i);});
107:   VecRestoreKokkosView(xin,&xv);
108:   PetscLogGpuTimeEnd();
109:   return 0;
110: }

112: PetscErrorCode VecMin_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
113: {
114:   typedef Kokkos::MinLoc<PetscReal,PetscInt>::value_type MinLocValue_t;
115:   ConstPetscScalarKokkosView xv;
116:   MinLocValue_t              minloc;

118:   PetscLogGpuTimeBegin();
119:   VecGetKokkosView(xin,&xv);
120:   Kokkos::parallel_reduce("VecMin",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MinLocValue_t& lminloc) {
121:     if (PetscRealPart(xv(i)) < lminloc.val) {
122:       lminloc.val = PetscRealPart(xv(i));
123:       lminloc.loc = i;
124:     }
125:   },Kokkos::MinLoc<PetscReal,PetscInt>(minloc)); /* Kokkos will set minloc properly even if xin is zero-lengthed */
126:   if (p) *p = minloc.loc;
127:   *val = minloc.val;
128:   VecRestoreKokkosView(xin,&xv);
129:   PetscLogGpuTimeEnd();
130:   return 0;
131: }

133: PetscErrorCode VecMax_SeqKokkos(Vec xin,PetscInt *p,PetscReal *val)
134: {
135:   typedef Kokkos::MaxLoc<PetscReal,PetscInt>::value_type MaxLocValue_t;
136:   ConstPetscScalarKokkosView xv;
137:   MaxLocValue_t              maxloc;

139:   PetscLogGpuTimeBegin();
140:   VecGetKokkosView(xin,&xv);
141:   Kokkos::parallel_reduce("VecMax",xin->map->n,KOKKOS_LAMBDA(PetscInt i,MaxLocValue_t& lmaxloc) {
142:     if (PetscRealPart(xv(i)) > lmaxloc.val) {
143:       lmaxloc.val = PetscRealPart(xv(i));
144:       lmaxloc.loc = i;
145:     }
146:   },Kokkos::MaxLoc<PetscReal,PetscInt>(maxloc));
147:   if (p) *p = maxloc.loc;
148:   *val = maxloc.val;
149:   VecRestoreKokkosView(xin,&xv);
150:   PetscLogGpuTimeEnd();
151:   return 0;
152: }

154: PetscErrorCode VecSum_SeqKokkos(Vec xin,PetscScalar* sum)
155: {
156:   ConstPetscScalarKokkosView xv;

158:   PetscLogGpuTimeBegin();
159:   VecGetKokkosView(xin,&xv);
160:   *sum = KokkosBlas::sum(xv);
161:   VecRestoreKokkosView(xin,&xv);
162:   PetscLogGpuTimeEnd();
163:   return 0;
164: }

166: PetscErrorCode VecShift_SeqKokkos(Vec xin,PetscScalar shift)
167: {
168:   PetscScalarKokkosView xv;

170:   PetscLogGpuTimeBegin();
171:   VecGetKokkosView(xin,&xv);
172:   Kokkos::parallel_for("VecShift",xin->map->n,KOKKOS_LAMBDA(PetscInt i) {xv(i) += shift;});
173:   VecRestoreKokkosView(xin,&xv);
174:   PetscLogGpuTimeEnd();
175:   return 0;
176: }

178: /* y = alpha x + y */
179: PetscErrorCode VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)
180: {
181:   PetscBool                  xiskok,yiskok;
182:   PetscScalarKokkosView      yv;
183:   ConstPetscScalarKokkosView xv;

185:   if (alpha == (PetscScalar)0.0) return 0;
186:   if (yin == xin) {
187:     VecScale_SeqKokkos(yin,alpha+1);
188:     return 0;
189:   }
190:   PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");
191:   PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");
192:   if (xiskok && yiskok) {
193:     PetscLogGpuTimeBegin();
194:     VecGetKokkosView(xin,&xv);
195:     VecGetKokkosView(yin,&yv);
196:     KokkosBlas::axpy(alpha,xv,yv);
197:     VecRestoreKokkosView(xin,&xv);
198:     VecRestoreKokkosView(yin,&yv);
199:     PetscLogGpuTimeEnd();
200:     PetscLogGpuFlops(2.0*yin->map->n);
201:   } else {
202:     VecAXPY_Seq(yin,alpha,xin);
203:   }
204:   return 0;
205: }

207: /* y = x + beta y */
208: PetscErrorCode VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)
209: {
210:   /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
211:   VecAXPBY_SeqKokkos(yin,1.0,beta,xin);
212:   return 0;
213: }

215: /* z = y^T x */
216: PetscErrorCode VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
217: {
218:   ConstPetscScalarKokkosView xv,yv;

220:   PetscLogGpuTimeBegin();
221:   VecGetKokkosView(xin,&xv);
222:   VecGetKokkosView(yin,&yv);
223:   Kokkos::parallel_reduce("VecTDot",xin->map->n,KOKKOS_LAMBDA(int64_t i, PetscScalar& update) {
224:     update += yv(i)*xv(i);
225:   },*z); /* Kokkos always overwrites z, so no need to init it */
226:   VecRestoreKokkosView(yin,&yv);
227:   VecRestoreKokkosView(xin,&xv);
228:   PetscLogGpuTimeEnd();
229:   if (xin->map->n > 0) PetscLogGpuFlops(2.0*xin->map->n);
230:   return 0;
231: }

233: struct TransposeDotTag {};
234: struct ConjugateDotTag {};

236: struct MDotFunctor {
237:   /* Note the C++ notation for an array typedef */
238:   typedef PetscScalar value_type[];
239:   typedef ConstPetscScalarKokkosView::size_type size_type;

241:   /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
242:   const size_type              value_count;
243:   ConstPetscScalarKokkosView   xv,yv[8];

245:   MDotFunctor(ConstPetscScalarKokkosView& xv,
246:               const PetscInt ny, /* Number of valid entries in yv[8]. 1 <= ny <= 8 */
247:               ConstPetscScalarKokkosView& yv0, ConstPetscScalarKokkosView& yv1,
248:               ConstPetscScalarKokkosView& yv2, ConstPetscScalarKokkosView& yv3,
249:               ConstPetscScalarKokkosView& yv4, ConstPetscScalarKokkosView& yv5,
250:               ConstPetscScalarKokkosView& yv6, ConstPetscScalarKokkosView& yv7)
251:     : value_count(ny),xv(xv)
252:   {
253:     yv[0] = yv0; yv[1] = yv1;
254:     yv[2] = yv2; yv[3] = yv3;
255:     yv[4] = yv4; yv[5] = yv5;
256:     yv[6] = yv6; yv[7] = yv7;
257:   }

259:   KOKKOS_INLINE_FUNCTION void operator() (TransposeDotTag,const size_type i,value_type sum) const
260:   {
261:     PetscScalar xval = xv(i);
262:     for (size_type j = 0; j<value_count; ++j) sum[j] += yv[j](i)*xval;
263:   }

265:   KOKKOS_INLINE_FUNCTION void operator() (ConjugateDotTag,const size_type i,value_type sum) const
266:   {
267:     PetscScalar xval = xv(i);
268:     for (size_type j = 0; j<value_count; ++j) sum[j] += PetscConj(yv[j](i))*xval;
269:   }

271:   KOKKOS_INLINE_FUNCTION void join (volatile value_type dst,const volatile value_type src) const
272:   {
273:     for (size_type j = 0; j<value_count; j++) dst[j] += src[j];
274:   }

276:   KOKKOS_INLINE_FUNCTION void init (value_type sum) const
277:   {
278:     for (size_type j = 0; j<value_count; j++) sum[j] = 0.0;
279:   }
280: };

282: template<class WorkTag>
283: PetscErrorCode VecMultiDot_Private(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
284: {
285:   PetscInt                        i,j,cur=0,ngroup=nv/8,rem=nv%8,N=xin->map->n;
286:   ConstPetscScalarKokkosView      xv,yv[8];
287:   PetscScalarKokkosViewHost       zv(z,nv);

289:   VecGetKokkosView(xin,&xv);
290:   for (i=0; i<ngroup; i++) { /* 8 y's per group */
291:     for (j=0; j<8; j++) VecGetKokkosView(yin[cur+j],&yv[j]);
292:     MDotFunctor mdot(xv,8,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]); /* Hope Kokkos make it asynchronous */
293:     Kokkos::parallel_reduce(Kokkos::RangePolicy<WorkTag>(0,N),mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+8)));
294:     for (j=0; j<8; j++) VecRestoreKokkosView(yin[cur+j],&yv[j]);
295:     cur += 8;
296:   }

298:   if (rem) { /* The remaining */
299:     for (j=0; j<rem; j++) VecGetKokkosView(yin[cur+j],&yv[j]);
300:     MDotFunctor mdot(xv,rem,yv[0],yv[1],yv[2],yv[3],yv[4],yv[5],yv[6],yv[7]);
301:     Kokkos::parallel_reduce(Kokkos::RangePolicy<WorkTag>(0,N),mdot,Kokkos::subview(zv,Kokkos::pair<PetscInt,PetscInt>(cur,cur+rem)));
302:     for (j=0; j<rem; j++) VecRestoreKokkosView(yin[cur+j],&yv[j]);
303:   }
304:   VecRestoreKokkosView(xin,&xv);
305:   Kokkos::fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
306:   return 0;
307: }

309: /* z[i] = (x,y_i) = y_i^H x */
310: PetscErrorCode VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
311: {
312:   VecMultiDot_Private<ConjugateDotTag>(xin,nv,yin,z);
313:   return 0;
314: }

316: /* z[i] = (x,y_i) = y_i^T x */
317: PetscErrorCode VecMTDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
318: {
319:   VecMultiDot_Private<TransposeDotTag>(xin,nv,yin,z);
320:   return 0;
321: }

323: /* x[:] = alpha */
324: PetscErrorCode VecSet_SeqKokkos(Vec xin,PetscScalar alpha)
325: {
326:   PetscScalarKokkosView     xv;

328:   PetscLogGpuTimeBegin();
329:   VecGetKokkosViewWrite(xin,&xv);
330:   KokkosBlas::fill(xv,alpha);
331:   VecRestoreKokkosViewWrite(xin,&xv);
332:   PetscLogGpuTimeEnd();
333:   return 0;
334: }

336: /* x = alpha x */
337: PetscErrorCode VecScale_SeqKokkos(Vec xin,PetscScalar alpha)
338: {
339:   PetscScalarKokkosView     xv;

341:   if (alpha == (PetscScalar)0.0) {
342:     VecSet_SeqKokkos(xin,alpha);
343:   } else if (alpha != (PetscScalar)1.0) {
344:     PetscLogGpuTimeBegin();
345:     VecGetKokkosView(xin,&xv);
346:     KokkosBlas::scal(xv,alpha,xv);
347:     VecRestoreKokkosView(xin,&xv);
348:     PetscLogGpuTimeEnd();
349:     PetscLogGpuFlops(xin->map->n);
350:   }
351:   return 0;
352: }

354: /* z = y^H x */
355: PetscErrorCode VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar *z)
356: {
357:   ConstPetscScalarKokkosView   xv,yv;

359:   PetscLogGpuTimeBegin();
360:   VecGetKokkosView(xin,&xv);
361:   VecGetKokkosView(yin,&yv);
362:   *z = KokkosBlas::dot(yv,xv); /* KokkosBlas::dot(a,b) takes conjugate of a */
363:   VecRestoreKokkosView(xin,&xv);
364:   VecRestoreKokkosView(yin,&yv);
365:   PetscLogGpuTimeEnd();
366:   if (xin->map->n > 0) PetscLogGpuFlops(2.0*xin->map->n-1);
367:   return 0;
368: }

370: /* y = x, where x is VECKOKKOS, but y may be not */
371: PetscErrorCode VecCopy_SeqKokkos(Vec xin,Vec yin)
372: {
373:   PetscLogGpuTimeBegin();
374:   if (xin != yin) {
375:     Vec_Kokkos *xkok = static_cast<Vec_Kokkos*>(xin->spptr);
376:     if (yin->offloadmask == PETSC_OFFLOAD_KOKKOS) {
377:       /* y is also a VecKokkos */
378:       Vec_Kokkos *ykok = static_cast<Vec_Kokkos*>(yin->spptr);
379:       /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
380:         In case x's host is newer, y's device is newer, it will error (though should not, I think). So we just
381:         clear y's sync state.
382:        */
383:       ykok->v_dual.clear_sync_state();
384:       Kokkos::deep_copy(ykok->v_dual,xkok->v_dual);
385:     } else {
386:       PetscScalar *yarray;
387:       VecGetArrayWrite(yin,&yarray);
388:       PetscScalarKokkosViewHost yv(yarray,yin->map->n);
389:       if (xkok->v_dual.need_sync_host()) { /* x's device has newer data */
390:         Kokkos::deep_copy(yv,xkok->v_dual.view_device());
391:       } else {
392:         Kokkos::deep_copy(yv,xkok->v_dual.view_host());
393:       }
394:       VecRestoreArrayWrite(yin,&yarray);
395:     }
396:   }
397:   PetscLogGpuTimeEnd();
398:   return 0;
399: }

401: /* y[i] <--> x[i] */
402: PetscErrorCode VecSwap_SeqKokkos(Vec xin,Vec yin)
403: {
404:   PetscScalarKokkosView           xv,yv;

406:   if (xin != yin) {
407:     PetscLogGpuTimeBegin();
408:     VecGetKokkosView(xin,&xv);
409:     VecGetKokkosView(yin,&yv);
410:     Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(const int64_t i) {
411:       PetscScalar tmp = xv(i);
412:       xv(i) = yv(i);
413:       yv(i) = tmp;
414:     });
415:     VecRestoreKokkosView(xin,&xv);
416:     VecRestoreKokkosView(yin,&yv);
417:     PetscLogGpuTimeEnd();
418:   }
419:   return 0;
420: }

422: /*  w = alpha x + y */
423: PetscErrorCode VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin, Vec yin)
424: {
425:   ConstPetscScalarKokkosView      xv,yv;
426:   PetscScalarKokkosView           wv;

428:   if (alpha == (PetscScalar)0.0) {
429:     VecCopy_SeqKokkos(yin,win);
430:   } else {
431:     PetscLogGpuTimeBegin();
432:     VecGetKokkosViewWrite(win,&wv);
433:     VecGetKokkosView(xin,&xv);
434:     VecGetKokkosView(yin,&yv);
435:     Kokkos::parallel_for(win->map->n,KOKKOS_LAMBDA(const int64_t i) {wv(i) = alpha*xv(i) + yv(i);});
436:     VecRestoreKokkosView(xin,&xv);
437:     VecRestoreKokkosView(yin,&yv);
438:     VecRestoreKokkosViewWrite(win,&wv);
439:     PetscLogGpuTimeEnd();
440:     PetscLogGpuFlops(2*win->map->n);
441:   }
442:   return 0;
443: }

445: struct MAXPYFunctor {
446:   typedef ConstPetscScalarKokkosView::size_type size_type;

448:   PetscScalarKokkosView         yv;
449:   PetscInt                      nx;   /* Significent entries in a[8] and xv[8] */
450:   PetscScalar                   a[8];
451:   ConstPetscScalarKokkosView    xv[8];

453:   MAXPYFunctor(PetscScalarKokkosView yv,
454:                PetscInt nx,
455:                PetscScalar a0,PetscScalar a1,PetscScalar a2,PetscScalar a3,
456:                PetscScalar a4,PetscScalar a5,PetscScalar a6,PetscScalar a7,
457:                ConstPetscScalarKokkosView xv0, ConstPetscScalarKokkosView xv1,
458:                ConstPetscScalarKokkosView xv2, ConstPetscScalarKokkosView xv3,
459:                ConstPetscScalarKokkosView xv4, ConstPetscScalarKokkosView xv5,
460:                ConstPetscScalarKokkosView xv6, ConstPetscScalarKokkosView xv7)
461:     : yv(yv),nx(nx)
462:   {
463:     a[0]  = a0;  a[1]  = a1;
464:     a[2]  = a2;  a[3]  = a3;
465:     a[4]  = a4;  a[5]  = a5;
466:     a[6]  = a6;  a[7]  = a7;
467:     xv[0] = xv0; xv[1] = xv1;
468:     xv[2] = xv2; xv[3] = xv3;
469:     xv[4] = xv4; xv[5] = xv5;
470:     xv[6] = xv6; xv[7] = xv7;
471:   }

473:   KOKKOS_INLINE_FUNCTION void operator() (const size_type i) const
474:   {
475:     for (PetscInt j = 0; j<nx; ++j) yv(i) += a[j]*xv[j](i);
476:   }
477: };

479: /*  y = y + sum alpha[i] x[i] */
480: PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv,const PetscScalar *alpha,Vec *xin)
481: {
482:   PetscInt                        i,j,cur=0,ngroup=nv/8,rem=nv%8;
483:   PetscScalarKokkosView           yv;
484:   PetscScalar                     a[8];
485:   ConstPetscScalarKokkosView      xv[8];

487:   VecGetKokkosView(yin,&yv);
488:   for (i=0; i<ngroup; i++) { /* 8 x's per group */
489:     for (j=0; j<8; j++) { /* Fill the parameters */
490:       a[j] = alpha[cur+j];
491:       VecGetKokkosView(xin[cur+j],&xv[j]);
492:     }
493:     MAXPYFunctor maxpy(yv,8,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
494:     Kokkos::parallel_for(yin->map->n,maxpy);
495:     for (j=0; j<8; j++) VecRestoreKokkosView(xin[cur+j],&xv[j]);
496:     cur += 8;
497:   }

499:   if (rem) { /* The remaining */
500:     for (j=0; j<rem; j++) {
501:       a[j] = alpha[cur+j];
502:       VecGetKokkosView(xin[cur+j],&xv[j]);
503:     }
504:     MAXPYFunctor maxpy(yv,rem,a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],xv[0],xv[1],xv[2],xv[3],xv[4],xv[5],xv[6],xv[7]);
505:     Kokkos::parallel_for(yin->map->n,maxpy);
506:     for (j=0; j<rem; j++) VecRestoreKokkosView(xin[cur+j],&xv[j]);
507:   }
508:   VecRestoreKokkosView(yin,&yv);
509:   PetscLogGpuFlops(nv*2.0*yin->map->n);
510:   return 0;
511: }

513: /* y = alpha x + beta y */
514: PetscErrorCode VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
515: {
516:   ConstPetscScalarKokkosView   xv;
517:   PetscScalarKokkosView        yv;
518:   PetscBool                    xiskok,yiskok;

520:   PetscObjectTypeCompareAny((PetscObject)xin,&xiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");
521:   PetscObjectTypeCompareAny((PetscObject)yin,&yiskok,VECSEQKOKKOS,VECMPIKOKKOS,"");
522:   if (xiskok && yiskok) {
523:     PetscLogGpuTimeBegin();
524:     VecGetKokkosView(xin,&xv);
525:     VecGetKokkosView(yin,&yv);
526:     KokkosBlas::axpby(alpha,xv,beta,yv);
527:     VecRestoreKokkosView(xin,&xv);
528:     VecRestoreKokkosView(yin,&yv);
529:     PetscLogGpuTimeEnd();
530:     if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
531:       PetscLogGpuFlops(xin->map->n);
532:     } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
533:       PetscLogGpuFlops(2.0*xin->map->n);
534:     } else {
535:       PetscLogGpuFlops(3.0*xin->map->n);
536:     }
537:   } else {
538:     VecAXPBY_Seq(yin,alpha,beta,xin);
539:   }
540:   return 0;
541: }

543: /* z = alpha x + beta y + gamma z */
544: PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
545: {
546:   ConstPetscScalarKokkosView    xv,yv;
547:   PetscScalarKokkosView         zv;

549:   PetscLogGpuTimeBegin();
550:   VecGetKokkosView(zin,&zv);
551:   VecGetKokkosView(xin,&xv);
552:   VecGetKokkosView(yin,&yv);
553:   KokkosBlas::update(alpha,xv,beta,yv,gamma,zv);
554:   VecRestoreKokkosView(xin,&xv);
555:   VecRestoreKokkosView(yin,&yv);
556:   VecRestoreKokkosView(zin,&zv);
557:   PetscLogGpuTimeEnd();
558:   PetscLogGpuFlops(zin->map->n*5);
559:   return 0;
560: }

562: /* w = x*y. Any subset of the x, y, and w may be the same vector.

564:   w is of type VecKokkos, but x, y may be not.
565: */
566: PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)
567: {
568:   PetscInt       n;

570:   PetscLogGpuTimeBegin();
571:   VecGetLocalSize(win,&n);
572:   if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
573:     PetscScalarKokkosViewHost  wv;
574:     const PetscScalar          *xp,*yp;
575:     VecGetArrayRead(xin,&xp);
576:     VecGetArrayRead(yin,&yp);
577:     VecGetKokkosViewWrite(win,&wv);

579:     ConstPetscScalarKokkosViewHost xv(xp,n),yv(yp,n);
580:     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,n),KOKKOS_LAMBDA(const PetscInt i) {wv(i) = xv(i)*yv(i);});

582:     VecRestoreArrayRead(xin,&xp);
583:     VecRestoreArrayRead(yin,&yp);
584:     VecRestoreKokkosViewWrite(win,&wv);
585:   } else {
586:     ConstPetscScalarKokkosView      xv,yv;
587:     PetscScalarKokkosView           wv;

589:     VecGetKokkosViewWrite(win,&wv);
590:     VecGetKokkosView(xin,&xv);
591:     VecGetKokkosView(yin,&yv);
592:     Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {wv(i) = xv(i)*yv(i);});
593:     VecRestoreKokkosView(yin,&yv);
594:     VecRestoreKokkosView(xin,&xv);
595:     VecRestoreKokkosViewWrite(win,&wv);
596:   }
597:   PetscLogGpuTimeEnd();
598:   PetscLogGpuFlops(n);
599:   return 0;
600: }

602: /* w = x/y */
603: PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)
604: {
605:   PetscInt       n;

607:   PetscLogGpuTimeBegin();
608:   VecGetLocalSize(win,&n);
609:   if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
610:     PetscScalarKokkosViewHost  wv;
611:     const PetscScalar          *xp,*yp;
612:     VecGetArrayRead(xin,&xp);
613:     VecGetArrayRead(yin,&yp);
614:     VecGetKokkosViewWrite(win,&wv);

616:     ConstPetscScalarKokkosViewHost xv(xp,n),yv(yp,n);
617:     Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,n),KOKKOS_LAMBDA(const PetscInt i) {
618:       if (yv(i) != 0.0) wv(i) = xv(i)/yv(i);
619:       else wv(i) = 0.0;
620:     });

622:     VecRestoreArrayRead(xin,&xp);
623:     VecRestoreArrayRead(yin,&yp);
624:     VecRestoreKokkosViewWrite(win,&wv);
625:   } else {
626:     ConstPetscScalarKokkosView      xv,yv;
627:     PetscScalarKokkosView           wv;

629:     VecGetKokkosViewWrite(win,&wv);
630:     VecGetKokkosView(xin,&xv);
631:     VecGetKokkosView(yin,&yv);
632:     Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
633:       if (yv(i) != 0.0) wv(i) = xv(i)/yv(i);
634:       else wv(i) = 0.0;
635:     });
636:     VecRestoreKokkosView(yin,&yv);
637:     VecRestoreKokkosView(xin,&xv);
638:     VecRestoreKokkosViewWrite(win,&wv);
639:   }
640:   PetscLogGpuTimeEnd();
641:   PetscLogGpuFlops(win->map->n);
642:   return 0;
643: }

645: PetscErrorCode VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal *z)
646: {
647:   const PetscInt                n = xin->map->n;
648:   ConstPetscScalarKokkosView    xv;

650:   if (type == NORM_1_AND_2) {
651:     VecNorm_SeqKokkos(xin,NORM_1,z);
652:     VecNorm_SeqKokkos(xin,NORM_2,z+1);
653:   } else {
654:     PetscLogGpuTimeBegin();
655:     VecGetKokkosView(xin,&xv);
656:     if (type == NORM_2 || type == NORM_FROBENIUS) {
657:       *z   = KokkosBlas::nrm2(xv);
658:       PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
659:     } else if (type == NORM_1) {
660:       *z   = KokkosBlas::nrm1(xv);
661:       PetscLogGpuFlops(PetscMax(n-1.0,0.0));
662:     } else if (type == NORM_INFINITY) {
663:       *z = KokkosBlas::nrminf(xv);
664:     }
665:     VecRestoreKokkosView(xin,&xv);
666:     PetscLogGpuTimeEnd();
667:   }
668:   return 0;
669: }

671: /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
672: struct DotNorm2 {
673:   typedef PetscScalar                            value_type[];
674:   typedef ConstPetscScalarKokkosView::size_type  size_type;

676:   size_type                    value_count;
677:   ConstPetscScalarKokkosView   xv_, yv_; /* first and second vectors in VecDotNorm2. The order matters. */

679:   DotNorm2(ConstPetscScalarKokkosView& xv,ConstPetscScalarKokkosView& yv) :
680:     value_count(2), xv_(xv), yv_(yv) {}

682:   KOKKOS_INLINE_FUNCTION void operator() (const size_type i, value_type result) const
683:   {
684:     result[0] += PetscConj(yv_(i))*xv_(i);
685:     result[1] += PetscConj(yv_(i))*yv_(i);
686:   }

688:   KOKKOS_INLINE_FUNCTION void join (volatile value_type dst, const volatile value_type src) const
689:   {
690:     dst[0] += src[0];
691:     dst[1] += src[1];
692:   }

694:   KOKKOS_INLINE_FUNCTION void init (value_type result) const
695:   {
696:     result[0] = 0.0;
697:     result[1] = 0.0;
698:   }
699: };

701: /* dp = y^H x, nm = y^H y */
702: PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
703: {
704:   ConstPetscScalarKokkosView      xv,yv;
705:   PetscScalar                     result[2];

707:   PetscLogGpuTimeBegin();
708:   VecGetKokkosView(xin,&xv);
709:   VecGetKokkosView(yin,&yv);
710:   DotNorm2 dn(xv,yv);
711:   Kokkos::parallel_reduce(xin->map->n,dn,result);
712:   *dp  = result[0];
713:   *nm  = result[1];
714:   VecRestoreKokkosView(yin,&yv);
715:   VecRestoreKokkosView(xin,&xv);
716:   PetscLogGpuTimeEnd();
717:   PetscLogGpuFlops(4.0*xin->map->n);
718:   return 0;
719: }

721: PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
722: {
723: #if defined(PETSC_USE_COMPLEX)
724:   PetscScalarKokkosView     xv;

726:   PetscLogGpuTimeBegin();
727:   VecGetKokkosView(xin,&xv);
728:   Kokkos::parallel_for(xin->map->n,KOKKOS_LAMBDA(int64_t i) {xv(i) = PetscConj(xv(i));});
729:   VecRestoreKokkosView(xin,&xv);
730:   PetscLogGpuTimeEnd();
731: #else
732: #endif
733:   return 0;
734: }

736: /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
737: PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
738: {
739:   Vec_Seq        *vecseq = (Vec_Seq*)vin->data;
740:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(vin->spptr);

742:   VecPlaceArray_Seq(vin,a);
743:   veckok->UpdateArray<Kokkos::HostSpace>(vecseq->array);
744:   return 0;
745: }

747: PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
748: {
749:   Vec_Seq        *vecseq = (Vec_Seq*)vin->data;
750:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(vin->spptr);

752:   veckok->v_dual.sync_host(); /* User wants to unhook the provided host array. Sync it so that user can get the latest */
753:   VecResetArray_Seq(vin); /* Swap back the old host array, assuming its has the latest value */
754:   veckok->UpdateArray<Kokkos::HostSpace>(vecseq->array);
755:   return 0;
756: }

758: /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwords. */
759: PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar *a)
760: {
761:   Vec_Seq        *vecseq = (Vec_Seq*)vin->data;
762:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(vin->spptr);

764:   /* Make sure the users array has the latest values */
765:   if (vecseq->array != vecseq->array_allocated) veckok->v_dual.sync_host();
766:   VecReplaceArray_Seq(vin,a);
767:   veckok->UpdateArray<Kokkos::HostSpace>(vecseq->array);
768:   return 0;
769: }

771: /* Maps the local portion of vector v into vector w */
772: PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v,Vec w)
773: {
774:   Vec_Seq          *vecseq = static_cast<Vec_Seq*>(w->data);
775:   Vec_Kokkos       *veckok = static_cast<Vec_Kokkos*>(w->spptr);

778:   /* Destroy w->data, w->spptr */
779:   if (vecseq) {
780:     PetscFree(vecseq->array_allocated);
781:     PetscFree(w->data);
782:   }
783:   delete veckok;

785:   /* Replace with v's */
786:   w->data  = v->data;
787:   w->spptr = v->spptr;
788:   PetscObjectStateIncrease((PetscObject)w);
789:   return 0;
790: }

792: PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)
793: {
795:   v->data  = w->data;
796:   v->spptr = w->spptr;
797:   PetscObjectStateIncrease((PetscObject)v);
798:   /* TODO: need to think if setting w->data/spptr to NULL is safe */
799:   w->data  = NULL;
800:   w->spptr = NULL;
801:   return 0;
802: }

804: PetscErrorCode VecGetArray_SeqKokkos(Vec v,PetscScalar **a)
805: {
806:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

808:   veckok->v_dual.sync_host();
809:   *a = *((PetscScalar**)v->data);
810:   return 0;
811: }

813: PetscErrorCode VecRestoreArray_SeqKokkos(Vec v,PetscScalar **a)
814: {
815:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

817:   veckok->v_dual.modify_host();
818:   return 0;
819: }

821: /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
822: PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar **a)
823: {
824:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

826:   veckok->v_dual.clear_sync_state();
827:   *a = veckok->v_dual.view_host().data();
828:   return 0;
829: }

831: PetscErrorCode VecGetArrayAndMemType_SeqKokkos(Vec v,PetscScalar** a,PetscMemType *mtype)
832: {
833:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

835:   if (std::is_same<DefaultMemorySpace,Kokkos::HostSpace>::value) {
836:     *a = veckok->v_dual.view_host().data();
837:     if (mtype) *mtype = PETSC_MEMTYPE_HOST;
838:   } else {
839:     /* When there is device, we always return up-to-date device data */
840:     veckok->v_dual.sync_device();
841:     *a = veckok->v_dual.view_device().data();
842:     if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
843:   }
844:   return 0;
845: }

847: PetscErrorCode VecRestoreArrayAndMemType_SeqKokkos(Vec v,PetscScalar** a)
848: {
849:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

851:   if (std::is_same<DefaultMemorySpace,Kokkos::HostSpace>::value) {
852:     veckok->v_dual.modify_host();
853:   } else {
854:     veckok->v_dual.modify_device();
855:   }
856:   return 0;
857: }

859: PetscErrorCode VecGetArrayWriteAndMemType_SeqKokkos(Vec v,PetscScalar** a,PetscMemType *mtype)
860: {
861:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);

863:   if (std::is_same<DefaultMemorySpace,Kokkos::HostSpace>::value) {
864:     *a = veckok->v_dual.view_host().data();
865:     if (mtype) *mtype = PETSC_MEMTYPE_HOST;
866:   } else {
867:     /* When there is device, we always return device data (but no need to sync the device) */
868:     veckok->v_dual.clear_sync_state(); /* So that in restore, we can safely modify_device() */
869:     *a = veckok->v_dual.view_device().data();
870:     if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
871:   }
872:   return 0;
873: }

875: /* Copy xin's sync state to y */
876: static PetscErrorCode VecCopySyncState_Kokkos_Private(Vec xin,Vec yout)
877: {
878:   Vec_Kokkos   *xkok = static_cast<Vec_Kokkos*>(xin->spptr);
879:   Vec_Kokkos   *ykok = static_cast<Vec_Kokkos*>(yout->spptr);

881:   ykok->v_dual.clear_sync_state();
882:   if (xkok->v_dual.need_sync_host()) {
883:     ykok->v_dual.modify_device();
884:   } else if (xkok->v_dual.need_sync_device()) {
885:     ykok->v_dual.modify_host();
886:   }
887:   return 0;
888: }

890: /* Interal routine shared by VecGetSubVector_{SeqKokkos,MPIKokkos} */
891: PetscErrorCode VecGetSubVector_Kokkos_Private(Vec x,PetscBool xIsMPI,IS is,Vec *y)
892: {
893:   PetscBool      contig;
894:   PetscInt       n,N,start,bs;
895:   MPI_Comm       comm;
896:   Vec            z;

898:   PetscObjectGetComm((PetscObject)x,&comm);
899:   ISGetLocalSize(is,&n);
900:   ISGetSize(is,&N);
901:   VecGetSubVectorContiguityAndBS_Private(x,is,&contig,&start,&bs);

903:   if (contig) { /* We can do a no-copy (in-place) implementation with y sharing x's arrays */
904:     Vec_Kokkos        *xkok = static_cast<Vec_Kokkos*>(x->spptr);
905:     const PetscScalar *array_h = xkok->v_dual.view_host().data() + start;
906:     const PetscScalar *array_d = xkok->v_dual.view_device().data() + start;

908:     /* These calls assume the input arrays are synced */
909:     if (xIsMPI) VecCreateMPIKokkosWithArrays_Private(comm,bs,n,N,array_h,array_d,&z); /* x could be MPI even when x's comm size = 1 */
910:     else VecCreateSeqKokkosWithArrays_Private(comm,bs,n,array_h,array_d,&z);

912:     VecCopySyncState_Kokkos_Private(x,z); /* Copy x's sync state to z */

914:     /* This is relevant only in debug mode */
915:     PetscInt state = 0;
916:     VecLockGet(x,&state);
917:     if (state) { /* x is either in read or read/write mode, therefore z, overlapped with x, can only be in read mode */
918:       VecLockReadPush(z);
919:     }

921:     z->ops->placearray   = NULL; /* z's arrays can't be replaced, because z does not own them */
922:     z->ops->replacearray = NULL;

924:   } else { /* Have to create a VecScatter and a stand-alone vector */
925:     VecGetSubVectorThroughVecScatter_Private(x,is,bs,&z);
926:   }
927:   *y = z;
928:   return 0;
929: }

931: PetscErrorCode VecGetSubVector_SeqKokkos(Vec x,IS is,Vec *y)
932: {
933:   VecGetSubVector_Kokkos_Private(x,PETSC_FALSE,is,y);
934:   return 0;
935: }

937: /* Restore subvector y to x */
938: PetscErrorCode VecRestoreSubVector_SeqKokkos(Vec x,IS is,Vec *y)
939: {
940:   VecScatter                    vscat;
941:   PETSC_UNUSED PetscObjectState dummystate = 0;
942:   PetscBool                     unchanged;

944:   PetscObjectComposedDataGetInt((PetscObject)*y,VecGetSubVectorSavedStateId,dummystate,unchanged);
945:   if (unchanged) return 0; /* If y's state has not changed since VecGetSubVector(), we only need to destroy it */

947:   PetscObjectQuery((PetscObject)*y,"VecGetSubVector_Scatter",(PetscObject*)&vscat);
948:   if (vscat) {
949:     VecScatterBegin(vscat,*y,x,INSERT_VALUES,SCATTER_REVERSE);
950:     VecScatterEnd(vscat,*y,x,INSERT_VALUES,SCATTER_REVERSE);
951:   } else { /* y and x's (host and device) arrays overlap */
952:     Vec_Kokkos *xkok = static_cast<Vec_Kokkos*>(x->spptr);
953:     Vec_Kokkos *ykok = static_cast<Vec_Kokkos*>((*y)->spptr);
954:     PetscInt   state;

956:     VecLockGet(x,&state);

959:     /* The tricky part: one has to carefully sync the arrays */
960:     if (xkok->v_dual.need_sync_device()) { /* x's host has newer data */
961:       ykok->v_dual.sync_host(); /* Move y's latest values to host (since y is just a subset of x) */
962:     } else if (xkok->v_dual.need_sync_host()) { /* x's device has newer data */
963:       ykok->v_dual.sync_device(); /* Move y's latest data to device */
964:     } else { /* x's host and device data is already sync'ed; Copy y's sync state to x */
965:       VecCopySyncState_Kokkos_Private(*y,x);
966:     }
967:     PetscObjectStateIncrease((PetscObject)x); /* Since x is updated */
968:   }
969:   return 0;
970: }

972: static PetscErrorCode VecSetOps_SeqKokkos(Vec v)
973: {
974:   v->ops->abs                    = VecAbs_SeqKokkos;
975:   v->ops->reciprocal             = VecReciprocal_SeqKokkos;
976:   v->ops->pointwisemult          = VecPointwiseMult_SeqKokkos;
977:   v->ops->min                    = VecMin_SeqKokkos;
978:   v->ops->max                    = VecMax_SeqKokkos;
979:   v->ops->sum                    = VecSum_SeqKokkos;
980:   v->ops->shift                  = VecShift_SeqKokkos;
981:   v->ops->norm                   = VecNorm_SeqKokkos;
982:   v->ops->scale                  = VecScale_SeqKokkos;
983:   v->ops->copy                   = VecCopy_SeqKokkos;
984:   v->ops->set                    = VecSet_SeqKokkos;
985:   v->ops->swap                   = VecSwap_SeqKokkos;
986:   v->ops->axpy                   = VecAXPY_SeqKokkos;
987:   v->ops->axpby                  = VecAXPBY_SeqKokkos;
988:   v->ops->axpbypcz               = VecAXPBYPCZ_SeqKokkos;
989:   v->ops->pointwisedivide        = VecPointwiseDivide_SeqKokkos;
990:   v->ops->setrandom              = VecSetRandom_SeqKokkos;

992:   v->ops->dot                    = VecDot_SeqKokkos;
993:   v->ops->tdot                   = VecTDot_SeqKokkos;
994:   v->ops->mdot                   = VecMDot_SeqKokkos;
995:   v->ops->mtdot                  = VecMTDot_SeqKokkos;

997:   v->ops->dot_local              = VecDot_SeqKokkos;
998:   v->ops->tdot_local             = VecTDot_SeqKokkos;
999:   v->ops->mdot_local             = VecMDot_SeqKokkos;
1000:   v->ops->mtdot_local            = VecMTDot_SeqKokkos;

1002:   v->ops->norm_local             = VecNorm_SeqKokkos;
1003:   v->ops->maxpy                  = VecMAXPY_SeqKokkos;
1004:   v->ops->aypx                   = VecAYPX_SeqKokkos;
1005:   v->ops->waxpy                  = VecWAXPY_SeqKokkos;
1006:   v->ops->dotnorm2               = VecDotNorm2_SeqKokkos;
1007:   v->ops->placearray             = VecPlaceArray_SeqKokkos;
1008:   v->ops->replacearray           = VecReplaceArray_SeqKokkos;
1009:   v->ops->resetarray             = VecResetArray_SeqKokkos;
1010:   v->ops->destroy                = VecDestroy_SeqKokkos;
1011:   v->ops->duplicate              = VecDuplicate_SeqKokkos;
1012:   v->ops->conjugate              = VecConjugate_SeqKokkos;
1013:   v->ops->getlocalvector         = VecGetLocalVector_SeqKokkos;
1014:   v->ops->restorelocalvector     = VecRestoreLocalVector_SeqKokkos;
1015:   v->ops->getlocalvectorread     = VecGetLocalVector_SeqKokkos;
1016:   v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
1017:   v->ops->getarraywrite          = VecGetArrayWrite_SeqKokkos;
1018:   v->ops->getarray               = VecGetArray_SeqKokkos;
1019:   v->ops->restorearray           = VecRestoreArray_SeqKokkos;

1021:   v->ops->getarrayandmemtype     = VecGetArrayAndMemType_SeqKokkos;
1022:   v->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqKokkos;
1023:   v->ops->getarraywriteandmemtype= VecGetArrayWriteAndMemType_SeqKokkos;
1024:   v->ops->getsubvector           = VecGetSubVector_SeqKokkos;
1025:   v->ops->restoresubvector       = VecRestoreSubVector_SeqKokkos;
1026:   return 0;
1027: }

1029: /*MC
1030:    VECSEQKOKKOS - VECSEQKOKKOS = "seqkokkos" - The basic sequential vector, modified to use Kokkos

1032:    Options Database Keys:
1033: . -vec_type seqkokkos - sets the vector type to VECSEQKOKKOS during a call to VecSetFromOptions()

1035:   Level: beginner

1037: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI()
1038: M*/
1039: PetscErrorCode VecCreate_SeqKokkos(Vec v)
1040: {
1041:   Vec_Seq        *vecseq;
1042:   Vec_Kokkos     *veckok;

1044:   PetscKokkosInitializeCheck();
1045:   PetscLayoutSetUp(v->map);
1046:   VecCreate_Seq(v);  /* Build a sequential vector, allocate array */
1047:   PetscObjectChangeTypeName((PetscObject)v,VECSEQKOKKOS);
1048:   VecSetOps_SeqKokkos(v);

1051:   vecseq   = static_cast<Vec_Seq*>(v->data);
1052:   veckok   = new Vec_Kokkos(v->map->n,vecseq->array,NULL); /* Let host claim it has the latest data (zero) */
1053:   v->spptr = static_cast<void*>(veckok);
1054:   v->offloadmask = PETSC_OFFLOAD_KOKKOS;
1055:   return 0;
1056: }

1058: /*@C
1059:    VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1060:    where the user provides the array space to store the vector values. The array
1061:    provided must be a device array.

1063:    Collective

1065:    Input Parameters:
1066: +  comm - the communicator, should be PETSC_COMM_SELF
1067: .  bs - the block size
1068: .  n - the vector length
1069: -  array - device memory where the vector elements are to be stored.

1071:    Output Parameter:
1072: .  v - the vector

1074:    Notes:
1075:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1076:    same type as an existing vector.

1078:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1079:    The user should not free the array until the vector is destroyed.

1081:    Level: intermediate

1083: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
1084:           VecCreateGhost(), VecCreateSeq(), VecCreateSeqWithArray(),
1085:           VecCreateMPIWithArray()
1086: @*/
1087: PetscErrorCode  VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec *v)
1088: {
1089:   PetscMPIInt    size;
1090:   Vec            w;
1091:   Vec_Kokkos     *veckok = NULL;
1092:   PetscScalar    *harray;

1094:   MPI_Comm_size(comm,&size);

1097:   PetscKokkosInitializeCheck();
1098:   VecCreate(comm,&w);
1099:   VecSetSizes(w,n,n);
1100:   VecSetBlockSize(w,bs);
1101:   if (!darray) { /* Allocate memory ourself if user provided NULL */
1102:     VecSetType(w,VECSEQKOKKOS);
1103:   } else {
1104:     /* Build a VECSEQ, get its harray, and then build Vec_Kokkos along with darray */
1105:     if (std::is_same<DefaultMemorySpace,Kokkos::HostSpace>::value) {
1106:       harray = const_cast<PetscScalar*>(darray);
1107:       VecCreate_Seq_Private(w,harray); /* Build a sequential vector with harray */
1108:     } else {
1109:       VecSetType(w,VECSEQ);
1110:       harray = static_cast<Vec_Seq*>(w->data)->array;
1111:     }
1112:     PetscObjectChangeTypeName((PetscObject)w,VECSEQKOKKOS); /* Change it to Kokkos */
1113:     VecSetOps_SeqKokkos(w);
1114:     veckok = new Vec_Kokkos(n,harray,const_cast<PetscScalar*>(darray));
1115:     veckok->v_dual.modify_device(); /* Mark the device is modified */
1116:     w->offloadmask = PETSC_OFFLOAD_KOKKOS;
1117:     w->spptr = static_cast<void*>(veckok);
1118:   }
1119:   *v       = w;
1120:   return 0;
1121: }

1123: /*
1124:    VecCreateSeqKokkosWithArrays_Private - Creates a Kokkos sequential array-style vector
1125:    with user-provided arrays on host and device.

1127:    Collective

1129:    Input Parameter:
1130: +  comm - the communicator, should be PETSC_COMM_SELF
1131: .  bs - the block size
1132: .  n - the vector length
1133: .  harray - host memory where the vector elements are to be stored.
1134: -  darray - device memory where the vector elements are to be stored.

1136:    Output Parameter:
1137: .  v - the vector

1139:    Notes:
1140:    Unlike VecCreate{Seq,MPI}CUDAWithArrays(), this routine is private since we do not expect users to use it directly.

1142:    If there is no device, then harray and darray must be the same.
1143:    If n is not zero, then harray and darray must be allocated.
1144:    After the call, the created vector is supposed to be in a synchronized state, i.e.,
1145:    we suppose harray and darray have the same data.

1147:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1148:    Caller should not free the array until the vector is destroyed.
1149: */
1150: PetscErrorCode  VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar harray[],const PetscScalar darray[],Vec *v)
1151: {
1152:   PetscMPIInt size;
1153:   Vec         w;

1155:   PetscKokkosInitializeCheck();
1156:   MPI_Comm_size(comm,&size);
1158:   if (n) {
1161:   }

1164:   VecCreateSeqWithArray(comm,bs,n,harray,&w);
1165:   PetscObjectChangeTypeName((PetscObject)w,VECSEQKOKKOS); /* Change it to Kokkos */
1166:   VecSetOps_SeqKokkos(w);
1167:   w->spptr = new Vec_Kokkos(n,const_cast<PetscScalar*>(harray),const_cast<PetscScalar*>(darray));
1168:   w->offloadmask = PETSC_OFFLOAD_KOKKOS;
1169:   *v = w;
1170:   return 0;
1171: }

1173: /* TODO: ftn-auto generates veckok.kokkosf.c */
1174: /*@C
1175:  VecCreateSeqKokkos - Creates a standard, sequential array-style vector.

1177:  Collective

1179:  Input Parameter:
1180:  +  comm - the communicator, should be PETSC_COMM_SELF
1181:  -  n - the vector length

1183:  Output Parameter:
1184:  .  v - the vector

1186:  Notes:
1187:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1188:  same type as an existing vector.

1190:  Level: intermediate

1192:  .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1193:  @*/
1194: PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec *v)
1195: {
1196:   PetscKokkosInitializeCheck();
1197:   VecCreate(comm,v);
1198:   VecSetSizes(*v,n,n);
1199:   VecSetType(*v,VECSEQKOKKOS); /* Calls VecCreate_SeqKokkos */
1200:   return 0;
1201: }

1203: /* Duplicate layout etc but not the values in the input vector */
1204: PetscErrorCode VecDuplicate_SeqKokkos(Vec win,Vec *v)
1205: {
1206:   VecDuplicate_Seq(win,v); /* It also dups ops of win */
1207:   return 0;
1208: }

1210: PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1211: {
1212:   Vec_Kokkos     *veckok = static_cast<Vec_Kokkos*>(v->spptr);
1213:   Vec_Seq        *vecseq = static_cast<Vec_Seq*>(v->data);

1215:   delete veckok;
1216:   v->spptr = NULL;
1217:   if (vecseq) VecDestroy_Seq(v);
1218:   return 0;
1219: }