Actual source code: vechip2.hip.cpp

  1: /*
  2:    Implements the sequential hip vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/hipvecimpl.h>

 12: #include <hip/hip_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/reduce.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19:  /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
 20: struct PetscDivideComplex
 21: {
 22:   __host__ __device__
 23:   PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
 24:   {
 25:     PetscReal lx = PetscRealPart(lhs);
 26:     PetscReal ly = PetscImaginaryPart(lhs);
 27:     PetscReal rx = PetscRealPart(rhs);
 28:     PetscReal ry = PetscImaginaryPart(rhs);
 29:     PetscReal n  = rx*rx + ry*ry;
 30:     return PetscComplex((lx*rx + ly*ry)/n,(rx*ly - lx*ry)/n);
 31:   }
 32: };

 34: struct PetscMultiplyComplex
 35: {
 36:   __host__ __device__
 37:   PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
 38:   {
 39:     PetscReal lx = PetscRealPart(lhs);
 40:     PetscReal ly = PetscImaginaryPart(lhs);
 41:     PetscReal rx = PetscRealPart(rhs);
 42:     PetscReal ry = PetscImaginaryPart(rhs);
 43:     return PetscComplex(lx*rx-ly*ry,ly*rx+lx*ry);
 44:   }
 45: };
 46: #endif

 48: /*
 49:     Allocates space for the vector array on the GPU if it does not exist.
 50:     Does NOT change the PetscHIPFlag for the vector
 51:     Does NOT zero the HIP array

 53:  */
 54: PetscErrorCode VecHIPAllocateCheck(Vec v)
 55: {
 56:   Vec_HIP        *vechip;
 57:   PetscBool      option_set;

 59:   if (!v->spptr) {
 60:     PetscReal      pinned_memory_min;

 63:     PetscCalloc(sizeof(Vec_HIP),&v->spptr);
 64:     vechip = (Vec_HIP*)v->spptr;
 65:     hipMalloc((void**)&vechip->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));
 66:     vechip->GPUarray = vechip->GPUarray_allocated;
 67:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 68:       if (v->data && ((Vec_Seq*)v->data)->array) {
 69:         v->offloadmask = PETSC_OFFLOAD_CPU;
 70:       } else {
 71:         v->offloadmask = PETSC_OFFLOAD_GPU;
 72:       }
 73:     }
 74:     pinned_memory_min = 0;

 76:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 77:        Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
 78:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECHIP Options","Vec");
 79:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 80:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 81:     PetscOptionsEnd();
 82:   }
 83:   return 0;
 84: }

 86: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 87: PetscErrorCode VecHIPCopyToGPU(Vec v)
 88: {
 89:   Vec_HIP        *vechip;
 90:   PetscScalar    *varray;

 93:   VecHIPAllocateCheck(v);
 94:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 95:     PetscLogEventBegin(VEC_HIPCopyToGPU,v,0,0,0);
 96:     vechip         = (Vec_HIP*)v->spptr;
 97:     varray         = vechip->GPUarray;
 98:     hipMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),hipMemcpyHostToDevice);
 99:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
100:     PetscLogEventEnd(VEC_HIPCopyToGPU,v,0,0,0);
101:     v->offloadmask = PETSC_OFFLOAD_BOTH;
102:   }
103:   return 0;
104: }

106: /*
107:      VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
108: */
109: PetscErrorCode VecHIPCopyFromGPU(Vec v)
110: {
111:   Vec_HIP        *vechip;
112:   PetscScalar    *varray;

115:   VecHIPAllocateCheckHost(v);
116:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
117:     PetscLogEventBegin(VEC_HIPCopyFromGPU,v,0,0,0);
118:     vechip         = (Vec_HIP*)v->spptr;
119:     varray         = vechip->GPUarray;
120:     hipMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);
121:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
122:     PetscLogEventEnd(VEC_HIPCopyFromGPU,v,0,0,0);
123:     v->offloadmask = PETSC_OFFLOAD_BOTH;
124:   }
125:   return 0;
126: }

128: /*MC
129:    VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP

131:    Options Database Keys:
132: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()

134:   Level: beginner

136: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
137: M*/

139: PetscErrorCode VecAYPX_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
140: {
141:   const PetscScalar *xarray;
142:   PetscScalar       *yarray;
143:   PetscBLASInt      one = 1,bn = 0;
144:   PetscScalar       sone = 1.0;
145:   hipblasHandle_t   hipblasv2handle;

147:   PetscHIPBLASGetHandle(&hipblasv2handle);
148:   PetscBLASIntCast(yin->map->n,&bn);
149:   VecHIPGetArrayRead(xin,&xarray);
150:   VecHIPGetArray(yin,&yarray);
151:   PetscLogGpuTimeBegin();
152:   if (alpha == (PetscScalar)0.0) {
153:     hipMemcpy(yarray,xarray,bn*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
154:   } else if (alpha == (PetscScalar)1.0) {
155:     hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
156:     PetscLogGpuFlops(1.0*yin->map->n);
157:   } else {
158:     hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);
159:     hipblasXaxpy(hipblasv2handle,bn,&sone,xarray,one,yarray,one);
160:     PetscLogGpuFlops(2.0*yin->map->n);
161:   }
162:   PetscLogGpuTimeEnd();
163:   VecHIPRestoreArrayRead(xin,&xarray);
164:   VecHIPRestoreArray(yin,&yarray);
165:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
166:   return 0;
167: }

169: PetscErrorCode VecAXPY_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
170: {
171:   const PetscScalar *xarray;
172:   PetscScalar       *yarray;
173:   PetscBLASInt      one = 1,bn = 0;
174:   hipblasHandle_t   hipblasv2handle;
175:   PetscBool         xiship;

177:   if (alpha == (PetscScalar)0.0) return 0;
178:   PetscHIPBLASGetHandle(&hipblasv2handle);
179:   PetscObjectTypeCompareAny((PetscObject)xin,&xiship,VECSEQHIP,VECMPIHIP,"");
180:   if (xiship) {
181:     PetscBLASIntCast(yin->map->n,&bn);
182:     VecHIPGetArrayRead(xin,&xarray);
183:     VecHIPGetArray(yin,&yarray);
184:     PetscLogGpuTimeBegin();
185:     hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
186:     PetscLogGpuTimeEnd();
187:     VecHIPRestoreArrayRead(xin,&xarray);
188:     VecHIPRestoreArray(yin,&yarray);
189:     PetscLogGpuFlops(2.0*yin->map->n);
190:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
191:   } else {
192:     VecAXPY_Seq(yin,alpha,xin);
193:   }
194:   return 0;
195: }

197: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
198: {
199:   PetscInt                              n = xin->map->n;
200:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
201:   PetscScalar                           *warray=NULL;
202:   thrust::device_ptr<const PetscScalar> xptr,yptr;
203:   thrust::device_ptr<PetscScalar>       wptr;

205:   if (xin->boundtocpu || yin->boundtocpu) {
206:     VecPointwiseDivide_Seq(win,xin,yin);
207:     return 0;
208:   }
209:   VecHIPGetArrayWrite(win,&warray);
210:   VecHIPGetArrayRead(xin,&xarray);
211:   VecHIPGetArrayRead(yin,&yarray);
212:   PetscLogGpuTimeBegin();
213:   try {
214:     wptr = thrust::device_pointer_cast(warray);
215:     xptr = thrust::device_pointer_cast(xarray);
216:     yptr = thrust::device_pointer_cast(yarray);
217: #if defined(PETSC_USE_COMPLEX)
218:     thrust::transform(xptr,xptr+n,yptr,wptr,PetscDivideComplex());
219: #else
220:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
221: #endif
222:   } catch (char *ex) {
223:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
224:   }
225:   PetscLogGpuTimeEnd();
226:   PetscLogGpuFlops(n);
227:   VecHIPRestoreArrayRead(xin,&xarray);
228:   VecHIPRestoreArrayRead(yin,&yarray);
229:   VecHIPRestoreArrayWrite(win,&warray);
230:   return 0;
231: }

233: PetscErrorCode VecWAXPY_SeqHIP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
234: {
235:   const PetscScalar *xarray=NULL,*yarray=NULL;
236:   PetscScalar       *warray=NULL;
237:   PetscBLASInt      one = 1,bn = 0;
238:   hipblasHandle_t   hipblasv2handle;

240:   PetscHIPBLASGetHandle(&hipblasv2handle);
241:   PetscBLASIntCast(win->map->n,&bn);
242:   if (alpha == (PetscScalar)0.0) {
243:     VecCopy_SeqHIP(yin,win);
244:   } else {
245:     VecHIPGetArrayRead(xin,&xarray);
246:     VecHIPGetArrayRead(yin,&yarray);
247:     VecHIPGetArrayWrite(win,&warray);
248:     PetscLogGpuTimeBegin();
249:     hipMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
250:     hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,warray,one);
251:     PetscLogGpuTimeEnd();
252:     PetscLogGpuFlops(2*win->map->n);
253:     VecHIPRestoreArrayRead(xin,&xarray);
254:     VecHIPRestoreArrayRead(yin,&yarray);
255:     VecHIPRestoreArrayWrite(win,&warray);
256:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
257:   }
258:   return 0;
259: }

261: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
262: {
263:   PetscInt          n = xin->map->n,j;
264:   PetscScalar       *xarray;
265:   const PetscScalar *yarray;
266:   PetscBLASInt      one = 1,bn = 0;
267:   hipblasHandle_t   hipblasv2handle;
268:   hipblasStatus_t   cberr;

270:   PetscLogGpuFlops(nv*2.0*n);
271:   PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
272:   PetscHIPBLASGetHandle(&hipblasv2handle);
273:   PetscBLASIntCast(n,&bn);
274:   VecHIPGetArray(xin,&xarray);
275:   PetscLogGpuTimeBegin();
276:   for (j=0; j<nv; j++) {
277:     VecHIPGetArrayRead(y[j],&yarray);
278:     hipblasXaxpy(hipblasv2handle,bn,alpha+j,yarray,one,xarray,one);
279:     VecHIPRestoreArrayRead(y[j],&yarray);
280:   }
281:   PetscLogGpuTimeEnd();
282:   VecHIPRestoreArray(xin,&xarray);
283:   return 0;
284: }

286: PetscErrorCode VecDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
287: {
288:   const PetscScalar *xarray,*yarray;
289:   PetscBLASInt      one = 1,bn = 0;
290:   hipblasHandle_t   hipblasv2handle;
291:   hipblasStatus_t   cerr;

293:   PetscHIPBLASGetHandle(&hipblasv2handle);
294:   PetscBLASIntCast(yin->map->n,&bn);
295:   VecHIPGetArrayRead(xin,&xarray);
296:   VecHIPGetArrayRead(yin,&yarray);
297:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
298:   PetscLogGpuTimeBegin();
299:   hipblasXdot(hipblasv2handle,bn,yarray,one,xarray,one,z);
300:   PetscLogGpuTimeEnd();
301:   if (xin->map->n >0) {
302:     PetscLogGpuFlops(2.0*xin->map->n-1);
303:   }
304:   PetscLogGpuToCpu(sizeof(PetscScalar));
305:   VecHIPRestoreArrayRead(xin,&xarray);
306:   VecHIPRestoreArrayRead(yin,&yarray);
307:   return 0;
308: }

310: //
311: // HIP kernels for MDot to follow
312: //

314: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
315: #define MDOT_WORKGROUP_SIZE 128
316: #define MDOT_WORKGROUP_NUM  128

318: #if !defined(PETSC_USE_COMPLEX)
319: // M = 2:
320: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
321:                                         PetscInt size, PetscScalar *group_results)
322: {
323:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
324:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
325:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
326:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
327:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

329:   PetscScalar entry_x    = 0;
330:   PetscScalar group_sum0 = 0;
331:   PetscScalar group_sum1 = 0;
332:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
333:     entry_x     = x[i];   // load only once from global memory!
334:     group_sum0 += entry_x * y0[i];
335:     group_sum1 += entry_x * y1[i];
336:   }
337:   tmp_buffer[threadIdx.x]                       = group_sum0;
338:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

340:   // parallel reduction
341:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
342:     __syncthreads();
343:     if (threadIdx.x < stride) {
344:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
345:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
346:     }
347:   }

349:   // write result of group to group_results
350:   if (threadIdx.x == 0) {
351:     group_results[blockIdx.x]             = tmp_buffer[0];
352:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
353:   }
354: }

356: // M = 3:
357: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
358:                                         PetscInt size, PetscScalar *group_results)
359: {
360:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
361:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
362:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
363:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
364:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

366:   PetscScalar entry_x    = 0;
367:   PetscScalar group_sum0 = 0;
368:   PetscScalar group_sum1 = 0;
369:   PetscScalar group_sum2 = 0;
370:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
371:     entry_x     = x[i];   // load only once from global memory!
372:     group_sum0 += entry_x * y0[i];
373:     group_sum1 += entry_x * y1[i];
374:     group_sum2 += entry_x * y2[i];
375:   }
376:   tmp_buffer[threadIdx.x]                           = group_sum0;
377:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
378:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

380:   // parallel reduction
381:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
382:     __syncthreads();
383:     if (threadIdx.x < stride) {
384:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
385:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
386:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
387:     }
388:   }

390:   // write result of group to group_results
391:   if (threadIdx.x == 0) {
392:     group_results[blockIdx.x                ] = tmp_buffer[0];
393:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
394:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
395:   }
396: }

398: // M = 4:
399: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
400:                                         PetscInt size, PetscScalar *group_results)
401: {
402:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
403:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
404:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
405:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
406:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

408:   PetscScalar entry_x    = 0;
409:   PetscScalar group_sum0 = 0;
410:   PetscScalar group_sum1 = 0;
411:   PetscScalar group_sum2 = 0;
412:   PetscScalar group_sum3 = 0;
413:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
414:     entry_x     = x[i];   // load only once from global memory!
415:     group_sum0 += entry_x * y0[i];
416:     group_sum1 += entry_x * y1[i];
417:     group_sum2 += entry_x * y2[i];
418:     group_sum3 += entry_x * y3[i];
419:   }
420:   tmp_buffer[threadIdx.x]                           = group_sum0;
421:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
422:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
423:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

425:   // parallel reduction
426:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
427:     __syncthreads();
428:     if (threadIdx.x < stride) {
429:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
430:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
431:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
432:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
433:     }
434:   }

436:   // write result of group to group_results
437:   if (threadIdx.x == 0) {
438:     group_results[blockIdx.x                ] = tmp_buffer[0];
439:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
440:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
441:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
442:   }
443: }

445: // M = 8:
446: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
447:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
448:                                           PetscInt size, PetscScalar *group_results)
449: {
450:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
451:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
452:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
453:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
454:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

456:   PetscScalar entry_x    = 0;
457:   PetscScalar group_sum0 = 0;
458:   PetscScalar group_sum1 = 0;
459:   PetscScalar group_sum2 = 0;
460:   PetscScalar group_sum3 = 0;
461:   PetscScalar group_sum4 = 0;
462:   PetscScalar group_sum5 = 0;
463:   PetscScalar group_sum6 = 0;
464:   PetscScalar group_sum7 = 0;
465:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
466:     entry_x     = x[i];   // load only once from global memory!
467:     group_sum0 += entry_x * y0[i];
468:     group_sum1 += entry_x * y1[i];
469:     group_sum2 += entry_x * y2[i];
470:     group_sum3 += entry_x * y3[i];
471:     group_sum4 += entry_x * y4[i];
472:     group_sum5 += entry_x * y5[i];
473:     group_sum6 += entry_x * y6[i];
474:     group_sum7 += entry_x * y7[i];
475:   }
476:   tmp_buffer[threadIdx.x]                           = group_sum0;
477:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
478:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
479:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
480:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
481:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
482:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
483:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

485:   // parallel reduction
486:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
487:     __syncthreads();
488:     if (threadIdx.x < stride) {
489:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
490:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
491:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
492:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
493:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
494:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
495:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
496:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
497:     }
498:   }

500:   // write result of group to group_results
501:   if (threadIdx.x == 0) {
502:     group_results[blockIdx.x                ] = tmp_buffer[0];
503:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
504:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
505:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
506:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
507:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
508:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
509:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
510:   }
511: }
512: #endif /* !defined(PETSC_USE_COMPLEX) */

514: PetscErrorCode VecMDot_SeqHIP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
515: {
516:   PetscInt          i,n = xin->map->n,current_y_index = 0;
517:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
518: #if !defined(PETSC_USE_COMPLEX)
519:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
520:   PetscScalar       *group_results_gpu,*group_results_cpu;
521: #endif
522:   PetscBLASInt      one = 1,bn = 0;
523:   hipblasHandle_t   hipblasv2handle;

525:   PetscHIPBLASGetHandle(&hipblasv2handle);
526:   PetscBLASIntCast(xin->map->n,&bn);
528:   /* Handle the case of local size zero first */
529:   if (!xin->map->n) {
530:     for (i=0; i<nv; ++i) z[i] = 0;
531:     return 0;
532:   }

534: #if !defined(PETSC_USE_COMPLEX)
535:   // allocate scratchpad memory for the results of individual work groups:
536:   hipMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
537:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
538: #endif
539:   VecHIPGetArrayRead(xin,&xptr);
540:   PetscLogGpuTimeBegin();

542:   while (current_y_index < nv)
543:   {
544:     switch (nv - current_y_index) {

546:       case 7:
547:       case 6:
548:       case 5:
549:       case 4:
550:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
551:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
552:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
553:         VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
554: #if defined(PETSC_USE_COMPLEX)
555:         hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
556:         hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
557:         hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
558:         hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
559: #else
560:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
561: #endif
562:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
563:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
564:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
565:         VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
566:         current_y_index += 4;
567:         break;

569:       case 3:
570:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
571:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
572:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);

574: #if defined(PETSC_USE_COMPLEX)
575:         hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
576:         hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
577:         hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
578: #else
579:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
580: #endif
581:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
582:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
583:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
584:         current_y_index += 3;
585:         break;

587:       case 2:
588:         VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
589:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
590: #if defined(PETSC_USE_COMPLEX)
591:         hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
592:         hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
593: #else
594:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
595: #endif
596:         VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
597:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
598:         current_y_index += 2;
599:         break;

601:       case 1:
602:         VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
603:         hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
604:         VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
605:         current_y_index += 1;
606:         break;

608:       default: // 8 or more vectors left
609:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
610:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
611:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
612:         VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
613:         VecHIPGetArrayRead(yin[current_y_index+4],&y4ptr);
614:         VecHIPGetArrayRead(yin[current_y_index+5],&y5ptr);
615:         VecHIPGetArrayRead(yin[current_y_index+6],&y6ptr);
616:         VecHIPGetArrayRead(yin[current_y_index+7],&y7ptr);
617: #if defined(PETSC_USE_COMPLEX)
618:         hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);
619:         hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);
620:         hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);
621:         hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);
622:         hipblasXdot(hipblasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);
623:         hipblasXdot(hipblasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);
624:         hipblasXdot(hipblasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);
625:         hipblasXdot(hipblasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);
626: #else
627:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
628: #endif
629:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
630:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
631:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
632:         VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
633:         VecHIPRestoreArrayRead(yin[current_y_index+4],&y4ptr);
634:         VecHIPRestoreArrayRead(yin[current_y_index+5],&y5ptr);
635:         VecHIPRestoreArrayRead(yin[current_y_index+6],&y6ptr);
636:         VecHIPRestoreArrayRead(yin[current_y_index+7],&y7ptr);
637:         current_y_index += 8;
638:         break;
639:     }
640:   }
641:   PetscLogGpuTimeEnd();
642:   VecHIPRestoreArrayRead(xin,&xptr);

644: #if defined(PETSC_USE_COMPLEX)
645:   PetscLogGpuToCpu(nv*sizeof(PetscScalar));
646: #else
647:   // copy results to CPU
648:   hipMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,hipMemcpyDeviceToHost);

650:   // sum group results into z
651:   for (j=0; j<nv1; ++j) {
652:     z[j] = 0;
653:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
654:   }
655:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
656:   hipFree(group_results_gpu);
657:   PetscFree(group_results_cpu);
658:   PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
659: #endif
660:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
661:   return 0;
662: }

664: #undef MDOT_WORKGROUP_SIZE
665: #undef MDOT_WORKGROUP_NUM

667: PetscErrorCode VecSet_SeqHIP(Vec xin,PetscScalar alpha)
668: {
669:   PetscInt                        n = xin->map->n;
670:   PetscScalar                     *xarray = NULL;
671:   thrust::device_ptr<PetscScalar> xptr;

673:   VecHIPGetArrayWrite(xin,&xarray);
674:   PetscLogGpuTimeBegin();
675:   if (alpha == (PetscScalar)0.0) {
676:     hipMemset(xarray,0,n*sizeof(PetscScalar));
677:   } else {
678:     try {
679:       xptr = thrust::device_pointer_cast(xarray);
680:       thrust::fill(xptr,xptr+n,alpha);
681:     } catch (char *ex) {
682:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
683:     }
684:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
685:   }
686:   PetscLogGpuTimeEnd();
687:   VecHIPRestoreArrayWrite(xin,&xarray);
688:   return 0;
689: }

691: struct PetscScalarReciprocal
692: {
693:   __host__ __device__
694:   PetscScalar operator()(const PetscScalar& s)
695:   {
696: #if defined(PETSC_USE_COMPLEX)
697:     /* SPOCK compilation issue, need to unroll division */
698:     PetscReal sx = PetscRealPart(s);
699:     PetscReal sy = PetscImaginaryPart(s);
700:     PetscReal n  = sx*sx + sy*sy;
701:     return n != 0.0 ? PetscComplex(sx/n,-sy/n) : 0.0;
702: #else
703:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
704: #endif
705:   }
706: };

708: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
709: {
710:   PetscInt       n;
711:   PetscScalar    *x;

713:   VecGetLocalSize(v,&n);
714:   VecHIPGetArray(v,&x);
715:   PetscLogGpuTimeBegin();
716:   try {
717:     auto xptr = thrust::device_pointer_cast(x);
718:     thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
719:   } catch (char *ex) {
720:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
721:   }
722:   PetscLogGpuTimeEnd();
723:   VecHIPRestoreArray(v,&x);
724:   return 0;
725: }

727: PetscErrorCode VecScale_SeqHIP(Vec xin,PetscScalar alpha)
728: {
729:   PetscScalar     *xarray;
730:   PetscBLASInt    one = 1,bn = 0;
731:   hipblasHandle_t hipblasv2handle;

733:   if (alpha == (PetscScalar)0.0) {
734:     VecSet_SeqHIP(xin,alpha);
735:   } else if (alpha != (PetscScalar)1.0) {
736:     PetscHIPBLASGetHandle(&hipblasv2handle);
737:     PetscBLASIntCast(xin->map->n,&bn);
738:     VecHIPGetArray(xin,&xarray);
739:     PetscLogGpuTimeBegin();
740:     hipblasXscal(hipblasv2handle,bn,&alpha,xarray,one);
741:     VecHIPRestoreArray(xin,&xarray);
742:     PetscLogGpuTimeEnd();
743:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
744:     PetscLogGpuFlops(xin->map->n);
745:   }
746:   return 0;
747: }

749: PetscErrorCode VecTDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
750: {
751:   const PetscScalar *xarray,*yarray;
752:   PetscBLASInt      one = 1,bn = 0;
753:   hipblasHandle_t   hipblasv2handle;
754:   hipblasStatus_t   cerr;

756:   PetscHIPBLASGetHandle(&hipblasv2handle);
757:   PetscBLASIntCast(xin->map->n,&bn);
758:   VecHIPGetArrayRead(xin,&xarray);
759:   VecHIPGetArrayRead(yin,&yarray);
760:   PetscLogGpuTimeBegin();
761:   hipblasXdotu(hipblasv2handle,bn,xarray,one,yarray,one,z);
762:   PetscLogGpuTimeEnd();
763:   if (xin->map->n > 0) {
764:     PetscLogGpuFlops(2.0*xin->map->n-1);
765:   }
766:   PetscLogGpuToCpu(sizeof(PetscScalar));
767:   VecHIPRestoreArrayRead(yin,&yarray);
768:   VecHIPRestoreArrayRead(xin,&xarray);
769:   return 0;
770: }

772: PetscErrorCode VecCopy_SeqHIP(Vec xin,Vec yin)
773: {
774:   const PetscScalar *xarray;
775:   PetscScalar       *yarray;

777:   if (xin != yin) {
778:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
779:       PetscBool yiship;

781:       PetscObjectTypeCompareAny((PetscObject)yin,&yiship,VECSEQHIP,VECMPIHIP,"");
782:       VecHIPGetArrayRead(xin,&xarray);
783:       if (yiship) {
784:         VecHIPGetArrayWrite(yin,&yarray);
785:       } else {
786:         VecGetArrayWrite(yin,&yarray);
787:       }
788:       PetscLogGpuTimeBegin();
789:       if (yiship) {
790:         hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
791:       } else {
792:         hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);
793:       }
794:       PetscLogGpuTimeEnd();
795:       VecHIPRestoreArrayRead(xin,&xarray);
796:       if (yiship) {
797:         VecHIPRestoreArrayWrite(yin,&yarray);
798:       } else {
799:         VecRestoreArrayWrite(yin,&yarray);
800:       }
801:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
802:       /* copy in CPU if we are on the CPU */
803:       VecCopy_SeqHIP_Private(xin,yin);
804:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
805:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
806:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
807:         /* copy in CPU */
808:         VecCopy_SeqHIP_Private(xin,yin);
809:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
810:         /* copy in GPU */
811:         VecHIPGetArrayRead(xin,&xarray);
812:         VecHIPGetArrayWrite(yin,&yarray);
813:         PetscLogGpuTimeBegin();
814:         hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
815:         PetscLogGpuTimeEnd();
816:         VecHIPRestoreArrayRead(xin,&xarray);
817:         VecHIPRestoreArrayWrite(yin,&yarray);
818:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
819:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
820:            default to copy in GPU (this is an arbitrary choice) */
821:         VecHIPGetArrayRead(xin,&xarray);
822:         VecHIPGetArrayWrite(yin,&yarray);
823:         PetscLogGpuTimeBegin();
824:         hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
825:         PetscLogGpuTimeEnd();
826:         VecHIPRestoreArrayRead(xin,&xarray);
827:         VecHIPRestoreArrayWrite(yin,&yarray);
828:       } else {
829:         VecCopy_SeqHIP_Private(xin,yin);
830:       }
831:     }
832:   }
833:   return 0;
834: }

836: PetscErrorCode VecSwap_SeqHIP(Vec xin,Vec yin)
837: {
838:   PetscBLASInt    one = 1,bn;
839:   PetscScalar     *xarray,*yarray;
840:   hipblasHandle_t hipblasv2handle;

842:   PetscHIPBLASGetHandle(&hipblasv2handle);
843:   PetscBLASIntCast(xin->map->n,&bn);
844:   if (xin != yin) {
845:     VecHIPGetArray(xin,&xarray);
846:     VecHIPGetArray(yin,&yarray);
847:     PetscLogGpuTimeBegin();
848:     hipblasXswap(hipblasv2handle,bn,xarray,one,yarray,one);
849:     PetscLogGpuTimeEnd();
850:     VecHIPRestoreArray(xin,&xarray);
851:     VecHIPRestoreArray(yin,&yarray);
852:   }
853:   return 0;
854: }

856: PetscErrorCode VecAXPBY_SeqHIP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
857: {
858:   PetscScalar       a = alpha,b = beta;
859:   const PetscScalar *xarray;
860:   PetscScalar       *yarray;
861:   PetscBLASInt      one = 1, bn = 0;
862:   hipblasHandle_t   hipblasv2handle;

864:   PetscHIPBLASGetHandle(&hipblasv2handle);
865:   PetscBLASIntCast(yin->map->n,&bn);
866:   if (a == (PetscScalar)0.0) {
867:     VecScale_SeqHIP(yin,beta);
868:   } else if (b == (PetscScalar)1.0) {
869:     VecAXPY_SeqHIP(yin,alpha,xin);
870:   } else if (a == (PetscScalar)1.0) {
871:     VecAYPX_SeqHIP(yin,beta,xin);
872:   } else if (b == (PetscScalar)0.0) {
873:     VecHIPGetArrayRead(xin,&xarray);
874:     VecHIPGetArray(yin,&yarray);
875:     PetscLogGpuTimeBegin();
876:     hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);
877:     hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);
878:     PetscLogGpuTimeEnd();
879:     PetscLogGpuFlops(xin->map->n);
880:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
881:     VecHIPRestoreArrayRead(xin,&xarray);
882:     VecHIPRestoreArray(yin,&yarray);
883:   } else {
884:     VecHIPGetArrayRead(xin,&xarray);
885:     VecHIPGetArray(yin,&yarray);
886:     PetscLogGpuTimeBegin();
887:     hipblasXscal(hipblasv2handle,bn,&beta,yarray,one);
888:     hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);
889:     VecHIPRestoreArrayRead(xin,&xarray);
890:     VecHIPRestoreArray(yin,&yarray);
891:     PetscLogGpuTimeEnd();
892:     PetscLogGpuFlops(3.0*xin->map->n);
893:     PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
894:   }
895:   return 0;
896: }

898: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
899: {
900:   PetscInt       n = zin->map->n;

902:   if (gamma == (PetscScalar)1.0) {
903:     /* z = ax + b*y + z */
904:     VecAXPY_SeqHIP(zin,alpha,xin);
905:     VecAXPY_SeqHIP(zin,beta,yin);
906:     PetscLogGpuFlops(4.0*n);
907:   } else {
908:     /* z = a*x + b*y + c*z */
909:     VecScale_SeqHIP(zin,gamma);
910:     VecAXPY_SeqHIP(zin,alpha,xin);
911:     VecAXPY_SeqHIP(zin,beta,yin);
912:     PetscLogGpuFlops(5.0*n);
913:   }
914:   return 0;
915: }

917: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win,Vec xin,Vec yin)
918: {
919:   PetscInt                              n = win->map->n;
920:   const PetscScalar                     *xarray,*yarray;
921:   PetscScalar                           *warray;
922:   thrust::device_ptr<const PetscScalar> xptr,yptr;
923:   thrust::device_ptr<PetscScalar>       wptr;

925:   if (xin->boundtocpu || yin->boundtocpu) {
926:     VecPointwiseMult_Seq(win,xin,yin);
927:     return 0;
928:   }
929:   VecHIPGetArrayRead(xin,&xarray);
930:   VecHIPGetArrayRead(yin,&yarray);
931:   VecHIPGetArrayWrite(win,&warray);
932:   PetscLogGpuTimeBegin();
933:   try {
934:     wptr = thrust::device_pointer_cast(warray);
935:     xptr = thrust::device_pointer_cast(xarray);
936:     yptr = thrust::device_pointer_cast(yarray);
937: #if defined(PETSC_USE_COMPLEX)
938:     thrust::transform(xptr,xptr+n,yptr,wptr,PetscMultiplyComplex());
939: #else
940:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
941: #endif
942:   } catch (char *ex) {
943:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
944:   }
945:   PetscLogGpuTimeEnd();
946:   VecHIPRestoreArrayRead(xin,&xarray);
947:   VecHIPRestoreArrayRead(yin,&yarray);
948:   VecHIPRestoreArrayWrite(win,&warray);
949:   PetscLogGpuFlops(n);
950:   return 0;
951: }

953: /* should do infinity norm in hip */

955: PetscErrorCode VecNorm_SeqHIP(Vec xin,NormType type,PetscReal *z)
956: {
957:   PetscInt          n = xin->map->n;
958:   PetscBLASInt      one = 1, bn = 0;
959:   const PetscScalar *xarray;
960:   hipblasHandle_t   hipblasv2handle;

962:   PetscHIPBLASGetHandle(&hipblasv2handle);
963:   PetscBLASIntCast(n,&bn);
964:   if (type == NORM_2 || type == NORM_FROBENIUS) {
965:     VecHIPGetArrayRead(xin,&xarray);
966:     PetscLogGpuTimeBegin();
967:     hipblasXnrm2(hipblasv2handle,bn,xarray,one,z);
968:     PetscLogGpuTimeEnd();
969:     VecHIPRestoreArrayRead(xin,&xarray);
970:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
971:   } else if (type == NORM_INFINITY) {
972:     int  i;
973:     VecHIPGetArrayRead(xin,&xarray);
974:     PetscLogGpuTimeBegin();
975:     hipblasIXamax(hipblasv2handle,bn,xarray,one,&i);
976:     PetscLogGpuTimeEnd();
977:     if (bn) {
978:       PetscScalar zs;
979:       hipMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),hipMemcpyDeviceToHost);
980:       *z = PetscAbsScalar(zs);
981:     } else *z = 0.0;
982:     VecHIPRestoreArrayRead(xin,&xarray);
983:   } else if (type == NORM_1) {
984:     VecHIPGetArrayRead(xin,&xarray);
985:     PetscLogGpuTimeBegin();
986:     hipblasXasum(hipblasv2handle,bn,xarray,one,z);
987:     VecHIPRestoreArrayRead(xin,&xarray);
988:     PetscLogGpuTimeEnd();
989:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
990:   } else if (type == NORM_1_AND_2) {
991:     VecNorm_SeqHIP(xin,NORM_1,z);
992:     VecNorm_SeqHIP(xin,NORM_2,z+1);
993:   }
994:   PetscLogGpuToCpu(sizeof(PetscReal));
995:   return 0;
996: }

998: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
999: {
1000:   VecDot_SeqHIP(s,t,dp);
1001:   VecDot_SeqHIP(t,t,nm);
1002:   return 0;
1003: }

1005: PetscErrorCode VecDestroy_SeqHIP(Vec v)
1006: {
1007:   if (v->spptr) {
1008:     if (((Vec_HIP*)v->spptr)->GPUarray_allocated) {
1009:       hipFree(((Vec_HIP*)v->spptr)->GPUarray_allocated);
1010:       ((Vec_HIP*)v->spptr)->GPUarray_allocated = NULL;
1011:     }
1012:     if (((Vec_HIP*)v->spptr)->stream) {
1013:       hipStreamDestroy(((Vec_HIP*)v->spptr)->stream);
1014:     }
1015:   }
1016:   VecDestroy_SeqHIP_Private(v);
1017:   PetscFree(v->spptr);
1018:   return 0;
1019: }

1021: #if defined(PETSC_USE_COMPLEX)
1022: /* SPOCK compilation issue, need to do conjugation ourselves */
1023: struct conjugate
1024: {
1025:   __host__ __device__
1026:     PetscScalar operator()(const PetscScalar& x)
1027:     {
1028:       return PetscScalar(PetscRealPart(x),-PetscImaginaryPart(x));
1029:     }
1030: };
1031: #endif

1033: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1034: {
1035: #if defined(PETSC_USE_COMPLEX)
1036:   PetscScalar                     *xarray;
1037:   PetscInt                        n = xin->map->n;
1038:   thrust::device_ptr<PetscScalar> xptr;

1040:   VecHIPGetArray(xin,&xarray);
1041:   PetscLogGpuTimeBegin();
1042:   try {
1043:     xptr = thrust::device_pointer_cast(xarray);
1044:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1045:   } catch (char *ex) {
1046:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1047:   }
1048:   PetscLogGpuTimeEnd();
1049:   VecHIPRestoreArray(xin,&xarray);
1050: #else
1051: #endif
1052:   return 0;
1053: }

1055: static inline PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1056: {
1057:   PetscBool      wisseqhip;

1062:   PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1063:   if (w->data && wisseqhip) {
1064:     if (((Vec_Seq*)w->data)->array_allocated) {
1065:       if (w->pinned_memory) {
1066:         PetscMallocSetHIPHost();
1067:       }
1068:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1069:       if (w->pinned_memory) {
1070:         PetscMallocResetHIPHost();
1071:         w->pinned_memory = PETSC_FALSE;
1072:       }
1073:     }
1074:     ((Vec_Seq*)w->data)->array = NULL;
1075:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1076:   }
1077:   if (w->spptr && wisseqhip) {
1078:     if (((Vec_HIP*)w->spptr)->GPUarray) {
1079:       hipFree(((Vec_HIP*)w->spptr)->GPUarray);
1080:       ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1081:     }
1082:     if (((Vec_HIP*)v->spptr)->stream) {
1083:       hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);
1084:     }
1085:     PetscFree(w->spptr);
1086:   }

1088:   if (v->petscnative && wisseqhip) {
1089:     PetscFree(w->data);
1090:     w->data = v->data;
1091:     w->offloadmask = v->offloadmask;
1092:     w->pinned_memory = v->pinned_memory;
1093:     w->spptr = v->spptr;
1094:     PetscObjectStateIncrease((PetscObject)w);
1095:   } else {
1096:     if (read) {
1097:       VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1098:     } else {
1099:       VecGetArray(v,&((Vec_Seq*)w->data)->array);
1100:     }
1101:     w->offloadmask = PETSC_OFFLOAD_CPU;
1102:     if (wisseqhip) {
1103:       VecHIPAllocateCheck(w);
1104:     }
1105:   }
1106:   return 0;
1107: }

1109: static inline PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1110: {
1111:   PetscBool      wisseqhip;

1116:   PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1117:   if (v->petscnative && wisseqhip) {
1118:     v->data = w->data;
1119:     v->offloadmask = w->offloadmask;
1120:     v->pinned_memory = w->pinned_memory;
1121:     v->spptr = w->spptr;
1122:     w->data = 0;
1123:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1124:     w->spptr = 0;
1125:   } else {
1126:     if (read) {
1127:       VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1128:     } else {
1129:       VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1130:     }
1131:     if ((Vec_HIP*)w->spptr && wisseqhip) {
1132:       hipFree(((Vec_HIP*)w->spptr)->GPUarray);
1133:       ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1134:       if (((Vec_HIP*)v->spptr)->stream) {
1135:         hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);
1136:       }
1137:       PetscFree(w->spptr);
1138:     }
1139:   }
1140:   return 0;
1141: }

1143: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v,Vec w)
1144: {
1145:   VecGetLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1146:   return 0;
1147: }

1149: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v,Vec w)
1150: {
1151:   VecGetLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1152:   return 0;
1153: }

1155: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v,Vec w)
1156: {
1157:   VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1158:   return 0;
1159: }

1161: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v,Vec w)
1162: {
1163:   VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1164:   return 0;
1165: }

1167: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1168: {
1169:   __host__ __device__
1170:   PetscReal operator()(const PetscScalar& x) {
1171:     return PetscRealPart(x);
1172:   }
1173: };

1175: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1176: {
1177:   __host__ __device__
1178:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1179:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1180:   }
1181: };

1183: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1184: {
1185:   __host__ __device__
1186:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1187:     return x < y ? y : x;
1188:   }
1189: };

1191: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1192: {
1193:   __host__ __device__
1194:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1195:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1196:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1197:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1198:   }
1199: };

1201: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1202: {
1203:   __host__ __device__
1204:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1205:     return x < y ? x : y;
1206:   }
1207: };

1209: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1210: {
1211:   __host__ __device__
1212:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1213:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1214:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1215:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1216:   }
1217: };

1219: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1220: {
1221:   PetscInt                              n = v->map->n;
1222:   const PetscScalar                     *av;
1223:   thrust::device_ptr<const PetscScalar> avpt;

1226:   if (!n) {
1227:     *m = PETSC_MIN_REAL;
1228:     if (p) *p = -1;
1229:     return 0;
1230:   }
1231:   VecHIPGetArrayRead(v,&av);
1232:   avpt = thrust::device_pointer_cast(av);
1233:   PetscLogGpuTimeBegin();
1234:   if (p) {
1235:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1236:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1237:     try {
1238: #if defined(PETSC_USE_COMPLEX)
1239:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1240: #else
1241:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1242: #endif
1243:     } catch (char *ex) {
1244:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1245:     }
1246:     *m = res.get<0>();
1247:     *p = res.get<1>();
1248:   } else {
1249:     try {
1250: #if defined(PETSC_USE_COMPLEX)
1251:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1252: #else
1253:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1254: #endif
1255:     } catch (char *ex) {
1256:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1257:     }
1258:   }
1259:   PetscLogGpuTimeEnd();
1260:   VecHIPRestoreArrayRead(v,&av);
1261:   return 0;
1262: }

1264: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1265: {
1266:   PetscInt                              n = v->map->n;
1267:   const PetscScalar                     *av;
1268:   thrust::device_ptr<const PetscScalar> avpt;

1271:   if (!n) {
1272:     *m = PETSC_MAX_REAL;
1273:     if (p) *p = -1;
1274:     return 0;
1275:   }
1276:   VecHIPGetArrayRead(v,&av);
1277:   avpt = thrust::device_pointer_cast(av);
1278:   PetscLogGpuTimeBegin();
1279:   if (p) {
1280:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1281:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1282:     try {
1283: #if defined(PETSC_USE_COMPLEX)
1284:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1285: #else
1286:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1287: #endif
1288:     } catch (char *ex) {
1289:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1290:     }
1291:     *m = res.get<0>();
1292:     *p = res.get<1>();
1293:   } else {
1294:     try {
1295: #if defined(PETSC_USE_COMPLEX)
1296:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1297: #else
1298:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1299: #endif
1300:     } catch (char *ex) {
1301:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1302:     }
1303:   }
1304:   PetscLogGpuTimeEnd();
1305:   VecHIPRestoreArrayRead(v,&av);
1306:   return 0;
1307: }

1309: PetscErrorCode VecSum_SeqHIP(Vec v,PetscScalar *sum)
1310: {
1311:   PetscInt                              n = v->map->n;
1312:   const PetscScalar                     *a;
1313:   thrust::device_ptr<const PetscScalar> dptr;

1316:   VecHIPGetArrayRead(v,&a);
1317:   dptr = thrust::device_pointer_cast(a);
1318:   PetscLogGpuTimeBegin();
1319:   try {
1320:     *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1321:   } catch (char *ex) {
1322:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1323:   }
1324:   PetscLogGpuTimeEnd();
1325:   VecHIPRestoreArrayRead(v,&a);
1326:   return 0;
1327: }

1329: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1330: {
1331:   const PetscScalar shift_;
1332:   petscshift(PetscScalar shift) : shift_(shift){}
1333:   __host__ __device__
1334:   PetscScalar operator()(PetscScalar x) {return x + shift_;}
1335: };

1337: PetscErrorCode VecShift_SeqHIP(Vec v,PetscScalar shift)
1338: {
1339:   PetscInt                              n = v->map->n;
1340:   PetscScalar                           *a;
1341:   thrust::device_ptr<PetscScalar>       dptr;

1344:   VecHIPGetArray(v,&a);
1345:   dptr = thrust::device_pointer_cast(a);
1346:   PetscLogGpuTimeBegin();
1347:   try {
1348:     thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1349:   } catch (char *ex) {
1350:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1351:   }
1352:   PetscLogGpuTimeEnd();
1353:   VecHIPRestoreArray(v,&a);
1354:   return 0;
1355: }