Actual source code: ex1cu.cu

  1: static char help[] = "Benchmarking CUDA kernel launch time\n";
  2: /*
  3:   Running example on Summit at OLCF:
  4:   # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
  5:   $ jsrun -n1 -a1 -c7 -g1 -r1  ./ex1cu
  6:   Average asynchronous CUDA kernel launch time = 9.48 microseconds
  7:   Average synchronous  CUDA kernel launch time = 12.83 microseconds
  8: */
  9: #include <petscsys.h>
 10: #include <petscdevice.h>

 12: __global__ void NullKernel(){}

 14: int main(int argc,char **argv)
 15: {
 16:   PetscInt       i,n=100000;
 17:   PetscLogDouble tstart,tend,time;

 19:   PetscInitialize(&argc,&argv,(char*)0,help);
 20:   PetscOptionsGetInt(NULL,NULL,"-n",&n,NULL);

 22:   /* Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one */
 23:   PetscTime(&tstart);
 24:   for (i=0; i<n; i++) {NullKernel<<<1,1,0,NULL>>>();}
 25:   PetscTime(&tend);
 26:   cudaStreamSynchronize(NULL); /* Sync after tend since we don't want to count kernel execution time */
 27:   time = (tend-tstart)*1e6/n;
 28:   PetscPrintf(PETSC_COMM_WORLD,"Average asynchronous CUDA kernel launch time = %.2f microseconds\n",time);

 30:   /* Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed */
 31:   PetscTime(&tstart);
 32:   for (i=0; i<n; i++) {
 33:     NullKernel<<<1,1,0,NULL>>>();
 34:     cudaStreamSynchronize(NULL);
 35:   }
 36:   PetscTime(&tend);
 37:   time = (tend-tstart)*1e6/n;
 38:   PetscPrintf(PETSC_COMM_WORLD,"Average synchronous  CUDA kernel launch time = %.2f microseconds\n",time);

 40:   PetscFinalize();
 41:   return 0;
 42: }

 44: /*TEST
 45:   build:
 46:     requires: cuda

 48:   test:
 49:     requires: cuda
 50:     args: -n 2
 51:     output_file: output/empty.out
 52:     filter: grep "DOES_NOT_EXIST"

 54: TEST*/