Actual source code: fnsqrt.c
slepc-3.17.0 2022-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Square root function sqrt(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
18: {
19: #if !defined(PETSC_USE_COMPLEX)
21: #endif
22: *y = PetscSqrtScalar(x);
23: PetscFunctionReturn(0);
24: }
26: PetscErrorCode FNEvaluateDerivative_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
27: {
29: #if !defined(PETSC_USE_COMPLEX)
31: #endif
32: *y = 1.0/(2.0*PetscSqrtScalar(x));
33: PetscFunctionReturn(0);
34: }
36: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Schur(FN fn,Mat A,Mat B)
37: {
38: PetscBLASInt n=0;
39: PetscScalar *T;
40: PetscInt m;
42: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
43: MatDenseGetArray(B,&T);
44: MatGetSize(A,&m,NULL);
45: PetscBLASIntCast(m,&n);
46: FNSqrtmSchur(fn,n,T,n,PETSC_FALSE);
47: MatDenseRestoreArray(B,&T);
48: PetscFunctionReturn(0);
49: }
51: PetscErrorCode FNEvaluateFunctionMatVec_Sqrt_Schur(FN fn,Mat A,Vec v)
52: {
53: PetscBLASInt n=0;
54: PetscScalar *T;
55: PetscInt m;
56: Mat B;
58: FN_AllocateWorkMat(fn,A,&B);
59: MatDenseGetArray(B,&T);
60: MatGetSize(A,&m,NULL);
61: PetscBLASIntCast(m,&n);
62: FNSqrtmSchur(fn,n,T,n,PETSC_TRUE);
63: MatDenseRestoreArray(B,&T);
64: MatGetColumnVector(B,v,0);
65: FN_FreeWorkMat(fn,&B);
66: PetscFunctionReturn(0);
67: }
69: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP(FN fn,Mat A,Mat B)
70: {
71: PetscBLASInt n=0;
72: PetscScalar *T;
73: PetscInt m;
75: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
76: MatDenseGetArray(B,&T);
77: MatGetSize(A,&m,NULL);
78: PetscBLASIntCast(m,&n);
79: FNSqrtmDenmanBeavers(fn,n,T,n,PETSC_FALSE);
80: MatDenseRestoreArray(B,&T);
81: PetscFunctionReturn(0);
82: }
84: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS(FN fn,Mat A,Mat B)
85: {
86: PetscBLASInt n=0;
87: PetscScalar *Ba;
88: PetscInt m;
90: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
91: MatDenseGetArray(B,&Ba);
92: MatGetSize(A,&m,NULL);
93: PetscBLASIntCast(m,&n);
94: FNSqrtmNewtonSchulz(fn,n,Ba,n,PETSC_FALSE);
95: MatDenseRestoreArray(B,&Ba);
96: PetscFunctionReturn(0);
97: }
99: #define MAXIT 50
101: /*
102: Computes the principal square root of the matrix A using the
103: Sadeghi iteration. A is overwritten with sqrtm(A).
104: */
105: PetscErrorCode FNSqrtmSadeghi(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
106: {
107: PetscScalar *M,*M2,*G,*X=A,*work,work1,sqrtnrm;
108: PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s1d16=1.0/16.0;
109: PetscReal tol,Mres=0.0,nrm,rwork[1],done=1.0;
110: PetscInt i,it;
111: PetscBLASInt N,*piv=NULL,info,lwork=0,query=-1,one=1,zero=0;
112: PetscBool converged=PETSC_FALSE;
113: unsigned int ftz;
115: N = n*n;
116: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
117: SlepcSetFlushToZero(&ftz);
119: /* query work size */
120: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,A,&ld,piv,&work1,&query,&info));
121: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
123: PetscMalloc5(N,&M,N,&M2,N,&G,lwork,&work,n,&piv);
124: PetscArraycpy(M,A,N);
126: /* scale M */
127: nrm = LAPACKlange_("fro",&n,&n,M,&n,rwork);
128: if (nrm>1.0) {
129: sqrtnrm = PetscSqrtReal(nrm);
130: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,M,&N,&info));
131: SlepcCheckLapackInfo("lascl",info);
132: tol *= nrm;
133: }
134: PetscInfo(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
136: /* X = I */
137: PetscArrayzero(X,N);
138: for (i=0;i<n;i++) X[i+i*ld] = 1.0;
140: for (it=0;it<MAXIT && !converged;it++) {
142: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
143: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M,&ld,M,&ld,&szero,M2,&ld));
144: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smfive,M,&one,M2,&one));
145: for (i=0;i<n;i++) M2[i+i*ld] += 15.0;
146: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&s1d16,M,&ld,M2,&ld,&szero,G,&ld));
147: for (i=0;i<n;i++) G[i+i*ld] += 5.0/16.0;
149: /* X = X*G */
150: PetscArraycpy(M2,X,N);
151: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M2,&ld,G,&ld,&szero,X,&ld));
153: /* M = M*inv(G*G) */
154: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,G,&ld,&szero,M2,&ld));
155: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,M2,&ld,piv,&info));
156: SlepcCheckLapackInfo("getrf",info);
157: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M2,&ld,piv,work,&lwork,&info));
158: SlepcCheckLapackInfo("getri",info);
160: PetscArraycpy(G,M,N);
161: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,M2,&ld,&szero,M,&ld));
163: /* check ||I-M|| */
164: PetscArraycpy(M2,M,N);
165: for (i=0;i<n;i++) M2[i+i*ld] -= 1.0;
166: Mres = LAPACKlange_("fro",&n,&n,M2,&n,rwork);
168: if (Mres<=tol) converged = PETSC_TRUE;
169: PetscInfo(fn,"it: %" PetscInt_FMT " res: %g\n",it,(double)Mres);
170: PetscLogFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
171: }
175: /* undo scaling */
176: if (nrm>1.0) PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sqrtnrm,A,&one));
178: PetscFree5(M,M2,G,work,piv);
179: SlepcResetFlushToZero(&ftz);
180: PetscFunctionReturn(0);
181: }
183: #if defined(PETSC_HAVE_CUDA)
184: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
185: #include <slepccublas.h>
187: #if defined(PETSC_HAVE_MAGMA)
188: #include <slepcmagma.h>
190: /*
191: * Matrix square root by Sadeghi iteration. CUDA version.
192: * Computes the principal square root of the matrix T using the
193: * Sadeghi iteration. T is overwritten with sqrtm(T).
194: */
195: PetscErrorCode FNSqrtmSadeghi_CUDAm(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
196: {
197: PetscScalar *d_X,*d_M,*d_M2,*d_G,*d_work,alpha;
198: const PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s15=15.0,s1d16=1.0/16.0;
199: PetscReal tol,Mres=0.0,nrm,sqrtnrm;
200: PetscInt it,nb,lwork;
201: PetscBLASInt *piv,N;
202: const PetscBLASInt one=1,zero=0;
203: PetscBool converged=PETSC_FALSE;
204: cublasHandle_t cublasv2handle;
206: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
207: PetscCUBLASGetHandle(&cublasv2handle);
208: magma_init();
209: N = n*n;
210: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
212: PetscMalloc1(n,&piv);
213: cudaMalloc((void **)&d_X,sizeof(PetscScalar)*N);
214: cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);
215: cudaMalloc((void **)&d_M2,sizeof(PetscScalar)*N);
216: cudaMalloc((void **)&d_G,sizeof(PetscScalar)*N);
218: nb = magma_get_xgetri_nb(n);
219: lwork = nb*n;
220: cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);
221: PetscLogGpuTimeBegin();
223: /* M = A */
224: cudaMemcpy(d_M,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);
226: /* scale M */
227: cublasXnrm2(cublasv2handle,N,d_M,one,&nrm);
228: if (nrm>1.0) {
229: sqrtnrm = PetscSqrtReal(nrm);
230: alpha = 1.0/nrm;
231: cublasXscal(cublasv2handle,N,&alpha,d_M,one);
232: tol *= nrm;
233: }
234: PetscInfo(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
236: /* X = I */
237: cudaMemset(d_X,zero,sizeof(PetscScalar)*N);
238: set_diagonal(n,d_X,ld,sone);
240: for (it=0;it<MAXIT && !converged;it++) {
242: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
243: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M,ld,d_M,ld,&szero,d_M2,ld);
244: cublasXaxpy(cublasv2handle,N,&smfive,d_M,one,d_M2,one);
245: shift_diagonal(n,d_M2,ld,s15);
246: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&s1d16,d_M,ld,d_M2,ld,&szero,d_G,ld);
247: shift_diagonal(n,d_G,ld,5.0/16.0);
249: /* X = X*G */
250: cudaMemcpy(d_M2,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
251: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M2,ld,d_G,ld,&szero,d_X,ld);
253: /* M = M*inv(G*G) */
254: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_G,ld,&szero,d_M2,ld);
255: /* magma */
256: magma_xgetrf_gpu,n,n,d_M2,ld,piv;
257: magma_xgetri_gpu,n,d_M2,ld,piv,d_work,lwork;
258: /* magma */
259: cudaMemcpy(d_G,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
260: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_M2,ld,&szero,d_M,ld);
262: /* check ||I-M|| */
263: cudaMemcpy(d_M2,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
264: shift_diagonal(n,d_M2,ld,-1.0);
265: cublasXnrm2(cublasv2handle,N,d_M2,one,&Mres);
267: if (Mres<=tol) converged = PETSC_TRUE;
268: PetscInfo(fn,"it: %" PetscInt_FMT " res: %g\n",it,(double)Mres);
269: PetscLogGpuFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
270: }
274: if (nrm>1.0) cublasXscal(cublasv2handle,N,&sqrtnrm,d_X,one);
275: cudaMemcpy(A,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);
276: PetscLogGpuTimeEnd();
278: cudaFree(d_X);
279: cudaFree(d_M);
280: cudaFree(d_M2);
281: cudaFree(d_G);
282: cudaFree(d_work);
283: PetscFree(piv);
285: magma_finalize();
286: PetscFunctionReturn(0);
287: }
288: #endif /* PETSC_HAVE_MAGMA */
289: #endif /* PETSC_HAVE_CUDA */
291: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi(FN fn,Mat A,Mat B)
292: {
293: PetscBLASInt n=0;
294: PetscScalar *Ba;
295: PetscInt m;
297: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
298: MatDenseGetArray(B,&Ba);
299: MatGetSize(A,&m,NULL);
300: PetscBLASIntCast(m,&n);
301: FNSqrtmSadeghi(fn,n,Ba,n);
302: MatDenseRestoreArray(B,&Ba);
303: PetscFunctionReturn(0);
304: }
306: #if defined(PETSC_HAVE_CUDA)
307: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS_CUDA(FN fn,Mat A,Mat B)
308: {
309: PetscBLASInt n=0;
310: PetscScalar *Ba;
311: PetscInt m;
313: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
314: MatDenseGetArray(B,&Ba);
315: MatGetSize(A,&m,NULL);
316: PetscBLASIntCast(m,&n);
317: FNSqrtmNewtonSchulz_CUDA(fn,n,Ba,n,PETSC_FALSE);
318: MatDenseRestoreArray(B,&Ba);
319: PetscFunctionReturn(0);
320: }
322: #if defined(PETSC_HAVE_MAGMA)
323: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP_CUDAm(FN fn,Mat A,Mat B)
324: {
325: PetscBLASInt n=0;
326: PetscScalar *T;
327: PetscInt m;
329: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
330: MatDenseGetArray(B,&T);
331: MatGetSize(A,&m,NULL);
332: PetscBLASIntCast(m,&n);
333: FNSqrtmDenmanBeavers_CUDAm(fn,n,T,n,PETSC_FALSE);
334: MatDenseRestoreArray(B,&T);
335: PetscFunctionReturn(0);
336: }
338: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm(FN fn,Mat A,Mat B)
339: {
340: PetscBLASInt n=0;
341: PetscScalar *Ba;
342: PetscInt m;
344: if (A!=B) MatCopy(A,B,SAME_NONZERO_PATTERN);
345: MatDenseGetArray(B,&Ba);
346: MatGetSize(A,&m,NULL);
347: PetscBLASIntCast(m,&n);
348: FNSqrtmSadeghi_CUDAm(fn,n,Ba,n);
349: MatDenseRestoreArray(B,&Ba);
350: PetscFunctionReturn(0);
351: }
352: #endif /* PETSC_HAVE_MAGMA */
353: #endif /* PETSC_HAVE_CUDA */
355: PetscErrorCode FNView_Sqrt(FN fn,PetscViewer viewer)
356: {
357: PetscBool isascii;
358: char str[50];
359: const char *methodname[] = {
360: "Schur method for the square root",
361: "Denman-Beavers (product form)",
362: "Newton-Schulz iteration",
363: "Sadeghi iteration"
364: #if defined(PETSC_HAVE_CUDA)
365: ,"Newton-Schulz iteration CUDA"
366: #if defined(PETSC_HAVE_MAGMA)
367: ,"Denman-Beavers (product form) CUDA/MAGMA",
368: "Sadeghi iteration CUDA/MAGMA"
369: #endif
370: #endif
371: };
372: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
374: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
375: if (isascii) {
376: if (fn->beta==(PetscScalar)1.0) {
377: if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer," Square root: sqrt(x)\n");
378: else {
379: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
380: PetscViewerASCIIPrintf(viewer," Square root: sqrt(%s*x)\n",str);
381: }
382: } else {
383: SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
384: if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer," Square root: %s*sqrt(x)\n",str);
385: else {
386: PetscViewerASCIIPrintf(viewer," Square root: %s",str);
387: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
388: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
389: PetscViewerASCIIPrintf(viewer,"*sqrt(%s*x)\n",str);
390: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
391: }
392: }
393: if (fn->method<nmeth) PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
394: }
395: PetscFunctionReturn(0);
396: }
398: SLEPC_EXTERN PetscErrorCode FNCreate_Sqrt(FN fn)
399: {
400: fn->ops->evaluatefunction = FNEvaluateFunction_Sqrt;
401: fn->ops->evaluatederivative = FNEvaluateDerivative_Sqrt;
402: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Sqrt_Schur;
403: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Sqrt_DBP;
404: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Sqrt_NS;
405: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Sqrt_Sadeghi;
406: #if defined(PETSC_HAVE_CUDA)
407: fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Sqrt_NS_CUDA;
408: #if defined(PETSC_HAVE_MAGMA)
409: fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Sqrt_DBP_CUDAm;
410: fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm;
411: #endif /* PETSC_HAVE_MAGMA */
412: #endif /* PETSC_HAVE_CUDA */
413: fn->ops->evaluatefunctionmatvec[0] = FNEvaluateFunctionMatVec_Sqrt_Schur;
414: fn->ops->view = FNView_Sqrt;
415: PetscFunctionReturn(0);
416: }