Actual source code: agmresorthog.c
1: #define PETSCKSP_DLL
3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
4: /*
5: * This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
6: * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7: *
8: *
9: * initial author R. B. SIDJE,
10: * modified : G.-A Atenekeng-Kahou, 2008
11: * modified : D. NUENTSA WAKAM, 2011
12: *
13: */
15: /*
16: * Take the processes that own the vectors and Organize them on a virtual ring.
17: */
18: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);
20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
21: {
22: MPI_Comm comm;
23: KSP_AGMRES *agmres = (KSP_AGMRES*)(ksp->data);
24: PetscMPIInt First, Last, rank, size;
26: PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
27: MPI_Comm_rank(comm, &rank);
28: MPI_Comm_size(comm, &size);
29: MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
30: MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);
32: if ((rank != Last) && (rank == 0)) {
33: agmres->Ileft = rank - 1;
34: agmres->Iright = rank + 1;
35: } else {
36: if (rank == Last) {
37: agmres->Ileft = rank - 1;
38: agmres->Iright = First;
39: } else {
40: {
41: agmres->Ileft = Last;
42: agmres->Iright = rank + 1;
43: }
44: }
45: }
46: agmres->rank = rank;
47: agmres->size = size;
48: agmres->First = First;
49: agmres->Last = Last;
50: return 0;
51: }
53: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
54: {
55: PetscReal a, b, t;
57: if (make_r == 1) {
58: a = *c;
59: b = *s;
60: if (b == 0.e0) {
61: *c = 1.e0;
62: *s = 0.e0;
63: } else {
64: if (PetscAbsReal(b) > PetscAbsReal(a)) {
65: t = -a / b;
66: *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
67: *c = (*s) * t;
68: } else {
69: t = -b / a;
70: *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
71: *s = (*c) * t;
72: }
73: }
74: if (*c == 0.e0) {
75: *r = 1.e0;
76: } else {
77: if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
78: *r = PetscSign(*c) * (*s) / 2.e0;
79: } else {
80: *r = PetscSign(*s) * 2.e0 / (*c);
81: }
82: }
83: }
85: if (*r == 1.e0) {
86: *c = 0.e0;
87: *s = 1.e0;
88: } else {
89: if (PetscAbsReal(*r) < 1.e0) {
90: *s = 2.e0 * (*r);
91: *c = PetscSqrtReal(1.e0 - (*s) * (*s));
92: } else {
93: *c = 2.e0 / (*r);
94: *s = PetscSqrtReal(1.e0 - (*c) * (*c));
95: }
96: }
97: return 0;
99: }
101: /*
102: * Compute the QR factorization of the Krylov basis vectors
103: * Input :
104: * - the vectors are availabe in agmres->vecs (alias VEC_V)
105: * - nvec : the number of vectors
106: * Output :
107: * - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
108: * - agmres->sgn : Sign of the rotations
109: * - agmres->tloc : scalar factors of the elementary reflectors.
111: */
112: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
113: {
114: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
115: MPI_Comm comm;
116: PetscScalar *Qloc = agmres->Qloc;
117: PetscScalar *sgn = agmres->sgn;
118: PetscScalar *tloc = agmres->tloc;
119: PetscReal *wbufptr = agmres->wbufptr;
120: PetscMPIInt rank = agmres->rank;
121: PetscMPIInt First = agmres->First;
122: PetscMPIInt Last = agmres->Last;
123: PetscBLASInt pas,len,bnloc,bpos;
124: PetscInt nloc,d, i, j, k;
125: PetscInt pos;
126: PetscReal c, s, rho, Ajj, val, tt, old;
127: PetscScalar *col;
128: MPI_Status status;
129: PetscBLASInt N = MAXKSPSIZE + 1;
131: PetscObjectGetComm((PetscObject)ksp,&comm);
132: PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
133: PetscArrayzero(agmres->Rloc, N*N);
134: /* check input arguments */
136: VecGetLocalSize(VEC_V(0), &nloc);
137: PetscBLASIntCast(nloc,&bnloc);
139: pas = 1;
140: /* Copy the vectors of the basis */
141: for (j = 0; j < nvec; j++) {
142: VecGetArray(VEC_V(j), &col);
143: PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
144: VecRestoreArray(VEC_V(j), &col);
145: }
146: /* Each process performs a local QR on its own block */
147: for (j = 0; j < nvec; j++) {
148: len = nloc - j;
149: Ajj = Qloc[j*nloc+j];
150: PetscStackCallBLAS("BLASnrm2",rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas));
151: if (rho == 0.0) tloc[j] = 0.0;
152: else {
153: tloc[j] = (Ajj - rho) / rho;
154: len = len - 1;
155: val = 1.0 / (Ajj - rho);
156: PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
157: Qloc[j*nloc+j] = 1.0;
158: len = len + 1;
159: for (k = j + 1; k < nvec; k++) {
160: PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
161: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
162: }
163: Qloc[j*nloc+j] = rho;
164: }
165: }
166: /* annihilate undesirable Rloc, diagonal by diagonal*/
167: for (d = 0; d < nvec; d++) {
168: len = nvec - d;
169: if (rank == First) {
170: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
171: MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
172: } else {
173: MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
174: /* Elimination of Rloc(1,d)*/
175: c = wbufptr[d];
176: s = Qloc[d*nloc];
177: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
178: /* Apply Givens Rotation*/
179: for (k = d; k < nvec; k++) {
180: old = wbufptr[k];
181: wbufptr[k] = c * old - s * Qloc[k*nloc];
182: Qloc[k*nloc] = s * old + c * Qloc[k*nloc];
183: }
184: Qloc[d*nloc] = rho;
185: if (rank != Last) {
186: MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
187: }
188: /* zero-out the d-th diagonal of Rloc ...*/
189: for (j = d + 1; j < nvec; j++) {
190: /* elimination of Rloc[i][j]*/
191: i = j - d;
192: c = Qloc[j*nloc+i-1];
193: s = Qloc[j*nloc+i];
194: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
195: for (k = j; k < nvec; k++) {
196: old = Qloc[k*nloc+i-1];
197: Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
198: Qloc[k*nloc+i] = s * old + c * Qloc[k*nloc+i];
199: }
200: Qloc[j*nloc+i] = rho;
201: }
202: if (rank == Last) {
203: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
204: for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
205: }
206: }
207: }
209: if (rank == Last) {
210: for (d = 0; d < nvec; d++) {
211: pos = nvec - d;
212: PetscBLASIntCast(pos,&bpos);
213: sgn[d] = PetscSign(*RLOC(d,d));
214: PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
215: }
216: }
217: /* BroadCast Rloc to all other processes
218: * NWD : should not be needed
219: */
220: MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
221: PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
222: return 0;
223: }
225: /*
226: * Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227: * Input
228: * - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229: * - In : input vector (size nvec)
230: * Output :
231: * - Out : Petsc vector (distributed as the basis vectors)
232: */
233: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234: {
235: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
236: MPI_Comm comm;
237: PetscScalar *Qloc = agmres->Qloc;
238: PetscScalar *sgn = agmres->sgn;
239: PetscScalar *tloc = agmres->tloc;
240: PetscMPIInt rank = agmres->rank;
241: PetscMPIInt First = agmres->First, Last = agmres->Last;
242: PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
243: PetscScalar *y, *zloc;
244: PetscInt nloc,d, len, i, j;
245: PetscBLASInt bnvec,pas,blen;
246: PetscInt dpt;
247: PetscReal c, s, rho, zp, zq, yd=0.0, tt;
248: MPI_Status status;
250: PetscBLASIntCast(nvec,&bnvec);
251: PetscObjectGetComm((PetscObject)ksp,&comm);
252: pas = 1;
253: VecGetLocalSize(VEC_V(0), &nloc);
254: PetscMalloc1(nvec, &y);
255: PetscArraycpy(y, In, nvec);
256: VecGetArray(Out, &zloc);
258: if (rank == Last) {
259: for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
260: }
261: for (i = 0; i < nloc; i++) zloc[i] = 0.0;
262: if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
263: else {
264: for (d = nvec - 1; d >= 0; d--) {
265: if (rank == First) {
266: MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
267: } else {
268: for (j = nvec - 1; j >= d + 1; j--) {
269: i = j - d;
270: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
271: zp = zloc[i-1];
272: zq = zloc[i];
273: zloc[i-1] = c * zp + s * zq;
274: zloc[i] = -s * zp + c * zq;
275: }
276: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
277: if (rank == Last) {
278: zp = y[d];
279: zq = zloc[0];
280: y[d] = c * zp + s * zq;
281: zloc[0] = -s * zp + c * zq;
282: MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
283: } else {
284: MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
285: zp = yd;
286: zq = zloc[0];
287: yd = c * zp + s * zq;
288: zloc[0] = -s * zp + c * zq;
289: MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
290: }
291: }
292: }
293: }
294: for (j = nvec - 1; j >= 0; j--) {
295: dpt = j * nloc + j;
296: if (tloc[j] != 0.0) {
297: len = nloc - j;
298: PetscBLASIntCast(len,&blen);
299: rho = Qloc[dpt];
300: Qloc[dpt] = 1.0;
301: tt = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
302: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
303: Qloc[dpt] = rho;
304: }
305: }
306: VecRestoreArray(Out, &zloc);
307: PetscFree(y);
308: return 0;
309: }