Actual source code: matstash.c
2: #include <petsc/private/matimpl.h>
4: #define DEFAULT_STASH_SIZE 10000
6: static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9: #if !defined(PETSC_HAVE_MPIUNI)
10: static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
11: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
12: static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
13: #endif
15: /*
16: MatStashCreate_Private - Creates a stash,currently used for all the parallel
17: matrix implementations. The stash is where elements of a matrix destined
18: to be stored on other processors are kept until matrix assembly is done.
20: This is a simple minded stash. Simply adds entries to end of stash.
22: Input Parameters:
23: comm - communicator, required for scatters.
24: bs - stash block size. used when stashing blocks of values
26: Output Parameters:
27: stash - the newly created stash
28: */
29: PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
30: {
31: PetscInt max,*opt,nopt,i;
32: PetscBool flg;
34: /* Require 2 tags,get the second using PetscCommGetNewTag() */
35: stash->comm = comm;
37: PetscCommGetNewTag(stash->comm,&stash->tag1);
38: PetscCommGetNewTag(stash->comm,&stash->tag2);
39: MPI_Comm_size(stash->comm,&stash->size);
40: MPI_Comm_rank(stash->comm,&stash->rank);
41: PetscMalloc1(2*stash->size,&stash->flg_v);
42: for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
44: nopt = stash->size;
45: PetscMalloc1(nopt,&opt);
46: PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg);
47: if (flg) {
48: if (nopt == 1) max = opt[0];
49: else if (nopt == stash->size) max = opt[stash->rank];
50: else if (stash->rank < nopt) max = opt[stash->rank];
51: else max = 0; /* Use default */
52: stash->umax = max;
53: } else {
54: stash->umax = 0;
55: }
56: PetscFree(opt);
57: if (bs <= 0) bs = 1;
59: stash->bs = bs;
60: stash->nmax = 0;
61: stash->oldnmax = 0;
62: stash->n = 0;
63: stash->reallocs = -1;
64: stash->space_head = NULL;
65: stash->space = NULL;
67: stash->send_waits = NULL;
68: stash->recv_waits = NULL;
69: stash->send_status = NULL;
70: stash->nsends = 0;
71: stash->nrecvs = 0;
72: stash->svalues = NULL;
73: stash->rvalues = NULL;
74: stash->rindices = NULL;
75: stash->nprocessed = 0;
76: stash->reproduce = PETSC_FALSE;
77: stash->blocktype = MPI_DATATYPE_NULL;
79: PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL);
80: #if !defined(PETSC_HAVE_MPIUNI)
81: flg = PETSC_FALSE;
82: PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL);
83: if (!flg) {
84: stash->ScatterBegin = MatStashScatterBegin_BTS;
85: stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
86: stash->ScatterEnd = MatStashScatterEnd_BTS;
87: stash->ScatterDestroy = MatStashScatterDestroy_BTS;
88: } else {
89: #endif
90: stash->ScatterBegin = MatStashScatterBegin_Ref;
91: stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
92: stash->ScatterEnd = MatStashScatterEnd_Ref;
93: stash->ScatterDestroy = NULL;
94: #if !defined(PETSC_HAVE_MPIUNI)
95: }
96: #endif
97: return 0;
98: }
100: /*
101: MatStashDestroy_Private - Destroy the stash
102: */
103: PetscErrorCode MatStashDestroy_Private(MatStash *stash)
104: {
105: PetscMatStashSpaceDestroy(&stash->space_head);
106: if (stash->ScatterDestroy) (*stash->ScatterDestroy)(stash);
108: stash->space = NULL;
110: PetscFree(stash->flg_v);
111: return 0;
112: }
114: /*
115: MatStashScatterEnd_Private - This is called as the final stage of
116: scatter. The final stages of message passing is done here, and
117: all the memory used for message passing is cleaned up. This
118: routine also resets the stash, and deallocates the memory used
119: for the stash. It also keeps track of the current memory usage
120: so that the same value can be used the next time through.
121: */
122: PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
123: {
124: (*stash->ScatterEnd)(stash);
125: return 0;
126: }
128: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
129: {
130: PetscInt nsends=stash->nsends,bs2,oldnmax,i;
131: MPI_Status *send_status;
133: for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
134: /* wait on sends */
135: if (nsends) {
136: PetscMalloc1(2*nsends,&send_status);
137: MPI_Waitall(2*nsends,stash->send_waits,send_status);
138: PetscFree(send_status);
139: }
141: /* Now update nmaxold to be app 10% more than max n used, this way the
142: wastage of space is reduced the next time this stash is used.
143: Also update the oldmax, only if it increases */
144: if (stash->n) {
145: bs2 = stash->bs*stash->bs;
146: oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
147: if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
148: }
150: stash->nmax = 0;
151: stash->n = 0;
152: stash->reallocs = -1;
153: stash->nprocessed = 0;
155: PetscMatStashSpaceDestroy(&stash->space_head);
157: stash->space = NULL;
159: PetscFree(stash->send_waits);
160: PetscFree(stash->recv_waits);
161: PetscFree2(stash->svalues,stash->sindices);
162: PetscFree(stash->rvalues[0]);
163: PetscFree(stash->rvalues);
164: PetscFree(stash->rindices[0]);
165: PetscFree(stash->rindices);
166: return 0;
167: }
169: /*
170: MatStashGetInfo_Private - Gets the relavant statistics of the stash
172: Input Parameters:
173: stash - the stash
174: nstash - the size of the stash. Indicates the number of values stored.
175: reallocs - the number of additional mallocs incurred.
177: */
178: PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
179: {
180: PetscInt bs2 = stash->bs*stash->bs;
182: if (nstash) *nstash = stash->n*bs2;
183: if (reallocs) {
184: if (stash->reallocs < 0) *reallocs = 0;
185: else *reallocs = stash->reallocs;
186: }
187: return 0;
188: }
190: /*
191: MatStashSetInitialSize_Private - Sets the initial size of the stash
193: Input Parameters:
194: stash - the stash
195: max - the value that is used as the max size of the stash.
196: this value is used while allocating memory.
197: */
198: PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
199: {
200: stash->umax = max;
201: return 0;
202: }
204: /* MatStashExpand_Private - Expand the stash. This function is called
205: when the space in the stash is not sufficient to add the new values
206: being inserted into the stash.
208: Input Parameters:
209: stash - the stash
210: incr - the minimum increase requested
212: Notes:
213: This routine doubles the currently used memory.
214: */
215: static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
216: {
217: PetscInt newnmax,bs2= stash->bs*stash->bs;
219: /* allocate a larger stash */
220: if (!stash->oldnmax && !stash->nmax) { /* new stash */
221: if (stash->umax) newnmax = stash->umax/bs2;
222: else newnmax = DEFAULT_STASH_SIZE/bs2;
223: } else if (!stash->nmax) { /* resuing stash */
224: if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
225: else newnmax = stash->oldnmax/bs2;
226: } else newnmax = stash->nmax*2;
227: if (newnmax < (stash->nmax + incr)) newnmax += 2*incr;
229: /* Get a MatStashSpace and attach it to stash */
230: PetscMatStashSpaceGet(bs2,newnmax,&stash->space);
231: if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
232: stash->space_head = stash->space;
233: }
235: stash->reallocs++;
236: stash->nmax = newnmax;
237: return 0;
238: }
239: /*
240: MatStashValuesRow_Private - inserts values into the stash. This function
241: expects the values to be roworiented. Multiple columns belong to the same row
242: can be inserted with a single call to this function.
244: Input Parameters:
245: stash - the stash
246: row - the global row correspoiding to the values
247: n - the number of elements inserted. All elements belong to the above row.
248: idxn - the global column indices corresponding to each of the values.
249: values - the values inserted
250: */
251: PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
252: {
253: PetscInt i,k,cnt = 0;
254: PetscMatStashSpace space=stash->space;
256: /* Check and see if we have sufficient memory */
257: if (!space || space->local_remaining < n) {
258: MatStashExpand_Private(stash,n);
259: }
260: space = stash->space;
261: k = space->local_used;
262: for (i=0; i<n; i++) {
263: if (ignorezeroentries && values && values[i] == 0.0) continue;
264: space->idx[k] = row;
265: space->idy[k] = idxn[i];
266: space->val[k] = values ? values[i] : 0.0;
267: k++;
268: cnt++;
269: }
270: stash->n += cnt;
271: space->local_used += cnt;
272: space->local_remaining -= cnt;
273: return 0;
274: }
276: /*
277: MatStashValuesCol_Private - inserts values into the stash. This function
278: expects the values to be columnoriented. Multiple columns belong to the same row
279: can be inserted with a single call to this function.
281: Input Parameters:
282: stash - the stash
283: row - the global row correspoiding to the values
284: n - the number of elements inserted. All elements belong to the above row.
285: idxn - the global column indices corresponding to each of the values.
286: values - the values inserted
287: stepval - the consecutive values are sepated by a distance of stepval.
288: this happens because the input is columnoriented.
289: */
290: PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
291: {
292: PetscInt i,k,cnt = 0;
293: PetscMatStashSpace space=stash->space;
295: /* Check and see if we have sufficient memory */
296: if (!space || space->local_remaining < n) {
297: MatStashExpand_Private(stash,n);
298: }
299: space = stash->space;
300: k = space->local_used;
301: for (i=0; i<n; i++) {
302: if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
303: space->idx[k] = row;
304: space->idy[k] = idxn[i];
305: space->val[k] = values ? values[i*stepval] : 0.0;
306: k++;
307: cnt++;
308: }
309: stash->n += cnt;
310: space->local_used += cnt;
311: space->local_remaining -= cnt;
312: return 0;
313: }
315: /*
316: MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
317: This function expects the values to be roworiented. Multiple columns belong
318: to the same block-row can be inserted with a single call to this function.
319: This function extracts the sub-block of values based on the dimensions of
320: the original input block, and the row,col values corresponding to the blocks.
322: Input Parameters:
323: stash - the stash
324: row - the global block-row correspoiding to the values
325: n - the number of elements inserted. All elements belong to the above row.
326: idxn - the global block-column indices corresponding to each of the blocks of
327: values. Each block is of size bs*bs.
328: values - the values inserted
329: rmax - the number of block-rows in the original block.
330: cmax - the number of block-columns on the original block.
331: idx - the index of the current block-row in the original block.
332: */
333: PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
334: {
335: PetscInt i,j,k,bs2,bs=stash->bs,l;
336: const PetscScalar *vals;
337: PetscScalar *array;
338: PetscMatStashSpace space=stash->space;
340: if (!space || space->local_remaining < n) {
341: MatStashExpand_Private(stash,n);
342: }
343: space = stash->space;
344: l = space->local_used;
345: bs2 = bs*bs;
346: for (i=0; i<n; i++) {
347: space->idx[l] = row;
348: space->idy[l] = idxn[i];
349: /* Now copy over the block of values. Store the values column oriented.
350: This enables inserting multiple blocks belonging to a row with a single
351: funtion call */
352: array = space->val + bs2*l;
353: vals = values + idx*bs2*n + bs*i;
354: for (j=0; j<bs; j++) {
355: for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
356: array++;
357: vals += cmax*bs;
358: }
359: l++;
360: }
361: stash->n += n;
362: space->local_used += n;
363: space->local_remaining -= n;
364: return 0;
365: }
367: /*
368: MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
369: This function expects the values to be roworiented. Multiple columns belong
370: to the same block-row can be inserted with a single call to this function.
371: This function extracts the sub-block of values based on the dimensions of
372: the original input block, and the row,col values corresponding to the blocks.
374: Input Parameters:
375: stash - the stash
376: row - the global block-row correspoiding to the values
377: n - the number of elements inserted. All elements belong to the above row.
378: idxn - the global block-column indices corresponding to each of the blocks of
379: values. Each block is of size bs*bs.
380: values - the values inserted
381: rmax - the number of block-rows in the original block.
382: cmax - the number of block-columns on the original block.
383: idx - the index of the current block-row in the original block.
384: */
385: PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
386: {
387: PetscInt i,j,k,bs2,bs=stash->bs,l;
388: const PetscScalar *vals;
389: PetscScalar *array;
390: PetscMatStashSpace space=stash->space;
392: if (!space || space->local_remaining < n) {
393: MatStashExpand_Private(stash,n);
394: }
395: space = stash->space;
396: l = space->local_used;
397: bs2 = bs*bs;
398: for (i=0; i<n; i++) {
399: space->idx[l] = row;
400: space->idy[l] = idxn[i];
401: /* Now copy over the block of values. Store the values column oriented.
402: This enables inserting multiple blocks belonging to a row with a single
403: funtion call */
404: array = space->val + bs2*l;
405: vals = values + idx*bs2*n + bs*i;
406: for (j=0; j<bs; j++) {
407: for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
408: array += bs;
409: vals += rmax*bs;
410: }
411: l++;
412: }
413: stash->n += n;
414: space->local_used += n;
415: space->local_remaining -= n;
416: return 0;
417: }
418: /*
419: MatStashScatterBegin_Private - Initiates the transfer of values to the
420: correct owners. This function goes through the stash, and check the
421: owners of each stashed value, and sends the values off to the owner
422: processors.
424: Input Parameters:
425: stash - the stash
426: owners - an array of size 'no-of-procs' which gives the ownership range
427: for each node.
429: Notes:
430: The 'owners' array in the cased of the blocked-stash has the
431: ranges specified blocked global indices, and for the regular stash in
432: the proper global indices.
433: */
434: PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
435: {
436: (*stash->ScatterBegin)(mat,stash,owners);
437: return 0;
438: }
440: static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
441: {
442: PetscInt *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
443: PetscInt size=stash->size,nsends;
444: PetscInt count,*sindices,**rindices,i,j,idx,lastidx,l;
445: PetscScalar **rvalues,*svalues;
446: MPI_Comm comm = stash->comm;
447: MPI_Request *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
448: PetscMPIInt *sizes,*nlengths,nreceives;
449: PetscInt *sp_idx,*sp_idy;
450: PetscScalar *sp_val;
451: PetscMatStashSpace space,space_next;
453: { /* make sure all processors are either in INSERTMODE or ADDMODE */
454: InsertMode addv;
455: MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
457: mat->insertmode = addv; /* in case this processor had no cache */
458: }
460: bs2 = stash->bs*stash->bs;
462: /* first count number of contributors to each processor */
463: PetscCalloc1(size,&nlengths);
464: PetscMalloc1(stash->n+1,&owner);
466: i = j = 0;
467: lastidx = -1;
468: space = stash->space_head;
469: while (space) {
470: space_next = space->next;
471: sp_idx = space->idx;
472: for (l=0; l<space->local_used; l++) {
473: /* if indices are NOT locally sorted, need to start search at the beginning */
474: if (lastidx > (idx = sp_idx[l])) j = 0;
475: lastidx = idx;
476: for (; j<size; j++) {
477: if (idx >= owners[j] && idx < owners[j+1]) {
478: nlengths[j]++; owner[i] = j; break;
479: }
480: }
481: i++;
482: }
483: space = space_next;
484: }
486: /* Now check what procs get messages - and compute nsends. */
487: PetscCalloc1(size,&sizes);
488: for (i=0, nsends=0; i<size; i++) {
489: if (nlengths[i]) {
490: sizes[i] = 1; nsends++;
491: }
492: }
494: {PetscMPIInt *onodes,*olengths;
495: /* Determine the number of messages to expect, their lengths, from from-ids */
496: PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);
497: PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);
498: /* since clubbing row,col - lengths are multiplied by 2 */
499: for (i=0; i<nreceives; i++) olengths[i] *=2;
500: PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);
501: /* values are size 'bs2' lengths (and remove earlier factor 2 */
502: for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
503: PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);
504: PetscFree(onodes);
505: PetscFree(olengths);}
507: /* do sends:
508: 1) starts[i] gives the starting index in svalues for stuff going to
509: the ith processor
510: */
511: PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);
512: PetscMalloc1(2*nsends,&send_waits);
513: PetscMalloc2(size,&startv,size,&starti);
514: /* use 2 sends the first with all_a, the next with all_i and all_j */
515: startv[0] = 0; starti[0] = 0;
516: for (i=1; i<size; i++) {
517: startv[i] = startv[i-1] + nlengths[i-1];
518: starti[i] = starti[i-1] + 2*nlengths[i-1];
519: }
521: i = 0;
522: space = stash->space_head;
523: while (space) {
524: space_next = space->next;
525: sp_idx = space->idx;
526: sp_idy = space->idy;
527: sp_val = space->val;
528: for (l=0; l<space->local_used; l++) {
529: j = owner[i];
530: if (bs2 == 1) {
531: svalues[startv[j]] = sp_val[l];
532: } else {
533: PetscInt k;
534: PetscScalar *buf1,*buf2;
535: buf1 = svalues+bs2*startv[j];
536: buf2 = space->val + bs2*l;
537: for (k=0; k<bs2; k++) buf1[k] = buf2[k];
538: }
539: sindices[starti[j]] = sp_idx[l];
540: sindices[starti[j]+nlengths[j]] = sp_idy[l];
541: startv[j]++;
542: starti[j]++;
543: i++;
544: }
545: space = space_next;
546: }
547: startv[0] = 0;
548: for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
550: for (i=0,count=0; i<size; i++) {
551: if (sizes[i]) {
552: MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);
553: MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);
554: }
555: }
556: #if defined(PETSC_USE_INFO)
557: PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends);
558: for (i=0; i<size; i++) {
559: if (sizes[i]) {
560: PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt))));
561: }
562: }
563: #endif
564: PetscFree(nlengths);
565: PetscFree(owner);
566: PetscFree2(startv,starti);
567: PetscFree(sizes);
569: /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
570: PetscMalloc1(2*nreceives,&recv_waits);
572: for (i=0; i<nreceives; i++) {
573: recv_waits[2*i] = recv_waits1[i];
574: recv_waits[2*i+1] = recv_waits2[i];
575: }
576: stash->recv_waits = recv_waits;
578: PetscFree(recv_waits1);
579: PetscFree(recv_waits2);
581: stash->svalues = svalues;
582: stash->sindices = sindices;
583: stash->rvalues = rvalues;
584: stash->rindices = rindices;
585: stash->send_waits = send_waits;
586: stash->nsends = nsends;
587: stash->nrecvs = nreceives;
588: stash->reproduce_count = 0;
589: return 0;
590: }
592: /*
593: MatStashScatterGetMesg_Private - This function waits on the receives posted
594: in the function MatStashScatterBegin_Private() and returns one message at
595: a time to the calling function. If no messages are left, it indicates this
596: by setting flg = 0, else it sets flg = 1.
598: Input Parameters:
599: stash - the stash
601: Output Parameters:
602: nvals - the number of entries in the current message.
603: rows - an array of row indices (or blocked indices) corresponding to the values
604: cols - an array of columnindices (or blocked indices) corresponding to the values
605: vals - the values
606: flg - 0 indicates no more message left, and the current call has no values associated.
607: 1 indicates that the current call successfully received a message, and the
608: other output parameters nvals,rows,cols,vals are set appropriately.
609: */
610: PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
611: {
612: (*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg);
613: return 0;
614: }
616: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
617: {
618: PetscMPIInt i,*flg_v = stash->flg_v,i1,i2;
619: PetscInt bs2;
620: MPI_Status recv_status;
621: PetscBool match_found = PETSC_FALSE;
623: *flg = 0; /* When a message is discovered this is reset to 1 */
624: /* Return if no more messages to process */
625: if (stash->nprocessed == stash->nrecvs) return 0;
627: bs2 = stash->bs*stash->bs;
628: /* If a matching pair of receives are found, process them, and return the data to
629: the calling function. Until then keep receiving messages */
630: while (!match_found) {
631: if (stash->reproduce) {
632: i = stash->reproduce_count++;
633: MPI_Wait(stash->recv_waits+i,&recv_status);
634: } else {
635: MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status);
636: }
639: /* Now pack the received message into a structure which is usable by others */
640: if (i % 2) {
641: MPI_Get_count(&recv_status,MPIU_SCALAR,nvals);
643: flg_v[2*recv_status.MPI_SOURCE] = i/2;
645: *nvals = *nvals/bs2;
646: } else {
647: MPI_Get_count(&recv_status,MPIU_INT,nvals);
649: flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
651: *nvals = *nvals/2; /* This message has both row indices and col indices */
652: }
654: /* Check if we have both messages from this proc */
655: i1 = flg_v[2*recv_status.MPI_SOURCE];
656: i2 = flg_v[2*recv_status.MPI_SOURCE+1];
657: if (i1 != -1 && i2 != -1) {
658: *rows = stash->rindices[i2];
659: *cols = *rows + *nvals;
660: *vals = stash->rvalues[i1];
661: *flg = 1;
662: stash->nprocessed++;
663: match_found = PETSC_TRUE;
664: }
665: }
666: return 0;
667: }
669: #if !defined(PETSC_HAVE_MPIUNI)
670: typedef struct {
671: PetscInt row;
672: PetscInt col;
673: PetscScalar vals[1]; /* Actually an array of length bs2 */
674: } MatStashBlock;
676: static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
677: {
678: PetscMatStashSpace space;
679: PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
680: PetscScalar **valptr;
682: PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm);
683: for (space=stash->space_head,cnt=0; space; space=space->next) {
684: for (i=0; i<space->local_used; i++) {
685: row[cnt] = space->idx[i];
686: col[cnt] = space->idy[i];
687: valptr[cnt] = &space->val[i*bs2];
688: perm[cnt] = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */
689: cnt++;
690: }
691: }
693: PetscSortIntWithArrayPair(n,row,col,perm);
694: /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
695: for (rowstart=0,cnt=0,i=1; i<=n; i++) {
696: if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */
697: PetscInt colstart;
698: PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]);
699: for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
700: PetscInt j,l;
701: MatStashBlock *block;
702: PetscSegBufferGet(stash->segsendblocks,1,&block);
703: block->row = row[rowstart];
704: block->col = col[colstart];
705: PetscArraycpy(block->vals,valptr[perm[colstart]],bs2);
706: for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
707: if (insertmode == ADD_VALUES) {
708: for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
709: } else {
710: PetscArraycpy(block->vals,valptr[perm[j]],bs2);
711: }
712: }
713: colstart = j;
714: }
715: rowstart = i;
716: }
717: }
718: PetscFree4(row,col,valptr,perm);
719: return 0;
720: }
722: static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
723: {
724: if (stash->blocktype == MPI_DATATYPE_NULL) {
725: PetscInt bs2 = PetscSqr(stash->bs);
726: PetscMPIInt blocklens[2];
727: MPI_Aint displs[2];
728: MPI_Datatype types[2],stype;
729: /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
730: std::complex itself has standard layout, so does DummyBlock, recursively.
731: To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
732: though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
733: offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
735: We can test if std::complex has standard layout with the following code:
736: #include <iostream>
737: #include <type_traits>
738: #include <complex>
739: int main() {
740: std::cout << std::boolalpha;
741: std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
742: }
743: Output: true
744: */
745: struct DummyBlock {PetscInt row,col; PetscScalar vals;};
747: stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
748: if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
749: stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
750: }
751: PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);
752: PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);
753: PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);
754: blocklens[0] = 2;
755: blocklens[1] = bs2;
756: displs[0] = offsetof(struct DummyBlock,row);
757: displs[1] = offsetof(struct DummyBlock,vals);
758: types[0] = MPIU_INT;
759: types[1] = MPIU_SCALAR;
760: MPI_Type_create_struct(2,blocklens,displs,types,&stype);
761: MPI_Type_commit(&stype);
762: MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);
763: MPI_Type_commit(&stash->blocktype);
764: MPI_Type_free(&stype);
765: }
766: return 0;
767: }
769: /* Callback invoked after target rank has initiatied receive of rendezvous message.
770: * Here we post the main sends.
771: */
772: static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
773: {
774: MatStash *stash = (MatStash*)ctx;
775: MatStashHeader *hdr = (MatStashHeader*)sdata;
778: MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
779: stash->sendframes[rankid].count = hdr->count;
780: stash->sendframes[rankid].pending = 1;
781: return 0;
782: }
784: /* Callback invoked by target after receiving rendezvous message.
785: * Here we post the main recvs.
786: */
787: static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
788: {
789: MatStash *stash = (MatStash*)ctx;
790: MatStashHeader *hdr = (MatStashHeader*)rdata;
791: MatStashFrame *frame;
793: PetscSegBufferGet(stash->segrecvframe,1,&frame);
794: PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer);
795: MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
796: frame->count = hdr->count;
797: frame->pending = 1;
798: return 0;
799: }
801: /*
802: * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
803: */
804: static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
805: {
807: size_t nblocks;
808: char *sendblocks;
810: if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
811: InsertMode addv;
812: MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
814: }
816: MatStashBlockTypeSetUp(stash);
817: MatStashSortCompress_Private(stash,mat->insertmode);
818: PetscSegBufferGetSize(stash->segsendblocks,&nblocks);
819: PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks);
820: if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
821: PetscInt i;
822: size_t b;
823: for (i=0,b=0; i<stash->nsendranks; i++) {
824: stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
825: /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
826: stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
827: for (; b<nblocks; b++) {
828: MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
830: if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
831: stash->sendhdr[i].count++;
832: }
833: }
834: } else { /* Dynamically count and pack (first time) */
835: PetscInt sendno;
836: size_t i,rowstart;
838: /* Count number of send ranks and allocate for sends */
839: stash->nsendranks = 0;
840: for (rowstart=0; rowstart<nblocks;) {
841: PetscInt owner;
842: MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
843: PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
844: if (owner < 0) owner = -(owner+2);
845: for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
846: MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
847: if (sendblock_i->row >= owners[owner+1]) break;
848: }
849: stash->nsendranks++;
850: rowstart = i;
851: }
852: PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes);
854: /* Set up sendhdrs and sendframes */
855: sendno = 0;
856: for (rowstart=0; rowstart<nblocks;) {
857: PetscInt owner;
858: MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
859: PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
860: if (owner < 0) owner = -(owner+2);
861: stash->sendranks[sendno] = owner;
862: for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
863: MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
864: if (sendblock_i->row >= owners[owner+1]) break;
865: }
866: stash->sendframes[sendno].buffer = sendblock_rowstart;
867: stash->sendframes[sendno].pending = 0;
868: stash->sendhdr[sendno].count = i - rowstart;
869: sendno++;
870: rowstart = i;
871: }
873: }
875: /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
876: * message or a dummy entry of some sort. */
877: if (mat->insertmode == INSERT_VALUES) {
878: size_t i;
879: for (i=0; i<nblocks; i++) {
880: MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
881: sendblock_i->row = -(sendblock_i->row+1);
882: }
883: }
885: if (stash->first_assembly_done) {
886: PetscMPIInt i,tag;
887: PetscCommGetNewTag(stash->comm,&tag);
888: for (i=0; i<stash->nrecvranks; i++) {
889: MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash);
890: }
891: for (i=0; i<stash->nsendranks; i++) {
892: MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash);
893: }
894: stash->use_status = PETSC_TRUE; /* Use count from message status. */
895: } else {
896: PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
897: &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
898: MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);
899: PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses);
900: stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
901: }
903: PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes);
904: stash->recvframe_active = NULL;
905: stash->recvframe_i = 0;
906: stash->some_i = 0;
907: stash->some_count = 0;
908: stash->recvcount = 0;
909: stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
910: stash->insertmode = &mat->insertmode;
911: return 0;
912: }
914: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
915: {
916: MatStashBlock *block;
918: *flg = 0;
919: while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
920: if (stash->some_i == stash->some_count) {
921: if (stash->recvcount == stash->nrecvranks) return 0; /* Done */
922: MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE);
923: stash->some_i = 0;
924: }
925: stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
926: stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
927: if (stash->use_status) { /* Count what was actually sent */
928: MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count);
929: }
930: if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
931: block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
932: if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
935: }
936: stash->some_i++;
937: stash->recvcount++;
938: stash->recvframe_i = 0;
939: }
940: *n = 1;
941: block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
942: if (block->row < 0) block->row = -(block->row + 1);
943: *row = &block->row;
944: *col = &block->col;
945: *val = block->vals;
946: stash->recvframe_i++;
947: *flg = 1;
948: return 0;
949: }
951: static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
952: {
953: MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE);
954: if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks */
955: void *dummy;
956: PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy);
957: } else { /* No reuse, so collect everything. */
958: MatStashScatterDestroy_BTS(stash);
959: }
961: /* Now update nmaxold to be app 10% more than max n used, this way the
962: wastage of space is reduced the next time this stash is used.
963: Also update the oldmax, only if it increases */
964: if (stash->n) {
965: PetscInt bs2 = stash->bs*stash->bs;
966: PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
967: if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
968: }
970: stash->nmax = 0;
971: stash->n = 0;
972: stash->reallocs = -1;
973: stash->nprocessed = 0;
975: PetscMatStashSpaceDestroy(&stash->space_head);
977: stash->space = NULL;
979: return 0;
980: }
982: PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
983: {
984: PetscSegBufferDestroy(&stash->segsendblocks);
985: PetscSegBufferDestroy(&stash->segrecvframe);
986: stash->recvframes = NULL;
987: PetscSegBufferDestroy(&stash->segrecvblocks);
988: if (stash->blocktype != MPI_DATATYPE_NULL) {
989: MPI_Type_free(&stash->blocktype);
990: }
991: stash->nsendranks = 0;
992: stash->nrecvranks = 0;
993: PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes);
994: PetscFree(stash->sendreqs);
995: PetscFree(stash->recvreqs);
996: PetscFree(stash->recvranks);
997: PetscFree(stash->recvhdr);
998: PetscFree2(stash->some_indices,stash->some_statuses);
999: return 0;
1000: }
1001: #endif