Actual source code: baijfact11.c


  2: /*
  3:     Factorization code for BAIJ format.
  4: */
  5: #include <../src/mat/impls/baij/seq/baij.h>
  6: #include <petsc/private/kernels/blockinvert.h>

  8: /* ------------------------------------------------------------*/
  9: /*
 10:       Version for when blocks are 4 by 4
 11: */
 12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 13: {
 14:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 15:   IS             isrow = b->row,isicol = b->icol;
 16:   const PetscInt *r,*ic;
 17:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 18:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 19:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 20:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 21:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 22:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 23:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 24:   MatScalar      m13,m14,m15,m16;
 25:   MatScalar      *ba           = b->a,*aa = a->a;
 26:   PetscBool      pivotinblocks = b->pivotinblocks;
 27:   PetscReal      shift         = info->shiftamount;
 28:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

 30:   ISGetIndices(isrow,&r);
 31:   ISGetIndices(isicol,&ic);
 32:   PetscMalloc1(16*(n+1),&rtmp);
 33:   allowzeropivot = PetscNot(A->erroriffailure);

 35:   for (i=0; i<n; i++) {
 36:     nz    = bi[i+1] - bi[i];
 37:     ajtmp = bj + bi[i];
 38:     for  (j=0; j<nz; j++) {
 39:       x     = rtmp+16*ajtmp[j];
 40:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 41:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 42:     }
 43:     /* load in initial (unfactored row) */
 44:     idx      = r[i];
 45:     nz       = ai[idx+1] - ai[idx];
 46:     ajtmpold = aj + ai[idx];
 47:     v        = aa + 16*ai[idx];
 48:     for (j=0; j<nz; j++) {
 49:       x     = rtmp+16*ic[ajtmpold[j]];
 50:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 51:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 52:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 53:       x[14] = v[14]; x[15] = v[15];
 54:       v    += 16;
 55:     }
 56:     row = *ajtmp++;
 57:     while (row < i) {
 58:       pc  = rtmp + 16*row;
 59:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 60:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 61:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 62:       p15 = pc[14]; p16 = pc[15];
 63:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 64:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 65:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 66:           || p16 != 0.0) {
 67:         pv    = ba + 16*diag_offset[row];
 68:         pj    = bj + diag_offset[row] + 1;
 69:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 70:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 71:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 72:         x15   = pv[14]; x16 = pv[15];
 73:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 74:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 75:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 76:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 78:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 79:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 80:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 81:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 83:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 84:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 85:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 86:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 88:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 89:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 90:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 91:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 93:         nz  = bi[row+1] - diag_offset[row] - 1;
 94:         pv += 16;
 95:         for (j=0; j<nz; j++) {
 96:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 97:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
 98:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
 99:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
100:           x     = rtmp + 16*pj[j];
101:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
102:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
103:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
104:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

106:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
107:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
108:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
109:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

111:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
112:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
113:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
114:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

116:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
117:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
118:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
119:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

121:           pv += 16;
122:         }
123:         PetscLogFlops(128.0*nz+112.0);
124:       }
125:       row = *ajtmp++;
126:     }
127:     /* finished row so stick it into b->a */
128:     pv = ba + 16*bi[i];
129:     pj = bj + bi[i];
130:     nz = bi[i+1] - bi[i];
131:     for (j=0; j<nz; j++) {
132:       x      = rtmp+16*pj[j];
133:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
134:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
135:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
136:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
137:       pv    += 16;
138:     }
139:     /* invert diagonal block */
140:     w = ba + 16*diag_offset[i];
141:     if (pivotinblocks) {
142:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
143:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
144:     } else {
145:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
146:     }
147:   }

149:   PetscFree(rtmp);
150:   ISRestoreIndices(isicol,&ic);
151:   ISRestoreIndices(isrow,&r);

153:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
154:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155:   C->assembled           = PETSC_TRUE;

157:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
158:   return 0;
159: }

161: /* MatLUFactorNumeric_SeqBAIJ_4 -
162:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163:        PetscKernel_A_gets_A_times_B()
164:        PetscKernel_A_gets_A_minus_B_times_C()
165:        PetscKernel_A_gets_inverse_A()
166: */

168: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
169: {
170:   Mat            C     = B;
171:   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
172:   IS             isrow = b->row,isicol = b->icol;
173:   const PetscInt *r,*ic;
174:   PetscInt       i,j,k,nz,nzL,row;
175:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
176:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
177:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
178:   PetscInt       flg;
179:   PetscReal      shift;
180:   PetscBool      allowzeropivot,zeropivotdetected;

182:   allowzeropivot = PetscNot(A->erroriffailure);
183:   ISGetIndices(isrow,&r);
184:   ISGetIndices(isicol,&ic);

186:   if (info->shifttype == MAT_SHIFT_NONE) {
187:     shift = 0;
188:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
189:     shift = info->shiftamount;
190:   }

192:   /* generate work space needed by the factorization */
193:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
194:   PetscArrayzero(rtmp,bs2*n);

196:   for (i=0; i<n; i++) {
197:     /* zero rtmp */
198:     /* L part */
199:     nz    = bi[i+1] - bi[i];
200:     bjtmp = bj + bi[i];
201:     for  (j=0; j<nz; j++) {
202:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
203:     }

205:     /* U part */
206:     nz    = bdiag[i]-bdiag[i+1];
207:     bjtmp = bj + bdiag[i+1]+1;
208:     for  (j=0; j<nz; j++) {
209:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
210:     }

212:     /* load in initial (unfactored row) */
213:     nz    = ai[r[i]+1] - ai[r[i]];
214:     ajtmp = aj + ai[r[i]];
215:     v     = aa + bs2*ai[r[i]];
216:     for (j=0; j<nz; j++) {
217:       PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);
218:     }

220:     /* elimination */
221:     bjtmp = bj + bi[i];
222:     nzL   = bi[i+1] - bi[i];
223:     for (k=0; k < nzL; k++) {
224:       row = bjtmp[k];
225:       pc  = rtmp + bs2*row;
226:       for (flg=0,j=0; j<bs2; j++) {
227:         if (pc[j]!=0.0) {
228:           flg = 1;
229:           break;
230:         }
231:       }
232:       if (flg) {
233:         pv = b->a + bs2*bdiag[row];
234:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
235:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

237:         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
238:         pv = b->a + bs2*(bdiag[row+1]+1);
239:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
240:         for (j=0; j<nz; j++) {
241:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
242:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
243:           v    = rtmp + bs2*pj[j];
244:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
245:           pv  += bs2;
246:         }
247:         PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
248:       }
249:     }

251:     /* finished row so stick it into b->a */
252:     /* L part */
253:     pv = b->a + bs2*bi[i];
254:     pj = b->j + bi[i];
255:     nz = bi[i+1] - bi[i];
256:     for (j=0; j<nz; j++) {
257:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
258:     }

260:     /* Mark diagonal and invert diagonal for simpler triangular solves */
261:     pv   = b->a + bs2*bdiag[i];
262:     pj   = b->j + bdiag[i];
263:     PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
264:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
265:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

267:     /* U part */
268:     pv = b->a + bs2*(bdiag[i+1]+1);
269:     pj = b->j + bdiag[i+1]+1;
270:     nz = bdiag[i] - bdiag[i+1] - 1;
271:     for (j=0; j<nz; j++) {
272:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
273:     }
274:   }

276:   PetscFree2(rtmp,mwork);
277:   ISRestoreIndices(isicol,&ic);
278:   ISRestoreIndices(isrow,&r);

280:   C->ops->solve          = MatSolve_SeqBAIJ_4;
281:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
282:   C->assembled           = PETSC_TRUE;

284:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
285:   return 0;
286: }

288: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
289: {
290: /*
291:     Default Version for when blocks are 4 by 4 Using natural ordering
292: */
293:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
294:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
295:   PetscInt       *ajtmpold,*ajtmp,nz,row;
296:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
297:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
298:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
299:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
300:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
301:   MatScalar      m13,m14,m15,m16;
302:   MatScalar      *ba           = b->a,*aa = a->a;
303:   PetscBool      pivotinblocks = b->pivotinblocks;
304:   PetscReal      shift         = info->shiftamount;
305:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

307:   allowzeropivot = PetscNot(A->erroriffailure);
308:   PetscMalloc1(16*(n+1),&rtmp);

310:   for (i=0; i<n; i++) {
311:     nz    = bi[i+1] - bi[i];
312:     ajtmp = bj + bi[i];
313:     for  (j=0; j<nz; j++) {
314:       x     = rtmp+16*ajtmp[j];
315:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
316:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
317:     }
318:     /* load in initial (unfactored row) */
319:     nz       = ai[i+1] - ai[i];
320:     ajtmpold = aj + ai[i];
321:     v        = aa + 16*ai[i];
322:     for (j=0; j<nz; j++) {
323:       x     = rtmp+16*ajtmpold[j];
324:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
325:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
326:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
327:       x[14] = v[14]; x[15] = v[15];
328:       v    += 16;
329:     }
330:     row = *ajtmp++;
331:     while (row < i) {
332:       pc  = rtmp + 16*row;
333:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
334:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
335:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
336:       p15 = pc[14]; p16 = pc[15];
337:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
338:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
339:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
340:           || p16 != 0.0) {
341:         pv    = ba + 16*diag_offset[row];
342:         pj    = bj + diag_offset[row] + 1;
343:         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
344:         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
345:         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
346:         x15   = pv[14]; x16 = pv[15];
347:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
348:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
349:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
350:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

352:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
353:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
354:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
355:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

357:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
358:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
359:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
360:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

362:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
363:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
364:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
365:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
366:         nz     = bi[row+1] - diag_offset[row] - 1;
367:         pv    += 16;
368:         for (j=0; j<nz; j++) {
369:           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
370:           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
371:           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
372:           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
373:           x     = rtmp + 16*pj[j];
374:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
375:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
376:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
377:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

379:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
380:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
381:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
382:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

384:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
385:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
386:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
387:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

389:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
390:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
391:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
392:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

394:           pv += 16;
395:         }
396:         PetscLogFlops(128.0*nz+112.0);
397:       }
398:       row = *ajtmp++;
399:     }
400:     /* finished row so stick it into b->a */
401:     pv = ba + 16*bi[i];
402:     pj = bj + bi[i];
403:     nz = bi[i+1] - bi[i];
404:     for (j=0; j<nz; j++) {
405:       x      = rtmp+16*pj[j];
406:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
407:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
408:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
409:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
410:       pv    += 16;
411:     }
412:     /* invert diagonal block */
413:     w = ba + 16*diag_offset[i];
414:     if (pivotinblocks) {
415:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
416:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417:     } else {
418:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
419:     }
420:   }

422:   PetscFree(rtmp);

424:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
425:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
426:   C->assembled           = PETSC_TRUE;

428:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
429:   return 0;
430: }

432: /*
433:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
434:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
435: */
436: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
437: {
438:   Mat            C =B;
439:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
440:   PetscInt       i,j,k,nz,nzL,row;
441:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
442:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
443:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
444:   PetscInt       flg;
445:   PetscReal      shift;
446:   PetscBool      allowzeropivot,zeropivotdetected;

448:   allowzeropivot = PetscNot(A->erroriffailure);

450:   /* generate work space needed by the factorization */
451:   PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
452:   PetscArrayzero(rtmp,bs2*n);

454:   if (info->shifttype == MAT_SHIFT_NONE) {
455:     shift = 0;
456:   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
457:     shift = info->shiftamount;
458:   }

460:   for (i=0; i<n; i++) {
461:     /* zero rtmp */
462:     /* L part */
463:     nz    = bi[i+1] - bi[i];
464:     bjtmp = bj + bi[i];
465:     for  (j=0; j<nz; j++) {
466:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
467:     }

469:     /* U part */
470:     nz    = bdiag[i] - bdiag[i+1];
471:     bjtmp = bj + bdiag[i+1]+1;
472:     for  (j=0; j<nz; j++) {
473:       PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
474:     }

476:     /* load in initial (unfactored row) */
477:     nz    = ai[i+1] - ai[i];
478:     ajtmp = aj + ai[i];
479:     v     = aa + bs2*ai[i];
480:     for (j=0; j<nz; j++) {
481:       PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);
482:     }

484:     /* elimination */
485:     bjtmp = bj + bi[i];
486:     nzL   = bi[i+1] - bi[i];
487:     for (k=0; k < nzL; k++) {
488:       row = bjtmp[k];
489:       pc  = rtmp + bs2*row;
490:       for (flg=0,j=0; j<bs2; j++) {
491:         if (pc[j]!=0.0) {
492:           flg = 1;
493:           break;
494:         }
495:       }
496:       if (flg) {
497:         pv = b->a + bs2*bdiag[row];
498:         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
499:         PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);

501:         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
502:         pv = b->a + bs2*(bdiag[row+1]+1);
503:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
504:         for (j=0; j<nz; j++) {
505:           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
506:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
507:           v    = rtmp + bs2*pj[j];
508:           PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
509:           pv  += bs2;
510:         }
511:         PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
512:       }
513:     }

515:     /* finished row so stick it into b->a */
516:     /* L part */
517:     pv = b->a + bs2*bi[i];
518:     pj = b->j + bi[i];
519:     nz = bi[i+1] - bi[i];
520:     for (j=0; j<nz; j++) {
521:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
522:     }

524:     /* Mark diagonal and invert diagonal for simpler triangular solves */
525:     pv   = b->a + bs2*bdiag[i];
526:     pj   = b->j + bdiag[i];
527:     PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
528:     PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
529:     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;

531:     /* U part */
532:     pv = b->a + bs2*(bdiag[i+1]+1);
533:     pj = b->j + bdiag[i+1]+1;
534:     nz = bdiag[i] - bdiag[i+1] - 1;
535:     for (j=0; j<nz; j++) {
536:       PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
537:     }
538:   }
539:   PetscFree2(rtmp,mwork);

541:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
542:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
543:   C->assembled           = PETSC_TRUE;

545:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
546:   return 0;
547: }

549: #if defined(PETSC_HAVE_SSE)

551: #include PETSC_HAVE_SSE

553: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
554: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
555: {
556:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
557:   int            i,j,n = a->mbs;
558:   int            *bj = b->j,*bjtmp,*pj;
559:   int            row;
560:   int            *ajtmpold,nz,*bi=b->i;
561:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
562:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
563:   MatScalar      *ba    = b->a,*aa = a->a;
564:   int            nonzero=0;
565:   PetscBool      pivotinblocks = b->pivotinblocks;
566:   PetscReal      shift         = info->shiftamount;
567:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

569:   allowzeropivot = PetscNot(A->erroriffailure);
570:   SSE_SCOPE_BEGIN;

574:   PetscMalloc1(16*(n+1),&rtmp);
576: /*    if ((unsigned long)bj==(unsigned long)aj) { */
577: /*      colscale = 4; */
578: /*    } */
579:   for (i=0; i<n; i++) {
580:     nz    = bi[i+1] - bi[i];
581:     bjtmp = bj + bi[i];
582:     /* zero out the 4x4 block accumulators */
583:     /* zero out one register */
584:     XOR_PS(XMM7,XMM7);
585:     for  (j=0; j<nz; j++) {
586:       x = rtmp+16*bjtmp[j];
587: /*        x = rtmp+4*bjtmp[j]; */
588:       SSE_INLINE_BEGIN_1(x)
589:       /* Copy zero register to memory locations */
590:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
591:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
592:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
593:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
594:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
595:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
596:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
597:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
598:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
599:       SSE_INLINE_END_1;
600:     }
601:     /* load in initial (unfactored row) */
602:     nz       = ai[i+1] - ai[i];
603:     ajtmpold = aj + ai[i];
604:     v        = aa + 16*ai[i];
605:     for (j=0; j<nz; j++) {
606:       x = rtmp+16*ajtmpold[j];
607: /*        x = rtmp+colscale*ajtmpold[j]; */
608:       /* Copy v block into x block */
609:       SSE_INLINE_BEGIN_2(v,x)
610:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
611:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
612:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

614:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
615:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

617:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
618:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

620:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
621:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

623:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
624:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

626:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
627:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

629:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
630:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

632:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
633:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
634:       SSE_INLINE_END_2;

636:       v += 16;
637:     }
638: /*      row = (*bjtmp++)/4; */
639:     row = *bjtmp++;
640:     while (row < i) {
641:       pc = rtmp + 16*row;
642:       SSE_INLINE_BEGIN_1(pc)
643:       /* Load block from lower triangle */
644:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
645:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
646:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

648:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
649:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

651:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
652:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

654:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
655:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

657:       /* Compare block to zero block */

659:       SSE_COPY_PS(XMM4,XMM7)
660:       SSE_CMPNEQ_PS(XMM4,XMM0)

662:       SSE_COPY_PS(XMM5,XMM7)
663:       SSE_CMPNEQ_PS(XMM5,XMM1)

665:       SSE_COPY_PS(XMM6,XMM7)
666:       SSE_CMPNEQ_PS(XMM6,XMM2)

668:       SSE_CMPNEQ_PS(XMM7,XMM3)

670:       /* Reduce the comparisons to one SSE register */
671:       SSE_OR_PS(XMM6,XMM7)
672:       SSE_OR_PS(XMM5,XMM4)
673:       SSE_OR_PS(XMM5,XMM6)
674:       SSE_INLINE_END_1;

676:       /* Reduce the one SSE register to an integer register for branching */
677:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
678:       MOVEMASK(nonzero,XMM5);

680:       /* If block is nonzero ... */
681:       if (nonzero) {
682:         pv = ba + 16*diag_offset[row];
683:         PREFETCH_L1(&pv[16]);
684:         pj = bj + diag_offset[row] + 1;

686:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
687:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
688:         /* but the diagonal was inverted already */
689:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

691:         SSE_INLINE_BEGIN_2(pv,pc)
692:         /* Column 0, product is accumulated in XMM4 */
693:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
694:         SSE_SHUFFLE(XMM4,XMM4,0x00)
695:         SSE_MULT_PS(XMM4,XMM0)

697:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
698:         SSE_SHUFFLE(XMM5,XMM5,0x00)
699:         SSE_MULT_PS(XMM5,XMM1)
700:         SSE_ADD_PS(XMM4,XMM5)

702:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
703:         SSE_SHUFFLE(XMM6,XMM6,0x00)
704:         SSE_MULT_PS(XMM6,XMM2)
705:         SSE_ADD_PS(XMM4,XMM6)

707:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
708:         SSE_SHUFFLE(XMM7,XMM7,0x00)
709:         SSE_MULT_PS(XMM7,XMM3)
710:         SSE_ADD_PS(XMM4,XMM7)

712:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
713:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

715:         /* Column 1, product is accumulated in XMM5 */
716:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
717:         SSE_SHUFFLE(XMM5,XMM5,0x00)
718:         SSE_MULT_PS(XMM5,XMM0)

720:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
721:         SSE_SHUFFLE(XMM6,XMM6,0x00)
722:         SSE_MULT_PS(XMM6,XMM1)
723:         SSE_ADD_PS(XMM5,XMM6)

725:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
726:         SSE_SHUFFLE(XMM7,XMM7,0x00)
727:         SSE_MULT_PS(XMM7,XMM2)
728:         SSE_ADD_PS(XMM5,XMM7)

730:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
731:         SSE_SHUFFLE(XMM6,XMM6,0x00)
732:         SSE_MULT_PS(XMM6,XMM3)
733:         SSE_ADD_PS(XMM5,XMM6)

735:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
736:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

738:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

740:         /* Column 2, product is accumulated in XMM6 */
741:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
742:         SSE_SHUFFLE(XMM6,XMM6,0x00)
743:         SSE_MULT_PS(XMM6,XMM0)

745:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
746:         SSE_SHUFFLE(XMM7,XMM7,0x00)
747:         SSE_MULT_PS(XMM7,XMM1)
748:         SSE_ADD_PS(XMM6,XMM7)

750:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
751:         SSE_SHUFFLE(XMM7,XMM7,0x00)
752:         SSE_MULT_PS(XMM7,XMM2)
753:         SSE_ADD_PS(XMM6,XMM7)

755:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
756:         SSE_SHUFFLE(XMM7,XMM7,0x00)
757:         SSE_MULT_PS(XMM7,XMM3)
758:         SSE_ADD_PS(XMM6,XMM7)

760:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
761:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

763:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
764:         /* Column 3, product is accumulated in XMM0 */
765:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
766:         SSE_SHUFFLE(XMM7,XMM7,0x00)
767:         SSE_MULT_PS(XMM0,XMM7)

769:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
770:         SSE_SHUFFLE(XMM7,XMM7,0x00)
771:         SSE_MULT_PS(XMM1,XMM7)
772:         SSE_ADD_PS(XMM0,XMM1)

774:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
775:         SSE_SHUFFLE(XMM1,XMM1,0x00)
776:         SSE_MULT_PS(XMM1,XMM2)
777:         SSE_ADD_PS(XMM0,XMM1)

779:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
780:         SSE_SHUFFLE(XMM7,XMM7,0x00)
781:         SSE_MULT_PS(XMM3,XMM7)
782:         SSE_ADD_PS(XMM0,XMM3)

784:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
785:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

787:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
788:         /* This is code to be maintained and read by humans afterall. */
789:         /* Copy Multiplier Col 3 into XMM3 */
790:         SSE_COPY_PS(XMM3,XMM0)
791:         /* Copy Multiplier Col 2 into XMM2 */
792:         SSE_COPY_PS(XMM2,XMM6)
793:         /* Copy Multiplier Col 1 into XMM1 */
794:         SSE_COPY_PS(XMM1,XMM5)
795:         /* Copy Multiplier Col 0 into XMM0 */
796:         SSE_COPY_PS(XMM0,XMM4)
797:         SSE_INLINE_END_2;

799:         /* Update the row: */
800:         nz  = bi[row+1] - diag_offset[row] - 1;
801:         pv += 16;
802:         for (j=0; j<nz; j++) {
803:           PREFETCH_L1(&pv[16]);
804:           x = rtmp + 16*pj[j];
805: /*            x = rtmp + 4*pj[j]; */

807:           /* X:=X-M*PV, One column at a time */
808:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
809:           SSE_INLINE_BEGIN_2(x,pv)
810:           /* Load First Column of X*/
811:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
812:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

814:           /* Matrix-Vector Product: */
815:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
816:           SSE_SHUFFLE(XMM5,XMM5,0x00)
817:           SSE_MULT_PS(XMM5,XMM0)
818:           SSE_SUB_PS(XMM4,XMM5)

820:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
821:           SSE_SHUFFLE(XMM6,XMM6,0x00)
822:           SSE_MULT_PS(XMM6,XMM1)
823:           SSE_SUB_PS(XMM4,XMM6)

825:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
826:           SSE_SHUFFLE(XMM7,XMM7,0x00)
827:           SSE_MULT_PS(XMM7,XMM2)
828:           SSE_SUB_PS(XMM4,XMM7)

830:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
831:           SSE_SHUFFLE(XMM5,XMM5,0x00)
832:           SSE_MULT_PS(XMM5,XMM3)
833:           SSE_SUB_PS(XMM4,XMM5)

835:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
836:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

838:           /* Second Column */
839:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
840:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

842:           /* Matrix-Vector Product: */
843:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
844:           SSE_SHUFFLE(XMM6,XMM6,0x00)
845:           SSE_MULT_PS(XMM6,XMM0)
846:           SSE_SUB_PS(XMM5,XMM6)

848:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
849:           SSE_SHUFFLE(XMM7,XMM7,0x00)
850:           SSE_MULT_PS(XMM7,XMM1)
851:           SSE_SUB_PS(XMM5,XMM7)

853:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
854:           SSE_SHUFFLE(XMM4,XMM4,0x00)
855:           SSE_MULT_PS(XMM4,XMM2)
856:           SSE_SUB_PS(XMM5,XMM4)

858:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
859:           SSE_SHUFFLE(XMM6,XMM6,0x00)
860:           SSE_MULT_PS(XMM6,XMM3)
861:           SSE_SUB_PS(XMM5,XMM6)

863:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
864:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

866:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

868:           /* Third Column */
869:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
870:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

872:           /* Matrix-Vector Product: */
873:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
874:           SSE_SHUFFLE(XMM7,XMM7,0x00)
875:           SSE_MULT_PS(XMM7,XMM0)
876:           SSE_SUB_PS(XMM6,XMM7)

878:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
879:           SSE_SHUFFLE(XMM4,XMM4,0x00)
880:           SSE_MULT_PS(XMM4,XMM1)
881:           SSE_SUB_PS(XMM6,XMM4)

883:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
884:           SSE_SHUFFLE(XMM5,XMM5,0x00)
885:           SSE_MULT_PS(XMM5,XMM2)
886:           SSE_SUB_PS(XMM6,XMM5)

888:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
889:           SSE_SHUFFLE(XMM7,XMM7,0x00)
890:           SSE_MULT_PS(XMM7,XMM3)
891:           SSE_SUB_PS(XMM6,XMM7)

893:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
894:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

896:           /* Fourth Column */
897:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
898:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

900:           /* Matrix-Vector Product: */
901:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
902:           SSE_SHUFFLE(XMM5,XMM5,0x00)
903:           SSE_MULT_PS(XMM5,XMM0)
904:           SSE_SUB_PS(XMM4,XMM5)

906:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
907:           SSE_SHUFFLE(XMM6,XMM6,0x00)
908:           SSE_MULT_PS(XMM6,XMM1)
909:           SSE_SUB_PS(XMM4,XMM6)

911:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
912:           SSE_SHUFFLE(XMM7,XMM7,0x00)
913:           SSE_MULT_PS(XMM7,XMM2)
914:           SSE_SUB_PS(XMM4,XMM7)

916:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
917:           SSE_SHUFFLE(XMM5,XMM5,0x00)
918:           SSE_MULT_PS(XMM5,XMM3)
919:           SSE_SUB_PS(XMM4,XMM5)

921:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
922:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
923:           SSE_INLINE_END_2;
924:           pv += 16;
925:         }
926:         PetscLogFlops(128.0*nz+112.0);
927:       }
928:       row = *bjtmp++;
929: /*        row = (*bjtmp++)/4; */
930:     }
931:     /* finished row so stick it into b->a */
932:     pv = ba + 16*bi[i];
933:     pj = bj + bi[i];
934:     nz = bi[i+1] - bi[i];

936:     /* Copy x block back into pv block */
937:     for (j=0; j<nz; j++) {
938:       x = rtmp+16*pj[j];
939: /*        x  = rtmp+4*pj[j]; */

941:       SSE_INLINE_BEGIN_2(x,pv)
942:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
943:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
944:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

946:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
947:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

949:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
950:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

952:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
953:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

955:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
956:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

958:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
959:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

961:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
962:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

964:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
965:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
966:       SSE_INLINE_END_2;
967:       pv += 16;
968:     }
969:     /* invert diagonal block */
970:     w = ba + 16*diag_offset[i];
971:     if (pivotinblocks) {
972:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
973:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
974:     } else {
975:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
976:     }
977: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
978:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
979:   }

981:   PetscFree(rtmp);

983:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
984:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
985:   C->assembled           = PETSC_TRUE;

987:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
988:   /* Flop Count from inverting diagonal blocks */
989:   SSE_SCOPE_END;
990:   return 0;
991: }

993: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
994: {
995:   Mat            A  =C;
996:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
997:   int            i,j,n = a->mbs;
998:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
999:   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1000:   unsigned int   row;
1001:   int            nz,*bi=b->i;
1002:   int            *diag_offset = b->diag,*ai=a->i;
1003:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1004:   MatScalar      *ba    = b->a,*aa = a->a;
1005:   int            nonzero=0;
1006:   PetscBool      pivotinblocks = b->pivotinblocks;
1007:   PetscReal      shift = info->shiftamount;
1008:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1010:   allowzeropivot = PetscNot(A->erroriffailure);
1011:   SSE_SCOPE_BEGIN;

1015:   PetscMalloc1(16*(n+1),&rtmp);
1017: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1018: /*      colscale = 4; */
1019: /*    } */

1021:   for (i=0; i<n; i++) {
1022:     nz    = bi[i+1] - bi[i];
1023:     bjtmp = bj + bi[i];
1024:     /* zero out the 4x4 block accumulators */
1025:     /* zero out one register */
1026:     XOR_PS(XMM7,XMM7);
1027:     for  (j=0; j<nz; j++) {
1028:       x = rtmp+16*((unsigned int)bjtmp[j]);
1029: /*        x = rtmp+4*bjtmp[j]; */
1030:       SSE_INLINE_BEGIN_1(x)
1031:       /* Copy zero register to memory locations */
1032:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1033:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1034:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1035:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1036:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1037:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1038:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1039:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1040:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1041:       SSE_INLINE_END_1;
1042:     }
1043:     /* load in initial (unfactored row) */
1044:     nz    = ai[i+1] - ai[i];
1045:     ajtmp = aj + ai[i];
1046:     v     = aa + 16*ai[i];
1047:     for (j=0; j<nz; j++) {
1048:       x = rtmp+16*((unsigned int)ajtmp[j]);
1049: /*        x = rtmp+colscale*ajtmp[j]; */
1050:       /* Copy v block into x block */
1051:       SSE_INLINE_BEGIN_2(v,x)
1052:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1053:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1054:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1056:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1057:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1059:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1060:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1062:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1063:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1065:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1066:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1068:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1069:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1071:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1072:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1074:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1075:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1076:       SSE_INLINE_END_2;

1078:       v += 16;
1079:     }
1080: /*      row = (*bjtmp++)/4; */
1081:     row = (unsigned int)(*bjtmp++);
1082:     while (row < i) {
1083:       pc = rtmp + 16*row;
1084:       SSE_INLINE_BEGIN_1(pc)
1085:       /* Load block from lower triangle */
1086:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1087:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1088:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1090:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1091:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1093:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1094:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1096:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1097:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1099:       /* Compare block to zero block */

1101:       SSE_COPY_PS(XMM4,XMM7)
1102:       SSE_CMPNEQ_PS(XMM4,XMM0)

1104:       SSE_COPY_PS(XMM5,XMM7)
1105:       SSE_CMPNEQ_PS(XMM5,XMM1)

1107:       SSE_COPY_PS(XMM6,XMM7)
1108:       SSE_CMPNEQ_PS(XMM6,XMM2)

1110:       SSE_CMPNEQ_PS(XMM7,XMM3)

1112:       /* Reduce the comparisons to one SSE register */
1113:       SSE_OR_PS(XMM6,XMM7)
1114:       SSE_OR_PS(XMM5,XMM4)
1115:       SSE_OR_PS(XMM5,XMM6)
1116:       SSE_INLINE_END_1;

1118:       /* Reduce the one SSE register to an integer register for branching */
1119:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1120:       MOVEMASK(nonzero,XMM5);

1122:       /* If block is nonzero ... */
1123:       if (nonzero) {
1124:         pv = ba + 16*diag_offset[row];
1125:         PREFETCH_L1(&pv[16]);
1126:         pj = bj + diag_offset[row] + 1;

1128:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1129:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1130:         /* but the diagonal was inverted already */
1131:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1133:         SSE_INLINE_BEGIN_2(pv,pc)
1134:         /* Column 0, product is accumulated in XMM4 */
1135:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1136:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1137:         SSE_MULT_PS(XMM4,XMM0)

1139:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1140:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1141:         SSE_MULT_PS(XMM5,XMM1)
1142:         SSE_ADD_PS(XMM4,XMM5)

1144:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1145:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1146:         SSE_MULT_PS(XMM6,XMM2)
1147:         SSE_ADD_PS(XMM4,XMM6)

1149:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1150:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1151:         SSE_MULT_PS(XMM7,XMM3)
1152:         SSE_ADD_PS(XMM4,XMM7)

1154:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1155:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1157:         /* Column 1, product is accumulated in XMM5 */
1158:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1159:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1160:         SSE_MULT_PS(XMM5,XMM0)

1162:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1163:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1164:         SSE_MULT_PS(XMM6,XMM1)
1165:         SSE_ADD_PS(XMM5,XMM6)

1167:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1168:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1169:         SSE_MULT_PS(XMM7,XMM2)
1170:         SSE_ADD_PS(XMM5,XMM7)

1172:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1173:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1174:         SSE_MULT_PS(XMM6,XMM3)
1175:         SSE_ADD_PS(XMM5,XMM6)

1177:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1178:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1180:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1182:         /* Column 2, product is accumulated in XMM6 */
1183:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1184:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1185:         SSE_MULT_PS(XMM6,XMM0)

1187:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1188:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1189:         SSE_MULT_PS(XMM7,XMM1)
1190:         SSE_ADD_PS(XMM6,XMM7)

1192:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1193:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1194:         SSE_MULT_PS(XMM7,XMM2)
1195:         SSE_ADD_PS(XMM6,XMM7)

1197:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1198:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1199:         SSE_MULT_PS(XMM7,XMM3)
1200:         SSE_ADD_PS(XMM6,XMM7)

1202:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1203:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1205:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1206:         /* Column 3, product is accumulated in XMM0 */
1207:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1208:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1209:         SSE_MULT_PS(XMM0,XMM7)

1211:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1212:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1213:         SSE_MULT_PS(XMM1,XMM7)
1214:         SSE_ADD_PS(XMM0,XMM1)

1216:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1217:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1218:         SSE_MULT_PS(XMM1,XMM2)
1219:         SSE_ADD_PS(XMM0,XMM1)

1221:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1222:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1223:         SSE_MULT_PS(XMM3,XMM7)
1224:         SSE_ADD_PS(XMM0,XMM3)

1226:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1227:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1229:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1230:         /* This is code to be maintained and read by humans afterall. */
1231:         /* Copy Multiplier Col 3 into XMM3 */
1232:         SSE_COPY_PS(XMM3,XMM0)
1233:         /* Copy Multiplier Col 2 into XMM2 */
1234:         SSE_COPY_PS(XMM2,XMM6)
1235:         /* Copy Multiplier Col 1 into XMM1 */
1236:         SSE_COPY_PS(XMM1,XMM5)
1237:         /* Copy Multiplier Col 0 into XMM0 */
1238:         SSE_COPY_PS(XMM0,XMM4)
1239:         SSE_INLINE_END_2;

1241:         /* Update the row: */
1242:         nz  = bi[row+1] - diag_offset[row] - 1;
1243:         pv += 16;
1244:         for (j=0; j<nz; j++) {
1245:           PREFETCH_L1(&pv[16]);
1246:           x = rtmp + 16*((unsigned int)pj[j]);
1247: /*            x = rtmp + 4*pj[j]; */

1249:           /* X:=X-M*PV, One column at a time */
1250:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1251:           SSE_INLINE_BEGIN_2(x,pv)
1252:           /* Load First Column of X*/
1253:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1254:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1256:           /* Matrix-Vector Product: */
1257:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1258:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1259:           SSE_MULT_PS(XMM5,XMM0)
1260:           SSE_SUB_PS(XMM4,XMM5)

1262:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1263:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1264:           SSE_MULT_PS(XMM6,XMM1)
1265:           SSE_SUB_PS(XMM4,XMM6)

1267:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1268:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1269:           SSE_MULT_PS(XMM7,XMM2)
1270:           SSE_SUB_PS(XMM4,XMM7)

1272:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1273:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1274:           SSE_MULT_PS(XMM5,XMM3)
1275:           SSE_SUB_PS(XMM4,XMM5)

1277:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1278:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1280:           /* Second Column */
1281:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1282:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1284:           /* Matrix-Vector Product: */
1285:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1286:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1287:           SSE_MULT_PS(XMM6,XMM0)
1288:           SSE_SUB_PS(XMM5,XMM6)

1290:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1291:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1292:           SSE_MULT_PS(XMM7,XMM1)
1293:           SSE_SUB_PS(XMM5,XMM7)

1295:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1296:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1297:           SSE_MULT_PS(XMM4,XMM2)
1298:           SSE_SUB_PS(XMM5,XMM4)

1300:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1301:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1302:           SSE_MULT_PS(XMM6,XMM3)
1303:           SSE_SUB_PS(XMM5,XMM6)

1305:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1306:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1308:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1310:           /* Third Column */
1311:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1312:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1314:           /* Matrix-Vector Product: */
1315:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1316:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1317:           SSE_MULT_PS(XMM7,XMM0)
1318:           SSE_SUB_PS(XMM6,XMM7)

1320:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1321:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1322:           SSE_MULT_PS(XMM4,XMM1)
1323:           SSE_SUB_PS(XMM6,XMM4)

1325:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1326:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1327:           SSE_MULT_PS(XMM5,XMM2)
1328:           SSE_SUB_PS(XMM6,XMM5)

1330:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1331:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1332:           SSE_MULT_PS(XMM7,XMM3)
1333:           SSE_SUB_PS(XMM6,XMM7)

1335:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1336:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1338:           /* Fourth Column */
1339:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1340:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1342:           /* Matrix-Vector Product: */
1343:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1344:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1345:           SSE_MULT_PS(XMM5,XMM0)
1346:           SSE_SUB_PS(XMM4,XMM5)

1348:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1349:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1350:           SSE_MULT_PS(XMM6,XMM1)
1351:           SSE_SUB_PS(XMM4,XMM6)

1353:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1354:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1355:           SSE_MULT_PS(XMM7,XMM2)
1356:           SSE_SUB_PS(XMM4,XMM7)

1358:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1359:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1360:           SSE_MULT_PS(XMM5,XMM3)
1361:           SSE_SUB_PS(XMM4,XMM5)

1363:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1364:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1365:           SSE_INLINE_END_2;
1366:           pv += 16;
1367:         }
1368:         PetscLogFlops(128.0*nz+112.0);
1369:       }
1370:       row = (unsigned int)(*bjtmp++);
1371: /*        row = (*bjtmp++)/4; */
1372: /*        bjtmp++; */
1373:     }
1374:     /* finished row so stick it into b->a */
1375:     pv = ba + 16*bi[i];
1376:     pj = bj + bi[i];
1377:     nz = bi[i+1] - bi[i];

1379:     /* Copy x block back into pv block */
1380:     for (j=0; j<nz; j++) {
1381:       x = rtmp+16*((unsigned int)pj[j]);
1382: /*        x  = rtmp+4*pj[j]; */

1384:       SSE_INLINE_BEGIN_2(x,pv)
1385:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1386:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1387:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1389:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1390:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1392:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1393:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1395:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1396:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1398:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1399:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1401:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1402:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1404:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1405:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1407:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1408:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1409:       SSE_INLINE_END_2;
1410:       pv += 16;
1411:     }
1412:     /* invert diagonal block */
1413:     w = ba + 16*diag_offset[i];
1414:     if (pivotinblocks) {
1415:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1416:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1417:     } else {
1418:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1419:     }
1420:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1421:   }

1423:   PetscFree(rtmp);

1425:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1426:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1427:   C->assembled           = PETSC_TRUE;

1429:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1430:   /* Flop Count from inverting diagonal blocks */
1431:   SSE_SCOPE_END;
1432:   return 0;
1433: }

1435: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1436: {
1437:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1438:   int            i,j,n = a->mbs;
1439:   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1440:   unsigned int   row;
1441:   int            *ajtmpold,nz,*bi=b->i;
1442:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1443:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1444:   MatScalar      *ba    = b->a,*aa = a->a;
1445:   int            nonzero=0;
1446:   PetscBool      pivotinblocks = b->pivotinblocks;
1447:   PetscReal      shift = info->shiftamount;
1448:   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;

1450:   allowzeropivot = PetscNot(A->erroriffailure);
1451:   SSE_SCOPE_BEGIN;

1455:   PetscMalloc1(16*(n+1),&rtmp);
1457: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1458: /*      colscale = 4; */
1459: /*    } */
1460:   if ((unsigned long)bj==(unsigned long)aj) {
1461:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1462:   }

1464:   for (i=0; i<n; i++) {
1465:     nz    = bi[i+1] - bi[i];
1466:     bjtmp = bj + bi[i];
1467:     /* zero out the 4x4 block accumulators */
1468:     /* zero out one register */
1469:     XOR_PS(XMM7,XMM7);
1470:     for  (j=0; j<nz; j++) {
1471:       x = rtmp+16*((unsigned int)bjtmp[j]);
1472: /*        x = rtmp+4*bjtmp[j]; */
1473:       SSE_INLINE_BEGIN_1(x)
1474:       /* Copy zero register to memory locations */
1475:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1476:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1477:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1478:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1479:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1480:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1481:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1482:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1483:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1484:       SSE_INLINE_END_1;
1485:     }
1486:     /* load in initial (unfactored row) */
1487:     nz       = ai[i+1] - ai[i];
1488:     ajtmpold = aj + ai[i];
1489:     v        = aa + 16*ai[i];
1490:     for (j=0; j<nz; j++) {
1491:       x = rtmp+16*ajtmpold[j];
1492: /*        x = rtmp+colscale*ajtmpold[j]; */
1493:       /* Copy v block into x block */
1494:       SSE_INLINE_BEGIN_2(v,x)
1495:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1496:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1497:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1499:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1500:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1502:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1503:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1505:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1506:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1508:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1509:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1511:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1512:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1514:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1515:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1517:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1518:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1519:       SSE_INLINE_END_2;

1521:       v += 16;
1522:     }
1523: /*      row = (*bjtmp++)/4; */
1524:     row = (unsigned int)(*bjtmp++);
1525:     while (row < i) {
1526:       pc = rtmp + 16*row;
1527:       SSE_INLINE_BEGIN_1(pc)
1528:       /* Load block from lower triangle */
1529:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1530:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1531:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1533:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1534:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1536:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1537:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1539:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1540:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1542:       /* Compare block to zero block */

1544:       SSE_COPY_PS(XMM4,XMM7)
1545:       SSE_CMPNEQ_PS(XMM4,XMM0)

1547:       SSE_COPY_PS(XMM5,XMM7)
1548:       SSE_CMPNEQ_PS(XMM5,XMM1)

1550:       SSE_COPY_PS(XMM6,XMM7)
1551:       SSE_CMPNEQ_PS(XMM6,XMM2)

1553:       SSE_CMPNEQ_PS(XMM7,XMM3)

1555:       /* Reduce the comparisons to one SSE register */
1556:       SSE_OR_PS(XMM6,XMM7)
1557:       SSE_OR_PS(XMM5,XMM4)
1558:       SSE_OR_PS(XMM5,XMM6)
1559:       SSE_INLINE_END_1;

1561:       /* Reduce the one SSE register to an integer register for branching */
1562:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1563:       MOVEMASK(nonzero,XMM5);

1565:       /* If block is nonzero ... */
1566:       if (nonzero) {
1567:         pv = ba + 16*diag_offset[row];
1568:         PREFETCH_L1(&pv[16]);
1569:         pj = bj + diag_offset[row] + 1;

1571:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1572:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1573:         /* but the diagonal was inverted already */
1574:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1576:         SSE_INLINE_BEGIN_2(pv,pc)
1577:         /* Column 0, product is accumulated in XMM4 */
1578:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1579:         SSE_SHUFFLE(XMM4,XMM4,0x00)
1580:         SSE_MULT_PS(XMM4,XMM0)

1582:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1583:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1584:         SSE_MULT_PS(XMM5,XMM1)
1585:         SSE_ADD_PS(XMM4,XMM5)

1587:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1588:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1589:         SSE_MULT_PS(XMM6,XMM2)
1590:         SSE_ADD_PS(XMM4,XMM6)

1592:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1593:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1594:         SSE_MULT_PS(XMM7,XMM3)
1595:         SSE_ADD_PS(XMM4,XMM7)

1597:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1598:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1600:         /* Column 1, product is accumulated in XMM5 */
1601:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1602:         SSE_SHUFFLE(XMM5,XMM5,0x00)
1603:         SSE_MULT_PS(XMM5,XMM0)

1605:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1606:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1607:         SSE_MULT_PS(XMM6,XMM1)
1608:         SSE_ADD_PS(XMM5,XMM6)

1610:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1611:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1612:         SSE_MULT_PS(XMM7,XMM2)
1613:         SSE_ADD_PS(XMM5,XMM7)

1615:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1616:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1617:         SSE_MULT_PS(XMM6,XMM3)
1618:         SSE_ADD_PS(XMM5,XMM6)

1620:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1621:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1623:         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1625:         /* Column 2, product is accumulated in XMM6 */
1626:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1627:         SSE_SHUFFLE(XMM6,XMM6,0x00)
1628:         SSE_MULT_PS(XMM6,XMM0)

1630:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1631:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1632:         SSE_MULT_PS(XMM7,XMM1)
1633:         SSE_ADD_PS(XMM6,XMM7)

1635:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1636:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1637:         SSE_MULT_PS(XMM7,XMM2)
1638:         SSE_ADD_PS(XMM6,XMM7)

1640:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1641:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1642:         SSE_MULT_PS(XMM7,XMM3)
1643:         SSE_ADD_PS(XMM6,XMM7)

1645:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1646:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1648:         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1649:         /* Column 3, product is accumulated in XMM0 */
1650:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1651:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1652:         SSE_MULT_PS(XMM0,XMM7)

1654:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1655:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1656:         SSE_MULT_PS(XMM1,XMM7)
1657:         SSE_ADD_PS(XMM0,XMM1)

1659:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1660:         SSE_SHUFFLE(XMM1,XMM1,0x00)
1661:         SSE_MULT_PS(XMM1,XMM2)
1662:         SSE_ADD_PS(XMM0,XMM1)

1664:         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1665:         SSE_SHUFFLE(XMM7,XMM7,0x00)
1666:         SSE_MULT_PS(XMM3,XMM7)
1667:         SSE_ADD_PS(XMM0,XMM3)

1669:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1670:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1672:         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1673:         /* This is code to be maintained and read by humans afterall. */
1674:         /* Copy Multiplier Col 3 into XMM3 */
1675:         SSE_COPY_PS(XMM3,XMM0)
1676:         /* Copy Multiplier Col 2 into XMM2 */
1677:         SSE_COPY_PS(XMM2,XMM6)
1678:         /* Copy Multiplier Col 1 into XMM1 */
1679:         SSE_COPY_PS(XMM1,XMM5)
1680:         /* Copy Multiplier Col 0 into XMM0 */
1681:         SSE_COPY_PS(XMM0,XMM4)
1682:         SSE_INLINE_END_2;

1684:         /* Update the row: */
1685:         nz  = bi[row+1] - diag_offset[row] - 1;
1686:         pv += 16;
1687:         for (j=0; j<nz; j++) {
1688:           PREFETCH_L1(&pv[16]);
1689:           x = rtmp + 16*((unsigned int)pj[j]);
1690: /*            x = rtmp + 4*pj[j]; */

1692:           /* X:=X-M*PV, One column at a time */
1693:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1694:           SSE_INLINE_BEGIN_2(x,pv)
1695:           /* Load First Column of X*/
1696:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1697:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1699:           /* Matrix-Vector Product: */
1700:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1701:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1702:           SSE_MULT_PS(XMM5,XMM0)
1703:           SSE_SUB_PS(XMM4,XMM5)

1705:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1706:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1707:           SSE_MULT_PS(XMM6,XMM1)
1708:           SSE_SUB_PS(XMM4,XMM6)

1710:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1711:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1712:           SSE_MULT_PS(XMM7,XMM2)
1713:           SSE_SUB_PS(XMM4,XMM7)

1715:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1716:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1717:           SSE_MULT_PS(XMM5,XMM3)
1718:           SSE_SUB_PS(XMM4,XMM5)

1720:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1721:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1723:           /* Second Column */
1724:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1725:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1727:           /* Matrix-Vector Product: */
1728:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1729:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1730:           SSE_MULT_PS(XMM6,XMM0)
1731:           SSE_SUB_PS(XMM5,XMM6)

1733:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1734:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1735:           SSE_MULT_PS(XMM7,XMM1)
1736:           SSE_SUB_PS(XMM5,XMM7)

1738:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1739:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1740:           SSE_MULT_PS(XMM4,XMM2)
1741:           SSE_SUB_PS(XMM5,XMM4)

1743:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1744:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1745:           SSE_MULT_PS(XMM6,XMM3)
1746:           SSE_SUB_PS(XMM5,XMM6)

1748:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1749:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1751:           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1753:           /* Third Column */
1754:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1755:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1757:           /* Matrix-Vector Product: */
1758:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1759:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1760:           SSE_MULT_PS(XMM7,XMM0)
1761:           SSE_SUB_PS(XMM6,XMM7)

1763:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1764:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1765:           SSE_MULT_PS(XMM4,XMM1)
1766:           SSE_SUB_PS(XMM6,XMM4)

1768:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1769:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1770:           SSE_MULT_PS(XMM5,XMM2)
1771:           SSE_SUB_PS(XMM6,XMM5)

1773:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1774:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1775:           SSE_MULT_PS(XMM7,XMM3)
1776:           SSE_SUB_PS(XMM6,XMM7)

1778:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1779:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1781:           /* Fourth Column */
1782:           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1783:           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1785:           /* Matrix-Vector Product: */
1786:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1787:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1788:           SSE_MULT_PS(XMM5,XMM0)
1789:           SSE_SUB_PS(XMM4,XMM5)

1791:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1792:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1793:           SSE_MULT_PS(XMM6,XMM1)
1794:           SSE_SUB_PS(XMM4,XMM6)

1796:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1797:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1798:           SSE_MULT_PS(XMM7,XMM2)
1799:           SSE_SUB_PS(XMM4,XMM7)

1801:           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1802:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1803:           SSE_MULT_PS(XMM5,XMM3)
1804:           SSE_SUB_PS(XMM4,XMM5)

1806:           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1807:           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1808:           SSE_INLINE_END_2;
1809:           pv += 16;
1810:         }
1811:         PetscLogFlops(128.0*nz+112.0);
1812:       }
1813:       row = (unsigned int)(*bjtmp++);
1814: /*        row = (*bjtmp++)/4; */
1815: /*        bjtmp++; */
1816:     }
1817:     /* finished row so stick it into b->a */
1818:     pv = ba + 16*bi[i];
1819:     pj = bj + bi[i];
1820:     nz = bi[i+1] - bi[i];

1822:     /* Copy x block back into pv block */
1823:     for (j=0; j<nz; j++) {
1824:       x = rtmp+16*((unsigned int)pj[j]);
1825: /*        x  = rtmp+4*pj[j]; */

1827:       SSE_INLINE_BEGIN_2(x,pv)
1828:       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1829:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1830:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1832:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1833:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1835:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1836:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1838:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1839:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1841:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1842:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1844:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1845:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1847:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1848:       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1850:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1851:       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1852:       SSE_INLINE_END_2;
1853:       pv += 16;
1854:     }
1855:     /* invert diagonal block */
1856:     w = ba + 16*diag_offset[i];
1857:     if (pivotinblocks) {
1858:       PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1859:       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1860:     } else {
1861:       PetscKernel_A_gets_inverse_A_4_nopivot(w);
1862:     }
1863: /*      PetscKernel_A_gets_inverse_A_4_SSE(w); */
1864:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1865:   }

1867:   PetscFree(rtmp);

1869:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1870:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1871:   C->assembled           = PETSC_TRUE;

1873:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1874:   /* Flop Count from inverting diagonal blocks */
1875:   SSE_SCOPE_END;
1876:   return 0;
1877: }

1879: #endif