Actual source code: baijfact11.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */
 6:  #include ../src/mat/impls/baij/seq/baij.h
 7:  #include ../src/mat/blockinvert.h

  9: /* ------------------------------------------------------------*/
 10: /*
 11:       Version for when blocks are 4 by 4
 12: */
 15: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
 16: {
 17:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
 18:   IS             isrow = b->row,isicol = b->icol;
 20:   const PetscInt *r,*ic;
 21:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 22:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 23:   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
 24:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 25:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 26:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 27:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 28:   MatScalar      m13,m14,m15,m16;
 29:   MatScalar      *ba = b->a,*aa = a->a;
 30:   PetscTruth     pivotinblocks = b->pivotinblocks;
 31:   PetscReal      shift = info->shiftamount;

 34:   ISGetIndices(isrow,&r);
 35:   ISGetIndices(isicol,&ic);
 36:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

 38:   for (i=0; i<n; i++) {
 39:     nz    = bi[i+1] - bi[i];
 40:     ajtmp = bj + bi[i];
 41:     for  (j=0; j<nz; j++) {
 42:       x = rtmp+16*ajtmp[j];
 43:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 44:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 45:     }
 46:     /* load in initial (unfactored row) */
 47:     idx      = r[i];
 48:     nz       = ai[idx+1] - ai[idx];
 49:     ajtmpold = aj + ai[idx];
 50:     v        = aa + 16*ai[idx];
 51:     for (j=0; j<nz; j++) {
 52:       x    = rtmp+16*ic[ajtmpold[j]];
 53:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 54:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 55:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 56:       x[14] = v[14]; x[15] = v[15];
 57:       v    += 16;
 58:     }
 59:     row = *ajtmp++;
 60:     while (row < i) {
 61:       pc  = rtmp + 16*row;
 62:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 63:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 64:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 65:       p15 = pc[14]; p16 = pc[15];
 66:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 67:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 68:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 69:           || p16 != 0.0) {
 70:         pv = ba + 16*diag_offset[row];
 71:         pj = bj + diag_offset[row] + 1;
 72:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 73:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 74:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 75:         x15 = pv[14]; x16 = pv[15];
 76:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 77:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 78:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 79:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 81:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 82:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 83:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 84:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 86:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 87:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 88:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 89:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 91:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 92:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 93:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 94:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;

 96:         nz = bi[row+1] - diag_offset[row] - 1;
 97:         pv += 16;
 98:         for (j=0; j<nz; j++) {
 99:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
100:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
101:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
103:           x    = rtmp + 16*pj[j];
104:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
105:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
106:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
107:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

109:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
110:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
111:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
112:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

114:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
115:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

119:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
120:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
121:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
122:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

124:           pv   += 16;
125:         }
126:         PetscLogFlops(128.0*nz+112.0);
127:       }
128:       row = *ajtmp++;
129:     }
130:     /* finished row so stick it into b->a */
131:     pv = ba + 16*bi[i];
132:     pj = bj + bi[i];
133:     nz = bi[i+1] - bi[i];
134:     for (j=0; j<nz; j++) {
135:       x      = rtmp+16*pj[j];
136:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
137:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
138:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140:       pv   += 16;
141:     }
142:     /* invert diagonal block */
143:     w    = ba + 16*diag_offset[i];
144:     if (pivotinblocks) {
145:       Kernel_A_gets_inverse_A_4(w,shift);
146:     } else {
147:       Kernel_A_gets_inverse_A_4_nopivot(w);
148:     }
149:   }

151:   PetscFree(rtmp);
152:   ISRestoreIndices(isicol,&ic);
153:   ISRestoreIndices(isrow,&r);
154:   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156:   C->assembled = PETSC_TRUE;
157:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
158:   return(0);
159: }

161: /* MatLUFactorNumeric_SeqBAIJ_4 - 
162:      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 
163:        Kernel_A_gets_A_times_B()
164:        Kernel_A_gets_A_minus_B_times_C()
165:        Kernel_A_gets_inverse_A()
166: */

170: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171: {
172:   Mat            C=B;
173:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174:   IS             isrow = b->row,isicol = b->icol;
176:   const PetscInt *r,*ic,*ics;
177:   PetscInt       i,j,k,nz,nzL,row;
178:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181:   PetscInt       flg;
182:   PetscReal      shift = info->shiftamount;

185:   ISGetIndices(isrow,&r);
186:   ISGetIndices(isicol,&ic);

188:   /* generate work space needed by the factorization */
189:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
190:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
191:   ics  = ic;

193:   for (i=0; i<n; i++){
194:     /* zero rtmp */
195:     /* L part */
196:     nz    = bi[i+1] - bi[i];
197:     bjtmp = bj + bi[i];
198:     for  (j=0; j<nz; j++){
199:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
200:     }

202:     /* U part */
203:     nz = bdiag[i]-bdiag[i+1];
204:     bjtmp = bj + bdiag[i+1]+1;
205:     for  (j=0; j<nz; j++){
206:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
207:     }
208: 
209:     /* load in initial (unfactored row) */
210:     nz    = ai[r[i]+1] - ai[r[i]];
211:     ajtmp = aj + ai[r[i]];
212:     v     = aa + bs2*ai[r[i]];
213:     for (j=0; j<nz; j++) {
214:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
215:     }

217:     /* elimination */
218:     bjtmp = bj + bi[i];
219:     nzL   = bi[i+1] - bi[i];
220:     for(k=0;k < nzL;k++) {
221:       row = bjtmp[k];
222:       pc = rtmp + bs2*row;
223:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
224:       if (flg) {
225:         pv = b->a + bs2*bdiag[row];
226:         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
227:         Kernel_A_gets_A_times_B_4(pc,pv,mwork);
228: 
229:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
230:         pv = b->a + bs2*(bdiag[row+1]+1);
231:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
232:         for (j=0; j<nz; j++) {
233:           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
234:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
235:           v    = rtmp + bs2*pj[j];
236:           Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
237:           pv  += bs2;
238:         }
239:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
240:       }
241:     }

243:     /* finished row so stick it into b->a */
244:     /* L part */
245:     pv   = b->a + bs2*bi[i] ;
246:     pj   = b->j + bi[i] ;
247:     nz   = bi[i+1] - bi[i];
248:     for (j=0; j<nz; j++) {
249:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
250:     }

252:     /* Mark diagonal and invert diagonal for simplier triangular solves */
253:     pv   = b->a + bs2*bdiag[i];
254:     pj   = b->j + bdiag[i];
255:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
256:     /* Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work); */
257:     Kernel_A_gets_inverse_A_4(pv,shift);
258: 
259:     /* U part */
260:     pv = b->a + bs2*(bdiag[i+1]+1);
261:     pj = b->j + bdiag[i+1]+1;
262:     nz = bdiag[i] - bdiag[i+1] - 1;
263:     for (j=0; j<nz; j++){
264:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
265:     }
266:   }

268:   PetscFree2(rtmp,mwork);
269:   ISRestoreIndices(isicol,&ic);
270:   ISRestoreIndices(isrow,&r);
271:   C->ops->solve          = MatSolve_SeqBAIJ_4;
272:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
273:   C->assembled = PETSC_TRUE;
274:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
275:   return(0);
276: }

280: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
281: {
282: /*
283:     Default Version for when blocks are 4 by 4 Using natural ordering
284: */
285:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
287:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
288:   PetscInt       *ajtmpold,*ajtmp,nz,row;
289:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
290:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
291:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
292:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
293:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
294:   MatScalar      m13,m14,m15,m16;
295:   MatScalar      *ba = b->a,*aa = a->a;
296:   PetscTruth     pivotinblocks = b->pivotinblocks;
297:   PetscReal      shift = info->shiftamount;

300:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

302:   for (i=0; i<n; i++) {
303:     nz    = bi[i+1] - bi[i];
304:     ajtmp = bj + bi[i];
305:     for  (j=0; j<nz; j++) {
306:       x = rtmp+16*ajtmp[j];
307:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
308:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
309:     }
310:     /* load in initial (unfactored row) */
311:     nz       = ai[i+1] - ai[i];
312:     ajtmpold = aj + ai[i];
313:     v        = aa + 16*ai[i];
314:     for (j=0; j<nz; j++) {
315:       x    = rtmp+16*ajtmpold[j];
316:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
317:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
318:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
319:       x[14] = v[14]; x[15] = v[15];
320:       v    += 16;
321:     }
322:     row = *ajtmp++;
323:     while (row < i) {
324:       pc  = rtmp + 16*row;
325:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
326:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
327:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
328:       p15 = pc[14]; p16 = pc[15];
329:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
330:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
331:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
332:           || p16 != 0.0) {
333:         pv = ba + 16*diag_offset[row];
334:         pj = bj + diag_offset[row] + 1;
335:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
336:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
337:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
338:         x15 = pv[14]; x16 = pv[15];
339:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
340:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
341:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
342:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

344:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
345:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
346:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
347:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

349:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
350:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
351:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
352:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

354:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
355:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
356:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
357:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
358:         nz = bi[row+1] - diag_offset[row] - 1;
359:         pv += 16;
360:         for (j=0; j<nz; j++) {
361:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
362:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
363:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
364:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
365:           x    = rtmp + 16*pj[j];
366:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
367:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
368:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
369:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

371:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
372:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
373:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
374:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

376:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
377:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
378:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
379:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

381:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
382:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
383:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
384:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

386:           pv   += 16;
387:         }
388:         PetscLogFlops(128.0*nz+112.0);
389:       }
390:       row = *ajtmp++;
391:     }
392:     /* finished row so stick it into b->a */
393:     pv = ba + 16*bi[i];
394:     pj = bj + bi[i];
395:     nz = bi[i+1] - bi[i];
396:     for (j=0; j<nz; j++) {
397:       x      = rtmp+16*pj[j];
398:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
399:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
400:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
401:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
402:       pv   += 16;
403:     }
404:     /* invert diagonal block */
405:     w = ba + 16*diag_offset[i];
406:     if (pivotinblocks) {
407:       Kernel_A_gets_inverse_A_4(w,shift);
408:     } else {
409:       Kernel_A_gets_inverse_A_4_nopivot(w);
410:     }
411:   }

413:   PetscFree(rtmp);
414:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
415:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
416:   C->assembled = PETSC_TRUE;
417:   PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
418:   return(0);
419: }

421: /*
422:   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
423:     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
424: */
427: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
428: {
429:   Mat            C=B;
430:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
432:   PetscInt       i,j,k,nz,nzL,row;
433:   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
434:   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
435:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
436:   PetscInt       flg;
437:   PetscReal      shift = info->shiftamount;

440:   /* generate work space needed by the factorization */
441:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
442:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

444:   for (i=0; i<n; i++){
445:     /* zero rtmp */
446:     /* L part */
447:     nz    = bi[i+1] - bi[i];
448:     bjtmp = bj + bi[i];
449:     for  (j=0; j<nz; j++){
450:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
451:     }

453:     /* U part */
454:     nz = bdiag[i] - bdiag[i+1];
455:     bjtmp = bj + bdiag[i+1]+1;
456:     for  (j=0; j<nz; j++){
457:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
458:     }
459: 
460:     /* load in initial (unfactored row) */
461:     nz    = ai[i+1] - ai[i];
462:     ajtmp = aj + ai[i];
463:     v     = aa + bs2*ai[i];
464:     for (j=0; j<nz; j++) {
465:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
466:     }

468:     /* elimination */
469:     bjtmp = bj + bi[i];
470:     nzL   = bi[i+1] - bi[i];
471:     for(k=0;k < nzL;k++) {
472:       row = bjtmp[k];
473:       pc = rtmp + bs2*row;
474:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
475:       if (flg) {
476:         pv = b->a + bs2*bdiag[row];
477:         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
478:         Kernel_A_gets_A_times_B_4(pc,pv,mwork);
479: 
480:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
481:         pv = b->a + bs2*(bdiag[row+1]+1);
482:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
483:         for (j=0; j<nz; j++) {
484:           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
485:           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
486:           v    = rtmp + bs2*pj[j];
487:           Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
488:           pv  += bs2;
489:         }
490:         PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
491:       }
492:     }

494:     /* finished row so stick it into b->a */
495:     /* L part */
496:     pv   = b->a + bs2*bi[i] ;
497:     pj   = b->j + bi[i] ;
498:     nz   = bi[i+1] - bi[i];
499:     for (j=0; j<nz; j++) {
500:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
501:     }

503:     /* Mark diagonal and invert diagonal for simplier triangular solves */
504:     pv   = b->a + bs2*bdiag[i];
505:     pj   = b->j + bdiag[i];
506:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
507:     /* Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work); */
508:     Kernel_A_gets_inverse_A_4(pv,shift);
509: 
510:     /* U part */
511:     pv = b->a + bs2*(bdiag[i+1]+1);
512:     pj = b->j + bdiag[i+1]+1;
513:     nz = bdiag[i] - bdiag[i+1] - 1;
514:     for (j=0; j<nz; j++){
515:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
516:     }
517:   }
518:   PetscFree2(rtmp,mwork);
519:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
520:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
521:   C->assembled = PETSC_TRUE;
522:   PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
523:   return(0);
524: }

526: #if defined(PETSC_HAVE_SSE)

528: #include PETSC_HAVE_SSE

530: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
533: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
534: {
535:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
537:   int i,j,n = a->mbs;
538:   int         *bj = b->j,*bjtmp,*pj;
539:   int         row;
540:   int         *ajtmpold,nz,*bi=b->i;
541:   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
542:   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
543:   MatScalar   *ba = b->a,*aa = a->a;
544:   int         nonzero=0;
545: /*    int            nonzero=0,colscale = 16; */
546:   PetscTruth  pivotinblocks = b->pivotinblocks;
547:   PetscReal      shift = info->shiftamount;

550:   SSE_SCOPE_BEGIN;

552:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
553:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
554:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
555:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
556: /*    if ((unsigned long)bj==(unsigned long)aj) { */
557: /*      colscale = 4; */
558: /*    } */
559:   for (i=0; i<n; i++) {
560:     nz    = bi[i+1] - bi[i];
561:     bjtmp = bj + bi[i];
562:     /* zero out the 4x4 block accumulators */
563:     /* zero out one register */
564:     XOR_PS(XMM7,XMM7);
565:     for  (j=0; j<nz; j++) {
566:       x = rtmp+16*bjtmp[j];
567: /*        x = rtmp+4*bjtmp[j]; */
568:       SSE_INLINE_BEGIN_1(x)
569:         /* Copy zero register to memory locations */
570:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
571:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
572:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
573:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
574:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
575:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
576:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
577:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
578:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
579:       SSE_INLINE_END_1;
580:     }
581:     /* load in initial (unfactored row) */
582:     nz       = ai[i+1] - ai[i];
583:     ajtmpold = aj + ai[i];
584:     v        = aa + 16*ai[i];
585:     for (j=0; j<nz; j++) {
586:       x = rtmp+16*ajtmpold[j];
587: /*        x = rtmp+colscale*ajtmpold[j]; */
588:       /* Copy v block into x block */
589:       SSE_INLINE_BEGIN_2(v,x)
590:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
591:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
592:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

594:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
595:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

597:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
598:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

600:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
601:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

603:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
604:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

606:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
607:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

609:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
610:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

612:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
613:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
614:       SSE_INLINE_END_2;

616:       v += 16;
617:     }
618: /*      row = (*bjtmp++)/4; */
619:     row = *bjtmp++;
620:     while (row < i) {
621:       pc  = rtmp + 16*row;
622:       SSE_INLINE_BEGIN_1(pc)
623:         /* Load block from lower triangle */
624:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
625:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
626:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

628:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
629:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

631:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
632:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

634:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
635:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

637:         /* Compare block to zero block */

639:         SSE_COPY_PS(XMM4,XMM7)
640:         SSE_CMPNEQ_PS(XMM4,XMM0)

642:         SSE_COPY_PS(XMM5,XMM7)
643:         SSE_CMPNEQ_PS(XMM5,XMM1)

645:         SSE_COPY_PS(XMM6,XMM7)
646:         SSE_CMPNEQ_PS(XMM6,XMM2)

648:         SSE_CMPNEQ_PS(XMM7,XMM3)

650:         /* Reduce the comparisons to one SSE register */
651:         SSE_OR_PS(XMM6,XMM7)
652:         SSE_OR_PS(XMM5,XMM4)
653:         SSE_OR_PS(XMM5,XMM6)
654:       SSE_INLINE_END_1;

656:       /* Reduce the one SSE register to an integer register for branching */
657:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
658:       MOVEMASK(nonzero,XMM5);

660:       /* If block is nonzero ... */
661:       if (nonzero) {
662:         pv = ba + 16*diag_offset[row];
663:         PREFETCH_L1(&pv[16]);
664:         pj = bj + diag_offset[row] + 1;

666:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
667:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
668:         /* but the diagonal was inverted already */
669:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

671:         SSE_INLINE_BEGIN_2(pv,pc)
672:           /* Column 0, product is accumulated in XMM4 */
673:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
674:           SSE_SHUFFLE(XMM4,XMM4,0x00)
675:           SSE_MULT_PS(XMM4,XMM0)

677:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
678:           SSE_SHUFFLE(XMM5,XMM5,0x00)
679:           SSE_MULT_PS(XMM5,XMM1)
680:           SSE_ADD_PS(XMM4,XMM5)

682:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
683:           SSE_SHUFFLE(XMM6,XMM6,0x00)
684:           SSE_MULT_PS(XMM6,XMM2)
685:           SSE_ADD_PS(XMM4,XMM6)

687:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
688:           SSE_SHUFFLE(XMM7,XMM7,0x00)
689:           SSE_MULT_PS(XMM7,XMM3)
690:           SSE_ADD_PS(XMM4,XMM7)

692:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
693:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

695:           /* Column 1, product is accumulated in XMM5 */
696:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
697:           SSE_SHUFFLE(XMM5,XMM5,0x00)
698:           SSE_MULT_PS(XMM5,XMM0)

700:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
701:           SSE_SHUFFLE(XMM6,XMM6,0x00)
702:           SSE_MULT_PS(XMM6,XMM1)
703:           SSE_ADD_PS(XMM5,XMM6)

705:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
706:           SSE_SHUFFLE(XMM7,XMM7,0x00)
707:           SSE_MULT_PS(XMM7,XMM2)
708:           SSE_ADD_PS(XMM5,XMM7)

710:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
711:           SSE_SHUFFLE(XMM6,XMM6,0x00)
712:           SSE_MULT_PS(XMM6,XMM3)
713:           SSE_ADD_PS(XMM5,XMM6)

715:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
716:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

718:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

720:           /* Column 2, product is accumulated in XMM6 */
721:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
722:           SSE_SHUFFLE(XMM6,XMM6,0x00)
723:           SSE_MULT_PS(XMM6,XMM0)

725:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
726:           SSE_SHUFFLE(XMM7,XMM7,0x00)
727:           SSE_MULT_PS(XMM7,XMM1)
728:           SSE_ADD_PS(XMM6,XMM7)

730:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
731:           SSE_SHUFFLE(XMM7,XMM7,0x00)
732:           SSE_MULT_PS(XMM7,XMM2)
733:           SSE_ADD_PS(XMM6,XMM7)

735:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
736:           SSE_SHUFFLE(XMM7,XMM7,0x00)
737:           SSE_MULT_PS(XMM7,XMM3)
738:           SSE_ADD_PS(XMM6,XMM7)
739: 
740:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
741:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

743:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
744:           /* Column 3, product is accumulated in XMM0 */
745:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
746:           SSE_SHUFFLE(XMM7,XMM7,0x00)
747:           SSE_MULT_PS(XMM0,XMM7)

749:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
750:           SSE_SHUFFLE(XMM7,XMM7,0x00)
751:           SSE_MULT_PS(XMM1,XMM7)
752:           SSE_ADD_PS(XMM0,XMM1)

754:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
755:           SSE_SHUFFLE(XMM1,XMM1,0x00)
756:           SSE_MULT_PS(XMM1,XMM2)
757:           SSE_ADD_PS(XMM0,XMM1)

759:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
760:           SSE_SHUFFLE(XMM7,XMM7,0x00)
761:           SSE_MULT_PS(XMM3,XMM7)
762:           SSE_ADD_PS(XMM0,XMM3)

764:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
765:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

767:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
768:           /* This is code to be maintained and read by humans afterall. */
769:           /* Copy Multiplier Col 3 into XMM3 */
770:           SSE_COPY_PS(XMM3,XMM0)
771:           /* Copy Multiplier Col 2 into XMM2 */
772:           SSE_COPY_PS(XMM2,XMM6)
773:           /* Copy Multiplier Col 1 into XMM1 */
774:           SSE_COPY_PS(XMM1,XMM5)
775:           /* Copy Multiplier Col 0 into XMM0 */
776:           SSE_COPY_PS(XMM0,XMM4)
777:         SSE_INLINE_END_2;

779:         /* Update the row: */
780:         nz = bi[row+1] - diag_offset[row] - 1;
781:         pv += 16;
782:         for (j=0; j<nz; j++) {
783:           PREFETCH_L1(&pv[16]);
784:           x = rtmp + 16*pj[j];
785: /*            x = rtmp + 4*pj[j]; */

787:           /* X:=X-M*PV, One column at a time */
788:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
789:           SSE_INLINE_BEGIN_2(x,pv)
790:             /* Load First Column of X*/
791:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
792:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

794:             /* Matrix-Vector Product: */
795:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
796:             SSE_SHUFFLE(XMM5,XMM5,0x00)
797:             SSE_MULT_PS(XMM5,XMM0)
798:             SSE_SUB_PS(XMM4,XMM5)

800:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
801:             SSE_SHUFFLE(XMM6,XMM6,0x00)
802:             SSE_MULT_PS(XMM6,XMM1)
803:             SSE_SUB_PS(XMM4,XMM6)

805:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
806:             SSE_SHUFFLE(XMM7,XMM7,0x00)
807:             SSE_MULT_PS(XMM7,XMM2)
808:             SSE_SUB_PS(XMM4,XMM7)

810:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
811:             SSE_SHUFFLE(XMM5,XMM5,0x00)
812:             SSE_MULT_PS(XMM5,XMM3)
813:             SSE_SUB_PS(XMM4,XMM5)

815:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
816:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

818:             /* Second Column */
819:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
820:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

822:             /* Matrix-Vector Product: */
823:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
824:             SSE_SHUFFLE(XMM6,XMM6,0x00)
825:             SSE_MULT_PS(XMM6,XMM0)
826:             SSE_SUB_PS(XMM5,XMM6)

828:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
829:             SSE_SHUFFLE(XMM7,XMM7,0x00)
830:             SSE_MULT_PS(XMM7,XMM1)
831:             SSE_SUB_PS(XMM5,XMM7)

833:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
834:             SSE_SHUFFLE(XMM4,XMM4,0x00)
835:             SSE_MULT_PS(XMM4,XMM2)
836:             SSE_SUB_PS(XMM5,XMM4)

838:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
839:             SSE_SHUFFLE(XMM6,XMM6,0x00)
840:             SSE_MULT_PS(XMM6,XMM3)
841:             SSE_SUB_PS(XMM5,XMM6)
842: 
843:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
844:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

846:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

848:             /* Third Column */
849:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
850:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

852:             /* Matrix-Vector Product: */
853:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
854:             SSE_SHUFFLE(XMM7,XMM7,0x00)
855:             SSE_MULT_PS(XMM7,XMM0)
856:             SSE_SUB_PS(XMM6,XMM7)

858:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
859:             SSE_SHUFFLE(XMM4,XMM4,0x00)
860:             SSE_MULT_PS(XMM4,XMM1)
861:             SSE_SUB_PS(XMM6,XMM4)

863:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
864:             SSE_SHUFFLE(XMM5,XMM5,0x00)
865:             SSE_MULT_PS(XMM5,XMM2)
866:             SSE_SUB_PS(XMM6,XMM5)

868:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
869:             SSE_SHUFFLE(XMM7,XMM7,0x00)
870:             SSE_MULT_PS(XMM7,XMM3)
871:             SSE_SUB_PS(XMM6,XMM7)
872: 
873:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
874:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
875: 
876:             /* Fourth Column */
877:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
878:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

880:             /* Matrix-Vector Product: */
881:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
882:             SSE_SHUFFLE(XMM5,XMM5,0x00)
883:             SSE_MULT_PS(XMM5,XMM0)
884:             SSE_SUB_PS(XMM4,XMM5)

886:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
887:             SSE_SHUFFLE(XMM6,XMM6,0x00)
888:             SSE_MULT_PS(XMM6,XMM1)
889:             SSE_SUB_PS(XMM4,XMM6)

891:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
892:             SSE_SHUFFLE(XMM7,XMM7,0x00)
893:             SSE_MULT_PS(XMM7,XMM2)
894:             SSE_SUB_PS(XMM4,XMM7)

896:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
897:             SSE_SHUFFLE(XMM5,XMM5,0x00)
898:             SSE_MULT_PS(XMM5,XMM3)
899:             SSE_SUB_PS(XMM4,XMM5)
900: 
901:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
902:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
903:           SSE_INLINE_END_2;
904:           pv   += 16;
905:         }
906:         PetscLogFlops(128.0*nz+112.0);
907:       }
908:       row = *bjtmp++;
909: /*        row = (*bjtmp++)/4; */
910:     }
911:     /* finished row so stick it into b->a */
912:     pv = ba + 16*bi[i];
913:     pj = bj + bi[i];
914:     nz = bi[i+1] - bi[i];

916:     /* Copy x block back into pv block */
917:     for (j=0; j<nz; j++) {
918:       x  = rtmp+16*pj[j];
919: /*        x  = rtmp+4*pj[j]; */

921:       SSE_INLINE_BEGIN_2(x,pv)
922:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
923:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
924:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

926:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
927:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

929:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
930:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

932:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
933:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

935:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
936:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

938:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
939:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

941:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
942:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

944:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
945:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
946:       SSE_INLINE_END_2;
947:       pv += 16;
948:     }
949:     /* invert diagonal block */
950:     w = ba + 16*diag_offset[i];
951:     if (pivotinblocks) {
952:       Kernel_A_gets_inverse_A_4(w,shift);
953:     } else {
954:       Kernel_A_gets_inverse_A_4_nopivot(w);
955:     }
956: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
957:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
958:   }

960:   PetscFree(rtmp);
961:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
962:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
963:   C->assembled = PETSC_TRUE;
964:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
965:   /* Flop Count from inverting diagonal blocks */
966:   SSE_SCOPE_END;
967:   return(0);
968: }

972: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
973: {
974:   Mat            A=C;
975:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
977:   int i,j,n = a->mbs;
978:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
979:   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
980:   unsigned int   row;
981:   int            nz,*bi=b->i;
982:   int            *diag_offset = b->diag,*ai=a->i;
983:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
984:   MatScalar      *ba = b->a,*aa = a->a;
985:   int            nonzero=0;
986: /*    int            nonzero=0,colscale = 16; */
987:   PetscTruth     pivotinblocks = b->pivotinblocks;
988:   PetscReal      shift = info->shiftamount;

991:   SSE_SCOPE_BEGIN;

993:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
994:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
995:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
996:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
997: /*    if ((unsigned long)bj==(unsigned long)aj) { */
998: /*      colscale = 4; */
999: /*    } */
1000: 
1001:   for (i=0; i<n; i++) {
1002:     nz    = bi[i+1] - bi[i];
1003:     bjtmp = bj + bi[i];
1004:     /* zero out the 4x4 block accumulators */
1005:     /* zero out one register */
1006:     XOR_PS(XMM7,XMM7);
1007:     for  (j=0; j<nz; j++) {
1008:       x = rtmp+16*((unsigned int)bjtmp[j]);
1009: /*        x = rtmp+4*bjtmp[j]; */
1010:       SSE_INLINE_BEGIN_1(x)
1011:         /* Copy zero register to memory locations */
1012:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1013:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1014:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1015:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1016:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1017:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1018:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1019:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1020:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1021:       SSE_INLINE_END_1;
1022:     }
1023:     /* load in initial (unfactored row) */
1024:     nz    = ai[i+1] - ai[i];
1025:     ajtmp = aj + ai[i];
1026:     v     = aa + 16*ai[i];
1027:     for (j=0; j<nz; j++) {
1028:       x = rtmp+16*((unsigned int)ajtmp[j]);
1029: /*        x = rtmp+colscale*ajtmp[j]; */
1030:       /* Copy v block into x block */
1031:       SSE_INLINE_BEGIN_2(v,x)
1032:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1033:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1034:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1036:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1037:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1039:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1040:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1042:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1043:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1045:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1046:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1048:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1049:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1051:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1052:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1054:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1055:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1056:       SSE_INLINE_END_2;

1058:       v += 16;
1059:     }
1060: /*      row = (*bjtmp++)/4; */
1061:     row = (unsigned int)(*bjtmp++);
1062:     while (row < i) {
1063:       pc  = rtmp + 16*row;
1064:       SSE_INLINE_BEGIN_1(pc)
1065:         /* Load block from lower triangle */
1066:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1067:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1068:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1070:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1071:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1073:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1074:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1076:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1077:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1079:         /* Compare block to zero block */

1081:         SSE_COPY_PS(XMM4,XMM7)
1082:         SSE_CMPNEQ_PS(XMM4,XMM0)

1084:         SSE_COPY_PS(XMM5,XMM7)
1085:         SSE_CMPNEQ_PS(XMM5,XMM1)

1087:         SSE_COPY_PS(XMM6,XMM7)
1088:         SSE_CMPNEQ_PS(XMM6,XMM2)

1090:         SSE_CMPNEQ_PS(XMM7,XMM3)

1092:         /* Reduce the comparisons to one SSE register */
1093:         SSE_OR_PS(XMM6,XMM7)
1094:         SSE_OR_PS(XMM5,XMM4)
1095:         SSE_OR_PS(XMM5,XMM6)
1096:       SSE_INLINE_END_1;

1098:       /* Reduce the one SSE register to an integer register for branching */
1099:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1100:       MOVEMASK(nonzero,XMM5);

1102:       /* If block is nonzero ... */
1103:       if (nonzero) {
1104:         pv = ba + 16*diag_offset[row];
1105:         PREFETCH_L1(&pv[16]);
1106:         pj = bj + diag_offset[row] + 1;

1108:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1109:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1110:         /* but the diagonal was inverted already */
1111:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1113:         SSE_INLINE_BEGIN_2(pv,pc)
1114:           /* Column 0, product is accumulated in XMM4 */
1115:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1116:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1117:           SSE_MULT_PS(XMM4,XMM0)

1119:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1120:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1121:           SSE_MULT_PS(XMM5,XMM1)
1122:           SSE_ADD_PS(XMM4,XMM5)

1124:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1125:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1126:           SSE_MULT_PS(XMM6,XMM2)
1127:           SSE_ADD_PS(XMM4,XMM6)

1129:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1130:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1131:           SSE_MULT_PS(XMM7,XMM3)
1132:           SSE_ADD_PS(XMM4,XMM7)

1134:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1135:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1137:           /* Column 1, product is accumulated in XMM5 */
1138:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1139:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1140:           SSE_MULT_PS(XMM5,XMM0)

1142:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1143:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1144:           SSE_MULT_PS(XMM6,XMM1)
1145:           SSE_ADD_PS(XMM5,XMM6)

1147:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1148:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1149:           SSE_MULT_PS(XMM7,XMM2)
1150:           SSE_ADD_PS(XMM5,XMM7)

1152:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1153:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1154:           SSE_MULT_PS(XMM6,XMM3)
1155:           SSE_ADD_PS(XMM5,XMM6)

1157:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1158:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1160:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1162:           /* Column 2, product is accumulated in XMM6 */
1163:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1164:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1165:           SSE_MULT_PS(XMM6,XMM0)

1167:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1168:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1169:           SSE_MULT_PS(XMM7,XMM1)
1170:           SSE_ADD_PS(XMM6,XMM7)

1172:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1173:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1174:           SSE_MULT_PS(XMM7,XMM2)
1175:           SSE_ADD_PS(XMM6,XMM7)

1177:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1178:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1179:           SSE_MULT_PS(XMM7,XMM3)
1180:           SSE_ADD_PS(XMM6,XMM7)
1181: 
1182:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1183:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1185:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1186:           /* Column 3, product is accumulated in XMM0 */
1187:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1188:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1189:           SSE_MULT_PS(XMM0,XMM7)

1191:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1192:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1193:           SSE_MULT_PS(XMM1,XMM7)
1194:           SSE_ADD_PS(XMM0,XMM1)

1196:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1197:           SSE_SHUFFLE(XMM1,XMM1,0x00)
1198:           SSE_MULT_PS(XMM1,XMM2)
1199:           SSE_ADD_PS(XMM0,XMM1)

1201:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1202:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1203:           SSE_MULT_PS(XMM3,XMM7)
1204:           SSE_ADD_PS(XMM0,XMM3)

1206:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1207:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1209:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1210:           /* This is code to be maintained and read by humans afterall. */
1211:           /* Copy Multiplier Col 3 into XMM3 */
1212:           SSE_COPY_PS(XMM3,XMM0)
1213:           /* Copy Multiplier Col 2 into XMM2 */
1214:           SSE_COPY_PS(XMM2,XMM6)
1215:           /* Copy Multiplier Col 1 into XMM1 */
1216:           SSE_COPY_PS(XMM1,XMM5)
1217:           /* Copy Multiplier Col 0 into XMM0 */
1218:           SSE_COPY_PS(XMM0,XMM4)
1219:         SSE_INLINE_END_2;

1221:         /* Update the row: */
1222:         nz = bi[row+1] - diag_offset[row] - 1;
1223:         pv += 16;
1224:         for (j=0; j<nz; j++) {
1225:           PREFETCH_L1(&pv[16]);
1226:           x = rtmp + 16*((unsigned int)pj[j]);
1227: /*            x = rtmp + 4*pj[j]; */

1229:           /* X:=X-M*PV, One column at a time */
1230:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1231:           SSE_INLINE_BEGIN_2(x,pv)
1232:             /* Load First Column of X*/
1233:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1234:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1236:             /* Matrix-Vector Product: */
1237:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1238:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1239:             SSE_MULT_PS(XMM5,XMM0)
1240:             SSE_SUB_PS(XMM4,XMM5)

1242:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1243:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1244:             SSE_MULT_PS(XMM6,XMM1)
1245:             SSE_SUB_PS(XMM4,XMM6)

1247:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1248:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1249:             SSE_MULT_PS(XMM7,XMM2)
1250:             SSE_SUB_PS(XMM4,XMM7)

1252:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1253:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1254:             SSE_MULT_PS(XMM5,XMM3)
1255:             SSE_SUB_PS(XMM4,XMM5)

1257:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1258:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1260:             /* Second Column */
1261:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1262:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1264:             /* Matrix-Vector Product: */
1265:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1266:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1267:             SSE_MULT_PS(XMM6,XMM0)
1268:             SSE_SUB_PS(XMM5,XMM6)

1270:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1271:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1272:             SSE_MULT_PS(XMM7,XMM1)
1273:             SSE_SUB_PS(XMM5,XMM7)

1275:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1276:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1277:             SSE_MULT_PS(XMM4,XMM2)
1278:             SSE_SUB_PS(XMM5,XMM4)

1280:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1281:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1282:             SSE_MULT_PS(XMM6,XMM3)
1283:             SSE_SUB_PS(XMM5,XMM6)
1284: 
1285:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1286:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1288:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1290:             /* Third Column */
1291:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1292:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1294:             /* Matrix-Vector Product: */
1295:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1296:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1297:             SSE_MULT_PS(XMM7,XMM0)
1298:             SSE_SUB_PS(XMM6,XMM7)

1300:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1301:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1302:             SSE_MULT_PS(XMM4,XMM1)
1303:             SSE_SUB_PS(XMM6,XMM4)

1305:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1306:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1307:             SSE_MULT_PS(XMM5,XMM2)
1308:             SSE_SUB_PS(XMM6,XMM5)

1310:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1311:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1312:             SSE_MULT_PS(XMM7,XMM3)
1313:             SSE_SUB_PS(XMM6,XMM7)
1314: 
1315:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1316:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1317: 
1318:             /* Fourth Column */
1319:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1320:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1322:             /* Matrix-Vector Product: */
1323:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1324:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1325:             SSE_MULT_PS(XMM5,XMM0)
1326:             SSE_SUB_PS(XMM4,XMM5)

1328:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1329:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1330:             SSE_MULT_PS(XMM6,XMM1)
1331:             SSE_SUB_PS(XMM4,XMM6)

1333:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1334:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1335:             SSE_MULT_PS(XMM7,XMM2)
1336:             SSE_SUB_PS(XMM4,XMM7)

1338:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1339:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1340:             SSE_MULT_PS(XMM5,XMM3)
1341:             SSE_SUB_PS(XMM4,XMM5)
1342: 
1343:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1344:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1345:           SSE_INLINE_END_2;
1346:           pv   += 16;
1347:         }
1348:         PetscLogFlops(128.0*nz+112.0);
1349:       }
1350:       row = (unsigned int)(*bjtmp++);
1351: /*        row = (*bjtmp++)/4; */
1352: /*        bjtmp++; */
1353:     }
1354:     /* finished row so stick it into b->a */
1355:     pv = ba + 16*bi[i];
1356:     pj = bj + bi[i];
1357:     nz = bi[i+1] - bi[i];

1359:     /* Copy x block back into pv block */
1360:     for (j=0; j<nz; j++) {
1361:       x  = rtmp+16*((unsigned int)pj[j]);
1362: /*        x  = rtmp+4*pj[j]; */

1364:       SSE_INLINE_BEGIN_2(x,pv)
1365:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1366:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1367:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1369:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1370:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1372:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1373:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1375:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1376:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1378:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1379:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1381:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1382:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1384:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1385:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1387:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1388:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1389:       SSE_INLINE_END_2;
1390:       pv += 16;
1391:     }
1392:     /* invert diagonal block */
1393:     w = ba + 16*diag_offset[i];
1394:     if (pivotinblocks) {
1395:       Kernel_A_gets_inverse_A_4(w,shift);
1396:     } else {
1397:       Kernel_A_gets_inverse_A_4_nopivot(w);
1398:     }
1399: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
1400:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1401:   }

1403:   PetscFree(rtmp);
1404:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1405:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1406:   C->assembled = PETSC_TRUE;
1407:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1408:   /* Flop Count from inverting diagonal blocks */
1409:   SSE_SCOPE_END;
1410:   return(0);
1411: }

1415: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1416: {
1417:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1419:   int  i,j,n = a->mbs;
1420:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1421:   unsigned int   row;
1422:   int            *ajtmpold,nz,*bi=b->i;
1423:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1424:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1425:   MatScalar      *ba = b->a,*aa = a->a;
1426:   int            nonzero=0;
1427: /*    int            nonzero=0,colscale = 16; */
1428:   PetscTruth     pivotinblocks = b->pivotinblocks;
1429:   PetscReal      shift = info->shiftamount;

1432:   SSE_SCOPE_BEGIN;

1434:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1435:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1436:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1437:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1438: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1439: /*      colscale = 4; */
1440: /*    } */
1441:   if ((unsigned long)bj==(unsigned long)aj) {
1442:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1443:   }
1444: 
1445:   for (i=0; i<n; i++) {
1446:     nz    = bi[i+1] - bi[i];
1447:     bjtmp = bj + bi[i];
1448:     /* zero out the 4x4 block accumulators */
1449:     /* zero out one register */
1450:     XOR_PS(XMM7,XMM7);
1451:     for  (j=0; j<nz; j++) {
1452:       x = rtmp+16*((unsigned int)bjtmp[j]);
1453: /*        x = rtmp+4*bjtmp[j]; */
1454:       SSE_INLINE_BEGIN_1(x)
1455:         /* Copy zero register to memory locations */
1456:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1457:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1458:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1459:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1460:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1461:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1462:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1463:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1464:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1465:       SSE_INLINE_END_1;
1466:     }
1467:     /* load in initial (unfactored row) */
1468:     nz       = ai[i+1] - ai[i];
1469:     ajtmpold = aj + ai[i];
1470:     v        = aa + 16*ai[i];
1471:     for (j=0; j<nz; j++) {
1472:       x = rtmp+16*ajtmpold[j];
1473: /*        x = rtmp+colscale*ajtmpold[j]; */
1474:       /* Copy v block into x block */
1475:       SSE_INLINE_BEGIN_2(v,x)
1476:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1477:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1478:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1480:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1481:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1483:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1484:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1486:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1487:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1489:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1490:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1492:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1493:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1495:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1496:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1498:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1499:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1500:       SSE_INLINE_END_2;

1502:       v += 16;
1503:     }
1504: /*      row = (*bjtmp++)/4; */
1505:     row = (unsigned int)(*bjtmp++);
1506:     while (row < i) {
1507:       pc  = rtmp + 16*row;
1508:       SSE_INLINE_BEGIN_1(pc)
1509:         /* Load block from lower triangle */
1510:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1511:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1512:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1514:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1515:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1517:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1518:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1520:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1521:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1523:         /* Compare block to zero block */

1525:         SSE_COPY_PS(XMM4,XMM7)
1526:         SSE_CMPNEQ_PS(XMM4,XMM0)

1528:         SSE_COPY_PS(XMM5,XMM7)
1529:         SSE_CMPNEQ_PS(XMM5,XMM1)

1531:         SSE_COPY_PS(XMM6,XMM7)
1532:         SSE_CMPNEQ_PS(XMM6,XMM2)

1534:         SSE_CMPNEQ_PS(XMM7,XMM3)

1536:         /* Reduce the comparisons to one SSE register */
1537:         SSE_OR_PS(XMM6,XMM7)
1538:         SSE_OR_PS(XMM5,XMM4)
1539:         SSE_OR_PS(XMM5,XMM6)
1540:       SSE_INLINE_END_1;

1542:       /* Reduce the one SSE register to an integer register for branching */
1543:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1544:       MOVEMASK(nonzero,XMM5);

1546:       /* If block is nonzero ... */
1547:       if (nonzero) {
1548:         pv = ba + 16*diag_offset[row];
1549:         PREFETCH_L1(&pv[16]);
1550:         pj = bj + diag_offset[row] + 1;

1552:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1553:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1554:         /* but the diagonal was inverted already */
1555:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1557:         SSE_INLINE_BEGIN_2(pv,pc)
1558:           /* Column 0, product is accumulated in XMM4 */
1559:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1560:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1561:           SSE_MULT_PS(XMM4,XMM0)

1563:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1564:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1565:           SSE_MULT_PS(XMM5,XMM1)
1566:           SSE_ADD_PS(XMM4,XMM5)

1568:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1569:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1570:           SSE_MULT_PS(XMM6,XMM2)
1571:           SSE_ADD_PS(XMM4,XMM6)

1573:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1574:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1575:           SSE_MULT_PS(XMM7,XMM3)
1576:           SSE_ADD_PS(XMM4,XMM7)

1578:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1579:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1581:           /* Column 1, product is accumulated in XMM5 */
1582:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1583:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1584:           SSE_MULT_PS(XMM5,XMM0)

1586:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1587:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1588:           SSE_MULT_PS(XMM6,XMM1)
1589:           SSE_ADD_PS(XMM5,XMM6)

1591:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1592:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1593:           SSE_MULT_PS(XMM7,XMM2)
1594:           SSE_ADD_PS(XMM5,XMM7)

1596:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1597:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1598:           SSE_MULT_PS(XMM6,XMM3)
1599:           SSE_ADD_PS(XMM5,XMM6)

1601:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1602:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1604:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1606:           /* Column 2, product is accumulated in XMM6 */
1607:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1608:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1609:           SSE_MULT_PS(XMM6,XMM0)

1611:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1612:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1613:           SSE_MULT_PS(XMM7,XMM1)
1614:           SSE_ADD_PS(XMM6,XMM7)

1616:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1617:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1618:           SSE_MULT_PS(XMM7,XMM2)
1619:           SSE_ADD_PS(XMM6,XMM7)

1621:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1622:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1623:           SSE_MULT_PS(XMM7,XMM3)
1624:           SSE_ADD_PS(XMM6,XMM7)
1625: 
1626:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1627:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1629:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1630:           /* Column 3, product is accumulated in XMM0 */
1631:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1632:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1633:           SSE_MULT_PS(XMM0,XMM7)

1635:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1636:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1637:           SSE_MULT_PS(XMM1,XMM7)
1638:           SSE_ADD_PS(XMM0,XMM1)

1640:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1641:           SSE_SHUFFLE(XMM1,XMM1,0x00)
1642:           SSE_MULT_PS(XMM1,XMM2)
1643:           SSE_ADD_PS(XMM0,XMM1)

1645:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1646:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1647:           SSE_MULT_PS(XMM3,XMM7)
1648:           SSE_ADD_PS(XMM0,XMM3)

1650:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1651:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1653:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1654:           /* This is code to be maintained and read by humans afterall. */
1655:           /* Copy Multiplier Col 3 into XMM3 */
1656:           SSE_COPY_PS(XMM3,XMM0)
1657:           /* Copy Multiplier Col 2 into XMM2 */
1658:           SSE_COPY_PS(XMM2,XMM6)
1659:           /* Copy Multiplier Col 1 into XMM1 */
1660:           SSE_COPY_PS(XMM1,XMM5)
1661:           /* Copy Multiplier Col 0 into XMM0 */
1662:           SSE_COPY_PS(XMM0,XMM4)
1663:         SSE_INLINE_END_2;

1665:         /* Update the row: */
1666:         nz = bi[row+1] - diag_offset[row] - 1;
1667:         pv += 16;
1668:         for (j=0; j<nz; j++) {
1669:           PREFETCH_L1(&pv[16]);
1670:           x = rtmp + 16*((unsigned int)pj[j]);
1671: /*            x = rtmp + 4*pj[j]; */

1673:           /* X:=X-M*PV, One column at a time */
1674:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1675:           SSE_INLINE_BEGIN_2(x,pv)
1676:             /* Load First Column of X*/
1677:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1678:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1680:             /* Matrix-Vector Product: */
1681:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1682:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1683:             SSE_MULT_PS(XMM5,XMM0)
1684:             SSE_SUB_PS(XMM4,XMM5)

1686:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1687:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1688:             SSE_MULT_PS(XMM6,XMM1)
1689:             SSE_SUB_PS(XMM4,XMM6)

1691:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1692:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1693:             SSE_MULT_PS(XMM7,XMM2)
1694:             SSE_SUB_PS(XMM4,XMM7)

1696:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1697:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1698:             SSE_MULT_PS(XMM5,XMM3)
1699:             SSE_SUB_PS(XMM4,XMM5)

1701:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1702:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1704:             /* Second Column */
1705:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1706:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1708:             /* Matrix-Vector Product: */
1709:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1710:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1711:             SSE_MULT_PS(XMM6,XMM0)
1712:             SSE_SUB_PS(XMM5,XMM6)

1714:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1715:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1716:             SSE_MULT_PS(XMM7,XMM1)
1717:             SSE_SUB_PS(XMM5,XMM7)

1719:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1720:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1721:             SSE_MULT_PS(XMM4,XMM2)
1722:             SSE_SUB_PS(XMM5,XMM4)

1724:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1725:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1726:             SSE_MULT_PS(XMM6,XMM3)
1727:             SSE_SUB_PS(XMM5,XMM6)
1728: 
1729:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1730:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1732:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1734:             /* Third Column */
1735:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1736:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1738:             /* Matrix-Vector Product: */
1739:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1740:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1741:             SSE_MULT_PS(XMM7,XMM0)
1742:             SSE_SUB_PS(XMM6,XMM7)

1744:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1745:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1746:             SSE_MULT_PS(XMM4,XMM1)
1747:             SSE_SUB_PS(XMM6,XMM4)

1749:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1750:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1751:             SSE_MULT_PS(XMM5,XMM2)
1752:             SSE_SUB_PS(XMM6,XMM5)

1754:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1755:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1756:             SSE_MULT_PS(XMM7,XMM3)
1757:             SSE_SUB_PS(XMM6,XMM7)
1758: 
1759:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1760:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1761: 
1762:             /* Fourth Column */
1763:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1764:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1766:             /* Matrix-Vector Product: */
1767:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1768:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1769:             SSE_MULT_PS(XMM5,XMM0)
1770:             SSE_SUB_PS(XMM4,XMM5)

1772:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1773:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1774:             SSE_MULT_PS(XMM6,XMM1)
1775:             SSE_SUB_PS(XMM4,XMM6)

1777:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1778:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1779:             SSE_MULT_PS(XMM7,XMM2)
1780:             SSE_SUB_PS(XMM4,XMM7)

1782:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1783:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1784:             SSE_MULT_PS(XMM5,XMM3)
1785:             SSE_SUB_PS(XMM4,XMM5)
1786: 
1787:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1788:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1789:           SSE_INLINE_END_2;
1790:           pv   += 16;
1791:         }
1792:         PetscLogFlops(128.0*nz+112.0);
1793:       }
1794:       row = (unsigned int)(*bjtmp++);
1795: /*        row = (*bjtmp++)/4; */
1796: /*        bjtmp++; */
1797:     }
1798:     /* finished row so stick it into b->a */
1799:     pv = ba + 16*bi[i];
1800:     pj = bj + bi[i];
1801:     nz = bi[i+1] - bi[i];

1803:     /* Copy x block back into pv block */
1804:     for (j=0; j<nz; j++) {
1805:       x  = rtmp+16*((unsigned int)pj[j]);
1806: /*        x  = rtmp+4*pj[j]; */

1808:       SSE_INLINE_BEGIN_2(x,pv)
1809:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1810:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1811:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1813:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1814:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1816:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1817:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1819:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1820:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1822:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1823:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1825:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1826:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1828:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1829:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1831:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1832:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1833:       SSE_INLINE_END_2;
1834:       pv += 16;
1835:     }
1836:     /* invert diagonal block */
1837:     w = ba + 16*diag_offset[i];
1838:     if (pivotinblocks) {
1839:       Kernel_A_gets_inverse_A_4(w,shift);
1840:     } else {
1841:       Kernel_A_gets_inverse_A_4_nopivot(w);
1842:     }
1843: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
1844:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1845:   }

1847:   PetscFree(rtmp);
1848:   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1849:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1850:   C->assembled = PETSC_TRUE;
1851:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1852:   /* Flop Count from inverting diagonal blocks */
1853:   SSE_SCOPE_END;
1854:   return(0);
1855: }

1857: #endif