Actual source code: baijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */

 7:  #include ../src/mat/impls/baij/seq/baij.h
 8:  #include ../src/mat/blockinvert.h
 9:  #include petscbt.h
 10:  #include ../src/mat/utils/freespace.h

 14: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 15: {
 16:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
 17:   PetscErrorCode    ierr;
 18:   PetscInt          i,nz;
 19:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
 20:   const MatScalar   *aa=a->a,*v;
 21:   PetscScalar       s1,*x;
 22:   const PetscScalar *b;

 25:   VecCopy(bb,xx);
 26:   VecGetArray(bb,(PetscScalar**)&b);
 27:   VecGetArray(xx,&x);
 28: 
 29:   /* forward solve the U^T */
 30:   for (i=0; i<n; i++) {

 32:     v     = aa + diag[i];
 33:     /* multiply by the inverse of the block diagonal */
 34:     s1    = (*v++)*x[i];
 35:     vi    = aj + diag[i] + 1;
 36:     nz    = ai[i+1] - diag[i] - 1;
 37:     while (nz--) {
 38:       x[*vi++]  -= (*v++)*s1;
 39:     }
 40:     x[i]   = s1;
 41:   }
 42:   /* backward solve the L^T */
 43:   for (i=n-1; i>=0; i--){
 44:     v    = aa + diag[i] - 1;
 45:     vi   = aj + diag[i] - 1;
 46:     nz   = diag[i] - ai[i];
 47:     s1   = x[i];
 48:     while (nz--) {
 49:       x[*vi--]   -=  (*v--)*s1;
 50:     }
 51:   }
 52:   VecRestoreArray(bb,(PetscScalar**)&b);
 53:   VecRestoreArray(xx,&x);
 54:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
 55:   return(0);
 56: }

 60: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
 61: {
 62:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
 63:   PetscErrorCode    ierr;
 64:   PetscInt          i,nz,idx,idt,oidx;
 65:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
 66:   const MatScalar   *aa=a->a,*v;
 67:   PetscScalar       s1,s2,x1,x2,*x;
 68:   const PetscScalar *b;

 71:   VecCopy(bb,xx);
 72:   VecGetArray(bb,(PetscScalar**)&b);
 73:   VecGetArray(xx,&x);

 75:   /* forward solve the U^T */
 76:   idx = 0;
 77:   for (i=0; i<n; i++) {

 79:     v     = aa + 4*diag[i];
 80:     /* multiply by the inverse of the block diagonal */
 81:     x1 = x[idx];   x2 = x[1+idx];
 82:     s1 = v[0]*x1  +  v[1]*x2;
 83:     s2 = v[2]*x1  +  v[3]*x2;
 84:     v += 4;

 86:     vi    = aj + diag[i] + 1;
 87:     nz    = ai[i+1] - diag[i] - 1;
 88:     while (nz--) {
 89:       oidx = 2*(*vi++);
 90:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
 91:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
 92:       v  += 4;
 93:     }
 94:     x[idx]   = s1;x[1+idx] = s2;
 95:     idx += 2;
 96:   }
 97:   /* backward solve the L^T */
 98:   for (i=n-1; i>=0; i--){
 99:     v    = aa + 4*diag[i] - 4;
100:     vi   = aj + diag[i] - 1;
101:     nz   = diag[i] - ai[i];
102:     idt  = 2*i;
103:     s1   = x[idt];  s2 = x[1+idt];
104:     while (nz--) {
105:       idx   = 2*(*vi--);
106:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108:       v -= 4;
109:     }
110:   }
111:   VecRestoreArray(bb,(PetscScalar**)&b);
112:   VecRestoreArray(xx,&x);
113:   PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
114:   return(0);
115: }

119: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120: {
121:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
122:   PetscErrorCode    ierr;
123:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124:   PetscInt          nz,idx,idt,j,i,oidx;
125:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126:   const MatScalar   *aa=a->a,*v;
127:   PetscScalar       s1,s2,x1,x2,*x;
128:   const PetscScalar *b;

131:   VecCopy(bb,xx);
132:   VecGetArray(bb,(PetscScalar**)&b);
133:   VecGetArray(xx,&x);

135:   /* forward solve the U^T */
136:   idx = 0;
137:   for (i=0; i<n; i++) {
138:     v     = aa + bs2*diag[i];
139:     /* multiply by the inverse of the block diagonal */
140:     x1 = x[idx];   x2 = x[1+idx];
141:     s1 = v[0]*x1  +  v[1]*x2;
142:     s2 = v[2]*x1  +  v[3]*x2;
143:     v -= bs2;

145:     vi    = aj + diag[i] - 1;
146:     nz    = diag[i] - diag[i+1] - 1;
147:     for(j=0;j>-nz;j--){
148:       oidx = bs*vi[j];
149:       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150:       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151:       v  -= bs2;
152:     }
153:     x[idx]   = s1;x[1+idx] = s2;
154:     idx += bs;
155:   }
156:   /* backward solve the L^T */
157:   for (i=n-1; i>=0; i--){
158:     v    = aa + bs2*ai[i];
159:     vi   = aj + ai[i];
160:     nz   = ai[i+1] - ai[i];
161:     idt  = bs*i;
162:     s1   = x[idt];  s2 = x[1+idt];
163:     for(j=0;j<nz;j++){
164:       idx   = bs*vi[j];
165:       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166:       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167:       v += bs2;
168:     }
169:   }
170:   VecRestoreArray(bb,(PetscScalar**)&b);
171:   VecRestoreArray(xx,&x);
172:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
173:   return(0);
174: }

178: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179: {
180:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181:   PetscErrorCode    ierr;
182:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183:   PetscInt          i,nz,idx,idt,oidx;
184:   const MatScalar   *aa=a->a,*v;
185:   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186:   const PetscScalar *b;

189:   VecCopy(bb,xx);
190:   VecGetArray(bb,(PetscScalar**)&b);
191:   VecGetArray(xx,&x);

193:   /* forward solve the U^T */
194:   idx = 0;
195:   for (i=0; i<n; i++) {

197:     v     = aa + 9*diag[i];
198:     /* multiply by the inverse of the block diagonal */
199:     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203:     v += 9;

205:     vi    = aj + diag[i] + 1;
206:     nz    = ai[i+1] - diag[i] - 1;
207:     while (nz--) {
208:       oidx = 3*(*vi++);
209:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210:       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211:       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212:       v  += 9;
213:     }
214:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215:     idx += 3;
216:   }
217:   /* backward solve the L^T */
218:   for (i=n-1; i>=0; i--){
219:     v    = aa + 9*diag[i] - 9;
220:     vi   = aj + diag[i] - 1;
221:     nz   = diag[i] - ai[i];
222:     idt  = 3*i;
223:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224:     while (nz--) {
225:       idx   = 3*(*vi--);
226:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227:       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228:       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229:       v -= 9;
230:     }
231:   }
232:   VecRestoreArray(bb,(PetscScalar**)&b);
233:   VecRestoreArray(xx,&x);
234:   PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
235:   return(0);
236: }

240: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241: {
242:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
243:   PetscErrorCode    ierr;
244:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245:   PetscInt          nz,idx,idt,j,i,oidx;
246:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247:   const MatScalar   *aa=a->a,*v;
248:   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249:   const PetscScalar *b;

252:   VecCopy(bb,xx);
253:   VecGetArray(bb,(PetscScalar**)&b);
254:   VecGetArray(xx,&x);

256:   /* forward solve the U^T */
257:   idx = 0;
258:   for (i=0; i<n; i++) {
259:     v     = aa + bs2*diag[i];
260:     /* multiply by the inverse of the block diagonal */
261:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262:     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263:     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264:     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265:     v -= bs2;

267:     vi    = aj + diag[i] - 1;
268:     nz    = diag[i] - diag[i+1] - 1;
269:     for(j=0;j>-nz;j--){
270:       oidx = bs*vi[j];
271:       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272:       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273:       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274:       v  -= bs2;
275:     }
276:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277:     idx += bs;
278:   }
279:   /* backward solve the L^T */
280:   for (i=n-1; i>=0; i--){
281:     v    = aa + bs2*ai[i];
282:     vi   = aj + ai[i];
283:     nz   = ai[i+1] - ai[i];
284:     idt  = bs*i;
285:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286:     for(j=0;j<nz;j++){
287:       idx   = bs*vi[j];
288:       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289:       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290:       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291:       v += bs2;
292:     }
293:   }
294:   VecRestoreArray(bb,(PetscScalar**)&b);
295:   VecRestoreArray(xx,&x);
296:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
297:   return(0);
298: }

302: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303: {
304:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305:   PetscErrorCode    ierr;
306:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307:   PetscInt          i,nz,idx,idt,oidx;
308:   const MatScalar   *aa=a->a,*v;
309:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310:   const PetscScalar *b;

313:   VecCopy(bb,xx);
314:   VecGetArray(bb,(PetscScalar**)&b);
315:   VecGetArray(xx,&x);

317:   /* forward solve the U^T */
318:   idx = 0;
319:   for (i=0; i<n; i++) {

321:     v     = aa + 16*diag[i];
322:     /* multiply by the inverse of the block diagonal */
323:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328:     v += 16;

330:     vi    = aj + diag[i] + 1;
331:     nz    = ai[i+1] - diag[i] - 1;
332:     while (nz--) {
333:       oidx = 4*(*vi++);
334:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335:       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336:       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337:       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338:       v  += 16;
339:     }
340:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341:     idx += 4;
342:   }
343:   /* backward solve the L^T */
344:   for (i=n-1; i>=0; i--){
345:     v    = aa + 16*diag[i] - 16;
346:     vi   = aj + diag[i] - 1;
347:     nz   = diag[i] - ai[i];
348:     idt  = 4*i;
349:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350:     while (nz--) {
351:       idx   = 4*(*vi--);
352:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353:       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354:       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355:       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356:       v -= 16;
357:     }
358:   }
359:   VecRestoreArray(bb,(PetscScalar**)&b);
360:   VecRestoreArray(xx,&x);
361:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
362:   return(0);
363: }

367: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368: {
369:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
370:   PetscErrorCode    ierr;
371:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372:   PetscInt          nz,idx,idt,j,i,oidx;
373:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374:   const MatScalar   *aa=a->a,*v;
375:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376:   const PetscScalar *b;

379:   VecCopy(bb,xx);
380:   VecGetArray(bb,(PetscScalar**)&b);
381:   VecGetArray(xx,&x);

383:   /* forward solve the U^T */
384:   idx = 0;
385:   for (i=0; i<n; i++) {
386:     v     = aa + bs2*diag[i];
387:     /* multiply by the inverse of the block diagonal */
388:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389:     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390:     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391:     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392:     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393:     v -= bs2;

395:     vi    = aj + diag[i] - 1;
396:     nz    = diag[i] - diag[i+1] - 1;
397:     for(j=0;j>-nz;j--){
398:       oidx = bs*vi[j];
399:       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400:       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401:       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402:       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403:       v  -= bs2;
404:     }
405:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406:     idx += bs;
407:   }
408:   /* backward solve the L^T */
409:   for (i=n-1; i>=0; i--){
410:     v    = aa + bs2*ai[i];
411:     vi   = aj + ai[i];
412:     nz   = ai[i+1] - ai[i];
413:     idt  = bs*i;
414:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415:     for(j=0;j<nz;j++){
416:       idx   = bs*vi[j];
417:       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418:       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419:       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420:       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421:       v += bs2;
422:     }
423:   }
424:   VecRestoreArray(bb,(PetscScalar**)&b);
425:   VecRestoreArray(xx,&x);
426:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
427:   return(0);
428: }

432: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433: {
434:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435:   PetscErrorCode    ierr;
436:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437:   PetscInt          i,nz,idx,idt,oidx;
438:   const MatScalar   *aa=a->a,*v;
439:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440:   const PetscScalar *b;

443:   VecCopy(bb,xx);
444:   VecGetArray(bb,(PetscScalar**)&b);
445:   VecGetArray(xx,&x);

447:   /* forward solve the U^T */
448:   idx = 0;
449:   for (i=0; i<n; i++) {

451:     v     = aa + 25*diag[i];
452:     /* multiply by the inverse of the block diagonal */
453:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459:     v += 25;

461:     vi    = aj + diag[i] + 1;
462:     nz    = ai[i+1] - diag[i] - 1;
463:     while (nz--) {
464:       oidx = 5*(*vi++);
465:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466:       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467:       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468:       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469:       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470:       v  += 25;
471:     }
472:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473:     idx += 5;
474:   }
475:   /* backward solve the L^T */
476:   for (i=n-1; i>=0; i--){
477:     v    = aa + 25*diag[i] - 25;
478:     vi   = aj + diag[i] - 1;
479:     nz   = diag[i] - ai[i];
480:     idt  = 5*i;
481:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482:     while (nz--) {
483:       idx   = 5*(*vi--);
484:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485:       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486:       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487:       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488:       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489:       v -= 25;
490:     }
491:   }
492:   VecRestoreArray(bb,(PetscScalar**)&b);
493:   VecRestoreArray(xx,&x);
494:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
495:   return(0);
496: }

500: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501: {
502:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
504:   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505:   PetscInt       nz,idx,idt,j,i,oidx;
506:   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507:   const MatScalar      *aa=a->a,*v;
508:   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509:   const PetscScalar    *b;

512:   VecCopy(bb,xx);
513:   VecGetArray(bb,(PetscScalar**)&b);
514:   VecGetArray(xx,&x);

516:   /* forward solve the U^T */
517:   idx = 0;
518:   for (i=0; i<n; i++) {
519:     v     = aa + bs2*diag[i];
520:     /* multiply by the inverse of the block diagonal */
521:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522:     x5 = x[4+idx];
523:     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524:     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525:     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526:     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527:     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528:     v -= bs2;

530:     vi    = aj + diag[i] - 1;
531:     nz    = diag[i] - diag[i+1] - 1;
532:     for(j=0;j>-nz;j--){
533:       oidx = bs*vi[j];
534:       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535:       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536:       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537:       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538:       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539:       v  -= bs2;
540:     }
541:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542:     idx += bs;
543:   }
544:   /* backward solve the L^T */
545:   for (i=n-1; i>=0; i--){
546:     v    = aa + bs2*ai[i];
547:     vi   = aj + ai[i];
548:     nz   = ai[i+1] - ai[i];
549:     idt  = bs*i;
550:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551:     for(j=0;j<nz;j++){
552:       idx   = bs*vi[j];
553:       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554:       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555:       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556:       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557:       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558:       v += bs2;
559:     }
560:   }
561:   VecRestoreArray(bb,(PetscScalar**)&b);
562:   VecRestoreArray(xx,&x);
563:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
564:   return(0);
565: }

569: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570: {
571:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572:   PetscErrorCode    ierr;
573:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574:   PetscInt          i,nz,idx,idt,oidx;
575:   const MatScalar   *aa=a->a,*v;
576:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577:   const PetscScalar *b;

580:   VecCopy(bb,xx);
581:   VecGetArray(bb,(PetscScalar**)&b);
582:   VecGetArray(xx,&x);

584:   /* forward solve the U^T */
585:   idx = 0;
586:   for (i=0; i<n; i++) {

588:     v     = aa + 36*diag[i];
589:     /* multiply by the inverse of the block diagonal */
590:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591:     x6    = x[5+idx];
592:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598:     v += 36;

600:     vi    = aj + diag[i] + 1;
601:     nz    = ai[i+1] - diag[i] - 1;
602:     while (nz--) {
603:       oidx = 6*(*vi++);
604:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610:       v  += 36;
611:     }
612:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613:     x[5+idx] = s6;
614:     idx += 6;
615:   }
616:   /* backward solve the L^T */
617:   for (i=n-1; i>=0; i--){
618:     v    = aa + 36*diag[i] - 36;
619:     vi   = aj + diag[i] - 1;
620:     nz   = diag[i] - ai[i];
621:     idt  = 6*i;
622:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623:     s6 = x[5+idt];
624:     while (nz--) {
625:       idx   = 6*(*vi--);
626:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627:       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632:       v -= 36;
633:     }
634:   }
635:   VecRestoreArray(bb,(PetscScalar**)&b);
636:   VecRestoreArray(xx,&x);
637:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
638:   return(0);
639: }

643: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644: {
645:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
646:   PetscErrorCode    ierr;
647:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648:   PetscInt          nz,idx,idt,j,i,oidx;
649:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650:   const MatScalar   *aa=a->a,*v;
651:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652:   const PetscScalar *b;

655:   VecCopy(bb,xx);
656:   VecGetArray(bb,(PetscScalar**)&b);
657:   VecGetArray(xx,&x);

659:   /* forward solve the U^T */
660:   idx = 0;
661:   for (i=0; i<n; i++) {
662:     v     = aa + bs2*diag[i];
663:     /* multiply by the inverse of the block diagonal */
664:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665:     x5 = x[4+idx]; x6 = x[5+idx];
666:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672:     v -= bs2;

674:     vi    = aj + diag[i] - 1;
675:     nz    = diag[i] - diag[i+1] - 1;
676:     for(j=0;j>-nz;j--){
677:       oidx = bs*vi[j];
678:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679:       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680:       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681:       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682:       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683:       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684:       v  -= bs2;
685:     }
686:     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687:     x[5+idx] = s6;
688:     idx += bs;
689:   }
690:   /* backward solve the L^T */
691:   for (i=n-1; i>=0; i--){
692:     v    = aa + bs2*ai[i];
693:     vi   = aj + ai[i];
694:     nz   = ai[i+1] - ai[i];
695:     idt  = bs*i;
696:     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697:     s6   = x[5+idt];
698:     for(j=0;j<nz;j++){
699:       idx   = bs*vi[j];
700:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701:       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702:       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703:       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704:       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705:       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706:       v += bs2;
707:     }
708:   }
709:   VecRestoreArray(bb,(PetscScalar**)&b);
710:   VecRestoreArray(xx,&x);
711:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
712:   return(0);
713: }

717: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718: {
719:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720:   PetscErrorCode    ierr;
721:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722:   PetscInt          i,nz,idx,idt,oidx;
723:   const MatScalar   *aa=a->a,*v;
724:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725:   const PetscScalar *b;

728:   VecCopy(bb,xx);
729:   VecGetArray(bb,(PetscScalar**)&b);
730:   VecGetArray(xx,&x);

732:   /* forward solve the U^T */
733:   idx = 0;
734:   for (i=0; i<n; i++) {

736:     v     = aa + 49*diag[i];
737:     /* multiply by the inverse of the block diagonal */
738:     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739:     x6    = x[5+idx]; x7 = x[6+idx];
740:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747:     v += 49;

749:     vi    = aj + diag[i] + 1;
750:     nz    = ai[i+1] - diag[i] - 1;
751:     while (nz--) {
752:       oidx = 7*(*vi++);
753:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760:       v  += 49;
761:     }
762:     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763:     x[5+idx] = s6;x[6+idx] = s7;
764:     idx += 7;
765:   }
766:   /* backward solve the L^T */
767:   for (i=n-1; i>=0; i--){
768:     v    = aa + 49*diag[i] - 49;
769:     vi   = aj + diag[i] - 1;
770:     nz   = diag[i] - ai[i];
771:     idt  = 7*i;
772:     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773:     s6 = x[5+idt];s7 = x[6+idt];
774:     while (nz--) {
775:       idx   = 7*(*vi--);
776:       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777:       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783:       v -= 49;
784:     }
785:   }
786:   VecRestoreArray(bb,(PetscScalar**)&b);
787:   VecRestoreArray(xx,&x);
788:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
789:   return(0);
790: }
793: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794: {
795:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
796:   PetscErrorCode    ierr;
797:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798:   PetscInt          nz,idx,idt,j,i,oidx;
799:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800:   const MatScalar   *aa=a->a,*v;
801:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802:   const PetscScalar *b;

805:   VecCopy(bb,xx);
806:   VecGetArray(bb,(PetscScalar**)&b);
807:   VecGetArray(xx,&x);

809:   /* forward solve the U^T */
810:   idx = 0;
811:   for (i=0; i<n; i++) {
812:     v     = aa + bs2*diag[i];
813:     /* multiply by the inverse of the block diagonal */
814:     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815:     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823:     v -= bs2;
824:     vi    = aj + diag[i] - 1;
825:     nz    = diag[i] - diag[i+1] - 1;
826:     for(j=0;j>-nz;j--){
827:       oidx = bs*vi[j];
828:       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829:       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830:       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831:       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832:       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833:       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834:       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835:       v  -= bs2;
836:     }
837:     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838:     x[5+idx] = s6;  x[6+idx] = s7;
839:     idx += bs;
840:   }
841:   /* backward solve the L^T */
842:   for (i=n-1; i>=0; i--){
843:     v    = aa + bs2*ai[i];
844:     vi   = aj + ai[i];
845:     nz   = ai[i+1] - ai[i];
846:     idt  = bs*i;
847:     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848:     s6   = x[5+idt];  s7 = x[6+idt];
849:     for(j=0;j<nz;j++){
850:       idx   = bs*vi[j];
851:       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852:       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853:       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854:       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855:       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856:       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857:       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858:       v += bs2;
859:     }
860:   }
861:   VecRestoreArray(bb,(PetscScalar**)&b);
862:   VecRestoreArray(xx,&x);
863:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
864:   return(0);
865: }

867: /*---------------------------------------------------------------------------------------------*/
870: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871: {
872:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873:   IS                iscol=a->col,isrow=a->row;
874:   PetscErrorCode    ierr;
875:   const PetscInt    *r,*c,*rout,*cout;
876:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877:   PetscInt          i,nz;
878:   const MatScalar   *aa=a->a,*v;
879:   PetscScalar       s1,*x,*t;
880:   const PetscScalar *b;

883:   VecGetArray(bb,(PetscScalar**)&b);
884:   VecGetArray(xx,&x);
885:   t  = a->solve_work;

887:   ISGetIndices(isrow,&rout); r = rout;
888:   ISGetIndices(iscol,&cout); c = cout;

890:   /* copy the b into temp work space according to permutation */
891:   for (i=0; i<n; i++) {
892:     t[i] = b[c[i]];
893:   }

895:   /* forward solve the U^T */
896:   for (i=0; i<n; i++) {

898:     v     = aa + diag[i];
899:     /* multiply by the inverse of the block diagonal */
900:     s1    = (*v++)*t[i];
901:     vi    = aj + diag[i] + 1;
902:     nz    = ai[i+1] - diag[i] - 1;
903:     while (nz--) {
904:       t[*vi++]  -= (*v++)*s1;
905:     }
906:     t[i]   = s1;
907:   }
908:   /* backward solve the L^T */
909:   for (i=n-1; i>=0; i--){
910:     v    = aa + diag[i] - 1;
911:     vi   = aj + diag[i] - 1;
912:     nz   = diag[i] - ai[i];
913:     s1   = t[i];
914:     while (nz--) {
915:       t[*vi--]   -=  (*v--)*s1;
916:     }
917:   }

919:   /* copy t into x according to permutation */
920:   for (i=0; i<n; i++) {
921:     x[r[i]]   = t[i];
922:   }

924:   ISRestoreIndices(isrow,&rout);
925:   ISRestoreIndices(iscol,&cout);
926:   VecRestoreArray(bb,(PetscScalar**)&b);
927:   VecRestoreArray(xx,&x);
928:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
929:   return(0);
930: }

934: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935: {
936:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937:   IS                iscol=a->col,isrow=a->row;
938:   PetscErrorCode    ierr;
939:   const PetscInt    *r,*c,*rout,*cout;
940:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942:   const MatScalar   *aa=a->a,*v;
943:   PetscScalar       s1,s2,x1,x2,*x,*t;
944:   const PetscScalar *b;

947:   VecGetArray(bb,(PetscScalar**)&b);
948:   VecGetArray(xx,&x);
949:   t  = a->solve_work;

951:   ISGetIndices(isrow,&rout); r = rout;
952:   ISGetIndices(iscol,&cout); c = cout;

954:   /* copy the b into temp work space according to permutation */
955:   ii = 0;
956:   for (i=0; i<n; i++) {
957:     ic      = 2*c[i];
958:     t[ii]   = b[ic];
959:     t[ii+1] = b[ic+1];
960:     ii += 2;
961:   }

963:   /* forward solve the U^T */
964:   idx = 0;
965:   for (i=0; i<n; i++) {

967:     v     = aa + 4*diag[i];
968:     /* multiply by the inverse of the block diagonal */
969:     x1    = t[idx];   x2 = t[1+idx];
970:     s1 = v[0]*x1  +  v[1]*x2;
971:     s2 = v[2]*x1  +  v[3]*x2;
972:     v += 4;

974:     vi    = aj + diag[i] + 1;
975:     nz    = ai[i+1] - diag[i] - 1;
976:     while (nz--) {
977:       oidx = 2*(*vi++);
978:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980:       v  += 4;
981:     }
982:     t[idx]   = s1;t[1+idx] = s2;
983:     idx += 2;
984:   }
985:   /* backward solve the L^T */
986:   for (i=n-1; i>=0; i--){
987:     v    = aa + 4*diag[i] - 4;
988:     vi   = aj + diag[i] - 1;
989:     nz   = diag[i] - ai[i];
990:     idt  = 2*i;
991:     s1 = t[idt];  s2 = t[1+idt];
992:     while (nz--) {
993:       idx   = 2*(*vi--);
994:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996:       v -= 4;
997:     }
998:   }

1000:   /* copy t into x according to permutation */
1001:   ii = 0;
1002:   for (i=0; i<n; i++) {
1003:     ir      = 2*r[i];
1004:     x[ir]   = t[ii];
1005:     x[ir+1] = t[ii+1];
1006:     ii += 2;
1007:   }

1009:   ISRestoreIndices(isrow,&rout);
1010:   ISRestoreIndices(iscol,&cout);
1011:   VecRestoreArray(bb,(PetscScalar**)&b);
1012:   VecRestoreArray(xx,&x);
1013:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1014:   return(0);
1015: }

1019: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020: {
1021:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1022:   PetscErrorCode    ierr;
1023:   IS                iscol=a->col,isrow=a->row;
1024:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025:   const PetscInt    *r,*c,*rout,*cout;
1026:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028:   const MatScalar   *aa=a->a,*v;
1029:   PetscScalar       s1,s2,x1,x2,*x,*t;
1030:   const PetscScalar *b;

1033:   VecGetArray(bb,(PetscScalar**)&b);
1034:   VecGetArray(xx,&x);
1035:   t = a->solve_work;

1037:   ISGetIndices(isrow,&rout); r = rout;
1038:   ISGetIndices(iscol,&cout); c = cout;

1040:   /* copy b into temp work space according to permutation */
1041:   for(i=0;i<n;i++){
1042:     ii = bs*i; ic = bs*c[i];
1043:     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044:   }

1046:   /* forward solve the U^T */
1047:   idx = 0;
1048:   for (i=0; i<n; i++) {
1049:     v     = aa + bs2*diag[i];
1050:     /* multiply by the inverse of the block diagonal */
1051:     x1 = t[idx];   x2 = t[1+idx];
1052:     s1 = v[0]*x1  +  v[1]*x2;
1053:     s2 = v[2]*x1  +  v[3]*x2;
1054:     v -= bs2;

1056:     vi    = aj + diag[i] - 1;
1057:     nz    = diag[i] - diag[i+1] - 1;
1058:     for(j=0;j>-nz;j--){
1059:       oidx = bs*vi[j];
1060:       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1061:       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1062:       v  -= bs2;
1063:     }
1064:     t[idx]   = s1;t[1+idx] = s2;
1065:     idx += bs;
1066:   }
1067:   /* backward solve the L^T */
1068:   for (i=n-1; i>=0; i--){
1069:     v    = aa + bs2*ai[i];
1070:     vi   = aj + ai[i];
1071:     nz   = ai[i+1] - ai[i];
1072:     idt  = bs*i;
1073:     s1   = t[idt];  s2 = t[1+idt];
1074:     for(j=0;j<nz;j++){
1075:       idx   = bs*vi[j];
1076:       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1077:       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1078:       v += bs2;
1079:     }
1080:   }

1082:   /* copy t into x according to permutation */
1083:   for(i=0;i<n;i++){
1084:     ii = bs*i;  ir = bs*r[i];
1085:     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1086:   }

1088:   ISRestoreIndices(isrow,&rout);
1089:   ISRestoreIndices(iscol,&cout);
1090:   VecRestoreArray(bb,(PetscScalar**)&b);
1091:   VecRestoreArray(xx,&x);
1092:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1093:   return(0);
1094: }

1098: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099: {
1100:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101:   IS                iscol=a->col,isrow=a->row;
1102:   PetscErrorCode    ierr;
1103:   const PetscInt    *r,*c,*rout,*cout;
1104:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106:   const MatScalar   *aa=a->a,*v;
1107:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108:   const PetscScalar *b;

1111:   VecGetArray(bb,(PetscScalar**)&b);
1112:   VecGetArray(xx,&x);
1113:   t  = a->solve_work;

1115:   ISGetIndices(isrow,&rout); r = rout;
1116:   ISGetIndices(iscol,&cout); c = cout;

1118:   /* copy the b into temp work space according to permutation */
1119:   ii = 0;
1120:   for (i=0; i<n; i++) {
1121:     ic      = 3*c[i];
1122:     t[ii]   = b[ic];
1123:     t[ii+1] = b[ic+1];
1124:     t[ii+2] = b[ic+2];
1125:     ii += 3;
1126:   }

1128:   /* forward solve the U^T */
1129:   idx = 0;
1130:   for (i=0; i<n; i++) {

1132:     v     = aa + 9*diag[i];
1133:     /* multiply by the inverse of the block diagonal */
1134:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138:     v += 9;

1140:     vi    = aj + diag[i] + 1;
1141:     nz    = ai[i+1] - diag[i] - 1;
1142:     while (nz--) {
1143:       oidx = 3*(*vi++);
1144:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147:       v  += 9;
1148:     }
1149:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150:     idx += 3;
1151:   }
1152:   /* backward solve the L^T */
1153:   for (i=n-1; i>=0; i--){
1154:     v    = aa + 9*diag[i] - 9;
1155:     vi   = aj + diag[i] - 1;
1156:     nz   = diag[i] - ai[i];
1157:     idt  = 3*i;
1158:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159:     while (nz--) {
1160:       idx   = 3*(*vi--);
1161:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162:       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164:       v -= 9;
1165:     }
1166:   }

1168:   /* copy t into x according to permutation */
1169:   ii = 0;
1170:   for (i=0; i<n; i++) {
1171:     ir      = 3*r[i];
1172:     x[ir]   = t[ii];
1173:     x[ir+1] = t[ii+1];
1174:     x[ir+2] = t[ii+2];
1175:     ii += 3;
1176:   }

1178:   ISRestoreIndices(isrow,&rout);
1179:   ISRestoreIndices(iscol,&cout);
1180:   VecRestoreArray(bb,(PetscScalar**)&b);
1181:   VecRestoreArray(xx,&x);
1182:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1183:   return(0);
1184: }

1188: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189: {
1190:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1191:   PetscErrorCode    ierr;
1192:   IS                iscol=a->col,isrow=a->row;
1193:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194:   const PetscInt    *r,*c,*rout,*cout;
1195:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197:   const MatScalar   *aa=a->a,*v;
1198:   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199:   const PetscScalar *b;

1202:   VecGetArray(bb,(PetscScalar**)&b);
1203:   VecGetArray(xx,&x);
1204:   t = a->solve_work;

1206:   ISGetIndices(isrow,&rout); r = rout;
1207:   ISGetIndices(iscol,&cout); c = cout;

1209:   /* copy b into temp work space according to permutation */
1210:   for(i=0;i<n;i++){
1211:     ii = bs*i; ic = bs*c[i];
1212:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213:   }

1215:   /* forward solve the U^T */
1216:   idx = 0;
1217:   for (i=0; i<n; i++) {
1218:     v     = aa + bs2*diag[i];
1219:     /* multiply by the inverse of the block diagonal */
1220:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1221:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1222:     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1223:     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1224:     v -= bs2;

1226:     vi    = aj + diag[i] - 1;
1227:     nz    = diag[i] - diag[i+1] - 1;
1228:     for(j=0;j>-nz;j--){
1229:       oidx = bs*vi[j];
1230:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1231:       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1232:       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233:       v  -= bs2;
1234:     }
1235:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1236:     idx += bs;
1237:   }
1238:   /* backward solve the L^T */
1239:   for (i=n-1; i>=0; i--){
1240:     v    = aa + bs2*ai[i];
1241:     vi   = aj + ai[i];
1242:     nz   = ai[i+1] - ai[i];
1243:     idt  = bs*i;
1244:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1245:     for(j=0;j<nz;j++){
1246:       idx   = bs*vi[j];
1247:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1248:       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1249:       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250:       v += bs2;
1251:     }
1252:   }

1254:   /* copy t into x according to permutation */
1255:   for(i=0;i<n;i++){
1256:     ii = bs*i;  ir = bs*r[i];
1257:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258:   }

1260:   ISRestoreIndices(isrow,&rout);
1261:   ISRestoreIndices(iscol,&cout);
1262:   VecRestoreArray(bb,(PetscScalar**)&b);
1263:   VecRestoreArray(xx,&x);
1264:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1265:   return(0);
1266: }

1270: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271: {
1272:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273:   IS                iscol=a->col,isrow=a->row;
1274:   PetscErrorCode    ierr;
1275:   const PetscInt    *r,*c,*rout,*cout;
1276:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278:   const MatScalar   *aa=a->a,*v;
1279:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280:   const PetscScalar *b;

1283:   VecGetArray(bb,(PetscScalar**)&b);
1284:   VecGetArray(xx,&x);
1285:   t  = a->solve_work;

1287:   ISGetIndices(isrow,&rout); r = rout;
1288:   ISGetIndices(iscol,&cout); c = cout;

1290:   /* copy the b into temp work space according to permutation */
1291:   ii = 0;
1292:   for (i=0; i<n; i++) {
1293:     ic      = 4*c[i];
1294:     t[ii]   = b[ic];
1295:     t[ii+1] = b[ic+1];
1296:     t[ii+2] = b[ic+2];
1297:     t[ii+3] = b[ic+3];
1298:     ii += 4;
1299:   }

1301:   /* forward solve the U^T */
1302:   idx = 0;
1303:   for (i=0; i<n; i++) {

1305:     v     = aa + 16*diag[i];
1306:     /* multiply by the inverse of the block diagonal */
1307:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312:     v += 16;

1314:     vi    = aj + diag[i] + 1;
1315:     nz    = ai[i+1] - diag[i] - 1;
1316:     while (nz--) {
1317:       oidx = 4*(*vi++);
1318:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322:       v  += 16;
1323:     }
1324:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325:     idx += 4;
1326:   }
1327:   /* backward solve the L^T */
1328:   for (i=n-1; i>=0; i--){
1329:     v    = aa + 16*diag[i] - 16;
1330:     vi   = aj + diag[i] - 1;
1331:     nz   = diag[i] - ai[i];
1332:     idt  = 4*i;
1333:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334:     while (nz--) {
1335:       idx   = 4*(*vi--);
1336:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338:       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339:       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340:       v -= 16;
1341:     }
1342:   }

1344:   /* copy t into x according to permutation */
1345:   ii = 0;
1346:   for (i=0; i<n; i++) {
1347:     ir      = 4*r[i];
1348:     x[ir]   = t[ii];
1349:     x[ir+1] = t[ii+1];
1350:     x[ir+2] = t[ii+2];
1351:     x[ir+3] = t[ii+3];
1352:     ii += 4;
1353:   }

1355:   ISRestoreIndices(isrow,&rout);
1356:   ISRestoreIndices(iscol,&cout);
1357:   VecRestoreArray(bb,(PetscScalar**)&b);
1358:   VecRestoreArray(xx,&x);
1359:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1360:   return(0);
1361: }

1365: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366: {
1367:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1368:   PetscErrorCode    ierr;
1369:   IS                iscol=a->col,isrow=a->row;
1370:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371:   const PetscInt    *r,*c,*rout,*cout;
1372:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374:   const MatScalar   *aa=a->a,*v;
1375:   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376:   const PetscScalar *b;

1379:   VecGetArray(bb,(PetscScalar**)&b);
1380:   VecGetArray(xx,&x);
1381:   t = a->solve_work;

1383:   ISGetIndices(isrow,&rout); r = rout;
1384:   ISGetIndices(iscol,&cout); c = cout;

1386:   /* copy b into temp work space according to permutation */
1387:   for(i=0;i<n;i++){
1388:     ii = bs*i; ic = bs*c[i];
1389:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390:   }

1392:   /* forward solve the U^T */
1393:   idx = 0;
1394:   for (i=0; i<n; i++) {
1395:     v     = aa + bs2*diag[i];
1396:     /* multiply by the inverse of the block diagonal */
1397:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1398:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1399:     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1400:     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1401:     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402:     v -= bs2;

1404:     vi    = aj + diag[i] - 1;
1405:     nz    = diag[i] - diag[i+1] - 1;
1406:     for(j=0;j>-nz;j--){
1407:       oidx = bs*vi[j];
1408:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1409:       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1410:       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411:       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412:       v  -= bs2;
1413:     }
1414:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1415:     idx += bs;
1416:   }
1417:   /* backward solve the L^T */
1418:   for (i=n-1; i>=0; i--){
1419:     v    = aa + bs2*ai[i];
1420:     vi   = aj + ai[i];
1421:     nz   = ai[i+1] - ai[i];
1422:     idt  = bs*i;
1423:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1424:     for(j=0;j<nz;j++){
1425:       idx   = bs*vi[j];
1426:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1427:       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1428:       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1429:       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1430:       v += bs2;
1431:     }
1432:   }

1434:   /* copy t into x according to permutation */
1435:   for(i=0;i<n;i++){
1436:     ii = bs*i;  ir = bs*r[i];
1437:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1438:   }

1440:   ISRestoreIndices(isrow,&rout);
1441:   ISRestoreIndices(iscol,&cout);
1442:   VecRestoreArray(bb,(PetscScalar**)&b);
1443:   VecRestoreArray(xx,&x);
1444:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1445:   return(0);
1446: }

1450: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451: {
1452:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453:   IS                iscol=a->col,isrow=a->row;
1454:   PetscErrorCode    ierr;
1455:   const PetscInt    *r,*c,*rout,*cout;
1456:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458:   const MatScalar   *aa=a->a,*v;
1459:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460:   const PetscScalar *b;

1463:   VecGetArray(bb,(PetscScalar**)&b);
1464:   VecGetArray(xx,&x);
1465:   t  = a->solve_work;

1467:   ISGetIndices(isrow,&rout); r = rout;
1468:   ISGetIndices(iscol,&cout); c = cout;

1470:   /* copy the b into temp work space according to permutation */
1471:   ii = 0;
1472:   for (i=0; i<n; i++) {
1473:     ic      = 5*c[i];
1474:     t[ii]   = b[ic];
1475:     t[ii+1] = b[ic+1];
1476:     t[ii+2] = b[ic+2];
1477:     t[ii+3] = b[ic+3];
1478:     t[ii+4] = b[ic+4];
1479:     ii += 5;
1480:   }

1482:   /* forward solve the U^T */
1483:   idx = 0;
1484:   for (i=0; i<n; i++) {

1486:     v     = aa + 25*diag[i];
1487:     /* multiply by the inverse of the block diagonal */
1488:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494:     v += 25;

1496:     vi    = aj + diag[i] + 1;
1497:     nz    = ai[i+1] - diag[i] - 1;
1498:     while (nz--) {
1499:       oidx = 5*(*vi++);
1500:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505:       v  += 25;
1506:     }
1507:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508:     idx += 5;
1509:   }
1510:   /* backward solve the L^T */
1511:   for (i=n-1; i>=0; i--){
1512:     v    = aa + 25*diag[i] - 25;
1513:     vi   = aj + diag[i] - 1;
1514:     nz   = diag[i] - ai[i];
1515:     idt  = 5*i;
1516:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517:     while (nz--) {
1518:       idx   = 5*(*vi--);
1519:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520:       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524:       v -= 25;
1525:     }
1526:   }

1528:   /* copy t into x according to permutation */
1529:   ii = 0;
1530:   for (i=0; i<n; i++) {
1531:     ir      = 5*r[i];
1532:     x[ir]   = t[ii];
1533:     x[ir+1] = t[ii+1];
1534:     x[ir+2] = t[ii+2];
1535:     x[ir+3] = t[ii+3];
1536:     x[ir+4] = t[ii+4];
1537:     ii += 5;
1538:   }

1540:   ISRestoreIndices(isrow,&rout);
1541:   ISRestoreIndices(iscol,&cout);
1542:   VecRestoreArray(bb,(PetscScalar**)&b);
1543:   VecRestoreArray(xx,&x);
1544:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1545:   return(0);
1546: }

1550: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551: {
1552:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1553:   PetscErrorCode    ierr;
1554:   IS                iscol=a->col,isrow=a->row;
1555:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556:   const PetscInt    *r,*c,*rout,*cout;
1557:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559:   const MatScalar   *aa=a->a,*v;
1560:   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561:   const PetscScalar *b;

1564:   VecGetArray(bb,(PetscScalar**)&b);
1565:   VecGetArray(xx,&x);
1566:   t = a->solve_work;

1568:   ISGetIndices(isrow,&rout); r = rout;
1569:   ISGetIndices(iscol,&cout); c = cout;

1571:   /* copy b into temp work space according to permutation */
1572:   for(i=0;i<n;i++){
1573:     ii = bs*i; ic = bs*c[i];
1574:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575:     t[ii+4] = b[ic+4];
1576:   }

1578:   /* forward solve the U^T */
1579:   idx = 0;
1580:   for (i=0; i<n; i++) {
1581:     v     = aa + bs2*diag[i];
1582:     /* multiply by the inverse of the block diagonal */
1583:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1585:     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1586:     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587:     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588:     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589:     v -= bs2;

1591:     vi    = aj + diag[i] - 1;
1592:     nz    = diag[i] - diag[i+1] - 1;
1593:     for(j=0;j>-nz;j--){
1594:       oidx = bs*vi[j];
1595:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1596:       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1597:       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598:       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599:       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600:       v  -= bs2;
1601:     }
1602:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1603:     idx += bs;
1604:   }
1605:   /* backward solve the L^T */
1606:   for (i=n-1; i>=0; i--){
1607:     v    = aa + bs2*ai[i];
1608:     vi   = aj + ai[i];
1609:     nz   = ai[i+1] - ai[i];
1610:     idt  = bs*i;
1611:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1612:     for(j=0;j<nz;j++){
1613:       idx   = bs*vi[j];
1614:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1615:       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1616:       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617:       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618:       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619:       v += bs2;
1620:     }
1621:   }

1623:   /* copy t into x according to permutation */
1624:   for(i=0;i<n;i++){
1625:     ii = bs*i;  ir = bs*r[i];
1626:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1627:     x[ir+4] = t[ii+4];
1628:   }

1630:   ISRestoreIndices(isrow,&rout);
1631:   ISRestoreIndices(iscol,&cout);
1632:   VecRestoreArray(bb,(PetscScalar**)&b);
1633:   VecRestoreArray(xx,&x);
1634:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1635:   return(0);
1636: }

1640: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641: {
1642:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643:   IS                iscol=a->col,isrow=a->row;
1644:   PetscErrorCode    ierr;
1645:   const PetscInt    *r,*c,*rout,*cout;
1646:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648:   const MatScalar   *aa=a->a,*v;
1649:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650:   const PetscScalar *b;

1653:   VecGetArray(bb,(PetscScalar**)&b);
1654:   VecGetArray(xx,&x);
1655:   t  = a->solve_work;

1657:   ISGetIndices(isrow,&rout); r = rout;
1658:   ISGetIndices(iscol,&cout); c = cout;

1660:   /* copy the b into temp work space according to permutation */
1661:   ii = 0;
1662:   for (i=0; i<n; i++) {
1663:     ic      = 6*c[i];
1664:     t[ii]   = b[ic];
1665:     t[ii+1] = b[ic+1];
1666:     t[ii+2] = b[ic+2];
1667:     t[ii+3] = b[ic+3];
1668:     t[ii+4] = b[ic+4];
1669:     t[ii+5] = b[ic+5];
1670:     ii += 6;
1671:   }

1673:   /* forward solve the U^T */
1674:   idx = 0;
1675:   for (i=0; i<n; i++) {

1677:     v     = aa + 36*diag[i];
1678:     /* multiply by the inverse of the block diagonal */
1679:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680:     x6    = t[5+idx];
1681:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687:     v += 36;

1689:     vi    = aj + diag[i] + 1;
1690:     nz    = ai[i+1] - diag[i] - 1;
1691:     while (nz--) {
1692:       oidx = 6*(*vi++);
1693:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699:       v  += 36;
1700:     }
1701:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702:     t[5+idx] = s6;
1703:     idx += 6;
1704:   }
1705:   /* backward solve the L^T */
1706:   for (i=n-1; i>=0; i--){
1707:     v    = aa + 36*diag[i] - 36;
1708:     vi   = aj + diag[i] - 1;
1709:     nz   = diag[i] - ai[i];
1710:     idt  = 6*i;
1711:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712:     s6 = t[5+idt];
1713:     while (nz--) {
1714:       idx   = 6*(*vi--);
1715:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716:       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721:       v -= 36;
1722:     }
1723:   }

1725:   /* copy t into x according to permutation */
1726:   ii = 0;
1727:   for (i=0; i<n; i++) {
1728:     ir      = 6*r[i];
1729:     x[ir]   = t[ii];
1730:     x[ir+1] = t[ii+1];
1731:     x[ir+2] = t[ii+2];
1732:     x[ir+3] = t[ii+3];
1733:     x[ir+4] = t[ii+4];
1734:     x[ir+5] = t[ii+5];
1735:     ii += 6;
1736:   }

1738:   ISRestoreIndices(isrow,&rout);
1739:   ISRestoreIndices(iscol,&cout);
1740:   VecRestoreArray(bb,(PetscScalar**)&b);
1741:   VecRestoreArray(xx,&x);
1742:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1743:   return(0);
1744: }

1748: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749: {
1750:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751:   PetscErrorCode    ierr;
1752:   IS                iscol=a->col,isrow=a->row;
1753:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754:   const PetscInt    *r,*c,*rout,*cout;
1755:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757:   const MatScalar   *aa=a->a,*v;
1758:   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759:   const PetscScalar *b;

1762:   VecGetArray(bb,(PetscScalar**)&b);
1763:   VecGetArray(xx,&x);
1764:   t = a->solve_work;

1766:   ISGetIndices(isrow,&rout); r = rout;
1767:   ISGetIndices(iscol,&cout); c = cout;

1769:   /* copy b into temp work space according to permutation */
1770:   for(i=0;i<n;i++){
1771:     ii = bs*i; ic = bs*c[i];
1772:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1774:   }

1776:   /* forward solve the U^T */
1777:   idx = 0;
1778:   for (i=0; i<n; i++) {
1779:     v     = aa + bs2*diag[i];
1780:     /* multiply by the inverse of the block diagonal */
1781:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782:     x6    = t[5+idx];
1783:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1784:     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1785:     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786:     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787:     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788:     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789:     v -= bs2;

1791:     vi    = aj + diag[i] - 1;
1792:     nz    = diag[i] - diag[i+1] - 1;
1793:     for(j=0;j>-nz;j--){
1794:       oidx = bs*vi[j];
1795:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1796:       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1797:       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798:       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799:       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800:       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801:       v  -= bs2;
1802:     }
1803:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1804:     t[5+idx] = s6;
1805:     idx += bs;
1806:   }
1807:   /* backward solve the L^T */
1808:   for (i=n-1; i>=0; i--){
1809:     v    = aa + bs2*ai[i];
1810:     vi   = aj + ai[i];
1811:     nz   = ai[i+1] - ai[i];
1812:     idt  = bs*i;
1813:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1814:     s6   = t[5+idt];
1815:    for(j=0;j<nz;j++){
1816:       idx   = bs*vi[j];
1817:       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1818:       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1819:       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820:       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821:       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822:       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823:       v += bs2;
1824:     }
1825:   }

1827:   /* copy t into x according to permutation */
1828:   for(i=0;i<n;i++){
1829:     ii = bs*i;  ir = bs*r[i];
1830:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1831:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1832:   }

1834:   ISRestoreIndices(isrow,&rout);
1835:   ISRestoreIndices(iscol,&cout);
1836:   VecRestoreArray(bb,(PetscScalar**)&b);
1837:   VecRestoreArray(xx,&x);
1838:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1839:   return(0);
1840: }

1844: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845: {
1846:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847:   IS                iscol=a->col,isrow=a->row;
1848:   PetscErrorCode    ierr;
1849:   const PetscInt    *r,*c,*rout,*cout;
1850:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851:   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852:   const MatScalar   *aa=a->a,*v;
1853:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854:   const PetscScalar *b;

1857:   VecGetArray(bb,(PetscScalar**)&b);
1858:   VecGetArray(xx,&x);
1859:   t  = a->solve_work;

1861:   ISGetIndices(isrow,&rout); r = rout;
1862:   ISGetIndices(iscol,&cout); c = cout;

1864:   /* copy the b into temp work space according to permutation */
1865:   ii = 0;
1866:   for (i=0; i<n; i++) {
1867:     ic      = 7*c[i];
1868:     t[ii]   = b[ic];
1869:     t[ii+1] = b[ic+1];
1870:     t[ii+2] = b[ic+2];
1871:     t[ii+3] = b[ic+3];
1872:     t[ii+4] = b[ic+4];
1873:     t[ii+5] = b[ic+5];
1874:     t[ii+6] = b[ic+6];
1875:     ii += 7;
1876:   }

1878:   /* forward solve the U^T */
1879:   idx = 0;
1880:   for (i=0; i<n; i++) {

1882:     v     = aa + 49*diag[i];
1883:     /* multiply by the inverse of the block diagonal */
1884:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885:     x6    = t[5+idx]; x7 = t[6+idx];
1886:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893:     v += 49;

1895:     vi    = aj + diag[i] + 1;
1896:     nz    = ai[i+1] - diag[i] - 1;
1897:     while (nz--) {
1898:       oidx = 7*(*vi++);
1899:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906:       v  += 49;
1907:     }
1908:     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909:     t[5+idx] = s6;t[6+idx] = s7;
1910:     idx += 7;
1911:   }
1912:   /* backward solve the L^T */
1913:   for (i=n-1; i>=0; i--){
1914:     v    = aa + 49*diag[i] - 49;
1915:     vi   = aj + diag[i] - 1;
1916:     nz   = diag[i] - ai[i];
1917:     idt  = 7*i;
1918:     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919:     s6 = t[5+idt];s7 = t[6+idt];
1920:     while (nz--) {
1921:       idx   = 7*(*vi--);
1922:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929:       v -= 49;
1930:     }
1931:   }

1933:   /* copy t into x according to permutation */
1934:   ii = 0;
1935:   for (i=0; i<n; i++) {
1936:     ir      = 7*r[i];
1937:     x[ir]   = t[ii];
1938:     x[ir+1] = t[ii+1];
1939:     x[ir+2] = t[ii+2];
1940:     x[ir+3] = t[ii+3];
1941:     x[ir+4] = t[ii+4];
1942:     x[ir+5] = t[ii+5];
1943:     x[ir+6] = t[ii+6];
1944:     ii += 7;
1945:   }

1947:   ISRestoreIndices(isrow,&rout);
1948:   ISRestoreIndices(iscol,&cout);
1949:   VecRestoreArray(bb,(PetscScalar**)&b);
1950:   VecRestoreArray(xx,&x);
1951:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1952:   return(0);
1953: }
1956: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957: {
1958:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1959:   PetscErrorCode    ierr;
1960:   IS                iscol=a->col,isrow=a->row;
1961:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962:   const PetscInt    *r,*c,*rout,*cout;
1963:   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965:   const MatScalar   *aa=a->a,*v;
1966:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967:   const PetscScalar *b;

1970:   VecGetArray(bb,(PetscScalar**)&b);
1971:   VecGetArray(xx,&x);
1972:   t = a->solve_work;

1974:   ISGetIndices(isrow,&rout); r = rout;
1975:   ISGetIndices(iscol,&cout); c = cout;

1977:   /* copy b into temp work space according to permutation */
1978:   for(i=0;i<n;i++){
1979:     ii = bs*i; ic = bs*c[i];
1980:     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981:     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1982:   }

1984:   /* forward solve the U^T */
1985:   idx = 0;
1986:   for (i=0; i<n; i++) {
1987:     v     = aa + bs2*diag[i];
1988:     /* multiply by the inverse of the block diagonal */
1989:     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990:     x6    = t[5+idx]; x7 = t[6+idx];
1991:     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1992:     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993:     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994:     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995:     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996:     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997:     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998:     v -= bs2;

2000:     vi    = aj + diag[i] - 1;
2001:     nz    = diag[i] - diag[i+1] - 1;
2002:     for(j=0;j>-nz;j--){
2003:       oidx = bs*vi[j];
2004:       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2005:       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006:       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007:       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008:       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009:       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010:       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011:       v  -= bs2;
2012:     }
2013:     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2014:     t[5+idx] = s6;  t[6+idx] = s7;
2015:     idx += bs;
2016:   }
2017:   /* backward solve the L^T */
2018:   for (i=n-1; i>=0; i--){
2019:     v    = aa + bs2*ai[i];
2020:     vi   = aj + ai[i];
2021:     nz   = ai[i+1] - ai[i];
2022:     idt  = bs*i;
2023:     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2024:     s6   = t[5+idt];  s7 = t[6+idt];
2025:    for(j=0;j<nz;j++){
2026:       idx   = bs*vi[j];
2027:       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2028:       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029:       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030:       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031:       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032:       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033:       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034:       v += bs2;
2035:     }
2036:   }

2038:   /* copy t into x according to permutation */
2039:   for(i=0;i<n;i++){
2040:     ii = bs*i;  ir = bs*r[i];
2041:     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2042:     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2043:   }

2045:   ISRestoreIndices(isrow,&rout);
2046:   ISRestoreIndices(iscol,&cout);
2047:   VecRestoreArray(bb,(PetscScalar**)&b);
2048:   VecRestoreArray(xx,&x);
2049:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2050:   return(0);
2051: }

2053: /* ----------------------------------------------------------- */
2056: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057: {
2058:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2059:   IS                iscol=a->col,isrow=a->row;
2060:   PetscErrorCode    ierr;
2061:   const PetscInt    *r,*c,*rout,*cout;
2062:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063:   PetscInt          i,nz;
2064:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065:   const MatScalar   *aa=a->a,*v;
2066:   PetscScalar       *x,*s,*t,*ls;
2067:   const PetscScalar *b;

2070:   VecGetArray(bb,(PetscScalar**)&b);
2071:   VecGetArray(xx,&x);
2072:   t  = a->solve_work;

2074:   ISGetIndices(isrow,&rout); r = rout;
2075:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2077:   /* forward solve the lower triangular */
2078:   PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2079:   for (i=1; i<n; i++) {
2080:     v   = aa + bs2*ai[i];
2081:     vi  = aj + ai[i];
2082:     nz  = a->diag[i] - ai[i];
2083:     s = t + bs*i;
2084:     PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2085:     while (nz--) {
2086:       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087:       v += bs2;
2088:     }
2089:   }
2090:   /* backward solve the upper triangular */
2091:   ls = a->solve_work + A->cmap->n;
2092:   for (i=n-1; i>=0; i--){
2093:     v   = aa + bs2*(a->diag[i] + 1);
2094:     vi  = aj + a->diag[i] + 1;
2095:     nz  = ai[i+1] - a->diag[i] - 1;
2096:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2097:     while (nz--) {
2098:       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099:       v += bs2;
2100:     }
2101:     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102:     PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2103:   }

2105:   ISRestoreIndices(isrow,&rout);
2106:   ISRestoreIndices(iscol,&cout);
2107:   VecRestoreArray(bb,(PetscScalar**)&b);
2108:   VecRestoreArray(xx,&x);
2109:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2110:   return(0);
2111: }

2113: /* ----------------------------------------------------------- */
2116: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117: {
2118:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2119:   IS                iscol=a->col,isrow=a->row;
2120:   PetscErrorCode    ierr;
2121:   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122:   PetscInt          i,nz,j;
2123:   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124:   const MatScalar   *aa=a->a,*v;
2125:   PetscScalar       *x,*t,*ls;
2126:   const PetscScalar *b;
2128:   VecGetArray(bb,(PetscScalar**)&b);
2129:   VecGetArray(xx,&x);
2130:   t    = a->solve_work;

2132:   ISGetIndices(isrow,&rout); r = rout;
2133:   ISGetIndices(iscol,&cout); c = cout;

2135:   /* copy the b into temp work space according to permutation */
2136:   for (i=0; i<n; i++) {
2137:     for (j=0; j<bs; j++) {
2138:       t[i*bs+j] = b[c[i]*bs+j];
2139:     }
2140:   }


2143:   /* forward solve the upper triangular transpose */
2144:   ls = a->solve_work + A->cmap->n;
2145:   for (i=0; i<n; i++){
2146:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2147:     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148:     v   = aa + bs2*(a->diag[i] + 1);
2149:     vi  = aj + a->diag[i] + 1;
2150:     nz  = ai[i+1] - a->diag[i] - 1;
2151:     while (nz--) {
2152:       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153:       v += bs2;
2154:     }
2155:   }

2157:   /* backward solve the lower triangular transpose */
2158:   for (i=n-1; i>=0; i--) {
2159:     v   = aa + bs2*ai[i];
2160:     vi  = aj + ai[i];
2161:     nz  = a->diag[i] - ai[i];
2162:     while (nz--) {
2163:       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164:       v += bs2;
2165:     }
2166:   }

2168:   /* copy t into x according to permutation */
2169:   for (i=0; i<n; i++) {
2170:     for (j=0; j<bs; j++) {
2171:       x[bs*r[i]+j]   = t[bs*i+j];
2172:     }
2173:   }

2175:   ISRestoreIndices(isrow,&rout);
2176:   ISRestoreIndices(iscol,&cout);
2177:   VecRestoreArray(bb,(PetscScalar**)&b);
2178:   VecRestoreArray(xx,&x);
2179:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2180:   return(0);
2181: }

2185: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186: {
2187:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2188:   IS                iscol=a->col,isrow=a->row;
2189:   PetscErrorCode    ierr;
2190:   const PetscInt    *r,*c,*rout,*cout;
2191:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192:   PetscInt          i,j,nz;
2193:   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2194:   const MatScalar   *aa=a->a,*v;
2195:   PetscScalar       *x,*t,*ls;
2196:   const PetscScalar *b;

2199:   VecGetArray(bb,(PetscScalar**)&b);
2200:   VecGetArray(xx,&x);
2201:   t    = a->solve_work;

2203:   ISGetIndices(isrow,&rout); r = rout;
2204:   ISGetIndices(iscol,&cout); c = cout;

2206:   /* copy the b into temp work space according to permutation */
2207:   for (i=0; i<n; i++) {
2208:     for (j=0; j<bs; j++) {
2209:       t[i*bs+j] = b[c[i]*bs+j];
2210:     }
2211:   }


2214:   /* forward solve the upper triangular transpose */
2215:   ls = a->solve_work + A->cmap->n;
2216:   for (i=0; i<n; i++){
2217:     PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2218:     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219:     v   = aa + bs2*(diag[i] - 1);
2220:     vi  = aj + diag[i] - 1;
2221:     nz  = diag[i] - diag[i+1] - 1;
2222:     for(j=0;j>-nz;j--){
2223:       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224:       v -= bs2;
2225:     }
2226:   }

2228:   /* backward solve the lower triangular transpose */
2229:   for (i=n-1; i>=0; i--) {
2230:     v   = aa + bs2*ai[i];
2231:     vi  = aj + ai[i];
2232:     nz  = ai[i+1] - ai[i];
2233:     for(j=0;j<nz;j++){
2234:       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235:       v += bs2;
2236:     }
2237:   }

2239:   /* copy t into x according to permutation */
2240:   for (i=0; i<n; i++) {
2241:     for (j=0; j<bs; j++) {
2242:       x[bs*r[i]+j]   = t[bs*i+j];
2243:     }
2244:   }

2246:   ISRestoreIndices(isrow,&rout);
2247:   ISRestoreIndices(iscol,&cout);
2248:   VecRestoreArray(bb,(PetscScalar**)&b);
2249:   VecRestoreArray(xx,&x);
2250:   PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2251:   return(0);
2252: }

2254: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */

2258: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2259: {
2260:   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2261:   PetscErrorCode    ierr;
2262:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263:   PetscInt          i,nz,idx,idt,m;
2264:   const MatScalar   *aa=a->a,*v;
2265:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267:   PetscScalar       *x;
2268:   const PetscScalar *b;

2271:   VecGetArray(bb,(PetscScalar**)&b);
2272:   VecGetArray(xx,&x);

2274:   /* forward solve the lower triangular */
2275:   idx    = 0;
2276:   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
2277:   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
2278:   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];

2280:   for (i=1; i<n; i++) {
2281:     v     = aa + bs2*ai[i];
2282:     vi    = aj + ai[i];
2283:     nz    = ai[i+1] - ai[i];
2284:     idt   = bs*i;
2285:     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
2286:     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
2287:     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2288:     for(m=0;m<nz;m++){
2289:       idx   = bs*vi[m];
2290:       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2291:       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2292:       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

2294: 
2295:       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2296:       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2297:       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2298:       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2299:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2300:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2301:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2302:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2303:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2304:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2305:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2306:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2307:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2308:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2309:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2310: 
2311:       v += bs2;
2312:     }
2313:     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
2314:     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
2315:     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2316: 
2317:   }
2318:   /* backward solve the upper triangular */
2319:   for (i=n-1; i>=0; i--){
2320:     v    = aa + bs2*(adiag[i+1]+1);
2321:     vi   = aj + adiag[i+1]+1;
2322:     nz   = adiag[i] - adiag[i+1] - 1;
2323:     idt  = bs*i;
2324:     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
2325:     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
2326:     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2327: 
2328:     for(m=0;m<nz;m++){
2329:       idx   = bs*vi[m];
2330:       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
2331:       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
2332:       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];

2334:       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2335:       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2336:       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2337:       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2338:       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2339:       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2340:       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2341:       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2342:       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2343:       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2344:       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2345:       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2346:       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2347:       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2348:       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;

2350:       v += bs2;
2351:     }

2353:     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2354:     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2355:     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2356:     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2357:     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2358:     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2359:     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2360:     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2361:     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2362:     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2363:     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2364:     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2365:     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2366:     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2367:     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;

2369:   }

2371:   VecRestoreArray(bb,(PetscScalar**)&b);
2372:   VecRestoreArray(xx,&x);
2373:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2374:   return(0);
2375: }

2377: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2378: /* Default MatSolve for block size 15 */

2382: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2383: {
2384:   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
2385:   PetscErrorCode    ierr;
2386:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2387:   PetscInt          i,k,nz,kdx,idx,idt,m;
2388:   const MatScalar   *aa=a->a,*v;
2389:   PetscScalar       s[15];
2390:   PetscScalar       *x;
2391:   const PetscScalar *b;

2394:   VecGetArray(bb,(PetscScalar**)&b);
2395:   VecGetArray(xx,&x);

2397:   /* forward solve the lower triangular */
2398:   for (i=0; i<n; i++) {
2399:     v     = aa + bs2*ai[i];
2400:     vi    = aj + ai[i];
2401:     nz    = ai[i+1] - ai[i];
2402:     idt   = bs*i;
2403:     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2404:     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2405:     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2406:     for(m=0;m<nz;m++){
2407:       idx   = bs*vi[m];
2408:       for(k=0;k<15;k++){
2409:         kdx = k + idx;
2410:         x[idt]    -= v[0]*x[kdx];
2411:         x[1+idt]  -= v[1]*x[kdx];
2412:         x[2+idt]  -= v[2]*x[kdx];
2413:         x[3+idt]  -= v[3]*x[kdx];
2414:         x[4+idt]  -= v[4]*x[kdx];
2415:         x[5+idt]  -= v[5]*x[kdx];
2416:         x[6+idt]  -= v[6]*x[kdx];
2417:         x[7+idt]  -= v[7]*x[kdx];
2418:         x[8+idt]  -= v[8]*x[kdx];
2419:         x[9+idt]  -= v[9]*x[kdx];
2420:         x[10+idt] -= v[10]*x[kdx];
2421:         x[11+idt] -= v[11]*x[kdx];
2422:         x[12+idt] -= v[12]*x[kdx];
2423:         x[13+idt] -= v[13]*x[kdx];
2424:         x[14+idt] -= v[14]*x[kdx];
2425:         v += 15;
2426:       }
2427:     }
2428:   }
2429:   /* backward solve the upper triangular */
2430:   for (i=n-1; i>=0; i--){
2431:     v    = aa + bs2*(adiag[i+1]+1);
2432:     vi   = aj + adiag[i+1]+1;
2433:     nz   = adiag[i] - adiag[i+1] - 1;
2434:     idt  = bs*i;
2435:     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
2436:     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
2437:     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2438: 
2439:     for(m=0;m<nz;m++){
2440:       idx   = bs*vi[m];
2441:       for(k=0;k<15;k++){
2442:         kdx = k + idx;
2443:         s[0]  -= v[0]*x[kdx];
2444:         s[1]  -= v[1]*x[kdx];
2445:         s[2]  -= v[2]*x[kdx];
2446:         s[3]  -= v[3]*x[kdx];
2447:         s[4]  -= v[4]*x[kdx];
2448:         s[5]  -= v[5]*x[kdx];
2449:         s[6]  -= v[6]*x[kdx];
2450:         s[7]  -= v[7]*x[kdx];
2451:         s[8]  -= v[8]*x[kdx];
2452:         s[9]  -= v[9]*x[kdx];
2453:         s[10] -= v[10]*x[kdx];
2454:         s[11] -= v[11]*x[kdx];
2455:         s[12] -= v[12]*x[kdx];
2456:         s[13] -= v[13]*x[kdx];
2457:         s[14] -= v[14]*x[kdx];
2458:         v += 15;
2459:       }
2460:     }
2461:     PetscMemzero(x+idt,bs*sizeof(MatScalar));
2462:     for(k=0;k<15;k++){
2463:       x[idt]    += v[0]*s[k];
2464:       x[1+idt]  += v[1]*s[k];
2465:       x[2+idt]  += v[2]*s[k];
2466:       x[3+idt]  += v[3]*s[k];
2467:       x[4+idt]  += v[4]*s[k];
2468:       x[5+idt]  += v[5]*s[k];
2469:       x[6+idt]  += v[6]*s[k];
2470:       x[7+idt]  += v[7]*s[k];
2471:       x[8+idt]  += v[8]*s[k];
2472:       x[9+idt]  += v[9]*s[k];
2473:       x[10+idt] += v[10]*s[k];
2474:       x[11+idt] += v[11]*s[k];
2475:       x[12+idt] += v[12]*s[k];
2476:       x[13+idt] += v[13]*s[k];
2477:       x[14+idt] += v[14]*s[k];
2478:       v += 15;
2479:     }
2480:   }
2481:   VecRestoreArray(bb,(PetscScalar**)&b);
2482:   VecRestoreArray(xx,&x);
2483:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2484:   return(0);
2485: }


2490: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2491: {
2492:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2493:   IS                iscol=a->col,isrow=a->row;
2494:   PetscErrorCode    ierr;
2495:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2496:   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2497:   PetscInt          i,nz,idx,idt,idc;
2498:   const MatScalar   *aa=a->a,*v;
2499:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2500:   const PetscScalar *b;

2503:   VecGetArray(bb,(PetscScalar**)&b);
2504:   VecGetArray(xx,&x);
2505:   t  = a->solve_work;

2507:   ISGetIndices(isrow,&rout); r = rout;
2508:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2510:   /* forward solve the lower triangular */
2511:   idx    = 7*(*r++);
2512:   t[0] = b[idx];   t[1] = b[1+idx];
2513:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2514:   t[5] = b[5+idx]; t[6] = b[6+idx];

2516:   for (i=1; i<n; i++) {
2517:     v     = aa + 49*ai[i];
2518:     vi    = aj + ai[i];
2519:     nz    = diag[i] - ai[i];
2520:     idx   = 7*(*r++);
2521:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2522:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2523:     while (nz--) {
2524:       idx   = 7*(*vi++);
2525:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2526:       x4    = t[3+idx];x5 = t[4+idx];
2527:       x6    = t[5+idx];x7 = t[6+idx];
2528:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2529:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2530:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2531:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2532:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2533:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2534:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2535:       v += 49;
2536:     }
2537:     idx = 7*i;
2538:     t[idx]   = s1;t[1+idx] = s2;
2539:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2540:     t[5+idx] = s6;t[6+idx] = s7;
2541:   }
2542:   /* backward solve the upper triangular */
2543:   for (i=n-1; i>=0; i--){
2544:     v    = aa + 49*diag[i] + 49;
2545:     vi   = aj + diag[i] + 1;
2546:     nz   = ai[i+1] - diag[i] - 1;
2547:     idt  = 7*i;
2548:     s1 = t[idt];  s2 = t[1+idt];
2549:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2550:     s6 = t[5+idt];s7 = t[6+idt];
2551:     while (nz--) {
2552:       idx   = 7*(*vi++);
2553:       x1    = t[idx];   x2 = t[1+idx];
2554:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2555:       x6    = t[5+idx]; x7 = t[6+idx];
2556:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2557:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2558:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2559:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2560:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2561:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2562:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2563:       v += 49;
2564:     }
2565:     idc = 7*(*c--);
2566:     v   = aa + 49*diag[i];
2567:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2568:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2569:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2570:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2571:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2572:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2573:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2574:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2575:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2576:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2577:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2578:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2579:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2580:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2581:   }

2583:   ISRestoreIndices(isrow,&rout);
2584:   ISRestoreIndices(iscol,&cout);
2585:   VecRestoreArray(bb,(PetscScalar**)&b);
2586:   VecRestoreArray(xx,&x);
2587:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2588:   return(0);
2589: }

2593: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2594: {
2595:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2596:   IS                iscol=a->col,isrow=a->row;
2597:   PetscErrorCode    ierr;
2598:   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2599:   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2600:   PetscInt          i,nz,idx,idt,idc,m;
2601:   const MatScalar   *aa=a->a,*v;
2602:   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2603:   const PetscScalar *b;

2606:   VecGetArray(bb,(PetscScalar**)&b);
2607:   VecGetArray(xx,&x);
2608:   t  = a->solve_work;

2610:   ISGetIndices(isrow,&rout); r = rout;
2611:   ISGetIndices(iscol,&cout); c = cout;

2613:   /* forward solve the lower triangular */
2614:   idx    = 7*r[0];
2615:   t[0] = b[idx];   t[1] = b[1+idx];
2616:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2617:   t[5] = b[5+idx]; t[6] = b[6+idx];

2619:   for (i=1; i<n; i++) {
2620:     v     = aa + 49*ai[i];
2621:     vi    = aj + ai[i];
2622:     nz    = ai[i+1] - ai[i];
2623:     idx   = 7*r[i];
2624:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2625:     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2626:     for(m=0;m<nz;m++){
2627:       idx   = 7*vi[m];
2628:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2629:       x4    = t[3+idx];x5 = t[4+idx];
2630:       x6    = t[5+idx];x7 = t[6+idx];
2631:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2632:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2633:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2634:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2635:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2636:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2637:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2638:       v += 49;
2639:     }
2640:     idx = 7*i;
2641:     t[idx]   = s1;t[1+idx] = s2;
2642:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2643:     t[5+idx] = s6;t[6+idx] = s7;
2644:   }
2645:   /* backward solve the upper triangular */
2646:   for (i=n-1; i>=0; i--){
2647:     v    = aa + 49*(adiag[i+1]+1);
2648:     vi   = aj + adiag[i+1]+1;
2649:     nz   = adiag[i] - adiag[i+1] - 1;
2650:     idt  = 7*i;
2651:     s1 = t[idt];  s2 = t[1+idt];
2652:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2653:     s6 = t[5+idt];s7 = t[6+idt];
2654:     for(m=0;m<nz;m++){
2655:       idx   = 7*vi[m];
2656:       x1    = t[idx];   x2 = t[1+idx];
2657:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2658:       x6    = t[5+idx]; x7 = t[6+idx];
2659:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2660:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2661:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2662:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2663:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2664:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2665:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2666:       v += 49;
2667:     }
2668:     idc = 7*c[i];
2669:     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2670:                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2671:     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2672:                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2673:     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2674:                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2675:     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2676:                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2677:     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2678:                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2679:     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2680:                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2681:     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2682:                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2683:   }

2685:   ISRestoreIndices(isrow,&rout);
2686:   ISRestoreIndices(iscol,&cout);
2687:   VecRestoreArray(bb,(PetscScalar**)&b);
2688:   VecRestoreArray(xx,&x);
2689:   PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2690:   return(0);
2691: }

2695: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2696: {
2697:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2698:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2699:   PetscErrorCode    ierr;
2700:   PetscInt          i,nz,idx,idt,jdx;
2701:   const MatScalar   *aa=a->a,*v;
2702:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2703:   const PetscScalar *b;

2706:   VecGetArray(bb,(PetscScalar**)&b);
2707:   VecGetArray(xx,&x);
2708:   /* forward solve the lower triangular */
2709:   idx    = 0;
2710:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2711:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2712:   x[6] = b[6+idx];
2713:   for (i=1; i<n; i++) {
2714:     v     =  aa + 49*ai[i];
2715:     vi    =  aj + ai[i];
2716:     nz    =  diag[i] - ai[i];
2717:     idx   =  7*i;
2718:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2719:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2720:     s7  =  b[6+idx];
2721:     while (nz--) {
2722:       jdx   = 7*(*vi++);
2723:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2724:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2725:       x7    = x[6+jdx];
2726:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2727:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2728:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2729:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2730:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2731:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2732:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2733:       v += 49;
2734:      }
2735:     x[idx]   = s1;
2736:     x[1+idx] = s2;
2737:     x[2+idx] = s3;
2738:     x[3+idx] = s4;
2739:     x[4+idx] = s5;
2740:     x[5+idx] = s6;
2741:     x[6+idx] = s7;
2742:   }
2743:   /* backward solve the upper triangular */
2744:   for (i=n-1; i>=0; i--){
2745:     v    = aa + 49*diag[i] + 49;
2746:     vi   = aj + diag[i] + 1;
2747:     nz   = ai[i+1] - diag[i] - 1;
2748:     idt  = 7*i;
2749:     s1 = x[idt];   s2 = x[1+idt];
2750:     s3 = x[2+idt]; s4 = x[3+idt];
2751:     s5 = x[4+idt]; s6 = x[5+idt];
2752:     s7 = x[6+idt];
2753:     while (nz--) {
2754:       idx   = 7*(*vi++);
2755:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2756:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2757:       x7    = x[6+idx];
2758:       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2759:       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2760:       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2761:       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2762:       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2763:       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2764:       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2765:       v += 49;
2766:     }
2767:     v        = aa + 49*diag[i];
2768:     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2769:                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2770:     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2771:                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2772:     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2773:                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2774:     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2775:                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2776:     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2777:                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2778:     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2779:                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2780:     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2781:                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2782:   }

2784:   VecRestoreArray(bb,(PetscScalar**)&b);
2785:   VecRestoreArray(xx,&x);
2786:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2787:   return(0);
2788: }

2792: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2793: {
2794:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2795:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2796:     PetscErrorCode    ierr;
2797:     PetscInt          i,k,nz,idx,jdx,idt;
2798:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
2799:     const MatScalar   *aa=a->a,*v;
2800:     PetscScalar       *x;
2801:     const PetscScalar *b;
2802:     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;

2805:     VecGetArray(bb,(PetscScalar**)&b);
2806:     VecGetArray(xx,&x);
2807:     /* forward solve the lower triangular */
2808:     idx    = 0;
2809:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2810:     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2811:     for (i=1; i<n; i++) {
2812:        v    = aa + bs2*ai[i];
2813:        vi   = aj + ai[i];
2814:        nz   = ai[i+1] - ai[i];
2815:       idx   = bs*i;
2816:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2817:        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2818:        for(k=0;k<nz;k++) {
2819:           jdx   = bs*vi[k];
2820:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2821:           x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2822:           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2823:           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2824:           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2825:           s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2826:           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2827:           s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2828:           s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2829:           v   +=  bs2;
2830:         }

2832:        x[idx]   = s1;
2833:        x[1+idx] = s2;
2834:        x[2+idx] = s3;
2835:        x[3+idx] = s4;
2836:        x[4+idx] = s5;
2837:        x[5+idx] = s6;
2838:        x[6+idx] = s7;
2839:     }
2840: 
2841:    /* backward solve the upper triangular */
2842:   for (i=n-1; i>=0; i--){
2843:     v   = aa + bs2*(adiag[i+1]+1);
2844:      vi  = aj + adiag[i+1]+1;
2845:      nz  = adiag[i] - adiag[i+1]-1;
2846:      idt = bs*i;
2847:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2848:      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2849:     for(k=0;k<nz;k++) {
2850:       idx   = bs*vi[k];
2851:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2852:        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2853:        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2854:        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2855:        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2856:        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2857:        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2858:        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2859:        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2860:         v   +=  bs2;
2861:     }
2862:     /* x = inv_diagonal*x */
2863:     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2864:     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2865:     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2866:     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2867:     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2868:     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2869:     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2870:   }

2872:   VecRestoreArray(bb,(PetscScalar**)&b);
2873:   VecRestoreArray(xx,&x);
2874:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2875:   return(0);
2876: }

2880: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2881: {
2882:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2883:   IS                iscol=a->col,isrow=a->row;
2884:   PetscErrorCode    ierr;
2885:   const PetscInt    *r,*c,*rout,*cout;
2886:   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2887:   PetscInt          i,nz,idx,idt,idc;
2888:   const MatScalar   *aa=a->a,*v;
2889:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2890:   const PetscScalar *b;

2893:   VecGetArray(bb,(PetscScalar**)&b);
2894:   VecGetArray(xx,&x);
2895:   t  = a->solve_work;

2897:   ISGetIndices(isrow,&rout); r = rout;
2898:   ISGetIndices(iscol,&cout); c = cout + (n-1);

2900:   /* forward solve the lower triangular */
2901:   idx    = 6*(*r++);
2902:   t[0] = b[idx];   t[1] = b[1+idx];
2903:   t[2] = b[2+idx]; t[3] = b[3+idx];
2904:   t[4] = b[4+idx]; t[5] = b[5+idx];
2905:   for (i=1; i<n; i++) {
2906:     v     = aa + 36*ai[i];
2907:     vi    = aj + ai[i];
2908:     nz    = diag[i] - ai[i];
2909:     idx   = 6*(*r++);
2910:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2911:     s5  = b[4+idx]; s6 = b[5+idx];
2912:     while (nz--) {
2913:       idx   = 6*(*vi++);
2914:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2915:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2916:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2917:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2918:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2919:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2920:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2921:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2922:       v += 36;
2923:     }
2924:     idx = 6*i;
2925:     t[idx]   = s1;t[1+idx] = s2;
2926:     t[2+idx] = s3;t[3+idx] = s4;
2927:     t[4+idx] = s5;t[5+idx] = s6;
2928:   }
2929:   /* backward solve the upper triangular */
2930:   for (i=n-1; i>=0; i--){
2931:     v    = aa + 36*diag[i] + 36;
2932:     vi   = aj + diag[i] + 1;
2933:     nz   = ai[i+1] - diag[i] - 1;
2934:     idt  = 6*i;
2935:     s1 = t[idt];  s2 = t[1+idt];
2936:     s3 = t[2+idt];s4 = t[3+idt];
2937:     s5 = t[4+idt];s6 = t[5+idt];
2938:     while (nz--) {
2939:       idx   = 6*(*vi++);
2940:       x1    = t[idx];   x2 = t[1+idx];
2941:       x3    = t[2+idx]; x4 = t[3+idx];
2942:       x5    = t[4+idx]; x6 = t[5+idx];
2943:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2944:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2945:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2946:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2947:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2948:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2949:       v += 36;
2950:     }
2951:     idc = 6*(*c--);
2952:     v   = aa + 36*diag[i];
2953:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2954:                                  v[18]*s4+v[24]*s5+v[30]*s6;
2955:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2956:                                  v[19]*s4+v[25]*s5+v[31]*s6;
2957:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2958:                                  v[20]*s4+v[26]*s5+v[32]*s6;
2959:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2960:                                  v[21]*s4+v[27]*s5+v[33]*s6;
2961:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2962:                                  v[22]*s4+v[28]*s5+v[34]*s6;
2963:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2964:                                  v[23]*s4+v[29]*s5+v[35]*s6;
2965:   }

2967:   ISRestoreIndices(isrow,&rout);
2968:   ISRestoreIndices(iscol,&cout);
2969:   VecRestoreArray(bb,(PetscScalar**)&b);
2970:   VecRestoreArray(xx,&x);
2971:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2972:   return(0);
2973: }

2977: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2978: {
2979:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2980:   IS                iscol=a->col,isrow=a->row;
2981:   PetscErrorCode    ierr;
2982:   const PetscInt    *r,*c,*rout,*cout;
2983:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2984:   PetscInt          i,nz,idx,idt,idc,m;
2985:   const MatScalar   *aa=a->a,*v;
2986:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2987:   const PetscScalar *b;

2990:   VecGetArray(bb,(PetscScalar**)&b);
2991:   VecGetArray(xx,&x);
2992:   t  = a->solve_work;

2994:   ISGetIndices(isrow,&rout); r = rout;
2995:   ISGetIndices(iscol,&cout); c = cout;

2997:   /* forward solve the lower triangular */
2998:   idx    = 6*r[0];
2999:   t[0] = b[idx];   t[1] = b[1+idx];
3000:   t[2] = b[2+idx]; t[3] = b[3+idx];
3001:   t[4] = b[4+idx]; t[5] = b[5+idx];
3002:   for (i=1; i<n; i++) {
3003:     v     = aa + 36*ai[i];
3004:     vi    = aj + ai[i];
3005:     nz    = ai[i+1] - ai[i];
3006:     idx   = 6*r[i];
3007:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3008:     s5  = b[4+idx]; s6 = b[5+idx];
3009:     for(m=0;m<nz;m++){
3010:       idx   = 6*vi[m];
3011:       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3012:       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3013:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3019:       v += 36;
3020:     }
3021:     idx = 6*i;
3022:     t[idx]   = s1;t[1+idx] = s2;
3023:     t[2+idx] = s3;t[3+idx] = s4;
3024:     t[4+idx] = s5;t[5+idx] = s6;
3025:   }
3026:   /* backward solve the upper triangular */
3027:   for (i=n-1; i>=0; i--){
3028:     v    = aa + 36*(adiag[i+1]+1);
3029:     vi   = aj + adiag[i+1]+1;
3030:     nz   = adiag[i] - adiag[i+1] - 1;
3031:     idt  = 6*i;
3032:     s1 = t[idt];  s2 = t[1+idt];
3033:     s3 = t[2+idt];s4 = t[3+idt];
3034:     s5 = t[4+idt];s6 = t[5+idt];
3035:     for(m=0;m<nz;m++){
3036:       idx   = 6*vi[m];
3037:       x1    = t[idx];   x2 = t[1+idx];
3038:       x3    = t[2+idx]; x4 = t[3+idx];
3039:       x5    = t[4+idx]; x6 = t[5+idx];
3040:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3041:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3042:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3043:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3044:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3045:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3046:       v += 36;
3047:     }
3048:     idc = 6*c[i];
3049:     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3050:                                  v[18]*s4+v[24]*s5+v[30]*s6;
3051:     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3052:                                  v[19]*s4+v[25]*s5+v[31]*s6;
3053:     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3054:                                  v[20]*s4+v[26]*s5+v[32]*s6;
3055:     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3056:                                  v[21]*s4+v[27]*s5+v[33]*s6;
3057:     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3058:                                  v[22]*s4+v[28]*s5+v[34]*s6;
3059:     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3060:                                  v[23]*s4+v[29]*s5+v[35]*s6;
3061:   }

3063:   ISRestoreIndices(isrow,&rout);
3064:   ISRestoreIndices(iscol,&cout);
3065:   VecRestoreArray(bb,(PetscScalar**)&b);
3066:   VecRestoreArray(xx,&x);
3067:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3068:   return(0);
3069: }

3073: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3074: {
3075:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3076:   PetscInt          i,nz,idx,idt,jdx;
3077:   PetscErrorCode    ierr;
3078:   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3079:   const MatScalar   *aa=a->a,*v;
3080:   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3081:   const PetscScalar *b;

3084:   VecGetArray(bb,(PetscScalar**)&b);
3085:   VecGetArray(xx,&x);
3086:   /* forward solve the lower triangular */
3087:   idx    = 0;
3088:   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
3089:   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3090:   for (i=1; i<n; i++) {
3091:     v     =  aa + 36*ai[i];
3092:     vi    =  aj + ai[i];
3093:     nz    =  diag[i] - ai[i];
3094:     idx   =  6*i;
3095:     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3096:     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3097:     while (nz--) {
3098:       jdx   = 6*(*vi++);
3099:       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
3100:       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3101:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3102:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3103:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3104:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3105:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3106:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3107:       v += 36;
3108:      }
3109:     x[idx]   = s1;
3110:     x[1+idx] = s2;
3111:     x[2+idx] = s3;
3112:     x[3+idx] = s4;
3113:     x[4+idx] = s5;
3114:     x[5+idx] = s6;
3115:   }
3116:   /* backward solve the upper triangular */
3117:   for (i=n-1; i>=0; i--){
3118:     v    = aa + 36*diag[i] + 36;
3119:     vi   = aj + diag[i] + 1;
3120:     nz   = ai[i+1] - diag[i] - 1;
3121:     idt  = 6*i;
3122:     s1 = x[idt];   s2 = x[1+idt];
3123:     s3 = x[2+idt]; s4 = x[3+idt];
3124:     s5 = x[4+idt]; s6 = x[5+idt];
3125:     while (nz--) {
3126:       idx   = 6*(*vi++);
3127:       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3128:       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3129:       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3130:       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3131:       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3132:       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3133:       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3134:       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3135:       v += 36;
3136:     }
3137:     v        = aa + 36*diag[i];
3138:     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3139:     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3140:     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3141:     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3142:     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3143:     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3144:   }

3146:   VecRestoreArray(bb,(PetscScalar**)&b);
3147:   VecRestoreArray(xx,&x);
3148:   PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3149:   return(0);
3150: }

3154: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3155: {
3156:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3157:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3158:     PetscErrorCode    ierr;
3159:     PetscInt          i,k,nz,idx,jdx,idt;
3160:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
3161:     const MatScalar   *aa=a->a,*v;
3162:     PetscScalar       *x;
3163:     const PetscScalar *b;
3164:     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;

3167:     VecGetArray(bb,(PetscScalar**)&b);
3168:     VecGetArray(xx,&x);
3169:     /* forward solve the lower triangular */
3170:     idx    = 0;
3171:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3172:     x[4] = b[4+idx];x[5] = b[5+idx];
3173:     for (i=1; i<n; i++) {
3174:        v    = aa + bs2*ai[i];
3175:        vi   = aj + ai[i];
3176:        nz   = ai[i+1] - ai[i];
3177:       idx   = bs*i;
3178:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3179:        s5   = b[4+idx];s6 = b[5+idx];
3180:        for(k=0;k<nz;k++){
3181:           jdx   = bs*vi[k];
3182:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3183:           x5    = x[4+jdx]; x6 = x[5+jdx];
3184:           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3185:           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3186:           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3187:           s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3188:           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3189:           s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3190:           v   +=  bs2;
3191:         }

3193:        x[idx]   = s1;
3194:        x[1+idx] = s2;
3195:        x[2+idx] = s3;
3196:        x[3+idx] = s4;
3197:        x[4+idx] = s5;
3198:        x[5+idx] = s6;
3199:     }
3200: 
3201:    /* backward solve the upper triangular */
3202:   for (i=n-1; i>=0; i--){
3203:     v   = aa + bs2*(adiag[i+1]+1);
3204:      vi  = aj + adiag[i+1]+1;
3205:      nz  = adiag[i] - adiag[i+1]-1;
3206:      idt = bs*i;
3207:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3208:      s5 = x[4+idt];s6 = x[5+idt];
3209:      for(k=0;k<nz;k++){
3210:       idx   = bs*vi[k];
3211:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3212:        x5    = x[4+idx];x6 = x[5+idx];
3213:        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3214:        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3215:        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3216:        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3217:        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3218:        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3219:         v   +=  bs2;
3220:     }
3221:     /* x = inv_diagonal*x */
3222:    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3223:    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3224:    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3225:    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3226:    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3227:    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3228:   }

3230:   VecRestoreArray(bb,(PetscScalar**)&b);
3231:   VecRestoreArray(xx,&x);
3232:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3233:   return(0);
3234: }

3238: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3239: {
3240:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3241:   IS                iscol=a->col,isrow=a->row;
3242:   PetscErrorCode    ierr;
3243:   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3244:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3245:   PetscInt          i,nz,idx,idt,idc;
3246:   const MatScalar   *aa=a->a,*v;
3247:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3248:   const PetscScalar *b;

3251:   VecGetArray(bb,(PetscScalar**)&b);
3252:   VecGetArray(xx,&x);
3253:   t  = a->solve_work;

3255:   ISGetIndices(isrow,&rout); r = rout;
3256:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3258:   /* forward solve the lower triangular */
3259:   idx    = 5*(*r++);
3260:   t[0] = b[idx];   t[1] = b[1+idx];
3261:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3262:   for (i=1; i<n; i++) {
3263:     v     = aa + 25*ai[i];
3264:     vi    = aj + ai[i];
3265:     nz    = diag[i] - ai[i];
3266:     idx   = 5*(*r++);
3267:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3268:     s5  = b[4+idx];
3269:     while (nz--) {
3270:       idx   = 5*(*vi++);
3271:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3272:       x4    = t[3+idx];x5 = t[4+idx];
3273:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3274:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3275:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3276:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3277:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3278:       v += 25;
3279:     }
3280:     idx = 5*i;
3281:     t[idx]   = s1;t[1+idx] = s2;
3282:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3283:   }
3284:   /* backward solve the upper triangular */
3285:   for (i=n-1; i>=0; i--){
3286:     v    = aa + 25*diag[i] + 25;
3287:     vi   = aj + diag[i] + 1;
3288:     nz   = ai[i+1] - diag[i] - 1;
3289:     idt  = 5*i;
3290:     s1 = t[idt];  s2 = t[1+idt];
3291:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3292:     while (nz--) {
3293:       idx   = 5*(*vi++);
3294:       x1    = t[idx];   x2 = t[1+idx];
3295:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3296:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3297:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3298:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3299:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3300:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3301:       v += 25;
3302:     }
3303:     idc = 5*(*c--);
3304:     v   = aa + 25*diag[i];
3305:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3306:                                  v[15]*s4+v[20]*s5;
3307:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3308:                                  v[16]*s4+v[21]*s5;
3309:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3310:                                  v[17]*s4+v[22]*s5;
3311:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3312:                                  v[18]*s4+v[23]*s5;
3313:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3314:                                  v[19]*s4+v[24]*s5;
3315:   }

3317:   ISRestoreIndices(isrow,&rout);
3318:   ISRestoreIndices(iscol,&cout);
3319:   VecRestoreArray(bb,(PetscScalar**)&b);
3320:   VecRestoreArray(xx,&x);
3321:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3322:   return(0);
3323: }

3327: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3328: {
3329:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3330:   IS                iscol=a->col,isrow=a->row;
3331:   PetscErrorCode    ierr;
3332:   const PetscInt    *r,*c,*rout,*cout;
3333:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3334:   PetscInt          i,nz,idx,idt,idc,m;
3335:   const MatScalar   *aa=a->a,*v;
3336:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3337:   const PetscScalar *b;

3340:   VecGetArray(bb,(PetscScalar**)&b);
3341:   VecGetArray(xx,&x);
3342:   t  = a->solve_work;

3344:   ISGetIndices(isrow,&rout); r = rout;
3345:   ISGetIndices(iscol,&cout); c = cout;

3347:   /* forward solve the lower triangular */
3348:   idx    = 5*r[0];
3349:   t[0] = b[idx];   t[1] = b[1+idx];
3350:   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3351:   for (i=1; i<n; i++) {
3352:     v     = aa + 25*ai[i];
3353:     vi    = aj + ai[i];
3354:     nz    = ai[i+1] - ai[i];
3355:     idx   = 5*r[i];
3356:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3357:     s5  = b[4+idx];
3358:     for(m=0;m<nz;m++){
3359:       idx   = 5*vi[m];
3360:       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3361:       x4    = t[3+idx];x5 = t[4+idx];
3362:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367:       v += 25;
3368:     }
3369:     idx = 5*i;
3370:     t[idx]   = s1;t[1+idx] = s2;
3371:     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3372:   }
3373:   /* backward solve the upper triangular */
3374:   for (i=n-1; i>=0; i--){
3375:     v    = aa + 25*(adiag[i+1]+1);
3376:     vi   = aj + adiag[i+1]+1;
3377:     nz   = adiag[i] - adiag[i+1] - 1;
3378:     idt  = 5*i;
3379:     s1 = t[idt];  s2 = t[1+idt];
3380:     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3381:     for(m=0;m<nz;m++){
3382:       idx   = 5*vi[m];
3383:       x1    = t[idx];   x2 = t[1+idx];
3384:       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3385:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3386:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3387:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3388:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3389:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3390:       v += 25;
3391:     }
3392:     idc = 5*c[i];
3393:     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3394:                                  v[15]*s4+v[20]*s5;
3395:     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3396:                                  v[16]*s4+v[21]*s5;
3397:     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3398:                                  v[17]*s4+v[22]*s5;
3399:     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3400:                                  v[18]*s4+v[23]*s5;
3401:     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3402:                                  v[19]*s4+v[24]*s5;
3403:   }

3405:   ISRestoreIndices(isrow,&rout);
3406:   ISRestoreIndices(iscol,&cout);
3407:   VecRestoreArray(bb,(PetscScalar**)&b);
3408:   VecRestoreArray(xx,&x);
3409:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3410:   return(0);
3411: }

3415: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3416: {
3417:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3418:   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3419:   PetscInt          i,nz,idx,idt,jdx;
3420:   PetscErrorCode    ierr;
3421:   const MatScalar   *aa=a->a,*v;
3422:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3423:   const PetscScalar *b;

3426:   VecGetArray(bb,(PetscScalar**)&b);
3427:   VecGetArray(xx,&x);
3428:   /* forward solve the lower triangular */
3429:   idx    = 0;
3430:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3431:   for (i=1; i<n; i++) {
3432:     v     =  aa + 25*ai[i];
3433:     vi    =  aj + ai[i];
3434:     nz    =  diag[i] - ai[i];
3435:     idx   =  5*i;
3436:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3437:     while (nz--) {
3438:       jdx   = 5*(*vi++);
3439:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3440:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3441:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3442:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3443:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3444:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3445:       v    += 25;
3446:     }
3447:     x[idx]   = s1;
3448:     x[1+idx] = s2;
3449:     x[2+idx] = s3;
3450:     x[3+idx] = s4;
3451:     x[4+idx] = s5;
3452:   }
3453:   /* backward solve the upper triangular */
3454:   for (i=n-1; i>=0; i--){
3455:     v    = aa + 25*diag[i] + 25;
3456:     vi   = aj + diag[i] + 1;
3457:     nz   = ai[i+1] - diag[i] - 1;
3458:     idt  = 5*i;
3459:     s1 = x[idt];  s2 = x[1+idt];
3460:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3461:     while (nz--) {
3462:       idx   = 5*(*vi++);
3463:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3464:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3465:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3466:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3467:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3468:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3469:       v    += 25;
3470:     }
3471:     v        = aa + 25*diag[i];
3472:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3473:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3474:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3475:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3476:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3477:   }

3479:   VecRestoreArray(bb,(PetscScalar**)&b);
3480:   VecRestoreArray(xx,&x);
3481:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3482:   return(0);
3483: }

3487: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3488: {
3489:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3490:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3491:   PetscInt          i,k,nz,idx,idt,jdx;
3492:   PetscErrorCode    ierr;
3493:   const MatScalar   *aa=a->a,*v;
3494:   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3495:   const PetscScalar *b;

3498:   VecGetArray(bb,(PetscScalar**)&b);
3499:   VecGetArray(xx,&x);
3500:   /* forward solve the lower triangular */
3501:   idx    = 0;
3502:   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3503:   for (i=1; i<n; i++) {
3504:     v   = aa + 25*ai[i];
3505:     vi  = aj + ai[i];
3506:     nz  = ai[i+1] - ai[i];
3507:     idx = 5*i;
3508:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3509:     for(k=0;k<nz;k++) {
3510:       jdx   = 5*vi[k];
3511:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3512:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3513:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3514:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3515:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3516:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3517:       v    += 25;
3518:     }
3519:     x[idx]   = s1;
3520:     x[1+idx] = s2;
3521:     x[2+idx] = s3;
3522:     x[3+idx] = s4;
3523:     x[4+idx] = s5;
3524:   }

3526:   /* backward solve the upper triangular */
3527:   for (i=n-1; i>=0; i--){
3528:     v   = aa + 25*(adiag[i+1]+1);
3529:     vi  = aj + adiag[i+1]+1;
3530:     nz  = adiag[i] - adiag[i+1]-1;
3531:     idt = 5*i;
3532:     s1 = x[idt];  s2 = x[1+idt];
3533:     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3534:     for(k=0;k<nz;k++){
3535:       idx   = 5*vi[k];
3536:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3537:       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3538:       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3539:       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3540:       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3541:       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3542:       v    += 25;
3543:     }
3544:     /* x = inv_diagonal*x */
3545:     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3546:     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3547:     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3548:     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3549:     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3550:   }

3552:   VecRestoreArray(bb,(PetscScalar**)&b);
3553:   VecRestoreArray(xx,&x);
3554:   PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3555:   return(0);
3556: }

3560: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3561: {
3562:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3563:   IS                iscol=a->col,isrow=a->row;
3564:   PetscErrorCode    ierr;
3565:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3566:   PetscInt          i,nz,idx,idt,idc;
3567:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3568:   const MatScalar   *aa=a->a,*v;
3569:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3570:   const PetscScalar *b;

3573:   VecGetArray(bb,(PetscScalar**)&b);
3574:   VecGetArray(xx,&x);
3575:   t  = a->solve_work;

3577:   ISGetIndices(isrow,&rout); r = rout;
3578:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3580:   /* forward solve the lower triangular */
3581:   idx    = 4*(*r++);
3582:   t[0] = b[idx];   t[1] = b[1+idx];
3583:   t[2] = b[2+idx]; t[3] = b[3+idx];
3584:   for (i=1; i<n; i++) {
3585:     v     = aa + 16*ai[i];
3586:     vi    = aj + ai[i];
3587:     nz    = diag[i] - ai[i];
3588:     idx   = 4*(*r++);
3589:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3590:     while (nz--) {
3591:       idx   = 4*(*vi++);
3592:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3593:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3594:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3595:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3596:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3597:       v    += 16;
3598:     }
3599:     idx        = 4*i;
3600:     t[idx]   = s1;t[1+idx] = s2;
3601:     t[2+idx] = s3;t[3+idx] = s4;
3602:   }
3603:   /* backward solve the upper triangular */
3604:   for (i=n-1; i>=0; i--){
3605:     v    = aa + 16*diag[i] + 16;
3606:     vi   = aj + diag[i] + 1;
3607:     nz   = ai[i+1] - diag[i] - 1;
3608:     idt  = 4*i;
3609:     s1 = t[idt];  s2 = t[1+idt];
3610:     s3 = t[2+idt];s4 = t[3+idt];
3611:     while (nz--) {
3612:       idx   = 4*(*vi++);
3613:       x1    = t[idx];   x2 = t[1+idx];
3614:       x3    = t[2+idx]; x4 = t[3+idx];
3615:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3616:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3617:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3618:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3619:       v += 16;
3620:     }
3621:     idc      = 4*(*c--);
3622:     v        = aa + 16*diag[i];
3623:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3624:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3625:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3626:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3627:   }

3629:   ISRestoreIndices(isrow,&rout);
3630:   ISRestoreIndices(iscol,&cout);
3631:   VecRestoreArray(bb,(PetscScalar**)&b);
3632:   VecRestoreArray(xx,&x);
3633:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3634:   return(0);
3635: }

3639: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3640: {
3641:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3642:   IS                iscol=a->col,isrow=a->row;
3643:   PetscErrorCode    ierr;
3644:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3645:   PetscInt          i,nz,idx,idt,idc,m;
3646:   const PetscInt    *r,*c,*rout,*cout;
3647:   const MatScalar   *aa=a->a,*v;
3648:   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3649:   const PetscScalar *b;

3652:   VecGetArray(bb,(PetscScalar**)&b);
3653:   VecGetArray(xx,&x);
3654:   t  = a->solve_work;

3656:   ISGetIndices(isrow,&rout); r = rout;
3657:   ISGetIndices(iscol,&cout); c = cout;

3659:   /* forward solve the lower triangular */
3660:   idx    = 4*r[0];
3661:   t[0] = b[idx];   t[1] = b[1+idx];
3662:   t[2] = b[2+idx]; t[3] = b[3+idx];
3663:   for (i=1; i<n; i++) {
3664:     v     = aa + 16*ai[i];
3665:     vi    = aj + ai[i];
3666:     nz    = ai[i+1] - ai[i];
3667:     idx   = 4*r[i];
3668:     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3669:     for(m=0;m<nz;m++){
3670:       idx   = 4*vi[m];
3671:       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3672:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3673:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3674:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3675:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3676:       v    += 16;
3677:     }
3678:     idx        = 4*i;
3679:     t[idx]   = s1;t[1+idx] = s2;
3680:     t[2+idx] = s3;t[3+idx] = s4;
3681:   }
3682:   /* backward solve the upper triangular */
3683:   for (i=n-1; i>=0; i--){
3684:     v    = aa + 16*(adiag[i+1]+1);
3685:     vi   = aj + adiag[i+1]+1;
3686:     nz   = adiag[i] - adiag[i+1] - 1;
3687:     idt  = 4*i;
3688:     s1 = t[idt];  s2 = t[1+idt];
3689:     s3 = t[2+idt];s4 = t[3+idt];
3690:     for(m=0;m<nz;m++){
3691:       idx   = 4*vi[m];
3692:       x1    = t[idx];   x2 = t[1+idx];
3693:       x3    = t[2+idx]; x4 = t[3+idx];
3694:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3695:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3696:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3697:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3698:       v += 16;
3699:     }
3700:     idc      = 4*c[i];
3701:     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3702:     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3703:     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3704:     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3705:   }

3707:   ISRestoreIndices(isrow,&rout);
3708:   ISRestoreIndices(iscol,&cout);
3709:   VecRestoreArray(bb,(PetscScalar**)&b);
3710:   VecRestoreArray(xx,&x);
3711:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3712:   return(0);
3713: }

3717: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3718: {
3719:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720:   IS                iscol=a->col,isrow=a->row;
3721:   PetscErrorCode    ierr;
3722:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3723:   PetscInt          i,nz,idx,idt,idc;
3724:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3725:   const MatScalar   *aa=a->a,*v;
3726:   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3727:   PetscScalar       *x;
3728:   const PetscScalar *b;

3731:   VecGetArray(bb,(PetscScalar**)&b);
3732:   VecGetArray(xx,&x);
3733:   t  = (MatScalar *)a->solve_work;

3735:   ISGetIndices(isrow,&rout); r = rout;
3736:   ISGetIndices(iscol,&cout); c = cout + (n-1);

3738:   /* forward solve the lower triangular */
3739:   idx    = 4*(*r++);
3740:   t[0] = (MatScalar)b[idx];
3741:   t[1] = (MatScalar)b[1+idx];
3742:   t[2] = (MatScalar)b[2+idx];
3743:   t[3] = (MatScalar)b[3+idx];
3744:   for (i=1; i<n; i++) {
3745:     v     = aa + 16*ai[i];
3746:     vi    = aj + ai[i];
3747:     nz    = diag[i] - ai[i];
3748:     idx   = 4*(*r++);
3749:     s1 = (MatScalar)b[idx];
3750:     s2 = (MatScalar)b[1+idx];
3751:     s3 = (MatScalar)b[2+idx];
3752:     s4 = (MatScalar)b[3+idx];
3753:     while (nz--) {
3754:       idx   = 4*(*vi++);
3755:       x1  = t[idx];
3756:       x2  = t[1+idx];
3757:       x3  = t[2+idx];
3758:       x4  = t[3+idx];
3759:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3760:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3761:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3762:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3763:       v    += 16;
3764:     }
3765:     idx        = 4*i;
3766:     t[idx]   = s1;
3767:     t[1+idx] = s2;
3768:     t[2+idx] = s3;
3769:     t[3+idx] = s4;
3770:   }
3771:   /* backward solve the upper triangular */
3772:   for (i=n-1; i>=0; i--){
3773:     v    = aa + 16*diag[i] + 16;
3774:     vi   = aj + diag[i] + 1;
3775:     nz   = ai[i+1] - diag[i] - 1;
3776:     idt  = 4*i;
3777:     s1 = t[idt];
3778:     s2 = t[1+idt];
3779:     s3 = t[2+idt];
3780:     s4 = t[3+idt];
3781:     while (nz--) {
3782:       idx   = 4*(*vi++);
3783:       x1  = t[idx];
3784:       x2  = t[1+idx];
3785:       x3  = t[2+idx];
3786:       x4  = t[3+idx];
3787:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3788:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3789:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3790:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3791:       v += 16;
3792:     }
3793:     idc      = 4*(*c--);
3794:     v        = aa + 16*diag[i];
3795:     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3796:     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3797:     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3798:     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3799:     x[idc]   = (PetscScalar)t[idt];
3800:     x[1+idc] = (PetscScalar)t[1+idt];
3801:     x[2+idc] = (PetscScalar)t[2+idt];
3802:     x[3+idc] = (PetscScalar)t[3+idt];
3803:  }

3805:   ISRestoreIndices(isrow,&rout);
3806:   ISRestoreIndices(iscol,&cout);
3807:   VecRestoreArray(bb,(PetscScalar**)&b);
3808:   VecRestoreArray(xx,&x);
3809:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3810:   return(0);
3811: }

3813: #if defined (PETSC_HAVE_SSE)

3815: #include PETSC_HAVE_SSE

3819: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3820: {
3821:   /* 
3822:      Note: This code uses demotion of double
3823:      to float when performing the mixed-mode computation.
3824:      This may not be numerically reasonable for all applications.
3825:   */
3826:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3827:   IS             iscol=a->col,isrow=a->row;
3829:   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3830:   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3831:   MatScalar      *aa=a->a,*v;
3832:   PetscScalar    *x,*b,*t;

3834:   /* Make space in temp stack for 16 Byte Aligned arrays */
3835:   float           ssealignedspace[11],*tmps,*tmpx;
3836:   unsigned long   offset;
3837: 
3839:   SSE_SCOPE_BEGIN;

3841:     offset = (unsigned long)ssealignedspace % 16;
3842:     if (offset) offset = (16 - offset)/4;
3843:     tmps = &ssealignedspace[offset];
3844:     tmpx = &ssealignedspace[offset+4];
3845:     PREFETCH_NTA(aa+16*ai[1]);

3847:     VecGetArray(bb,&b);
3848:     VecGetArray(xx,&x);
3849:     t  = a->solve_work;

3851:     ISGetIndices(isrow,&rout); r = rout;
3852:     ISGetIndices(iscol,&cout); c = cout + (n-1);

3854:     /* forward solve the lower triangular */
3855:     idx  = 4*(*r++);
3856:     t[0] = b[idx];   t[1] = b[1+idx];
3857:     t[2] = b[2+idx]; t[3] = b[3+idx];
3858:     v    =  aa + 16*ai[1];

3860:     for (i=1; i<n;) {
3861:       PREFETCH_NTA(&v[8]);
3862:       vi   =  aj      + ai[i];
3863:       nz   =  diag[i] - ai[i];
3864:       idx  =  4*(*r++);

3866:       /* Demote sum from double to float */
3867:       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3868:       LOAD_PS(tmps,XMM7);

3870:       while (nz--) {
3871:         PREFETCH_NTA(&v[16]);
3872:         idx = 4*(*vi++);
3873: 
3874:         /* Demote solution (so far) from double to float */
3875:         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);

3877:         /* 4x4 Matrix-Vector product with negative accumulation: */
3878:         SSE_INLINE_BEGIN_2(tmpx,v)
3879:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

3881:           /* First Column */
3882:           SSE_COPY_PS(XMM0,XMM6)
3883:           SSE_SHUFFLE(XMM0,XMM0,0x00)
3884:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3885:           SSE_SUB_PS(XMM7,XMM0)
3886: 
3887:           /* Second Column */
3888:           SSE_COPY_PS(XMM1,XMM6)
3889:           SSE_SHUFFLE(XMM1,XMM1,0x55)
3890:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3891:           SSE_SUB_PS(XMM7,XMM1)
3892: 
3893:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3894: 
3895:           /* Third Column */
3896:           SSE_COPY_PS(XMM2,XMM6)
3897:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3898:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3899:           SSE_SUB_PS(XMM7,XMM2)

3901:           /* Fourth Column */
3902:           SSE_COPY_PS(XMM3,XMM6)
3903:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3904:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3905:           SSE_SUB_PS(XMM7,XMM3)
3906:         SSE_INLINE_END_2
3907: 
3908:         v  += 16;
3909:       }
3910:       idx = 4*i;
3911:       v   = aa + 16*ai[++i];
3912:       PREFETCH_NTA(v);
3913:       STORE_PS(tmps,XMM7);

3915:       /* Promote result from float to double */
3916:       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3917:     }
3918:     /* backward solve the upper triangular */
3919:     idt  = 4*(n-1);
3920:     ai16 = 16*diag[n-1];
3921:     v    = aa + ai16 + 16;
3922:     for (i=n-1; i>=0;){
3923:       PREFETCH_NTA(&v[8]);
3924:       vi = aj + diag[i] + 1;
3925:       nz = ai[i+1] - diag[i] - 1;
3926: 
3927:       /* Demote accumulator from double to float */
3928:       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3929:       LOAD_PS(tmps,XMM7);

3931:       while (nz--) {
3932:         PREFETCH_NTA(&v[16]);
3933:         idx = 4*(*vi++);

3935:         /* Demote solution (so far) from double to float */
3936:         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);

3938:         /* 4x4 Matrix-Vector Product with negative accumulation: */
3939:         SSE_INLINE_BEGIN_2(tmpx,v)
3940:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

3942:           /* First Column */
3943:           SSE_COPY_PS(XMM0,XMM6)
3944:           SSE_SHUFFLE(XMM0,XMM0,0x00)
3945:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3946:           SSE_SUB_PS(XMM7,XMM0)

3948:           /* Second Column */
3949:           SSE_COPY_PS(XMM1,XMM6)
3950:           SSE_SHUFFLE(XMM1,XMM1,0x55)
3951:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3952:           SSE_SUB_PS(XMM7,XMM1)

3954:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3955: 
3956:           /* Third Column */
3957:           SSE_COPY_PS(XMM2,XMM6)
3958:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3959:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3960:           SSE_SUB_PS(XMM7,XMM2)

3962:           /* Fourth Column */
3963:           SSE_COPY_PS(XMM3,XMM6)
3964:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3965:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3966:           SSE_SUB_PS(XMM7,XMM3)
3967:         SSE_INLINE_END_2
3968:         v  += 16;
3969:       }
3970:       v    = aa + ai16;
3971:       ai16 = 16*diag[--i];
3972:       PREFETCH_NTA(aa+ai16+16);
3973:       /* 
3974:          Scale the result by the diagonal 4x4 block, 
3975:          which was inverted as part of the factorization
3976:       */
3977:       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3978:         /* First Column */
3979:         SSE_COPY_PS(XMM0,XMM7)
3980:         SSE_SHUFFLE(XMM0,XMM0,0x00)
3981:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

3983:         /* Second Column */
3984:         SSE_COPY_PS(XMM1,XMM7)
3985:         SSE_SHUFFLE(XMM1,XMM1,0x55)
3986:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3987:         SSE_ADD_PS(XMM0,XMM1)

3989:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3990: 
3991:         /* Third Column */
3992:         SSE_COPY_PS(XMM2,XMM7)
3993:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3994:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3995:         SSE_ADD_PS(XMM0,XMM2)

3997:         /* Fourth Column */
3998:         SSE_COPY_PS(XMM3,XMM7)
3999:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4000:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4001:         SSE_ADD_PS(XMM0,XMM3)
4002: 
4003:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4004:       SSE_INLINE_END_3

4006:       /* Promote solution from float to double */
4007:       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);

4009:       /* Apply reordering to t and stream into x.    */
4010:       /* This way, x doesn't pollute the cache.      */
4011:       /* Be careful with size: 2 doubles = 4 floats! */
4012:       idc  = 4*(*c--);
4013:       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4014:         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
4015:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4016:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4017:         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4018:         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4019:         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4020:       SSE_INLINE_END_2
4021:       v    = aa + ai16 + 16;
4022:       idt -= 4;
4023:     }

4025:     ISRestoreIndices(isrow,&rout);
4026:     ISRestoreIndices(iscol,&cout);
4027:     VecRestoreArray(bb,&b);
4028:     VecRestoreArray(xx,&x);
4029:     PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4030:   SSE_SCOPE_END;
4031:   return(0);
4032: }

4034: #endif


4037: /*
4038:       Special case where the matrix was ILU(0) factored in the natural
4039:    ordering. This eliminates the need for the column and row permutation.
4040: */
4043: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4044: {
4045:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4046:   PetscInt          n=a->mbs;
4047:   const PetscInt    *ai=a->i,*aj=a->j;
4048:   PetscErrorCode    ierr;
4049:   const PetscInt    *diag = a->diag;
4050:   const MatScalar   *aa=a->a;
4051:   PetscScalar       *x;
4052:   const PetscScalar *b;

4055:   VecGetArray(bb,(PetscScalar**)&b);
4056:   VecGetArray(xx,&x);

4058: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4059:   {
4060:     static PetscScalar w[2000]; /* very BAD need to fix */
4061:     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4062:   }
4063: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4064:   {
4065:     static PetscScalar w[2000]; /* very BAD need to fix */
4066:     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4067:   }
4068: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4069:   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4070: #else
4071:   {
4072:     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4073:     const MatScalar *v;
4074:     PetscInt        jdx,idt,idx,nz,i,ai16;
4075:     const PetscInt  *vi;

4077:   /* forward solve the lower triangular */
4078:   idx    = 0;
4079:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4080:   for (i=1; i<n; i++) {
4081:     v     =  aa      + 16*ai[i];
4082:     vi    =  aj      + ai[i];
4083:     nz    =  diag[i] - ai[i];
4084:     idx   +=  4;
4085:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4086:     while (nz--) {
4087:       jdx   = 4*(*vi++);
4088:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4089:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4090:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4091:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4092:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4093:       v    += 16;
4094:     }
4095:     x[idx]   = s1;
4096:     x[1+idx] = s2;
4097:     x[2+idx] = s3;
4098:     x[3+idx] = s4;
4099:   }
4100:   /* backward solve the upper triangular */
4101:   idt = 4*(n-1);
4102:   for (i=n-1; i>=0; i--){
4103:     ai16 = 16*diag[i];
4104:     v    = aa + ai16 + 16;
4105:     vi   = aj + diag[i] + 1;
4106:     nz   = ai[i+1] - diag[i] - 1;
4107:     s1 = x[idt];  s2 = x[1+idt];
4108:     s3 = x[2+idt];s4 = x[3+idt];
4109:     while (nz--) {
4110:       idx   = 4*(*vi++);
4111:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4112:       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4113:       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4114:       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4115:       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4116:       v    += 16;
4117:     }
4118:     v        = aa + ai16;
4119:     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4120:     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4121:     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4122:     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4123:     idt -= 4;
4124:   }
4125:   }
4126: #endif

4128:   VecRestoreArray(bb,(PetscScalar**)&b);
4129:   VecRestoreArray(xx,&x);
4130:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4131:   return(0);
4132: }

4136: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4137: {
4138:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4139:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4140:     PetscInt          i,k,nz,idx,jdx,idt;
4141:     PetscErrorCode    ierr;
4142:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4143:     const MatScalar   *aa=a->a,*v;
4144:     PetscScalar       *x;
4145:     const PetscScalar *b;
4146:     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;

4149:     VecGetArray(bb,(PetscScalar**)&b);
4150:     VecGetArray(xx,&x);
4151:     /* forward solve the lower triangular */
4152:     idx    = 0;
4153:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4154:     for (i=1; i<n; i++) {
4155:        v    = aa + bs2*ai[i];
4156:        vi   = aj + ai[i];
4157:        nz   = ai[i+1] - ai[i];
4158:       idx   = bs*i;
4159:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4160:       for(k=0;k<nz;k++) {
4161:           jdx   = bs*vi[k];
4162:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4163:           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4164:           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4165:           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4166:           s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4167: 
4168:           v   +=  bs2;
4169:         }

4171:        x[idx]   = s1;
4172:        x[1+idx] = s2;
4173:        x[2+idx] = s3;
4174:        x[3+idx] = s4;
4175:     }
4176: 
4177:    /* backward solve the upper triangular */
4178:   for (i=n-1; i>=0; i--){
4179:     v   = aa + bs2*(adiag[i+1]+1);
4180:      vi  = aj + adiag[i+1]+1;
4181:      nz  = adiag[i] - adiag[i+1]-1;
4182:      idt = bs*i;
4183:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4184: 
4185:     for(k=0;k<nz;k++){
4186:       idx   = bs*vi[k];
4187:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4188:        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4189:        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4190:        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4191:        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;

4193:         v   +=  bs2;
4194:     }
4195:     /* x = inv_diagonal*x */
4196:    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4197:    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4198:    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4199:    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;

4201:   }

4203:   VecRestoreArray(bb,(PetscScalar**)&b);
4204:   VecRestoreArray(xx,&x);
4205:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4206:   return(0);
4207: }

4211: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4212: {
4213:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4214:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4215:   PetscErrorCode    ierr;
4216:   const MatScalar   *aa=a->a;
4217:   const PetscScalar *b;
4218:   PetscScalar       *x;

4221:   VecGetArray(bb,(PetscScalar**)&b);
4222:   VecGetArray(xx,&x);

4224:   {
4225:     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4226:     const MatScalar  *v;
4227:     MatScalar        *t=(MatScalar *)x;
4228:     PetscInt         jdx,idt,idx,nz,i,ai16;
4229:     const PetscInt   *vi;

4231:     /* forward solve the lower triangular */
4232:     idx  = 0;
4233:     t[0] = (MatScalar)b[0];
4234:     t[1] = (MatScalar)b[1];
4235:     t[2] = (MatScalar)b[2];
4236:     t[3] = (MatScalar)b[3];
4237:     for (i=1; i<n; i++) {
4238:       v     =  aa      + 16*ai[i];
4239:       vi    =  aj      + ai[i];
4240:       nz    =  diag[i] - ai[i];
4241:       idx   +=  4;
4242:       s1 = (MatScalar)b[idx];
4243:       s2 = (MatScalar)b[1+idx];
4244:       s3 = (MatScalar)b[2+idx];
4245:       s4 = (MatScalar)b[3+idx];
4246:       while (nz--) {
4247:         jdx = 4*(*vi++);
4248:         x1  = t[jdx];
4249:         x2  = t[1+jdx];
4250:         x3  = t[2+jdx];
4251:         x4  = t[3+jdx];
4252:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4253:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4254:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4255:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4256:         v    += 16;
4257:       }
4258:       t[idx]   = s1;
4259:       t[1+idx] = s2;
4260:       t[2+idx] = s3;
4261:       t[3+idx] = s4;
4262:     }
4263:     /* backward solve the upper triangular */
4264:     idt = 4*(n-1);
4265:     for (i=n-1; i>=0; i--){
4266:       ai16 = 16*diag[i];
4267:       v    = aa + ai16 + 16;
4268:       vi   = aj + diag[i] + 1;
4269:       nz   = ai[i+1] - diag[i] - 1;
4270:       s1   = t[idt];
4271:       s2   = t[1+idt];
4272:       s3   = t[2+idt];
4273:       s4   = t[3+idt];
4274:       while (nz--) {
4275:         idx = 4*(*vi++);
4276:         x1  = (MatScalar)x[idx];
4277:         x2  = (MatScalar)x[1+idx];
4278:         x3  = (MatScalar)x[2+idx];
4279:         x4  = (MatScalar)x[3+idx];
4280:         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4281:         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4282:         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4283:         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4284:         v    += 16;
4285:       }
4286:       v        = aa + ai16;
4287:       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4288:       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4289:       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4290:       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4291:       idt -= 4;
4292:     }
4293:   }

4295:   VecRestoreArray(bb,(PetscScalar**)&b);
4296:   VecRestoreArray(xx,&x);
4297:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4298:   return(0);
4299: }

4301: #if defined (PETSC_HAVE_SSE)

4303: #include PETSC_HAVE_SSE
4306: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4307: {
4308:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4309:   unsigned short *aj=(unsigned short *)a->j;
4311:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4312:   MatScalar      *aa=a->a;
4313:   PetscScalar    *x,*b;

4316:   SSE_SCOPE_BEGIN;
4317:   /* 
4318:      Note: This code currently uses demotion of double
4319:      to float when performing the mixed-mode computation.
4320:      This may not be numerically reasonable for all applications.
4321:   */
4322:   PREFETCH_NTA(aa+16*ai[1]);

4324:   VecGetArray(bb,&b);
4325:   VecGetArray(xx,&x);
4326:   {
4327:     /* x will first be computed in single precision then promoted inplace to double */
4328:     MatScalar      *v,*t=(MatScalar *)x;
4329:     int            nz,i,idt,ai16;
4330:     unsigned int   jdx,idx;
4331:     unsigned short *vi;
4332:     /* Forward solve the lower triangular factor. */

4334:     /* First block is the identity. */
4335:     idx  = 0;
4336:     CONVERT_DOUBLE4_FLOAT4(t,b);
4337:     v    =  aa + 16*((unsigned int)ai[1]);

4339:     for (i=1; i<n;) {
4340:       PREFETCH_NTA(&v[8]);
4341:       vi   =  aj      + ai[i];
4342:       nz   =  diag[i] - ai[i];
4343:       idx +=  4;

4345:       /* Demote RHS from double to float. */
4346:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4347:       LOAD_PS(&t[idx],XMM7);

4349:       while (nz--) {
4350:         PREFETCH_NTA(&v[16]);
4351:         jdx = 4*((unsigned int)(*vi++));
4352: 
4353:         /* 4x4 Matrix-Vector product with negative accumulation: */
4354:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4355:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4357:           /* First Column */
4358:           SSE_COPY_PS(XMM0,XMM6)
4359:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4360:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4361:           SSE_SUB_PS(XMM7,XMM0)

4363:           /* Second Column */
4364:           SSE_COPY_PS(XMM1,XMM6)
4365:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4366:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4367:           SSE_SUB_PS(XMM7,XMM1)

4369:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4370: 
4371:           /* Third Column */
4372:           SSE_COPY_PS(XMM2,XMM6)
4373:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4374:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4375:           SSE_SUB_PS(XMM7,XMM2)

4377:           /* Fourth Column */
4378:           SSE_COPY_PS(XMM3,XMM6)
4379:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4380:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4381:           SSE_SUB_PS(XMM7,XMM3)
4382:         SSE_INLINE_END_2
4383: 
4384:         v  += 16;
4385:       }
4386:       v    =  aa + 16*ai[++i];
4387:       PREFETCH_NTA(v);
4388:       STORE_PS(&t[idx],XMM7);
4389:     }

4391:     /* Backward solve the upper triangular factor.*/

4393:     idt  = 4*(n-1);
4394:     ai16 = 16*diag[n-1];
4395:     v    = aa + ai16 + 16;
4396:     for (i=n-1; i>=0;){
4397:       PREFETCH_NTA(&v[8]);
4398:       vi = aj + diag[i] + 1;
4399:       nz = ai[i+1] - diag[i] - 1;
4400: 
4401:       LOAD_PS(&t[idt],XMM7);

4403:       while (nz--) {
4404:         PREFETCH_NTA(&v[16]);
4405:         idx = 4*((unsigned int)(*vi++));

4407:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4408:         SSE_INLINE_BEGIN_2(&t[idx],v)
4409:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4411:           /* First Column */
4412:           SSE_COPY_PS(XMM0,XMM6)
4413:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4414:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4415:           SSE_SUB_PS(XMM7,XMM0)

4417:           /* Second Column */
4418:           SSE_COPY_PS(XMM1,XMM6)
4419:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4420:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4421:           SSE_SUB_PS(XMM7,XMM1)

4423:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4424: 
4425:           /* Third Column */
4426:           SSE_COPY_PS(XMM2,XMM6)
4427:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4428:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4429:           SSE_SUB_PS(XMM7,XMM2)

4431:           /* Fourth Column */
4432:           SSE_COPY_PS(XMM3,XMM6)
4433:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4434:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4435:           SSE_SUB_PS(XMM7,XMM3)
4436:         SSE_INLINE_END_2
4437:         v  += 16;
4438:       }
4439:       v    = aa + ai16;
4440:       ai16 = 16*diag[--i];
4441:       PREFETCH_NTA(aa+ai16+16);
4442:       /* 
4443:          Scale the result by the diagonal 4x4 block, 
4444:          which was inverted as part of the factorization
4445:       */
4446:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4447:         /* First Column */
4448:         SSE_COPY_PS(XMM0,XMM7)
4449:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4450:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4452:         /* Second Column */
4453:         SSE_COPY_PS(XMM1,XMM7)
4454:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4455:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4456:         SSE_ADD_PS(XMM0,XMM1)

4458:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4459: 
4460:         /* Third Column */
4461:         SSE_COPY_PS(XMM2,XMM7)
4462:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4463:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4464:         SSE_ADD_PS(XMM0,XMM2)

4466:         /* Fourth Column */
4467:         SSE_COPY_PS(XMM3,XMM7)
4468:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4469:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4470:         SSE_ADD_PS(XMM0,XMM3)

4472:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4473:       SSE_INLINE_END_3

4475:       v    = aa + ai16 + 16;
4476:       idt -= 4;
4477:     }

4479:     /* Convert t from single precision back to double precision (inplace)*/
4480:     idt = 4*(n-1);
4481:     for (i=n-1;i>=0;i--) {
4482:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4483:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4484:       PetscScalar *xtemp=&x[idt];
4485:       MatScalar   *ttemp=&t[idt];
4486:       xtemp[3] = (PetscScalar)ttemp[3];
4487:       xtemp[2] = (PetscScalar)ttemp[2];
4488:       xtemp[1] = (PetscScalar)ttemp[1];
4489:       xtemp[0] = (PetscScalar)ttemp[0];
4490:       idt -= 4;
4491:     }

4493:   } /* End of artificial scope. */
4494:   VecRestoreArray(bb,&b);
4495:   VecRestoreArray(xx,&x);
4496:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4497:   SSE_SCOPE_END;
4498:   return(0);
4499: }

4503: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4504: {
4505:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4506:   int            *aj=a->j;
4508:   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4509:   MatScalar      *aa=a->a;
4510:   PetscScalar    *x,*b;

4513:   SSE_SCOPE_BEGIN;
4514:   /* 
4515:      Note: This code currently uses demotion of double
4516:      to float when performing the mixed-mode computation.
4517:      This may not be numerically reasonable for all applications.
4518:   */
4519:   PREFETCH_NTA(aa+16*ai[1]);

4521:   VecGetArray(bb,&b);
4522:   VecGetArray(xx,&x);
4523:   {
4524:     /* x will first be computed in single precision then promoted inplace to double */
4525:     MatScalar *v,*t=(MatScalar *)x;
4526:     int       nz,i,idt,ai16;
4527:     int       jdx,idx;
4528:     int       *vi;
4529:     /* Forward solve the lower triangular factor. */

4531:     /* First block is the identity. */
4532:     idx  = 0;
4533:     CONVERT_DOUBLE4_FLOAT4(t,b);
4534:     v    =  aa + 16*ai[1];

4536:     for (i=1; i<n;) {
4537:       PREFETCH_NTA(&v[8]);
4538:       vi   =  aj      + ai[i];
4539:       nz   =  diag[i] - ai[i];
4540:       idx +=  4;

4542:       /* Demote RHS from double to float. */
4543:       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4544:       LOAD_PS(&t[idx],XMM7);

4546:       while (nz--) {
4547:         PREFETCH_NTA(&v[16]);
4548:         jdx = 4*(*vi++);
4549: /*          jdx = *vi++; */
4550: 
4551:         /* 4x4 Matrix-Vector product with negative accumulation: */
4552:         SSE_INLINE_BEGIN_2(&t[jdx],v)
4553:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4555:           /* First Column */
4556:           SSE_COPY_PS(XMM0,XMM6)
4557:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4558:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4559:           SSE_SUB_PS(XMM7,XMM0)

4561:           /* Second Column */
4562:           SSE_COPY_PS(XMM1,XMM6)
4563:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4564:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4565:           SSE_SUB_PS(XMM7,XMM1)

4567:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4568: 
4569:           /* Third Column */
4570:           SSE_COPY_PS(XMM2,XMM6)
4571:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4572:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4573:           SSE_SUB_PS(XMM7,XMM2)

4575:           /* Fourth Column */
4576:           SSE_COPY_PS(XMM3,XMM6)
4577:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4578:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4579:           SSE_SUB_PS(XMM7,XMM3)
4580:         SSE_INLINE_END_2
4581: 
4582:         v  += 16;
4583:       }
4584:       v    =  aa + 16*ai[++i];
4585:       PREFETCH_NTA(v);
4586:       STORE_PS(&t[idx],XMM7);
4587:     }

4589:     /* Backward solve the upper triangular factor.*/

4591:     idt  = 4*(n-1);
4592:     ai16 = 16*diag[n-1];
4593:     v    = aa + ai16 + 16;
4594:     for (i=n-1; i>=0;){
4595:       PREFETCH_NTA(&v[8]);
4596:       vi = aj + diag[i] + 1;
4597:       nz = ai[i+1] - diag[i] - 1;
4598: 
4599:       LOAD_PS(&t[idt],XMM7);

4601:       while (nz--) {
4602:         PREFETCH_NTA(&v[16]);
4603:         idx = 4*(*vi++);
4604: /*          idx = *vi++; */

4606:         /* 4x4 Matrix-Vector Product with negative accumulation: */
4607:         SSE_INLINE_BEGIN_2(&t[idx],v)
4608:           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)

4610:           /* First Column */
4611:           SSE_COPY_PS(XMM0,XMM6)
4612:           SSE_SHUFFLE(XMM0,XMM0,0x00)
4613:           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4614:           SSE_SUB_PS(XMM7,XMM0)

4616:           /* Second Column */
4617:           SSE_COPY_PS(XMM1,XMM6)
4618:           SSE_SHUFFLE(XMM1,XMM1,0x55)
4619:           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4620:           SSE_SUB_PS(XMM7,XMM1)

4622:           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4623: 
4624:           /* Third Column */
4625:           SSE_COPY_PS(XMM2,XMM6)
4626:           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4627:           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4628:           SSE_SUB_PS(XMM7,XMM2)

4630:           /* Fourth Column */
4631:           SSE_COPY_PS(XMM3,XMM6)
4632:           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4633:           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4634:           SSE_SUB_PS(XMM7,XMM3)
4635:         SSE_INLINE_END_2
4636:         v  += 16;
4637:       }
4638:       v    = aa + ai16;
4639:       ai16 = 16*diag[--i];
4640:       PREFETCH_NTA(aa+ai16+16);
4641:       /* 
4642:          Scale the result by the diagonal 4x4 block, 
4643:          which was inverted as part of the factorization
4644:       */
4645:       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4646:         /* First Column */
4647:         SSE_COPY_PS(XMM0,XMM7)
4648:         SSE_SHUFFLE(XMM0,XMM0,0x00)
4649:         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)

4651:         /* Second Column */
4652:         SSE_COPY_PS(XMM1,XMM7)
4653:         SSE_SHUFFLE(XMM1,XMM1,0x55)
4654:         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4655:         SSE_ADD_PS(XMM0,XMM1)

4657:         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4658: 
4659:         /* Third Column */
4660:         SSE_COPY_PS(XMM2,XMM7)
4661:         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4662:         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4663:         SSE_ADD_PS(XMM0,XMM2)

4665:         /* Fourth Column */
4666:         SSE_COPY_PS(XMM3,XMM7)
4667:         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4668:         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4669:         SSE_ADD_PS(XMM0,XMM3)

4671:         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4672:       SSE_INLINE_END_3

4674:       v    = aa + ai16 + 16;
4675:       idt -= 4;
4676:     }

4678:     /* Convert t from single precision back to double precision (inplace)*/
4679:     idt = 4*(n-1);
4680:     for (i=n-1;i>=0;i--) {
4681:       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4682:       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4683:       PetscScalar *xtemp=&x[idt];
4684:       MatScalar   *ttemp=&t[idt];
4685:       xtemp[3] = (PetscScalar)ttemp[3];
4686:       xtemp[2] = (PetscScalar)ttemp[2];
4687:       xtemp[1] = (PetscScalar)ttemp[1];
4688:       xtemp[0] = (PetscScalar)ttemp[0];
4689:       idt -= 4;
4690:     }

4692:   } /* End of artificial scope. */
4693:   VecRestoreArray(bb,&b);
4694:   VecRestoreArray(xx,&x);
4695:   PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4696:   SSE_SCOPE_END;
4697:   return(0);
4698: }

4700: #endif

4704: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4705: {
4706:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4707:   IS                iscol=a->col,isrow=a->row;
4708:   PetscErrorCode    ierr;
4709:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4710:   PetscInt          i,nz,idx,idt,idc;
4711:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4712:   const MatScalar   *aa=a->a,*v;
4713:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4714:   const PetscScalar *b;

4717:   VecGetArray(bb,(PetscScalar**)&b);
4718:   VecGetArray(xx,&x);
4719:   t  = a->solve_work;

4721:   ISGetIndices(isrow,&rout); r = rout;
4722:   ISGetIndices(iscol,&cout); c = cout + (n-1);

4724:   /* forward solve the lower triangular */
4725:   idx    = 3*(*r++);
4726:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4727:   for (i=1; i<n; i++) {
4728:     v     = aa + 9*ai[i];
4729:     vi    = aj + ai[i];
4730:     nz    = diag[i] - ai[i];
4731:     idx   = 3*(*r++);
4732:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4733:     while (nz--) {
4734:       idx   = 3*(*vi++);
4735:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4736:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4737:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4738:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4739:       v += 9;
4740:     }
4741:     idx = 3*i;
4742:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4743:   }
4744:   /* backward solve the upper triangular */
4745:   for (i=n-1; i>=0; i--){
4746:     v    = aa + 9*diag[i] + 9;
4747:     vi   = aj + diag[i] + 1;
4748:     nz   = ai[i+1] - diag[i] - 1;
4749:     idt  = 3*i;
4750:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4751:     while (nz--) {
4752:       idx   = 3*(*vi++);
4753:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4754:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4755:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4756:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4757:       v += 9;
4758:     }
4759:     idc = 3*(*c--);
4760:     v   = aa + 9*diag[i];
4761:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4762:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4763:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4764:   }
4765:   ISRestoreIndices(isrow,&rout);
4766:   ISRestoreIndices(iscol,&cout);
4767:   VecRestoreArray(bb,(PetscScalar**)&b);
4768:   VecRestoreArray(xx,&x);
4769:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4770:   return(0);
4771: }

4775: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4776: {
4777:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4778:   IS                iscol=a->col,isrow=a->row;
4779:   PetscErrorCode    ierr;
4780:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4781:   PetscInt          i,nz,idx,idt,idc,m;
4782:   const PetscInt    *r,*c,*rout,*cout;
4783:   const MatScalar   *aa=a->a,*v;
4784:   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4785:   const PetscScalar *b;

4788:   VecGetArray(bb,(PetscScalar**)&b);
4789:   VecGetArray(xx,&x);
4790:   t  = a->solve_work;

4792:   ISGetIndices(isrow,&rout); r = rout;
4793:   ISGetIndices(iscol,&cout); c = cout;

4795:   /* forward solve the lower triangular */
4796:   idx    = 3*r[0];
4797:   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4798:   for (i=1; i<n; i++) {
4799:     v     = aa + 9*ai[i];
4800:     vi    = aj + ai[i];
4801:     nz    = ai[i+1] - ai[i];
4802:     idx   = 3*r[i];
4803:     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4804:     for(m=0;m<nz;m++){
4805:       idx   = 3*vi[m];
4806:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4807:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4808:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4809:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4810:       v += 9;
4811:     }
4812:     idx = 3*i;
4813:     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4814:   }
4815:   /* backward solve the upper triangular */
4816:   for (i=n-1; i>=0; i--){
4817:     v    = aa + 9*(adiag[i+1]+1);
4818:     vi   = aj + adiag[i+1]+1;
4819:     nz   = adiag[i] - adiag[i+1] - 1;
4820:     idt  = 3*i;
4821:     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4822:     for(m=0;m<nz;m++){
4823:       idx   = 3*vi[m];
4824:       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4825:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4826:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4827:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4828:       v += 9;
4829:     }
4830:     idc = 3*c[i];
4831:     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4832:     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4833:     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4834:   }
4835:   ISRestoreIndices(isrow,&rout);
4836:   ISRestoreIndices(iscol,&cout);
4837:   VecRestoreArray(bb,(PetscScalar**)&b);
4838:   VecRestoreArray(xx,&x);
4839:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4840:   return(0);
4841: }

4843: /*
4844:       Special case where the matrix was ILU(0) factored in the natural
4845:    ordering. This eliminates the need for the column and row permutation.
4846: */
4849: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4850: {
4851:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4852:   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4853:   PetscErrorCode    ierr;
4854:   const PetscInt    *diag = a->diag,*vi;
4855:   const MatScalar   *aa=a->a,*v;
4856:   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4857:   const PetscScalar *b;
4858:   PetscInt          jdx,idt,idx,nz,i;

4861:   VecGetArray(bb,(PetscScalar**)&b);
4862:   VecGetArray(xx,&x);

4864:   /* forward solve the lower triangular */
4865:   idx    = 0;
4866:   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4867:   for (i=1; i<n; i++) {
4868:     v     =  aa      + 9*ai[i];
4869:     vi    =  aj      + ai[i];
4870:     nz    =  diag[i] - ai[i];
4871:     idx   +=  3;
4872:     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4873:     while (nz--) {
4874:       jdx   = 3*(*vi++);
4875:       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4876:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879:       v    += 9;
4880:     }
4881:     x[idx]   = s1;
4882:     x[1+idx] = s2;
4883:     x[2+idx] = s3;
4884:   }
4885:   /* backward solve the upper triangular */
4886:   for (i=n-1; i>=0; i--){
4887:     v    = aa + 9*diag[i] + 9;
4888:     vi   = aj + diag[i] + 1;
4889:     nz   = ai[i+1] - diag[i] - 1;
4890:     idt  = 3*i;
4891:     s1 = x[idt];  s2 = x[1+idt];
4892:     s3 = x[2+idt];
4893:     while (nz--) {
4894:       idx   = 3*(*vi++);
4895:       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4896:       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4897:       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4898:       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4899:       v    += 9;
4900:     }
4901:     v        = aa +  9*diag[i];
4902:     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4903:     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4904:     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4905:   }

4907:   VecRestoreArray(bb,(PetscScalar**)&b);
4908:   VecRestoreArray(xx,&x);
4909:   PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4910:   return(0);
4911: }

4915: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4916: {
4917:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4918:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4919:     PetscErrorCode    ierr;
4920:     PetscInt          i,k,nz,idx,jdx,idt;
4921:     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4922:     const MatScalar   *aa=a->a,*v;
4923:     PetscScalar       *x;
4924:     const PetscScalar *b;
4925:     PetscScalar        s1,s2,s3,x1,x2,x3;

4928:     VecGetArray(bb,(PetscScalar**)&b);
4929:     VecGetArray(xx,&x);
4930:     /* forward solve the lower triangular */
4931:     idx    = 0;
4932:     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4933:     for (i=1; i<n; i++) {
4934:        v    = aa + bs2*ai[i];
4935:        vi   = aj + ai[i];
4936:        nz   = ai[i+1] - ai[i];
4937:       idx   = bs*i;
4938:        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939:       for(k=0;k<nz;k++){
4940:          jdx   = bs*vi[k];
4941:           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4942:           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943:           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944:           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945: 
4946:           v   +=  bs2;
4947:         }

4949:        x[idx]   = s1;
4950:        x[1+idx] = s2;
4951:        x[2+idx] = s3;
4952:     }
4953: 
4954:    /* backward solve the upper triangular */
4955:   for (i=n-1; i>=0; i--){
4956:     v   = aa + bs2*(adiag[i+1]+1);
4957:      vi  = aj + adiag[i+1]+1;
4958:      nz  = adiag[i] - adiag[i+1]-1;
4959:      idt = bs*i;
4960:      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4961: 
4962:      for(k=0;k<nz;k++){
4963:        idx   = bs*vi[k];
4964:        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4965:        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966:        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967:        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;

4969:         v   +=  bs2;
4970:     }
4971:     /* x = inv_diagonal*x */
4972:    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973:    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974:    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;

4976:   }

4978:   VecRestoreArray(bb,(PetscScalar**)&b);
4979:   VecRestoreArray(xx,&x);
4980:   PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4981:   return(0);
4982: }

4986: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4987: {
4988:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4989:   IS                iscol=a->col,isrow=a->row;
4990:   PetscErrorCode    ierr;
4991:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4992:   PetscInt          i,nz,idx,idt,idc;
4993:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4994:   const MatScalar   *aa=a->a,*v;
4995:   PetscScalar       *x,s1,s2,x1,x2,*t;
4996:   const PetscScalar *b;

4999:   VecGetArray(bb,(PetscScalar**)&b);
5000:   VecGetArray(xx,&x);
5001:   t  = a->solve_work;

5003:   ISGetIndices(isrow,&rout); r = rout;
5004:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5006:   /* forward solve the lower triangular */
5007:   idx    = 2*(*r++);
5008:   t[0] = b[idx]; t[1] = b[1+idx];
5009:   for (i=1; i<n; i++) {
5010:     v     = aa + 4*ai[i];
5011:     vi    = aj + ai[i];
5012:     nz    = diag[i] - ai[i];
5013:     idx   = 2*(*r++);
5014:     s1  = b[idx]; s2 = b[1+idx];
5015:     while (nz--) {
5016:       idx   = 2*(*vi++);
5017:       x1    = t[idx]; x2 = t[1+idx];
5018:       s1 -= v[0]*x1 + v[2]*x2;
5019:       s2 -= v[1]*x1 + v[3]*x2;
5020:       v += 4;
5021:     }
5022:     idx = 2*i;
5023:     t[idx] = s1; t[1+idx] = s2;
5024:   }
5025:   /* backward solve the upper triangular */
5026:   for (i=n-1; i>=0; i--){
5027:     v    = aa + 4*diag[i] + 4;
5028:     vi   = aj + diag[i] + 1;
5029:     nz   = ai[i+1] - diag[i] - 1;
5030:     idt  = 2*i;
5031:     s1 = t[idt]; s2 = t[1+idt];
5032:     while (nz--) {
5033:       idx   = 2*(*vi++);
5034:       x1    = t[idx]; x2 = t[1+idx];
5035:       s1 -= v[0]*x1 + v[2]*x2;
5036:       s2 -= v[1]*x1 + v[3]*x2;
5037:       v += 4;
5038:     }
5039:     idc = 2*(*c--);
5040:     v   = aa + 4*diag[i];
5041:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5042:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5043:   }
5044:   ISRestoreIndices(isrow,&rout);
5045:   ISRestoreIndices(iscol,&cout);
5046:   VecRestoreArray(bb,(PetscScalar**)&b);
5047:   VecRestoreArray(xx,&x);
5048:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5049:   return(0);
5050: }

5054: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5055: {
5056:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5057:   IS                iscol=a->col,isrow=a->row;
5058:   PetscErrorCode    ierr;
5059:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5060:   PetscInt          i,nz,idx,jdx,idt,idc,m;
5061:   const PetscInt    *r,*c,*rout,*cout;
5062:   const MatScalar   *aa=a->a,*v;
5063:   PetscScalar       *x,s1,s2,x1,x2,*t;
5064:   const PetscScalar *b;

5067:   VecGetArray(bb,(PetscScalar**)&b);
5068:   VecGetArray(xx,&x);
5069:   t  = a->solve_work;

5071:   ISGetIndices(isrow,&rout); r = rout;
5072:   ISGetIndices(iscol,&cout); c = cout;

5074:   /* forward solve the lower triangular */
5075:   idx    = 2*r[0];
5076:   t[0] = b[idx]; t[1] = b[1+idx];
5077:   for (i=1; i<n; i++) {
5078:     v     = aa + 4*ai[i];
5079:     vi    = aj + ai[i];
5080:     nz    = ai[i+1] - ai[i];
5081:     idx   = 2*r[i];
5082:     s1  = b[idx]; s2 = b[1+idx];
5083:     for(m=0;m<nz;m++){
5084:       jdx   = 2*vi[m];
5085:       x1    = t[jdx]; x2 = t[1+jdx];
5086:       s1 -= v[0]*x1 + v[2]*x2;
5087:       s2 -= v[1]*x1 + v[3]*x2;
5088:       v += 4;
5089:     }
5090:     idx = 2*i;
5091:     t[idx] = s1; t[1+idx] = s2;
5092:   }
5093:   /* backward solve the upper triangular */
5094:   for (i=n-1; i>=0; i--){
5095:     v    = aa + 4*(adiag[i+1]+1);
5096:     vi   = aj + adiag[i+1]+1;
5097:     nz   = adiag[i] - adiag[i+1] - 1;
5098:     idt  = 2*i;
5099:     s1 = t[idt]; s2 = t[1+idt];
5100:     for(m=0;m<nz;m++){
5101:       idx   = 2*vi[m];
5102:       x1    = t[idx]; x2 = t[1+idx];
5103:       s1 -= v[0]*x1 + v[2]*x2;
5104:       s2 -= v[1]*x1 + v[3]*x2;
5105:       v += 4;
5106:     }
5107:     idc = 2*c[i];
5108:     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5109:     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5110:   }
5111:   ISRestoreIndices(isrow,&rout);
5112:   ISRestoreIndices(iscol,&cout);
5113:   VecRestoreArray(bb,(PetscScalar**)&b);
5114:   VecRestoreArray(xx,&x);
5115:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5116:   return(0);
5117: }

5119: /*
5120:       Special case where the matrix was ILU(0) factored in the natural
5121:    ordering. This eliminates the need for the column and row permutation.
5122: */
5125: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5126: {
5127:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5128:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5129:   PetscErrorCode    ierr;
5130:   const MatScalar   *aa=a->a,*v;
5131:   PetscScalar       *x,s1,s2,x1,x2;
5132:   const PetscScalar *b;
5133:   PetscInt          jdx,idt,idx,nz,i;

5136:   VecGetArray(bb,(PetscScalar**)&b);
5137:   VecGetArray(xx,&x);

5139:   /* forward solve the lower triangular */
5140:   idx    = 0;
5141:   x[0]   = b[0]; x[1] = b[1];
5142:   for (i=1; i<n; i++) {
5143:     v     =  aa      + 4*ai[i];
5144:     vi    =  aj      + ai[i];
5145:     nz    =  diag[i] - ai[i];
5146:     idx   +=  2;
5147:     s1  =  b[idx];s2 = b[1+idx];
5148:     while (nz--) {
5149:       jdx   = 2*(*vi++);
5150:       x1    = x[jdx];x2 = x[1+jdx];
5151:       s1 -= v[0]*x1 + v[2]*x2;
5152:       s2 -= v[1]*x1 + v[3]*x2;
5153:       v    += 4;
5154:     }
5155:     x[idx]   = s1;
5156:     x[1+idx] = s2;
5157:   }
5158:   /* backward solve the upper triangular */
5159:   for (i=n-1; i>=0; i--){
5160:     v    = aa + 4*diag[i] + 4;
5161:     vi   = aj + diag[i] + 1;
5162:     nz   = ai[i+1] - diag[i] - 1;
5163:     idt  = 2*i;
5164:     s1 = x[idt];  s2 = x[1+idt];
5165:     while (nz--) {
5166:       idx   = 2*(*vi++);
5167:       x1    = x[idx];   x2 = x[1+idx];
5168:       s1 -= v[0]*x1 + v[2]*x2;
5169:       s2 -= v[1]*x1 + v[3]*x2;
5170:       v    += 4;
5171:     }
5172:     v        = aa +  4*diag[i];
5173:     x[idt]   = v[0]*s1 + v[2]*s2;
5174:     x[1+idt] = v[1]*s1 + v[3]*s2;
5175:   }

5177:   VecRestoreArray(bb,(PetscScalar**)&b);
5178:   VecRestoreArray(xx,&x);
5179:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5180:   return(0);
5181: }

5185: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5186: {
5187:     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5188:     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5189:     PetscInt          i,k,nz,idx,idt,jdx;
5190:     PetscErrorCode    ierr;
5191:     const MatScalar   *aa=a->a,*v;
5192:     PetscScalar       *x,s1,s2,x1,x2;
5193:     const PetscScalar *b;
5194: 
5196:     VecGetArray(bb,(PetscScalar**)&b);
5197:     VecGetArray(xx,&x);
5198:     /* forward solve the lower triangular */
5199:     idx    = 0;
5200:     x[0] = b[idx]; x[1] = b[1+idx];
5201:     for (i=1; i<n; i++) {
5202:         v   = aa + 4*ai[i];
5203:        vi   = aj + ai[i];
5204:        nz   = ai[i+1] - ai[i];
5205:        idx  = 2*i;
5206:        s1   = b[idx];s2 = b[1+idx];
5207:       for(k=0;k<nz;k++){
5208:          jdx   = 2*vi[k];
5209:           x1    = x[jdx];x2 = x[1+jdx];
5210:           s1   -= v[0]*x1 + v[2]*x2;
5211:           s2   -= v[1]*x1 + v[3]*x2;
5212:            v   +=  4;
5213:         }
5214:        x[idx]   = s1;
5215:        x[1+idx] = s2;
5216:     }
5217: 
5218:    /* backward solve the upper triangular */
5219:   for (i=n-1; i>=0; i--){
5220:      v   = aa + 4*(adiag[i+1]+1);
5221:      vi  = aj + adiag[i+1]+1;
5222:      nz  = adiag[i] - adiag[i+1]-1;
5223:      idt = 2*i;
5224:      s1 = x[idt];  s2 = x[1+idt];
5225:      for(k=0;k<nz;k++){
5226:       idx   = 2*vi[k];
5227:        x1    = x[idx];   x2 = x[1+idx];
5228:        s1 -= v[0]*x1 + v[2]*x2;
5229:        s2 -= v[1]*x1 + v[3]*x2;
5230:          v    += 4;
5231:     }
5232:     /* x = inv_diagonal*x */
5233:    x[idt]   = v[0]*s1 + v[2]*s2;
5234:    x[1+idt] = v[1]*s1 + v[3]*s2;
5235:   }

5237:   VecRestoreArray(bb,(PetscScalar**)&b);
5238:   VecRestoreArray(xx,&x);
5239:   PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5240:   return(0);
5241: }

5245: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5246: {
5247:   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
5248:   IS                iscol=a->col,isrow=a->row;
5249:   PetscErrorCode    ierr;
5250:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5251:   PetscInt          i,nz;
5252:   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5253:   const MatScalar   *aa=a->a,*v;
5254:   PetscScalar       *x,s1,*t;
5255:   const PetscScalar *b;

5258:   if (!n) return(0);

5260:   VecGetArray(bb,(PetscScalar**)&b);
5261:   VecGetArray(xx,&x);
5262:   t  = a->solve_work;

5264:   ISGetIndices(isrow,&rout); r = rout;
5265:   ISGetIndices(iscol,&cout); c = cout + (n-1);

5267:   /* forward solve the lower triangular */
5268:   t[0] = b[*r++];
5269:   for (i=1; i<n; i++) {
5270:     v     = aa + ai[i];
5271:     vi    = aj + ai[i];
5272:     nz    = diag[i] - ai[i];
5273:     s1  = b[*r++];
5274:     while (nz--) {
5275:       s1 -= (*v++)*t[*vi++];
5276:     }
5277:     t[i] = s1;
5278:   }
5279:   /* backward solve the upper triangular */
5280:   for (i=n-1; i>=0; i--){
5281:     v    = aa + diag[i] + 1;
5282:     vi   = aj + diag[i] + 1;
5283:     nz   = ai[i+1] - diag[i] - 1;
5284:     s1 = t[i];
5285:     while (nz--) {
5286:       s1 -= (*v++)*t[*vi++];
5287:     }
5288:     x[*c--] = t[i] = aa[diag[i]]*s1;
5289:   }

5291:   ISRestoreIndices(isrow,&rout);
5292:   ISRestoreIndices(iscol,&cout);
5293:   VecRestoreArray(bb,(PetscScalar**)&b);
5294:   VecRestoreArray(xx,&x);
5295:   PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5296:   return(0);
5297: }
5298: /*
5299:       Special case where the matrix was ILU(0) factored in the natural
5300:    ordering. This eliminates the need for the column and row permutation.
5301: */
5304: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5305: {
5306:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5307:   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5308:   PetscErrorCode    ierr;
5309:   const MatScalar   *aa=a->a,*v;
5310:   PetscScalar       *x;
5311:   const PetscScalar *b;
5312:   PetscScalar       s1,x1;
5313:   PetscInt          jdx,idt,idx,nz,i;

5316:   VecGetArray(bb,(PetscScalar**)&b);
5317:   VecGetArray(xx,&x);

5319:   /* forward solve the lower triangular */
5320:   idx    = 0;
5321:   x[0]   = b[0];
5322:   for (i=1; i<n; i++) {
5323:     v     =  aa      + ai[i];
5324:     vi    =  aj      + ai[i];
5325:     nz    =  diag[i] - ai[i];
5326:     idx   +=  1;
5327:     s1  =  b[idx];
5328:     while (nz--) {
5329:       jdx   = *vi++;
5330:       x1    = x[jdx];
5331:       s1 -= v[0]*x1;
5332:       v    += 1;
5333:     }
5334:     x[idx]   = s1;
5335:   }
5336:   /* backward solve the upper triangular */
5337:   for (i=n-1; i>=0; i--){
5338:     v    = aa + diag[i] + 1;
5339:     vi   = aj + diag[i] + 1;
5340:     nz   = ai[i+1] - diag[i] - 1;
5341:     idt  = i;
5342:     s1 = x[idt];
5343:     while (nz--) {
5344:       idx   = *vi++;
5345:       x1    = x[idx];
5346:       s1 -= v[0]*x1;
5347:       v    += 1;
5348:     }
5349:     v        = aa +  diag[i];
5350:     x[idt]   = v[0]*s1;
5351:   }
5352:   VecRestoreArray(bb,(PetscScalar**)&b);
5353:   VecRestoreArray(xx,&x);
5354:   PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5355:   return(0);
5356: }

5358: /* ----------------------------------------------------------------*/
5359: EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);

5363: /*
5364:    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5365: */
5366: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5367: {
5368:   Mat             C=B;
5369:   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5370:   PetscErrorCode  ierr;
5371:   PetscInt        i,j,k,ipvt[15];
5372:   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5373:   PetscInt        nz,nzL,row;
5374:   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5375:   const MatScalar *v,*aa=a->a;
5376:   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
5377:   PetscInt        sol_ver;


5381:   PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);

5383:   /* generate work space needed by the factorization */
5384:   PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5385:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));

5387:   for (i=0; i<n; i++){
5388:     /* zero rtmp */
5389:     /* L part */
5390:     nz    = bi[i+1] - bi[i];
5391:     bjtmp = bj + bi[i];
5392:     for  (j=0; j<nz; j++){
5393:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5394:     }

5396:     /* U part */
5397:     nz = bdiag[i] - bdiag[i+1];
5398:     bjtmp = bj + bdiag[i+1]+1;
5399:     for  (j=0; j<nz; j++){
5400:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5401:     }
5402: 
5403:     /* load in initial (unfactored row) */
5404:     nz    = ai[i+1] - ai[i];
5405:     ajtmp = aj + ai[i];
5406:     v     = aa + bs2*ai[i];
5407:     for (j=0; j<nz; j++) {
5408:       PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5409:     }

5411:     /* elimination */
5412:     bjtmp = bj + bi[i];
5413:     nzL   = bi[i+1] - bi[i];
5414:     for(k=0;k < nzL;k++) {
5415:       row = bjtmp[k];
5416:       pc = rtmp + bs2*row;
5417:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5418:       if (flg) {
5419:         pv = b->a + bs2*bdiag[row];
5420:         Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5421:         /*Kernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5422:         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5423:         pv = b->a + bs2*(bdiag[row+1]+1);
5424:         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5425:         for (j=0; j<nz; j++) {
5426:           vv   = rtmp + bs2*pj[j];
5427:           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5428:           /* Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5429:           pv  += bs2;
5430:         }
5431:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5432:       }
5433:     }

5435:     /* finished row so stick it into b->a */
5436:     /* L part */
5437:     pv   = b->a + bs2*bi[i] ;
5438:     pj   = b->j + bi[i] ;
5439:     nz   = bi[i+1] - bi[i];
5440:     for (j=0; j<nz; j++) {
5441:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5442:     }

5444:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5445:     pv   = b->a + bs2*bdiag[i];
5446:     pj   = b->j + bdiag[i];
5447:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5448:     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5449:     Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
5450: 
5451:     /* U part */
5452:     pv = b->a + bs2*(bdiag[i+1]+1);
5453:     pj = b->j + bdiag[i+1]+1;
5454:     nz = bdiag[i] - bdiag[i+1] - 1;
5455:     for (j=0; j<nz; j++){
5456:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5457:     }
5458:   }

5460:   PetscFree2(rtmp,mwork);
5461:   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5462:   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5463:   C->assembled = PETSC_TRUE;
5464:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5465:   return(0);
5466: }

5470: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5471: {
5472:   Mat            C=B;
5473:   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5474:   IS             isrow = b->row,isicol = b->icol;
5476:   const PetscInt *r,*ic,*ics;
5477:   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5478:   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5479:   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5480:   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5481:   MatScalar      *v_work;
5482:   PetscTruth     col_identity,row_identity,both_identity;

5485:   ISGetIndices(isrow,&r);
5486:   ISGetIndices(isicol,&ic);
5487: 
5488:   PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5489:   PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5490:   ics  = ic;

5492:   /* generate work space needed by dense LU factorization */
5493:   PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);

5495:   for (i=0; i<n; i++){
5496:     /* zero rtmp */
5497:     /* L part */
5498:     nz    = bi[i+1] - bi[i];
5499:     bjtmp = bj + bi[i];
5500:     for  (j=0; j<nz; j++){
5501:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5502:     }

5504:     /* U part */
5505:     nz = bdiag[i] - bdiag[i+1];
5506:     bjtmp = bj + bdiag[i+1]+1;
5507:     for  (j=0; j<nz; j++){
5508:       PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5509:     }
5510: 
5511:     /* load in initial (unfactored row) */
5512:     nz    = ai[r[i]+1] - ai[r[i]];
5513:     ajtmp = aj + ai[r[i]];
5514:     v     = aa + bs2*ai[r[i]];
5515:     for (j=0; j<nz; j++) {
5516:       PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5517:     }

5519:     /* elimination */
5520:     bjtmp = bj + bi[i];
5521:     nzL   = bi[i+1] - bi[i];
5522:     for(k=0;k < nzL;k++) {
5523:       row = bjtmp[k];
5524:       pc = rtmp + bs2*row;
5525:       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5526:       if (flg) {
5527:         pv         = b->a + bs2*bdiag[row];
5528:         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5529:         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5530:         pv         = b->a + bs2*(bdiag[row+1]+1);
5531:         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5532:         for (j=0; j<nz; j++) {
5533:           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5534:         }
5535:         PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5536:       }
5537:     }

5539:     /* finished row so stick it into b->a */
5540:     /* L part */
5541:     pv   = b->a + bs2*bi[i] ;
5542:     pj   = b->j + bi[i] ;
5543:     nz   = bi[i+1] - bi[i];
5544:     for (j=0; j<nz; j++) {
5545:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5546:     }

5548:     /* Mark diagonal and invert diagonal for simplier triangular solves */
5549:     pv  = b->a + bs2*bdiag[i];
5550:     pj  = b->j + bdiag[i];
5551:     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5552:     PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5553:     Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
5554: 
5555:     /* U part */
5556:     pv = b->a + bs2*(bdiag[i+1]+1);
5557:     pj = b->j + bdiag[i+1]+1;
5558:     nz = bdiag[i] - bdiag[i+1] - 1;
5559:     for (j=0; j<nz; j++){
5560:       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5561:     }
5562:   }

5564:   PetscFree(rtmp);
5565:   PetscFree3(v_work,mwork,v_pivots);
5566:   ISRestoreIndices(isicol,&ic);
5567:   ISRestoreIndices(isrow,&r);

5569:   ISIdentity(isrow,&row_identity);
5570:   ISIdentity(isicol,&col_identity);
5571:   both_identity = (PetscTruth) (row_identity && col_identity);
5572:   if (both_identity){
5573:     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5574:   } else {
5575:     C->ops->solve = MatSolve_SeqBAIJ_N;
5576:   }
5577:   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5578: 
5579:   C->assembled = PETSC_TRUE;
5580:   PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5581:   return(0);
5582: }

5584: /* 
5585:    ilu(0) with natural ordering under new data structure.
5586:    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5587:    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5588: */

5592: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5593: {
5594: 
5595:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5596:   PetscErrorCode     ierr;
5597:   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5598:   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;

5601:   MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5602:   b    = (Mat_SeqBAIJ*)(fact)->data;
5603: 
5604:   /* allocate matrix arrays for new data structure */
5605:   PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5606:   PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
5607:   b->singlemalloc = PETSC_TRUE;
5608:   if (!b->diag){
5609:     PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5610:     PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5611:   }
5612:   bdiag = b->diag;
5613: 
5614:   if (n > 0) {
5615:     PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5616:   }
5617: 
5618:   /* set bi and bj with new data structure */
5619:   bi = b->i;
5620:   bj = b->j;

5622:   /* L part */
5623:   bi[0] = 0;
5624:   for (i=0; i<n; i++){
5625:     nz = adiag[i] - ai[i];
5626:     bi[i+1] = bi[i] + nz;
5627:     aj = a->j + ai[i];
5628:     for (j=0; j<nz; j++){
5629:       *bj = aj[j]; bj++;
5630:     }
5631:   }
5632: 
5633:   /* U part */
5634:   bi_temp = bi[n];
5635:   bdiag[n] = bi[n]-1;
5636:   for (i=n-1; i>=0; i--){
5637:     nz = ai[i+1] - adiag[i] - 1;
5638:     bi_temp = bi_temp + nz + 1;
5639:     aj = a->j + adiag[i] + 1;
5640:     for (j=0; j<nz; j++){
5641:       *bj = aj[j]; bj++;
5642:     }
5643:     /* diag[i] */
5644:     *bj = i; bj++;
5645:     bdiag[i] = bi_temp - 1;
5646:   }
5647:   return(0);
5648: }

5652: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5653: {
5654:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5655:   IS                 isicol;
5656:   PetscErrorCode     ierr;
5657:   const PetscInt     *r,*ic;
5658:   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5659:   PetscInt           *bi,*cols,nnz,*cols_lvl;
5660:   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5661:   PetscInt           i,levels,diagonal_fill;
5662:   PetscTruth         col_identity,row_identity,both_identity;
5663:   PetscReal          f;
5664:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5665:   PetscBT            lnkbt;
5666:   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5667:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5668:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5669:   PetscTruth         missing;
5670:   PetscInt           bs=A->rmap->bs,bs2=a->bs2;

5673:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5674:   MatMissingDiagonal(A,&missing,&d);
5675:   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

5677:   f             = info->fill;
5678:   levels        = (PetscInt)info->levels;
5679:   diagonal_fill = (PetscInt)info->diagonal_fill;
5680:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

5682:   ISIdentity(isrow,&row_identity);
5683:   ISIdentity(iscol,&col_identity);
5684:   both_identity = (PetscTruth) (row_identity && col_identity);
5685: 
5686:   if (!levels && both_identity) {
5687:     /* special case: ilu(0) with natural ordering */
5688:     MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5689:     MatSeqBAIJSetNumericFactorization(fact,both_identity);

5691:     fact->factor = MAT_FACTOR_ILU;
5692:     (fact)->info.factor_mallocs    = 0;
5693:     (fact)->info.fill_ratio_given  = info->fill;
5694:     (fact)->info.fill_ratio_needed = 1.0;
5695:     b                = (Mat_SeqBAIJ*)(fact)->data;
5696:     b->row           = isrow;
5697:     b->col           = iscol;
5698:     b->icol          = isicol;
5699:     PetscObjectReference((PetscObject)isrow);
5700:     PetscObjectReference((PetscObject)iscol);
5701:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5702:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5703:     return(0);
5704:   }
5705: 
5706:   ISGetIndices(isrow,&r);
5707:   ISGetIndices(isicol,&ic);
5708: 
5709:   /* get new row pointers */
5710:   PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5711:   bi[0] = 0;
5712:   /* bdiag is location of diagonal in factor */
5713:   PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5714:   bdiag[0]  = 0;

5716:   PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);

5718:   /* create a linked list for storing column indices of the active row */
5719:   nlnk = n + 1;
5720:   PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
5721: 
5722:   /* initial FreeSpace size is f*(ai[n]+1) */
5723:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5724:   current_space = free_space;
5725:   PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5726:   current_space_lvl = free_space_lvl;
5727: 
5728:   for (i=0; i<n; i++) {
5729:     nzi = 0;
5730:     /* copy current row into linked list */
5731:     nnz  = ai[r[i]+1] - ai[r[i]];
5732:     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5733:     cols = aj + ai[r[i]];
5734:     lnk[i] = -1; /* marker to indicate if diagonal exists */
5735:     PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5736:     nzi += nlnk;

5738:     /* make sure diagonal entry is included */
5739:     if (diagonal_fill && lnk[i] == -1) {
5740:       fm = n;
5741:       while (lnk[fm] < i) fm = lnk[fm];
5742:       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5743:       lnk[fm]    = i;
5744:       lnk_lvl[i] = 0;
5745:       nzi++; dcount++;
5746:     }

5748:     /* add pivot rows into the active row */
5749:     nzbd = 0;
5750:     prow = lnk[n];
5751:     while (prow < i) {
5752:       nnz      = bdiag[prow];
5753:       cols     = bj_ptr[prow] + nnz + 1;
5754:       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5755:       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5756:       PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5757:       nzi += nlnk;
5758:       prow = lnk[prow];
5759:       nzbd++;
5760:     }
5761:     bdiag[i] = nzbd;
5762:     bi[i+1]  = bi[i] + nzi;

5764:     /* if free space is not available, make more free space */
5765:     if (current_space->local_remaining<nzi) {
5766:       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5767:       PetscFreeSpaceGet(nnz,&current_space);
5768:       PetscFreeSpaceGet(nnz,&current_space_lvl);
5769:       reallocs++;
5770:     }

5772:     /* copy data into free_space and free_space_lvl, then initialize lnk */
5773:     PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
5774:     bj_ptr[i]    = current_space->array;
5775:     bjlvl_ptr[i] = current_space_lvl->array;

5777:     /* make sure the active row i has diagonal entry */
5778:     if (*(bj_ptr[i]+bdiag[i]) != i) {
5779:       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5780:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5781:     }

5783:     current_space->array           += nzi;
5784:     current_space->local_used      += nzi;
5785:     current_space->local_remaining -= nzi;
5786:     current_space_lvl->array           += nzi;
5787:     current_space_lvl->local_used      += nzi;
5788:     current_space_lvl->local_remaining -= nzi;
5789:   }
5790: 
5791:   ISRestoreIndices(isrow,&r);
5792:   ISRestoreIndices(isicol,&ic);

5794:   /* destroy list of free space and other temporary arrays */
5795:   PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5796: 
5797:   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5798:   PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
5799: 
5800:   PetscIncompleteLLDestroy(lnk,lnkbt);
5801:   PetscFreeSpaceDestroy(free_space_lvl);
5802:   PetscFree2(bj_ptr,bjlvl_ptr);

5804: #if defined(PETSC_USE_INFO)
5805:   {
5806:     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5807:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
5808:     PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
5809:     PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
5810:     PetscInfo(A,"for best performance.\n");
5811:     if (diagonal_fill) {
5812:       PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
5813:     }
5814:   }
5815: #endif

5817:   /* put together the new matrix */
5818:   MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
5819:   PetscLogObjectParent(fact,isicol);
5820:   b = (Mat_SeqBAIJ*)(fact)->data;
5821:   b->free_a       = PETSC_TRUE;
5822:   b->free_ij      = PETSC_TRUE;
5823:   b->singlemalloc = PETSC_FALSE;
5824:   PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);
5825:   b->j          = bj;
5826:   b->i          = bi;
5827:   b->diag       = bdiag;
5828:   b->free_diag  = PETSC_TRUE;
5829:   b->ilen       = 0;
5830:   b->imax       = 0;
5831:   b->row        = isrow;
5832:   b->col        = iscol;
5833:   PetscObjectReference((PetscObject)isrow);
5834:   PetscObjectReference((PetscObject)iscol);
5835:   b->icol       = isicol;
5836:   PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
5837:   /* In b structure:  Free imax, ilen, old a, old j.  
5838:      Allocate bdiag, solve_work, new a, new j */
5839:   PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
5840:   b->maxnz = b->nz = bdiag[0]+1;
5841:   fact->info.factor_mallocs    = reallocs;
5842:   fact->info.fill_ratio_given  = f;
5843:   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5844:   MatSeqBAIJSetNumericFactorization(fact,both_identity);
5845:   return(0);
5846: }


5849: /*
5850:      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5851:    except that the data structure of Mat_SeqAIJ is slightly different.
5852:    Not a good example of code reuse.
5853: */
5856: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5857: {
5858:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5859:   IS             isicol;
5861:   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5862:   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5863:   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5864:   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5865:   PetscTruth     col_identity,row_identity,both_identity,flg;
5866:   PetscReal      f;

5869:   MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
5870:   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5871: 
5872:   f             = info->fill;
5873:   levels        = (PetscInt)info->levels;
5874:   diagonal_fill = (PetscInt)info->diagonal_fill;
5875:   ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);

5877:   ISIdentity(isrow,&row_identity);
5878:   ISIdentity(iscol,&col_identity);
5879:   both_identity = (PetscTruth) (row_identity && col_identity);

5881:   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5882:     MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
5883:     MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);

5885:     fact->factor = MAT_FACTOR_ILU;
5886:     b            = (Mat_SeqBAIJ*)fact->data;
5887:     b->row       = isrow;
5888:     b->col       = iscol;
5889:     PetscObjectReference((PetscObject)isrow);
5890:     PetscObjectReference((PetscObject)iscol);
5891:     b->icol      = isicol;
5892:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5893:     PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5894:     return(0);
5895:   }

5897:   /* general case perform the symbolic factorization */
5898:     ISGetIndices(isrow,&r);
5899:     ISGetIndices(isicol,&ic);

5901:     /* get new row pointers */
5902:     PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
5903:     ainew[0] = 0;
5904:     /* don't know how many column pointers are needed so estimate */
5905:     jmax = (PetscInt)(f*ai[n] + 1);
5906:     PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
5907:     /* ajfill is level of fill for each fill entry */
5908:     PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
5909:     /* fill is a linked list of nonzeros in active row */
5910:     PetscMalloc((n+1)*sizeof(PetscInt),&fill);
5911:     /* im is level for each filled value */
5912:     PetscMalloc((n+1)*sizeof(PetscInt),&im);
5913:     /* dloc is location of diagonal in factor */
5914:     PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
5915:     dloc[0]  = 0;
5916:     for (prow=0; prow<n; prow++) {

5918:       /* copy prow into linked list */
5919:       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5920:       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5921:       xi         = aj + ai[r[prow]];
5922:       fill[n]    = n;
5923:       fill[prow] = -1; /* marker for diagonal entry */
5924:       while (nz--) {
5925:         fm  = n;
5926:         idx = ic[*xi++];
5927:         do {
5928:           m  = fm;
5929:           fm = fill[m];
5930:         } while (fm < idx);
5931:         fill[m]   = idx;
5932:         fill[idx] = fm;
5933:         im[idx]   = 0;
5934:       }

5936:       /* make sure diagonal entry is included */
5937:       if (diagonal_fill && fill[prow] == -1) {
5938:         fm = n;
5939:         while (fill[fm] < prow) fm = fill[fm];
5940:         fill[prow] = fill[fm];  /* insert diagonal into linked list */
5941:         fill[fm]   = prow;
5942:         im[prow]   = 0;
5943:         nzf++;
5944:         dcount++;
5945:       }

5947:       nzi = 0;
5948:       row = fill[n];
5949:       while (row < prow) {
5950:         incrlev = im[row] + 1;
5951:         nz      = dloc[row];
5952:         xi      = ajnew  + ainew[row] + nz + 1;
5953:         flev    = ajfill + ainew[row] + nz + 1;
5954:         nnz     = ainew[row+1] - ainew[row] - nz - 1;
5955:         fm      = row;
5956:         while (nnz-- > 0) {
5957:           idx = *xi++;
5958:           if (*flev + incrlev > levels) {
5959:             flev++;
5960:             continue;
5961:           }
5962:           do {
5963:             m  = fm;
5964:             fm = fill[m];
5965:           } while (fm < idx);
5966:           if (fm != idx) {
5967:             im[idx]   = *flev + incrlev;
5968:             fill[m]   = idx;
5969:             fill[idx] = fm;
5970:             fm        = idx;
5971:             nzf++;
5972:           } else {
5973:             if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5974:           }
5975:           flev++;
5976:         }
5977:         row = fill[row];
5978:         nzi++;
5979:       }
5980:       /* copy new filled row into permanent storage */
5981:       ainew[prow+1] = ainew[prow] + nzf;
5982:       if (ainew[prow+1] > jmax) {

5984:         /* estimate how much additional space we will need */
5985:         /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5986:         /* just double the memory each time */
5987:         PetscInt maxadd = jmax;
5988:         /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5989:         if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5990:         jmax += maxadd;

5992:         /* allocate a longer ajnew and ajfill */
5993:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
5994:         PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
5995:         PetscFree(ajnew);
5996:         ajnew = xitmp;
5997:         PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
5998:         PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
5999:         PetscFree(ajfill);
6000:         ajfill = xitmp;
6001:         reallocate++; /* count how many reallocations are needed */
6002:       }
6003:       xitmp       = ajnew + ainew[prow];
6004:       flev        = ajfill + ainew[prow];
6005:       dloc[prow]  = nzi;
6006:       fm          = fill[n];
6007:       while (nzf--) {
6008:         *xitmp++ = fm;
6009:         *flev++ = im[fm];
6010:         fm      = fill[fm];
6011:       }
6012:       /* make sure row has diagonal entry */
6013:       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6014:         SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6015:     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6016:       }
6017:     }
6018:     PetscFree(ajfill);
6019:     ISRestoreIndices(isrow,&r);
6020:     ISRestoreIndices(isicol,&ic);
6021:     PetscFree(fill);
6022:     PetscFree(im);

6024: #if defined(PETSC_USE_INFO)
6025:     {
6026:       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6027:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6028:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6029:       PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6030:       PetscInfo(A,"for best performance.\n");
6031:       if (diagonal_fill) {
6032:         PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6033:       }
6034:     }
6035: #endif

6037:     /* put together the new matrix */
6038:     MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
6039:     PetscLogObjectParent(fact,isicol);
6040:     b    = (Mat_SeqBAIJ*)fact->data;
6041:     b->free_a       = PETSC_TRUE;
6042:     b->free_ij      = PETSC_TRUE;
6043:     b->singlemalloc = PETSC_FALSE;
6044:     PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
6045:     b->j          = ajnew;
6046:     b->i          = ainew;
6047:     for (i=0; i<n; i++) dloc[i] += ainew[i];
6048:     b->diag       = dloc;
6049:     b->free_diag  = PETSC_TRUE;
6050:     b->ilen       = 0;
6051:     b->imax       = 0;
6052:     b->row        = isrow;
6053:     b->col        = iscol;
6054:     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6055:     PetscObjectReference((PetscObject)isrow);
6056:     PetscObjectReference((PetscObject)iscol);
6057:     b->icol       = isicol;
6058:     PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6059:     /* In b structure:  Free imax, ilen, old a, old j.  
6060:        Allocate dloc, solve_work, new a, new j */
6061:     PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6062:     b->maxnz          = b->nz = ainew[n];

6064:     fact->info.factor_mallocs    = reallocate;
6065:     fact->info.fill_ratio_given  = f;
6066:     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);

6068:   MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6069:   return(0);
6070: }

6074: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6075: {
6076:   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6077:   /* int i,*AJ=a->j,nz=a->nz; */
6079:   /* Undo Column scaling */
6080: /*    while (nz--) { */
6081: /*      AJ[i] = AJ[i]/4; */
6082: /*    } */
6083:   /* This should really invoke a push/pop logic, but we don't have that yet. */
6084:   A->ops->setunfactored = PETSC_NULL;
6085:   return(0);
6086: }

6090: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6091: {
6092:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6093:   PetscInt       *AJ=a->j,nz=a->nz;
6094:   unsigned short *aj=(unsigned short *)AJ;
6096:   /* Is this really necessary? */
6097:   while (nz--) {
6098:     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6099:   }
6100:   A->ops->setunfactored = PETSC_NULL;
6101:   return(0);
6102: }