Actual source code: inode.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   This file provides high performance routines for the Inode format (compressed sparse row)
  5:   by taking advantage of rows with identical nonzero structure (I-nodes).
  6: */
 7:  #include ../src/mat/impls/aij/seq/aij.h

 11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 12: {
 13:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 15:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 18:   n      = A->cmap->n;
 19:   m      = A->rmap->n;
 20:   ns_row = a->inode.size;
 21: 
 22:   min_mn = (m < n) ? m : n;
 23:   if (!ns) {
 24:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 25:     for(; count+1 < n; count++,i++);
 26:     if (count < n)  {
 27:       i++;
 28:     }
 29:     *size = i;
 30:     return(0);
 31:   }
 32:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 33: 
 34:   /* Use the same row structure wherever feasible. */
 35:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 36:     ns_col[i] = ns_row[i];
 37:   }

 39:   /* if m < n; pad up the remainder with inode_limit */
 40:   for(; count+1 < n; count++,i++) {
 41:     ns_col[i] = 1;
 42:   }
 43:   /* The last node is the odd ball. padd it up with the remaining rows; */
 44:   if (count < n)  {
 45:     ns_col[i] = n - count;
 46:     i++;
 47:   } else if (count > n) {
 48:     /* Adjust for the over estimation */
 49:     ns_col[i-1] += n - count;
 50:   }
 51:   *size = i;
 52:   *ns   = ns_col;
 53:   return(0);
 54: }


 57: /*
 58:       This builds symmetric version of nonzero structure,
 59: */
 62: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 63: {
 64:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
 66:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 67:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap->n;
 72:   n         = A->cmap->n;
 73:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
 74: 
 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 81:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1){
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col)
 87:       tvc[col] = i1;
 88:   }
 89:   /* allocate space for row pointers */
 90:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 91:   *iia = ia;
 92:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 93:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 95:   /* determine the number of columns in each row */
 96:   ia[0] = oshift;
 97:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 99:     j    = aj + ai[row] + ishift;
100:     jmax = aj + ai[row+1] + ishift;
101:     i2   = 0;
102:     col  = *j++ + ishift;
103:     i2   = tvc[col];
104:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105:       ia[i1+1]++;
106:       ia[i2+1]++;
107:       i2++;                     /* Start col of next node */
108:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109:       i2 = tvc[col];
110:     }
111:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112:   }

114:   /* shift ia[i] to point to next row */
115:   for (i1=1; i1<nslim_row+1; i1++) {
116:     row        = ia[i1-1];
117:     ia[i1]    += row;
118:     work[i1-1] = row - oshift;
119:   }

121:   /* allocate space for column pointers */
122:   nz   = ia[nslim_row] + (!ishift);
123:   PetscMalloc(nz*sizeof(PetscInt),&ja);
124:   *jja = ja;

126:  /* loop over lower triangular part putting into ja */
127:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128:     j    = aj + ai[row] + ishift;
129:     jmax = aj + ai[row+1] + ishift;
130:     i2   = 0;                     /* Col inode index */
131:     col  = *j++ + ishift;
132:     i2   = tvc[col];
133:     while (i2<i1 && j<jmax) {
134:       ja[work[i2]++] = i1 + oshift;
135:       ja[work[i1]++] = i2 + oshift;
136:       ++i2;
137:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138:       i2 = tvc[col];
139:     }
140:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

142:   }
143:   PetscFree(work);
144:   PetscFree(tns);
145:   PetscFree(tvc);
146:   return(0);
147: }

149: /*
150:       This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
158:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap->n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);
167: 
168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1){
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col)
176:       tvc[col] = i1;
177:   }
178:   /* allocate space for row pointers */
179:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180:   *iia = ia;
181:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

184:   /* determine the number of columns in each row */
185:   ia[0] = oshift;
186:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187:     j   = aj + ai[row] + ishift;
188:     col = *j++ + ishift;
189:     i2  = tvc[col];
190:     nz  = ai[row+1] - ai[row];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc(nz*sizeof(PetscInt),&ja);
209:   *jja = ja;

211:  /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     i2  = 0;                     /* Col inode index */
215:     col = *j++ + ishift;
216:     i2  = tvc[col];
217:     nz  = ai[row+1] - ai[row];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;

240:   *n     = a->inode.node_count;
241:   if (!ia) return(0);
242:   if (!blockcompressed) {
243:     MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244:   } else if (symmetric) {
245:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246:   } else {
247:     MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248:   }
249:   return(0);
250: }

254: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {

259:   if (!ia) return(0);

261:   if (!blockcompressed) {
262:     MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263:   } else {
264:     PetscFree(*ia);
265:     PetscFree(*ja);
266:   }

268:   return(0);
269: }

271: /* ----------------------------------------------------------- */

275: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
279:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

283:   nslim_row = a->inode.node_count;
284:   n         = A->cmap->n;

286:   /* Create The column_inode for this matrix */
287:   Mat_CreateColInode(A,&nslim_col,&ns_col);
288: 
289:   /* allocate space for reformated column_inode structure */
290:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

294:   for (i1=0,col=0; i1<nslim_col; ++i1){
295:     nsz = ns_col[i1];
296:     for (i2=0; i2<nsz; ++i2,++col)
297:       tvc[col] = i1;
298:   }
299:   /* allocate space for column pointers */
300:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301:   *iia = ia;
302:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

305:   /* determine the number of columns in each row */
306:   ia[0] = oshift;
307:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308:     j   = aj + ai[row] + ishift;
309:     col = *j++ + ishift;
310:     i2  = tvc[col];
311:     nz  = ai[row+1] - ai[row];
312:     while (nz-- > 0) {           /* off-diagonal elemets */
313:       /* ia[i1+1]++; */
314:       ia[i2+1]++;
315:       i2++;
316:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317:       if (nz > 0) i2 = tvc[col];
318:     }
319:   }

321:   /* shift ia[i] to point to next col */
322:   for (i1=1; i1<nslim_col+1; i1++) {
323:     col        = ia[i1-1];
324:     ia[i1]    += col;
325:     work[i1-1] = col - oshift;
326:   }

328:   /* allocate space for column pointers */
329:   nz   = ia[nslim_col] + (!ishift);
330:   PetscMalloc(nz*sizeof(PetscInt),&ja);
331:   *jja = ja;

333:  /* loop over matrix putting into ja */
334:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335:     j   = aj + ai[row] + ishift;
336:     i2  = 0;                     /* Col inode index */
337:     col = *j++ + ishift;
338:     i2  = tvc[col];
339:     nz  = ai[row+1] - ai[row];
340:     while (nz-- > 0) {
341:       /* ja[work[i1]++] = i2 + oshift; */
342:       ja[work[i2]++] = i1 + oshift;
343:       i2++;
344:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345:       if (nz > 0) i2 = tvc[col];
346:     }
347:   }
348:   PetscFree(ns_col);
349:   PetscFree(work);
350:   PetscFree(tns);
351:   PetscFree(tvc);
352:   return(0);
353: }

357: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {

362:   Mat_CreateColInode(A,n,PETSC_NULL);
363:   if (!ia) return(0);

365:   if (!blockcompressed) {
366:     MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367:   } else if (symmetric) {
368:     /* Since the indices are symmetric it does'nt matter */
369:     MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370:   } else {
371:     MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372:   }
373:   return(0);
374: }

378: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {

383:   if (!ia) return(0);
384:   if (!blockcompressed) {
385:     MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386:   } else {
387:     PetscFree(*ia);
388:     PetscFree(*ja);
389:   }
390:   return(0);
391: }

393: /* ----------------------------------------------------------- */

397: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398: {
399:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400:   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401:   PetscScalar       *y;
402:   const PetscScalar *x;
403:   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404:   PetscErrorCode    ierr;
405:   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406: 
407: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409: #endif

412:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413:   node_max = a->inode.node_count;
414:   ns       = a->inode.size;     /* Node Size array */
415:   VecGetArray(xx,(PetscScalar**)&x);
416:   VecGetArray(yy,&y);
417:   idx  = a->j;
418:   v1   = a->a;
419:   ii   = a->i;

421:   for (i = 0,row = 0; i< node_max; ++i){
422:     nsz  = ns[i];
423:     n    = ii[1] - ii[0];
424:     nonzerorow += (n>0)*nsz;
425:     ii  += nsz;
426:     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427:     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428:     sz   = n;                   /* No of non zeros in this row */
429:                                 /* Switch on the size of Node */
430:     switch (nsz){               /* Each loop in 'case' is unrolled */
431:     case 1 :
432:       sum1  = 0.;
433: 
434:       for(n = 0; n< sz-1; n+=2) {
435:         i1   = idx[0];          /* The instructions are ordered to */
436:         i2   = idx[1];          /* make the compiler's job easy */
437:         idx += 2;
438:         tmp0 = x[i1];
439:         tmp1 = x[i2];
440:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441:        }
442: 
443:       if (n == sz-1){          /* Take care of the last nonzero  */
444:         tmp0  = x[*idx++];
445:         sum1 += *v1++ * tmp0;
446:       }
447:       y[row++]=sum1;
448:       break;
449:     case 2:
450:       sum1  = 0.;
451:       sum2  = 0.;
452:       v2    = v1 + n;
453: 
454:       for (n = 0; n< sz-1; n+=2) {
455:         i1   = idx[0];
456:         i2   = idx[1];
457:         idx += 2;
458:         tmp0 = x[i1];
459:         tmp1 = x[i2];
460:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462:       }
463:       if (n == sz-1){
464:         tmp0  = x[*idx++];
465:         sum1 += *v1++ * tmp0;
466:         sum2 += *v2++ * tmp0;
467:       }
468:       y[row++]=sum1;
469:       y[row++]=sum2;
470:       v1      =v2;              /* Since the next block to be processed starts there*/
471:       idx    +=sz;
472:       break;
473:     case 3:
474:       sum1  = 0.;
475:       sum2  = 0.;
476:       sum3  = 0.;
477:       v2    = v1 + n;
478:       v3    = v2 + n;
479: 
480:       for (n = 0; n< sz-1; n+=2) {
481:         i1   = idx[0];
482:         i2   = idx[1];
483:         idx += 2;
484:         tmp0 = x[i1];
485:         tmp1 = x[i2];
486:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489:       }
490:       if (n == sz-1){
491:         tmp0  = x[*idx++];
492:         sum1 += *v1++ * tmp0;
493:         sum2 += *v2++ * tmp0;
494:         sum3 += *v3++ * tmp0;
495:       }
496:       y[row++]=sum1;
497:       y[row++]=sum2;
498:       y[row++]=sum3;
499:       v1       =v3;             /* Since the next block to be processed starts there*/
500:       idx     +=2*sz;
501:       break;
502:     case 4:
503:       sum1  = 0.;
504:       sum2  = 0.;
505:       sum3  = 0.;
506:       sum4  = 0.;
507:       v2    = v1 + n;
508:       v3    = v2 + n;
509:       v4    = v3 + n;
510: 
511:       for (n = 0; n< sz-1; n+=2) {
512:         i1   = idx[0];
513:         i2   = idx[1];
514:         idx += 2;
515:         tmp0 = x[i1];
516:         tmp1 = x[i2];
517:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521:       }
522:       if (n == sz-1){
523:         tmp0  = x[*idx++];
524:         sum1 += *v1++ * tmp0;
525:         sum2 += *v2++ * tmp0;
526:         sum3 += *v3++ * tmp0;
527:         sum4 += *v4++ * tmp0;
528:       }
529:       y[row++]=sum1;
530:       y[row++]=sum2;
531:       y[row++]=sum3;
532:       y[row++]=sum4;
533:       v1      =v4;              /* Since the next block to be processed starts there*/
534:       idx    +=3*sz;
535:       break;
536:     case 5:
537:       sum1  = 0.;
538:       sum2  = 0.;
539:       sum3  = 0.;
540:       sum4  = 0.;
541:       sum5  = 0.;
542:       v2    = v1 + n;
543:       v3    = v2 + n;
544:       v4    = v3 + n;
545:       v5    = v4 + n;
546: 
547:       for (n = 0; n<sz-1; n+=2) {
548:         i1   = idx[0];
549:         i2   = idx[1];
550:         idx += 2;
551:         tmp0 = x[i1];
552:         tmp1 = x[i2];
553:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558:       }
559:       if (n == sz-1){
560:         tmp0  = x[*idx++];
561:         sum1 += *v1++ * tmp0;
562:         sum2 += *v2++ * tmp0;
563:         sum3 += *v3++ * tmp0;
564:         sum4 += *v4++ * tmp0;
565:         sum5 += *v5++ * tmp0;
566:       }
567:       y[row++]=sum1;
568:       y[row++]=sum2;
569:       y[row++]=sum3;
570:       y[row++]=sum4;
571:       y[row++]=sum5;
572:       v1      =v5;       /* Since the next block to be processed starts there */
573:       idx    +=4*sz;
574:       break;
575:     default :
576:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577:     }
578:   }
579:   VecRestoreArray(xx,(PetscScalar**)&x);
580:   VecRestoreArray(yy,&y);
581:   PetscLogFlops(2.0*a->nz - nonzerorow);
582:   return(0);
583: }
584: /* ----------------------------------------------------------- */
585: /* Almost same code as the MatMult_SeqAIJ_Inode() */
588: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589: {
590:   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592:   MatScalar      *v1,*v2,*v3,*v4,*v5;
593:   PetscScalar    *x,*y,*z,*zt;
595:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596: 
598:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599:   node_max = a->inode.node_count;
600:   ns       = a->inode.size;     /* Node Size array */
601:   VecGetArray(xx,&x);
602:   VecGetArray(yy,&y);
603:   if (zz != yy) {
604:     VecGetArray(zz,&z);
605:   } else {
606:     z = y;
607:   }
608:   zt = z;

610:   idx  = a->j;
611:   v1   = a->a;
612:   ii   = a->i;

614:   for (i = 0,row = 0; i< node_max; ++i){
615:     nsz  = ns[i];
616:     n    = ii[1] - ii[0];
617:     ii  += nsz;
618:     sz   = n;                   /* No of non zeros in this row */
619:                                 /* Switch on the size of Node */
620:     switch (nsz){               /* Each loop in 'case' is unrolled */
621:     case 1 :
622:       sum1  = *zt++;
623: 
624:       for(n = 0; n< sz-1; n+=2) {
625:         i1   = idx[0];          /* The instructions are ordered to */
626:         i2   = idx[1];          /* make the compiler's job easy */
627:         idx += 2;
628:         tmp0 = x[i1];
629:         tmp1 = x[i2];
630:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631:        }
632: 
633:       if(n   == sz-1){          /* Take care of the last nonzero  */
634:         tmp0  = x[*idx++];
635:         sum1 += *v1++ * tmp0;
636:       }
637:       y[row++]=sum1;
638:       break;
639:     case 2:
640:       sum1  = *zt++;
641:       sum2  = *zt++;
642:       v2    = v1 + n;
643: 
644:       for(n = 0; n< sz-1; n+=2) {
645:         i1   = idx[0];
646:         i2   = idx[1];
647:         idx += 2;
648:         tmp0 = x[i1];
649:         tmp1 = x[i2];
650:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652:       }
653:       if(n   == sz-1){
654:         tmp0  = x[*idx++];
655:         sum1 += *v1++ * tmp0;
656:         sum2 += *v2++ * tmp0;
657:       }
658:       y[row++]=sum1;
659:       y[row++]=sum2;
660:       v1      =v2;              /* Since the next block to be processed starts there*/
661:       idx    +=sz;
662:       break;
663:     case 3:
664:       sum1  = *zt++;
665:       sum2  = *zt++;
666:       sum3  = *zt++;
667:       v2    = v1 + n;
668:       v3    = v2 + n;
669: 
670:       for (n = 0; n< sz-1; n+=2) {
671:         i1   = idx[0];
672:         i2   = idx[1];
673:         idx += 2;
674:         tmp0 = x[i1];
675:         tmp1 = x[i2];
676:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679:       }
680:       if (n == sz-1){
681:         tmp0  = x[*idx++];
682:         sum1 += *v1++ * tmp0;
683:         sum2 += *v2++ * tmp0;
684:         sum3 += *v3++ * tmp0;
685:       }
686:       y[row++]=sum1;
687:       y[row++]=sum2;
688:       y[row++]=sum3;
689:       v1       =v3;             /* Since the next block to be processed starts there*/
690:       idx     +=2*sz;
691:       break;
692:     case 4:
693:       sum1  = *zt++;
694:       sum2  = *zt++;
695:       sum3  = *zt++;
696:       sum4  = *zt++;
697:       v2    = v1 + n;
698:       v3    = v2 + n;
699:       v4    = v3 + n;
700: 
701:       for (n = 0; n< sz-1; n+=2) {
702:         i1   = idx[0];
703:         i2   = idx[1];
704:         idx += 2;
705:         tmp0 = x[i1];
706:         tmp1 = x[i2];
707:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711:       }
712:       if (n == sz-1){
713:         tmp0  = x[*idx++];
714:         sum1 += *v1++ * tmp0;
715:         sum2 += *v2++ * tmp0;
716:         sum3 += *v3++ * tmp0;
717:         sum4 += *v4++ * tmp0;
718:       }
719:       y[row++]=sum1;
720:       y[row++]=sum2;
721:       y[row++]=sum3;
722:       y[row++]=sum4;
723:       v1      =v4;              /* Since the next block to be processed starts there*/
724:       idx    +=3*sz;
725:       break;
726:     case 5:
727:       sum1  = *zt++;
728:       sum2  = *zt++;
729:       sum3  = *zt++;
730:       sum4  = *zt++;
731:       sum5  = *zt++;
732:       v2    = v1 + n;
733:       v3    = v2 + n;
734:       v4    = v3 + n;
735:       v5    = v4 + n;
736: 
737:       for (n = 0; n<sz-1; n+=2) {
738:         i1   = idx[0];
739:         i2   = idx[1];
740:         idx += 2;
741:         tmp0 = x[i1];
742:         tmp1 = x[i2];
743:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748:       }
749:       if(n   == sz-1){
750:         tmp0  = x[*idx++];
751:         sum1 += *v1++ * tmp0;
752:         sum2 += *v2++ * tmp0;
753:         sum3 += *v3++ * tmp0;
754:         sum4 += *v4++ * tmp0;
755:         sum5 += *v5++ * tmp0;
756:       }
757:       y[row++]=sum1;
758:       y[row++]=sum2;
759:       y[row++]=sum3;
760:       y[row++]=sum4;
761:       y[row++]=sum5;
762:       v1      =v5;       /* Since the next block to be processed starts there */
763:       idx    +=4*sz;
764:       break;
765:     default :
766:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767:     }
768:   }
769:   VecRestoreArray(xx,&x);
770:   VecRestoreArray(yy,&y);
771:   if (zz != yy) {
772:     VecRestoreArray(zz,&z);
773:   }
774:   PetscLogFlops(2.0*a->nz);
775:   return(0);
776: }

778: /* ----------------------------------------------------------- */
781: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782: {
783:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784:   IS                iscol = a->col,isrow = a->row;
785:   PetscErrorCode    ierr;
786:   const PetscInt    *r,*c,*rout,*cout;
787:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792:   const PetscScalar *b;

795:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796:   node_max = a->inode.node_count;
797:   ns       = a->inode.size;     /* Node Size array */

799:   VecGetArray(bb,(PetscScalar**)&b);
800:   VecGetArray(xx,&x);
801:   tmp  = a->solve_work;
802: 
803:   ISGetIndices(isrow,&rout); r = rout;
804:   ISGetIndices(iscol,&cout); c = cout + (n-1);
805: 
806:   /* forward solve the lower triangular */
807:   tmps = tmp ;
808:   aa   = a_a ;
809:   aj   = a_j ;
810:   ad   = a->diag;

812:   for (i = 0,row = 0; i< node_max; ++i){
813:     nsz = ns[i];
814:     aii = ai[row];
815:     v1  = aa + aii;
816:     vi  = aj + aii;
817:     nz  = ad[row]- aii;
818:     if (i < node_max-1) {
819:       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820:       * but our indexing to determine it's size could. */
821:       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822:       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823:       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824:       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825:     }

827:     switch (nsz){               /* Each loop in 'case' is unrolled */
828:     case 1 :
829:       sum1 = b[*r++];
830:       for(j=0; j<nz-1; j+=2){
831:         i0   = vi[0];
832:         i1   = vi[1];
833:         vi  +=2;
834:         tmp0 = tmps[i0];
835:         tmp1 = tmps[i1];
836:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837:       }
838:       if(j == nz-1){
839:         tmp0 = tmps[*vi++];
840:         sum1 -= *v1++ *tmp0;
841:       }
842:       tmp[row ++]=sum1;
843:       break;
844:     case 2:
845:       sum1 = b[*r++];
846:       sum2 = b[*r++];
847:       v2   = aa + ai[row+1];

849:       for(j=0; j<nz-1; j+=2){
850:         i0   = vi[0];
851:         i1   = vi[1];
852:         vi  +=2;
853:         tmp0 = tmps[i0];
854:         tmp1 = tmps[i1];
855:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857:       }
858:       if(j == nz-1){
859:         tmp0 = tmps[*vi++];
860:         sum1 -= *v1++ *tmp0;
861:         sum2 -= *v2++ *tmp0;
862:       }
863:       sum2 -= *v2++ * sum1;
864:       tmp[row ++]=sum1;
865:       tmp[row ++]=sum2;
866:       break;
867:     case 3:
868:       sum1 = b[*r++];
869:       sum2 = b[*r++];
870:       sum3 = b[*r++];
871:       v2   = aa + ai[row+1];
872:       v3   = aa + ai[row+2];
873: 
874:       for (j=0; j<nz-1; j+=2){
875:         i0   = vi[0];
876:         i1   = vi[1];
877:         vi  +=2;
878:         tmp0 = tmps[i0];
879:         tmp1 = tmps[i1];
880:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883:       }
884:       if (j == nz-1){
885:         tmp0 = tmps[*vi++];
886:         sum1 -= *v1++ *tmp0;
887:         sum2 -= *v2++ *tmp0;
888:         sum3 -= *v3++ *tmp0;
889:       }
890:       sum2 -= *v2++ * sum1;
891:       sum3 -= *v3++ * sum1;
892:       sum3 -= *v3++ * sum2;
893:       tmp[row ++]=sum1;
894:       tmp[row ++]=sum2;
895:       tmp[row ++]=sum3;
896:       break;
897: 
898:     case 4:
899:       sum1 = b[*r++];
900:       sum2 = b[*r++];
901:       sum3 = b[*r++];
902:       sum4 = b[*r++];
903:       v2   = aa + ai[row+1];
904:       v3   = aa + ai[row+2];
905:       v4   = aa + ai[row+3];
906: 
907:       for (j=0; j<nz-1; j+=2){
908:         i0   = vi[0];
909:         i1   = vi[1];
910:         vi  +=2;
911:         tmp0 = tmps[i0];
912:         tmp1 = tmps[i1];
913:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917:       }
918:       if (j == nz-1){
919:         tmp0 = tmps[*vi++];
920:         sum1 -= *v1++ *tmp0;
921:         sum2 -= *v2++ *tmp0;
922:         sum3 -= *v3++ *tmp0;
923:         sum4 -= *v4++ *tmp0;
924:       }
925:       sum2 -= *v2++ * sum1;
926:       sum3 -= *v3++ * sum1;
927:       sum4 -= *v4++ * sum1;
928:       sum3 -= *v3++ * sum2;
929:       sum4 -= *v4++ * sum2;
930:       sum4 -= *v4++ * sum3;
931: 
932:       tmp[row ++]=sum1;
933:       tmp[row ++]=sum2;
934:       tmp[row ++]=sum3;
935:       tmp[row ++]=sum4;
936:       break;
937:     case 5:
938:       sum1 = b[*r++];
939:       sum2 = b[*r++];
940:       sum3 = b[*r++];
941:       sum4 = b[*r++];
942:       sum5 = b[*r++];
943:       v2   = aa + ai[row+1];
944:       v3   = aa + ai[row+2];
945:       v4   = aa + ai[row+3];
946:       v5   = aa + ai[row+4];
947: 
948:       for (j=0; j<nz-1; j+=2){
949:         i0   = vi[0];
950:         i1   = vi[1];
951:         vi  +=2;
952:         tmp0 = tmps[i0];
953:         tmp1 = tmps[i1];
954:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959:       }
960:       if (j == nz-1){
961:         tmp0 = tmps[*vi++];
962:         sum1 -= *v1++ *tmp0;
963:         sum2 -= *v2++ *tmp0;
964:         sum3 -= *v3++ *tmp0;
965:         sum4 -= *v4++ *tmp0;
966:         sum5 -= *v5++ *tmp0;
967:       }

969:       sum2 -= *v2++ * sum1;
970:       sum3 -= *v3++ * sum1;
971:       sum4 -= *v4++ * sum1;
972:       sum5 -= *v5++ * sum1;
973:       sum3 -= *v3++ * sum2;
974:       sum4 -= *v4++ * sum2;
975:       sum5 -= *v5++ * sum2;
976:       sum4 -= *v4++ * sum3;
977:       sum5 -= *v5++ * sum3;
978:       sum5 -= *v5++ * sum4;
979: 
980:       tmp[row ++]=sum1;
981:       tmp[row ++]=sum2;
982:       tmp[row ++]=sum3;
983:       tmp[row ++]=sum4;
984:       tmp[row ++]=sum5;
985:       break;
986:     default:
987:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988:     }
989:   }
990:   /* backward solve the upper triangular */
991:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992:     nsz = ns[i];
993:     aii = ai[row+1] -1;
994:     v1  = aa + aii;
995:     vi  = aj + aii;
996:     nz  = aii- ad[row];
997:     switch (nsz){               /* Each loop in 'case' is unrolled */
998:     case 1 :
999:       sum1 = tmp[row];

1001:       for(j=nz ; j>1; j-=2){
1002:         vi  -=2;
1003:         i0   = vi[2];
1004:         i1   = vi[1];
1005:         tmp0 = tmps[i0];
1006:         tmp1 = tmps[i1];
1007:         v1   -= 2;
1008:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009:       }
1010:       if (j==1){
1011:         tmp0  = tmps[*vi--];
1012:         sum1 -= *v1-- * tmp0;
1013:       }
1014:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015:       break;
1016:     case 2 :
1017:       sum1 = tmp[row];
1018:       sum2 = tmp[row -1];
1019:       v2   = aa + ai[row]-1;
1020:       for (j=nz ; j>1; j-=2){
1021:         vi  -=2;
1022:         i0   = vi[2];
1023:         i1   = vi[1];
1024:         tmp0 = tmps[i0];
1025:         tmp1 = tmps[i1];
1026:         v1   -= 2;
1027:         v2   -= 2;
1028:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030:       }
1031:       if (j==1){
1032:         tmp0  = tmps[*vi--];
1033:         sum1 -= *v1-- * tmp0;
1034:         sum2 -= *v2-- * tmp0;
1035:       }
1036: 
1037:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038:       sum2   -= *v2-- * tmp0;
1039:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040:       break;
1041:     case 3 :
1042:       sum1 = tmp[row];
1043:       sum2 = tmp[row -1];
1044:       sum3 = tmp[row -2];
1045:       v2   = aa + ai[row]-1;
1046:       v3   = aa + ai[row -1]-1;
1047:       for (j=nz ; j>1; j-=2){
1048:         vi  -=2;
1049:         i0   = vi[2];
1050:         i1   = vi[1];
1051:         tmp0 = tmps[i0];
1052:         tmp1 = tmps[i1];
1053:         v1   -= 2;
1054:         v2   -= 2;
1055:         v3   -= 2;
1056:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059:       }
1060:       if (j==1){
1061:         tmp0  = tmps[*vi--];
1062:         sum1 -= *v1-- * tmp0;
1063:         sum2 -= *v2-- * tmp0;
1064:         sum3 -= *v3-- * tmp0;
1065:       }
1066:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067:       sum2   -= *v2-- * tmp0;
1068:       sum3   -= *v3-- * tmp0;
1069:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070:       sum3   -= *v3-- * tmp0;
1071:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072: 
1073:       break;
1074:     case 4 :
1075:       sum1 = tmp[row];
1076:       sum2 = tmp[row -1];
1077:       sum3 = tmp[row -2];
1078:       sum4 = tmp[row -3];
1079:       v2   = aa + ai[row]-1;
1080:       v3   = aa + ai[row -1]-1;
1081:       v4   = aa + ai[row -2]-1;

1083:       for (j=nz ; j>1; j-=2){
1084:         vi  -=2;
1085:         i0   = vi[2];
1086:         i1   = vi[1];
1087:         tmp0 = tmps[i0];
1088:         tmp1 = tmps[i1];
1089:         v1  -= 2;
1090:         v2  -= 2;
1091:         v3  -= 2;
1092:         v4  -= 2;
1093:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097:       }
1098:       if (j==1){
1099:         tmp0  = tmps[*vi--];
1100:         sum1 -= *v1-- * tmp0;
1101:         sum2 -= *v2-- * tmp0;
1102:         sum3 -= *v3-- * tmp0;
1103:         sum4 -= *v4-- * tmp0;
1104:       }

1106:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107:       sum2   -= *v2-- * tmp0;
1108:       sum3   -= *v3-- * tmp0;
1109:       sum4   -= *v4-- * tmp0;
1110:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111:       sum3   -= *v3-- * tmp0;
1112:       sum4   -= *v4-- * tmp0;
1113:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114:       sum4   -= *v4-- * tmp0;
1115:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116:       break;
1117:     case 5 :
1118:       sum1 = tmp[row];
1119:       sum2 = tmp[row -1];
1120:       sum3 = tmp[row -2];
1121:       sum4 = tmp[row -3];
1122:       sum5 = tmp[row -4];
1123:       v2   = aa + ai[row]-1;
1124:       v3   = aa + ai[row -1]-1;
1125:       v4   = aa + ai[row -2]-1;
1126:       v5   = aa + ai[row -3]-1;
1127:       for (j=nz ; j>1; j-=2){
1128:         vi  -= 2;
1129:         i0   = vi[2];
1130:         i1   = vi[1];
1131:         tmp0 = tmps[i0];
1132:         tmp1 = tmps[i1];
1133:         v1   -= 2;
1134:         v2   -= 2;
1135:         v3   -= 2;
1136:         v4   -= 2;
1137:         v5   -= 2;
1138:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143:       }
1144:       if (j==1){
1145:         tmp0  = tmps[*vi--];
1146:         sum1 -= *v1-- * tmp0;
1147:         sum2 -= *v2-- * tmp0;
1148:         sum3 -= *v3-- * tmp0;
1149:         sum4 -= *v4-- * tmp0;
1150:         sum5 -= *v5-- * tmp0;
1151:       }

1153:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154:       sum2   -= *v2-- * tmp0;
1155:       sum3   -= *v3-- * tmp0;
1156:       sum4   -= *v4-- * tmp0;
1157:       sum5   -= *v5-- * tmp0;
1158:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159:       sum3   -= *v3-- * tmp0;
1160:       sum4   -= *v4-- * tmp0;
1161:       sum5   -= *v5-- * tmp0;
1162:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163:       sum4   -= *v4-- * tmp0;
1164:       sum5   -= *v5-- * tmp0;
1165:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166:       sum5   -= *v5-- * tmp0;
1167:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168:       break;
1169:     default:
1170:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171:     }
1172:   }
1173:   ISRestoreIndices(isrow,&rout);
1174:   ISRestoreIndices(iscol,&cout);
1175:   VecRestoreArray(bb,(PetscScalar**)&b);
1176:   VecRestoreArray(xx,&x);
1177:   PetscLogFlops(2.0*a->nz - A->cmap->n);
1178:   return(0);
1179: }

1183: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184: {
1185:   Mat              C=B;
1186:   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187:   IS               isrow = b->row,isicol = b->icol;
1188:   PetscErrorCode   ierr;
1189:   const PetscInt   *r,*ic,*ics;
1190:   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191:   PetscInt         i,j,k,nz,nzL,row,*pj;
1192:   const PetscInt   *ajtmp,*bjtmp;
1193:   MatScalar        *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194:   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195:   FactorShiftCtx   sctx;
1196:   const PetscInt   *ddiag;
1197:   PetscReal        rs;
1198:   MatScalar        d;
1199:   PetscInt         inod,nodesz,node_max,col;
1200:   const PetscInt   *ns;
1201:   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202: 
1204:   /* MatPivotSetUp(): initialize shift context sctx */
1205:   PetscMemzero(&sctx,sizeof(FactorShiftCtx));

1207:   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208:     ddiag          = a->diag;
1209:     sctx.shift_top = info->zeropivot;
1210:     for (i=0; i<n; i++) {
1211:       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212:       d  = (aa)[ddiag[i]];
1213:       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214:       v  = aa+ai[i];
1215:       nz = ai[i+1] - ai[i];
1216:       for (j=0; j<nz; j++)
1217:         rs += PetscAbsScalar(v[j]);
1218:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219:     }
1220:     sctx.shift_top   *= 1.1;
1221:     sctx.nshift_max   = 5;
1222:     sctx.shift_lo     = 0.;
1223:     sctx.shift_hi     = 1.;
1224:   }

1226:   ISGetIndices(isrow,&r);
1227:   ISGetIndices(isicol,&ic);
1228: 
1229:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1230:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1231:   rtmp2 = rtmp1 + n;
1232:   rtmp3 = rtmp2 + n;
1233:   ics   = ic;

1235:   node_max = a->inode.node_count;
1236:   ns       = a->inode.size;
1237:   if (!ns){
1238:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239:   }

1241:   /* If max inode size > 3, split it into two inodes.*/
1242:   /* also map the inode sizes according to the ordering */
1243:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1244:   for (i=0,j=0; i<node_max; ++i,++j){
1245:     if (ns[i]>3) {
1246:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1247:       ++j;
1248:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249:     } else {
1250:       tmp_vec1[j] = ns[i];
1251:     }
1252:   }
1253:   /* Use the correct node_max */
1254:   node_max = j;
1255: 
1256:   /* Now reorder the inode info based on mat re-ordering info */
1257:   /* First create a row -> inode_size_array_index map */
1258:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1259:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1260:   for (i=0,row=0; i<node_max; i++) {
1261:     nodesz = tmp_vec1[i];
1262:     for (j=0; j<nodesz; j++,row++) {
1263:       nsmap[row] = i;
1264:     }
1265:   }
1266:   /* Using nsmap, create a reordered ns structure */
1267:   for (i=0,j=0; i< node_max; i++) {
1268:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269:     tmp_vec2[i]  = nodesz;
1270:     j           += nodesz;
1271:   }
1272:   PetscFree(nsmap);
1273:   PetscFree(tmp_vec1);

1275:   /* Now use the correct ns */
1276:   ns = tmp_vec2;

1278:   do {
1279:     sctx.useshift = PETSC_FALSE;
1280:     /* Now loop over each block-row, and do the factorization */
1281:     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1282:       nodesz = ns[inod];
1283: 
1284:       switch (nodesz){
1285:       case 1:
1286:       /*----------*/
1287:         /* zero rtmp1 */
1288:         /* L part */
1289:         nz    = bi[i+1] - bi[i];
1290:         bjtmp = bj + bi[i];
1291:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;

1293:         /* U part */
1294:         nz = bdiag[i]-bdiag[i+1];
1295:         bjtmp = bj + bdiag[i+1]+1;
1296:         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1297: 
1298:         /* load in initial (unfactored row) */
1299:         nz    = ai[r[i]+1] - ai[r[i]];
1300:         ajtmp = aj + ai[r[i]];
1301:         v     = aa + ai[r[i]];
1302:         for (j=0; j<nz; j++) {
1303:           rtmp1[ics[ajtmp[j]]] = v[j];
1304:         }
1305:         /* ZeropivotApply() */
1306:         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1307: 
1308:         /* elimination */
1309:         bjtmp = bj + bi[i];
1310:         row   = *bjtmp++;
1311:         nzL   = bi[i+1] - bi[i];
1312:         for(k=0; k < nzL;k++) {
1313:           pc = rtmp1 + row;
1314:           if (*pc != 0.0) {
1315:             pv   = b->a + bdiag[row];
1316:             mul1 = *pc * (*pv);
1317:             *pc  = mul1;
1318:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1319:             pv = b->a + bdiag[row+1]+1;
1320:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1321:             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1322:             PetscLogFlops(2.0*nz);
1323:           }
1324:           row = *bjtmp++;
1325:         }

1327:         /* finished row so stick it into b->a */
1328:         rs = 0.0;
1329:         /* L part */
1330:         pv = b->a + bi[i] ;
1331:         pj = b->j + bi[i] ;
1332:         nz = bi[i+1] - bi[i];
1333:         for (j=0; j<nz; j++) {
1334:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335:         }

1337:         /* U part */
1338:         pv = b->a + bdiag[i+1]+1;
1339:         pj = b->j + bdiag[i+1]+1;
1340:         nz = bdiag[i] - bdiag[i+1]-1;
1341:         for (j=0; j<nz; j++) {
1342:           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1343:         }

1345:         /* Check zero pivot */
1346:         sctx.rs = rs;
1347:         sctx.pv = rtmp1[i];
1348:         MatPivotCheck(info,sctx,i);
1349: 
1350:         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351:         pv  = b->a + bdiag[i];
1352:         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353:         break;
1354: 
1355:       case 2:
1356:       /*----------*/
1357:         /* zero rtmp1 and rtmp2 */
1358:         /* L part */
1359:         nz    = bi[i+1] - bi[i];
1360:         bjtmp = bj + bi[i];
1361:         for  (j=0; j<nz; j++) {
1362:           col = bjtmp[j];
1363:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364:         }

1366:         /* U part */
1367:         nz = bdiag[i]-bdiag[i+1];
1368:         bjtmp = bj + bdiag[i+1]+1;
1369:         for  (j=0; j<nz; j++) {
1370:           col = bjtmp[j];
1371:           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372:         }
1373: 
1374:         /* load in initial (unfactored row) */
1375:         nz    = ai[r[i]+1] - ai[r[i]];
1376:         ajtmp = aj + ai[r[i]];
1377:         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378:         for (j=0; j<nz; j++) {
1379:           col = ics[ajtmp[j]];
1380:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381:         }
1382:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384: 
1385:         /* elimination */
1386:         bjtmp = bj + bi[i];
1387:         row   = *bjtmp++; /* pivot row */
1388:         nzL   = bi[i+1] - bi[i];
1389:         for(k=0; k < nzL;k++) {
1390:           pc1 = rtmp1 + row;
1391:           pc2 = rtmp2 + row;
1392:           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393:             pv   = b->a + bdiag[row];
1394:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395:             *pc1 = mul1;       *pc2 = mul2;

1397:             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398:             pv = b->a + bdiag[row+1]+1;
1399:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400:             for (j=0; j<nz; j++){
1401:               col = pj[j];
1402:               rtmp1[col] -= mul1 * pv[j];
1403:               rtmp2[col] -= mul2 * pv[j];
1404:             }
1405:             PetscLogFlops(4*nz);
1406:           }
1407:           row = *bjtmp++;
1408:         }

1410:         /* finished row i; check zero pivot, then stick row i into b->a */
1411:         rs  = 0.0;
1412:         /* L part */
1413:         pc1 = b->a + bi[i];
1414:         pj  = b->j + bi[i] ;
1415:         nz  = bi[i+1] - bi[i];
1416:         for (j=0; j<nz; j++) {
1417:           col = pj[j];
1418:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419:         }
1420:         /* U part */
1421:         pc1 = b->a + bdiag[i+1]+1;
1422:         pj  = b->j + bdiag[i+1]+1;
1423:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424:         for (j=0; j<nz; j++) {
1425:           col = pj[j];
1426:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427:         }
1428: 
1429:         sctx.rs  = rs;
1430:         sctx.pv  = rtmp1[i];
1431:         MatPivotCheck(info,sctx,i);
1432:         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433:         *pc1 = 1.0/sctx.pv;

1435:         /* Now take care of diagonal 2x2 block. */
1436:         pc2 = rtmp2 + i;
1437:         if (*pc2 != 0.0){
1438:           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439:           *pc2 = mul1;          /* insert L entry */
1440:           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441:           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442:           for (j=0; j<nz; j++) {
1443:             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444:           }
1445:           PetscLogFlops(2*nz);
1446:         }

1448:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449:         rs = 0.0;
1450:         /* L part */
1451:         pc2 = b->a + bi[i+1];
1452:         pj  = b->j + bi[i+1] ;
1453:         nz  = bi[i+2] - bi[i+1];
1454:         for (j=0; j<nz; j++) {
1455:           col = pj[j];
1456:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457:         }
1458:         /* U part */
1459:         pc2 = b->a + bdiag[i+2]+1;
1460:         pj  = b->j + bdiag[i+2]+1;
1461:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462:         for (j=0; j<nz; j++) {
1463:           col = pj[j];
1464:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465:         }

1467:         sctx.rs  = rs;
1468:         sctx.pv  = rtmp2[i+1];
1469:         MatPivotCheck(info,sctx,i+1);
1470:         pc2  = b->a + bdiag[i+1];
1471:         *pc2 = 1.0/sctx.pv;
1472:         break;

1474:       case 3:
1475:       /*----------*/
1476:         /* zero rtmp */
1477:         /* L part */
1478:         nz    = bi[i+1] - bi[i];
1479:         bjtmp = bj + bi[i];
1480:         for  (j=0; j<nz; j++) {
1481:           col = bjtmp[j];
1482:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1483:         }

1485:         /* U part */
1486:         nz = bdiag[i]-bdiag[i+1];
1487:         bjtmp = bj + bdiag[i+1]+1;
1488:         for  (j=0; j<nz; j++) {
1489:           col = bjtmp[j];
1490:           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1491:         }
1492: 
1493:         /* load in initial (unfactored row) */
1494:         nz    = ai[r[i]+1] - ai[r[i]];
1495:         ajtmp = aj + ai[r[i]];
1496:         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1497:         for (j=0; j<nz; j++) {
1498:           col = ics[ajtmp[j]];
1499:           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1500:         }
1501:         /* ZeropivotApply(): shift the diagonal of the matrix  */
1502:         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1503: 
1504:         /* elimination */
1505:         bjtmp = bj + bi[i];
1506:         row   = *bjtmp++; /* pivot row */
1507:         nzL   = bi[i+1] - bi[i];
1508:         for(k=0; k < nzL;k++) {
1509:           pc1 = rtmp1 + row;
1510:           pc2 = rtmp2 + row;
1511:           pc3 = rtmp3 + row;
1512:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1513:             pv  = b->a + bdiag[row];
1514:             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1515:             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;

1517:             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1518:             pv = b->a + bdiag[row+1]+1;
1519:             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1520:             for (j=0; j<nz; j++){
1521:               col = pj[j];
1522:               rtmp1[col] -= mul1 * pv[j];
1523:               rtmp2[col] -= mul2 * pv[j];
1524:               rtmp3[col] -= mul3 * pv[j];
1525:             }
1526:             PetscLogFlops(6*nz);
1527:           }
1528:           row = *bjtmp++;
1529:         }

1531:         /* finished row i; check zero pivot, then stick row i into b->a */
1532:         rs  = 0.0;
1533:         /* L part */
1534:         pc1 = b->a + bi[i];
1535:         pj  = b->j + bi[i] ;
1536:         nz  = bi[i+1] - bi[i];
1537:         for (j=0; j<nz; j++) {
1538:           col = pj[j];
1539:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1540:         }
1541:         /* U part */
1542:         pc1 = b->a + bdiag[i+1]+1;
1543:         pj  = b->j + bdiag[i+1]+1;
1544:         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1545:         for (j=0; j<nz; j++) {
1546:           col = pj[j];
1547:           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1548:         }
1549: 
1550:         sctx.rs  = rs;
1551:         sctx.pv  = rtmp1[i];
1552:         MatPivotCheck(info,sctx,i);
1553:         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554:         *pc1 = 1.0/sctx.pv;

1556:         /* Now take care of 1st column of diagonal 3x3 block. */
1557:         pc2 = rtmp2 + i;
1558:         pc3 = rtmp3 + i;
1559:         if (*pc2 != 0.0 || *pc3 != 0.0){
1560:           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561:           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562:           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563:           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564:           for (j=0; j<nz; j++) {
1565:             col = pj[j];
1566:             rtmp2[col] -= mul2 * rtmp1[col];
1567:             rtmp3[col] -= mul3 * rtmp1[col];
1568:           }
1569:           PetscLogFlops(4*nz);
1570:         }
1571: 
1572:         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573:         rs = 0.0;
1574:         /* L part */
1575:         pc2 = b->a + bi[i+1];
1576:         pj  = b->j + bi[i+1] ;
1577:         nz  = bi[i+2] - bi[i+1];
1578:         for (j=0; j<nz; j++) {
1579:           col = pj[j];
1580:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581:         }
1582:         /* U part */
1583:         pc2 = b->a + bdiag[i+2]+1;
1584:         pj  = b->j + bdiag[i+2]+1;
1585:         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586:         for (j=0; j<nz; j++) {
1587:           col = pj[j];
1588:           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589:         }

1591:         sctx.rs  = rs;
1592:         sctx.pv  = rtmp2[i+1];
1593:         MatPivotCheck(info,sctx,i+1);
1594:         pc2  = b->a + bdiag[i+1];
1595:         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */

1597:         /* Now take care of 2nd column of diagonal 3x3 block. */
1598:         pc3 = rtmp3 + i+1;
1599:         if (*pc3 != 0.0){
1600:           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1601:           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1602:           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1603:           for (j=0; j<nz; j++) {
1604:             col = pj[j];
1605:             rtmp3[col] -= mul3 * rtmp2[col];
1606:           }
1607:           PetscLogFlops(2*nz);
1608:         }

1610:         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1611:         rs = 0.0;
1612:         /* L part */
1613:         pc3 = b->a + bi[i+2];
1614:         pj  = b->j + bi[i+2] ;
1615:         nz  = bi[i+3] - bi[i+2];
1616:         for (j=0; j<nz; j++) {
1617:           col = pj[j];
1618:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1619:         }
1620:         /* U part */
1621:         pc3 = b->a + bdiag[i+3]+1;
1622:         pj  = b->j + bdiag[i+3]+1;
1623:         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1624:         for (j=0; j<nz; j++) {
1625:           col = pj[j];
1626:           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1627:         }

1629:         sctx.rs  = rs;
1630:         sctx.pv  = rtmp3[i+2];
1631:         MatPivotCheck(info,sctx,i+2);
1632:         pc3  = b->a + bdiag[i+2];
1633:         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1634:         break;

1636:       default:
1637:         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1638:       }
1639:       i += nodesz;                 /* Update the row */
1640:     }

1642:     /* MatPivotRefine() */
1643:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1644:       /* 
1645:        * if no shift in this attempt & shifting & started shifting & can refine,
1646:        * then try lower shift
1647:        */
1648:       sctx.shift_hi       = sctx.shift_fraction;
1649:       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1650:       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1651:       sctx.useshift       = PETSC_TRUE;
1652:       sctx.nshift++;
1653:     }
1654:   } while (sctx.useshift);

1656:   PetscFree(rtmp1);
1657:   PetscFree(tmp_vec2);
1658:   ISRestoreIndices(isicol,&ic);
1659:   ISRestoreIndices(isrow,&r);

1661:   C->ops->solve              = MatSolve_SeqAIJ;
1662:   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1663:   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1664:   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1665:   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1666:   C->assembled    = PETSC_TRUE;
1667:   C->preallocated = PETSC_TRUE;
1668:   PetscLogFlops(C->cmap->n);

1670:   /* MatShiftView(A,info,&sctx) */
1671:   if (sctx.nshift){
1672:     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1673:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1674:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1675:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1676:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1677:       PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1678:     }
1679:   }
1680:   Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1681:   return(0);
1682: }

1686: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1687: {
1688:   Mat               C = B;
1689:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1690:   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1691:   PetscErrorCode    ierr;
1692:   const PetscInt    *r,*ic,*c,*ics;
1693:   PetscInt          n = A->rmap->n,*bi = b->i;
1694:   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1695:   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1696:   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1697:   PetscScalar       mul1,mul2,mul3,tmp;
1698:   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1699:   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1700:   PetscReal         rs=0.0;
1701:   FactorShiftCtx    sctx;
1702:   PetscInt          newshift;

1705:   sctx.shift_top      = 0;
1706:   sctx.nshift_max     = 0;
1707:   sctx.shift_lo       = 0;
1708:   sctx.shift_hi       = 0;
1709:   sctx.shift_fraction = 0;

1711:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1712:   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1713:     sctx.shift_top = 0;
1714:     for (i=0; i<n; i++) {
1715:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1716:       rs    = 0.0;
1717:       ajtmp = aj + ai[i];
1718:       rtmp1 = aa + ai[i];
1719:       nz = ai[i+1] - ai[i];
1720:       for (j=0; j<nz; j++){
1721:         if (*ajtmp != i){
1722:           rs += PetscAbsScalar(*rtmp1++);
1723:         } else {
1724:           rs -= PetscRealPart(*rtmp1++);
1725:         }
1726:         ajtmp++;
1727:       }
1728:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1729:     }
1730:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1731:     sctx.shift_top *= 1.1;
1732:     sctx.nshift_max = 5;
1733:     sctx.shift_lo   = 0.;
1734:     sctx.shift_hi   = 1.;
1735:   }
1736:   sctx.shift_amount = 0;
1737:   sctx.nshift       = 0;

1739:   ISGetIndices(isrow,&r);
1740:   ISGetIndices(iscol,&c);
1741:   ISGetIndices(isicol,&ic);
1742:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1743:   PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1744:   ics   = ic ;
1745:   rtmp22 = rtmp11 + n;
1746:   rtmp33 = rtmp22 + n;
1747: 
1748:   node_max = a->inode.node_count;
1749:   ns       = a->inode.size;
1750:   if (!ns){
1751:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1752:   }

1754:   /* If max inode size > 3, split it into two inodes.*/
1755:   /* also map the inode sizes according to the ordering */
1756:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1757:   for (i=0,j=0; i<node_max; ++i,++j){
1758:     if (ns[i]>3) {
1759:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1760:       ++j;
1761:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1762:     } else {
1763:       tmp_vec1[j] = ns[i];
1764:     }
1765:   }
1766:   /* Use the correct node_max */
1767:   node_max = j;

1769:   /* Now reorder the inode info based on mat re-ordering info */
1770:   /* First create a row -> inode_size_array_index map */
1771:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1772:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1773:   for (i=0,row=0; i<node_max; i++) {
1774:     nodesz = tmp_vec1[i];
1775:     for (j=0; j<nodesz; j++,row++) {
1776:       nsmap[row] = i;
1777:     }
1778:   }
1779:   /* Using nsmap, create a reordered ns structure */
1780:   for (i=0,j=0; i< node_max; i++) {
1781:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1782:     tmp_vec2[i]  = nodesz;
1783:     j           += nodesz;
1784:   }
1785:   PetscFree(nsmap);
1786:   PetscFree(tmp_vec1);
1787:   /* Now use the correct ns */
1788:   ns = tmp_vec2;

1790:   do {
1791:     sctx.useshift = PETSC_FALSE;
1792:     /* Now loop over each block-row, and do the factorization */
1793:     for (i=0,row=0; i<node_max; i++) {
1794:       nodesz = ns[i];
1795:       nz     = bi[row+1] - bi[row];
1796:       bjtmp  = bj + bi[row];

1798:       switch (nodesz){
1799:       case 1:
1800:         for  (j=0; j<nz; j++){
1801:           idx        = bjtmp[j];
1802:           rtmp11[idx] = 0.0;
1803:         }
1804: 
1805:         /* load in initial (unfactored row) */
1806:         idx    = r[row];
1807:         nz_tmp = ai[idx+1] - ai[idx];
1808:         ajtmp  = aj + ai[idx];
1809:         v1     = aa + ai[idx];

1811:         for (j=0; j<nz_tmp; j++) {
1812:           idx        = ics[ajtmp[j]];
1813:           rtmp11[idx] = v1[j];
1814:         }
1815:         rtmp11[ics[r[row]]] += sctx.shift_amount;

1817:         prow = *bjtmp++ ;
1818:         while (prow < row) {
1819:           pc1 = rtmp11 + prow;
1820:           if (*pc1 != 0.0){
1821:             pv   = ba + bd[prow];
1822:             pj   = nbj + bd[prow];
1823:             mul1 = *pc1 * *pv++;
1824:             *pc1 = mul1;
1825:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1826:             PetscLogFlops(2*nz_tmp);
1827:             for (j=0; j<nz_tmp; j++) {
1828:               tmp = pv[j];
1829:               idx = pj[j];
1830:               rtmp11[idx] -= mul1 * tmp;
1831:             }
1832:           }
1833:           prow = *bjtmp++ ;
1834:         }
1835:         pj  = bj + bi[row];
1836:         pc1 = ba + bi[row];

1838:         sctx.pv    = rtmp11[row];
1839:         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1840:         rs         = 0.0;
1841:         for (j=0; j<nz; j++) {
1842:           idx    = pj[j];
1843:           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1844:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1845:         }
1846:         sctx.rs  = rs;
1847:         MatLUCheckShift_inline(info,sctx,row,newshift);
1848:         if (newshift == 1) goto endofwhile;
1849:         break;
1850: 
1851:       case 2:
1852:         for (j=0; j<nz; j++) {
1853:           idx        = bjtmp[j];
1854:           rtmp11[idx] = 0.0;
1855:           rtmp22[idx] = 0.0;
1856:         }
1857: 
1858:         /* load in initial (unfactored row) */
1859:         idx    = r[row];
1860:         nz_tmp = ai[idx+1] - ai[idx];
1861:         ajtmp  = aj + ai[idx];
1862:         v1     = aa + ai[idx];
1863:         v2     = aa + ai[idx+1];
1864:         for (j=0; j<nz_tmp; j++) {
1865:           idx        = ics[ajtmp[j]];
1866:           rtmp11[idx] = v1[j];
1867:           rtmp22[idx] = v2[j];
1868:         }
1869:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1870:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;

1872:         prow = *bjtmp++ ;
1873:         while (prow < row) {
1874:           pc1 = rtmp11 + prow;
1875:           pc2 = rtmp22 + prow;
1876:           if (*pc1 != 0.0 || *pc2 != 0.0){
1877:             pv   = ba + bd[prow];
1878:             pj   = nbj + bd[prow];
1879:             mul1 = *pc1 * *pv;
1880:             mul2 = *pc2 * *pv;
1881:             ++pv;
1882:             *pc1 = mul1;
1883:             *pc2 = mul2;
1884: 
1885:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1886:             for (j=0; j<nz_tmp; j++) {
1887:               tmp = pv[j];
1888:               idx = pj[j];
1889:               rtmp11[idx] -= mul1 * tmp;
1890:               rtmp22[idx] -= mul2 * tmp;
1891:             }
1892:             PetscLogFlops(4*nz_tmp);
1893:           }
1894:           prow = *bjtmp++ ;
1895:         }

1897:         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1898:         pc1 = rtmp11 + prow;
1899:         pc2 = rtmp22 + prow;

1901:         sctx.pv = *pc1;
1902:         pj      = bj + bi[prow];
1903:         rs      = 0.0;
1904:         for (j=0; j<nz; j++){
1905:           idx = pj[j];
1906:           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1907:         }
1908:         sctx.rs = rs;
1909:         MatLUCheckShift_inline(info,sctx,row,newshift);
1910:         if (newshift == 1) goto endofwhile;

1912:         if (*pc2 != 0.0){
1913:           pj     = nbj + bd[prow];
1914:           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1915:           *pc2   = mul2;
1916:           nz_tmp = bi[prow+1] - bd[prow] - 1;
1917:           for (j=0; j<nz_tmp; j++) {
1918:             idx = pj[j] ;
1919:             tmp = rtmp11[idx];
1920:             rtmp22[idx] -= mul2 * tmp;
1921:           }
1922:           PetscLogFlops(2*nz_tmp);
1923:         }
1924: 
1925:         pj  = bj + bi[row];
1926:         pc1 = ba + bi[row];
1927:         pc2 = ba + bi[row+1];

1929:         sctx.pv = rtmp22[row+1];
1930:         rs = 0.0;
1931:         rtmp11[row]   = 1.0/rtmp11[row];
1932:         rtmp22[row+1] = 1.0/rtmp22[row+1];
1933:         /* copy row entries from dense representation to sparse */
1934:         for (j=0; j<nz; j++) {
1935:           idx    = pj[j];
1936:           pc1[j] = rtmp11[idx];
1937:           pc2[j] = rtmp22[idx];
1938:           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1939:         }
1940:         sctx.rs = rs;
1941:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
1942:         if (newshift == 1) goto endofwhile;
1943:         break;

1945:       case 3:
1946:         for  (j=0; j<nz; j++) {
1947:           idx        = bjtmp[j];
1948:           rtmp11[idx] = 0.0;
1949:           rtmp22[idx] = 0.0;
1950:           rtmp33[idx] = 0.0;
1951:         }
1952:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1953:         idx    = r[row];
1954:         nz_tmp = ai[idx+1] - ai[idx];
1955:         ajtmp = aj + ai[idx];
1956:         v1    = aa + ai[idx];
1957:         v2    = aa + ai[idx+1];
1958:         v3    = aa + ai[idx+2];
1959:         for (j=0; j<nz_tmp; j++) {
1960:           idx        = ics[ajtmp[j]];
1961:           rtmp11[idx] = v1[j];
1962:           rtmp22[idx] = v2[j];
1963:           rtmp33[idx] = v3[j];
1964:         }
1965:         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1966:         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1967:         rtmp33[ics[r[row+2]]] += sctx.shift_amount;

1969:         /* loop over all pivot row blocks above this row block */
1970:         prow = *bjtmp++ ;
1971:         while (prow < row) {
1972:           pc1 = rtmp11 + prow;
1973:           pc2 = rtmp22 + prow;
1974:           pc3 = rtmp33 + prow;
1975:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1976:             pv   = ba  + bd[prow];
1977:             pj   = nbj + bd[prow];
1978:             mul1 = *pc1 * *pv;
1979:             mul2 = *pc2 * *pv;
1980:             mul3 = *pc3 * *pv;
1981:             ++pv;
1982:             *pc1 = mul1;
1983:             *pc2 = mul2;
1984:             *pc3 = mul3;
1985: 
1986:             nz_tmp = bi[prow+1] - bd[prow] - 1;
1987:             /* update this row based on pivot row */
1988:             for (j=0; j<nz_tmp; j++) {
1989:               tmp = pv[j];
1990:               idx = pj[j];
1991:               rtmp11[idx] -= mul1 * tmp;
1992:               rtmp22[idx] -= mul2 * tmp;
1993:               rtmp33[idx] -= mul3 * tmp;
1994:             }
1995:             PetscLogFlops(6*nz_tmp);
1996:           }
1997:           prow = *bjtmp++ ;
1998:         }

2000:         /* Now take care of diagonal 3x3 block in this set of rows */
2001:         /* note: prow = row here */
2002:         pc1 = rtmp11 + prow;
2003:         pc2 = rtmp22 + prow;
2004:         pc3 = rtmp33 + prow;

2006:         sctx.pv = *pc1;
2007:         pj      = bj + bi[prow];
2008:         rs      = 0.0;
2009:         for (j=0; j<nz; j++){
2010:           idx = pj[j];
2011:           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2012:         }
2013:         sctx.rs = rs;
2014:         MatLUCheckShift_inline(info,sctx,row,newshift);
2015:         if (newshift == 1) goto endofwhile;

2017:         if (*pc2 != 0.0 || *pc3 != 0.0){
2018:           mul2 = (*pc2)/(*pc1);
2019:           mul3 = (*pc3)/(*pc1);
2020:           *pc2 = mul2;
2021:           *pc3 = mul3;
2022:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2023:           pj     = nbj + bd[prow];
2024:           for (j=0; j<nz_tmp; j++) {
2025:             idx = pj[j] ;
2026:             tmp = rtmp11[idx];
2027:             rtmp22[idx] -= mul2 * tmp;
2028:             rtmp33[idx] -= mul3 * tmp;
2029:           }
2030:           PetscLogFlops(4*nz_tmp);
2031:         }
2032:         ++prow;

2034:         pc2 = rtmp22 + prow;
2035:         pc3 = rtmp33 + prow;
2036:         sctx.pv = *pc2;
2037:         pj      = bj + bi[prow];
2038:         rs      = 0.0;
2039:         for (j=0; j<nz; j++){
2040:           idx = pj[j];
2041:           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2042:         }
2043:         sctx.rs = rs;
2044:         MatLUCheckShift_inline(info,sctx,row+1,newshift);
2045:         if (newshift == 1) goto endofwhile;

2047:         if (*pc3 != 0.0){
2048:           mul3   = (*pc3)/(*pc2);
2049:           *pc3   = mul3;
2050:           pj     = nbj + bd[prow];
2051:           nz_tmp = bi[prow+1] - bd[prow] - 1;
2052:           for (j=0; j<nz_tmp; j++) {
2053:             idx = pj[j] ;
2054:             tmp = rtmp22[idx];
2055:             rtmp33[idx] -= mul3 * tmp;
2056:           }
2057:           PetscLogFlops(2*nz_tmp);
2058:         }

2060:         pj  = bj + bi[row];
2061:         pc1 = ba + bi[row];
2062:         pc2 = ba + bi[row+1];
2063:         pc3 = ba + bi[row+2];

2065:         sctx.pv = rtmp33[row+2];
2066:         rs = 0.0;
2067:         rtmp11[row]   = 1.0/rtmp11[row];
2068:         rtmp22[row+1] = 1.0/rtmp22[row+1];
2069:         rtmp33[row+2] = 1.0/rtmp33[row+2];
2070:         /* copy row entries from dense representation to sparse */
2071:         for (j=0; j<nz; j++) {
2072:           idx    = pj[j];
2073:           pc1[j] = rtmp11[idx];
2074:           pc2[j] = rtmp22[idx];
2075:           pc3[j] = rtmp33[idx];
2076:           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2077:         }

2079:         sctx.rs = rs;
2080:         MatLUCheckShift_inline(info,sctx,row+2,newshift);
2081:         if (newshift == 1) goto endofwhile;
2082:         break;

2084:       default:
2085:         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2086:       }
2087:       row += nodesz;                 /* Update the row */
2088:     }
2089:     endofwhile:;
2090:   } while (sctx.useshift);
2091:   PetscFree(rtmp11);
2092:   PetscFree(tmp_vec2);
2093:   ISRestoreIndices(isicol,&ic);
2094:   ISRestoreIndices(isrow,&r);
2095:   ISRestoreIndices(iscol,&c);
2096:   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2097:   /* do not set solve add, since MatSolve_Inode + Add is faster */
2098:   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2099:   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2100:   C->assembled   = PETSC_TRUE;
2101:   C->preallocated = PETSC_TRUE;
2102:   if (sctx.nshift) {
2103:     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2104:       PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2105:     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2106:       PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2107:     }
2108:   }
2109:   PetscLogFlops(C->cmap->n);
2110:   Mat_CheckInode(C,PETSC_FALSE);
2111:   return(0);
2112: }


2115: /* ----------------------------------------------------------- */
2118: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2119: {
2120:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2121:   IS                iscol = a->col,isrow = a->row;
2122:   PetscErrorCode    ierr;
2123:   const PetscInt    *r,*c,*rout,*cout;
2124:   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2125:   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2126:   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2127:   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2128:   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2129:   const PetscScalar *b;

2132:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2133:   node_max = a->inode.node_count;
2134:   ns       = a->inode.size;     /* Node Size array */

2136:   VecGetArray(bb,(PetscScalar**)&b);
2137:   VecGetArray(xx,&x);
2138:   tmp  = a->solve_work;
2139: 
2140:   ISGetIndices(isrow,&rout); r = rout;
2141:   ISGetIndices(iscol,&cout); c = cout;
2142: 
2143:   /* forward solve the lower triangular */
2144:   tmps = tmp ;
2145:   aa   = a_a ;
2146:   aj   = a_j ;
2147:   ad   = a->diag;

2149:   for (i = 0,row = 0; i< node_max; ++i){
2150:     nsz = ns[i];
2151:     aii = ai[row];
2152:     v1  = aa + aii;
2153:     vi  = aj + aii;
2154:     nz  = ai[row+1]- ai[row];
2155: 
2156:     if (i < node_max-1) {
2157:       /* Prefetch the indices for the next block */
2158:       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2159:       /* Prefetch the data for the next block */
2160:       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2161:     }

2163:     switch (nsz){               /* Each loop in 'case' is unrolled */
2164:     case 1 :
2165:       sum1 = b[r[row]];
2166:       for(j=0; j<nz-1; j+=2){
2167:         i0   = vi[j];
2168:         i1   = vi[j+1];
2169:         tmp0 = tmps[i0];
2170:         tmp1 = tmps[i1];
2171:         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2172:       }
2173:       if(j == nz-1){
2174:         tmp0 = tmps[vi[j]];
2175:         sum1 -= v1[j]*tmp0;
2176:       }
2177:       tmp[row++]=sum1;
2178:       break;
2179:     case 2:
2180:       sum1 = b[r[row]];
2181:       sum2 = b[r[row+1]];
2182:       v2   = aa + ai[row+1];

2184:       for(j=0; j<nz-1; j+=2){
2185:         i0   = vi[j];
2186:         i1   = vi[j+1];
2187:         tmp0 = tmps[i0];
2188:         tmp1 = tmps[i1];
2189:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2190:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2191:       }
2192:       if(j == nz-1){
2193:         tmp0 = tmps[vi[j]];
2194:         sum1 -= v1[j] *tmp0;
2195:         sum2 -= v2[j] *tmp0;
2196:       }
2197:       sum2 -= v2[nz] * sum1;
2198:       tmp[row ++]=sum1;
2199:       tmp[row ++]=sum2;
2200:       break;
2201:     case 3:
2202:       sum1 = b[r[row]];
2203:       sum2 = b[r[row+1]];
2204:       sum3 = b[r[row+2]];
2205:       v2   = aa + ai[row+1];
2206:       v3   = aa + ai[row+2];
2207: 
2208:       for (j=0; j<nz-1; j+=2){
2209:         i0   = vi[j];
2210:         i1   = vi[j+1];
2211:         tmp0 = tmps[i0];
2212:         tmp1 = tmps[i1];
2213:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2214:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2215:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2216:       }
2217:       if (j == nz-1){
2218:         tmp0 = tmps[vi[j]];
2219:         sum1 -= v1[j] *tmp0;
2220:         sum2 -= v2[j] *tmp0;
2221:         sum3 -= v3[j] *tmp0;
2222:       }
2223:       sum2 -= v2[nz] * sum1;
2224:       sum3 -= v3[nz] * sum1;
2225:       sum3 -= v3[nz+1] * sum2;
2226:       tmp[row ++]=sum1;
2227:       tmp[row ++]=sum2;
2228:       tmp[row ++]=sum3;
2229:       break;
2230: 
2231:     case 4:
2232:       sum1 = b[r[row]];
2233:       sum2 = b[r[row+1]];
2234:       sum3 = b[r[row+2]];
2235:       sum4 = b[r[row+3]];
2236:       v2   = aa + ai[row+1];
2237:       v3   = aa + ai[row+2];
2238:       v4   = aa + ai[row+3];
2239: 
2240:       for (j=0; j<nz-1; j+=2){
2241:         i0   = vi[j];
2242:         i1   = vi[j+1];
2243:         tmp0 = tmps[i0];
2244:         tmp1 = tmps[i1];
2245:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2246:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2247:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2248:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2249:       }
2250:       if (j == nz-1){
2251:         tmp0 = tmps[vi[j]];
2252:         sum1 -= v1[j] *tmp0;
2253:         sum2 -= v2[j] *tmp0;
2254:         sum3 -= v3[j] *tmp0;
2255:         sum4 -= v4[j] *tmp0;
2256:       }
2257:       sum2 -= v2[nz] * sum1;
2258:       sum3 -= v3[nz] * sum1;
2259:       sum4 -= v4[nz] * sum1;
2260:       sum3 -= v3[nz+1] * sum2;
2261:       sum4 -= v4[nz+1] * sum2;
2262:       sum4 -= v4[nz+2] * sum3;
2263: 
2264:       tmp[row ++]=sum1;
2265:       tmp[row ++]=sum2;
2266:       tmp[row ++]=sum3;
2267:       tmp[row ++]=sum4;
2268:       break;
2269:     case 5:
2270:       sum1 = b[r[row]];
2271:       sum2 = b[r[row+1]];
2272:       sum3 = b[r[row+2]];
2273:       sum4 = b[r[row+3]];
2274:       sum5 = b[r[row+4]];
2275:       v2   = aa + ai[row+1];
2276:       v3   = aa + ai[row+2];
2277:       v4   = aa + ai[row+3];
2278:       v5   = aa + ai[row+4];
2279: 
2280:       for (j=0; j<nz-1; j+=2){
2281:         i0   = vi[j];
2282:         i1   = vi[j+1];
2283:         tmp0 = tmps[i0];
2284:         tmp1 = tmps[i1];
2285:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2286:         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2287:         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2288:         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2289:         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2290:       }
2291:       if (j == nz-1){
2292:         tmp0 = tmps[vi[j]];
2293:         sum1 -= v1[j] *tmp0;
2294:         sum2 -= v2[j] *tmp0;
2295:         sum3 -= v3[j] *tmp0;
2296:         sum4 -= v4[j] *tmp0;
2297:         sum5 -= v5[j] *tmp0;
2298:       }

2300:       sum2 -= v2[nz] * sum1;
2301:       sum3 -= v3[nz] * sum1;
2302:       sum4 -= v4[nz] * sum1;
2303:       sum5 -= v5[nz] * sum1;
2304:       sum3 -= v3[nz+1] * sum2;
2305:       sum4 -= v4[nz+1] * sum2;
2306:       sum5 -= v5[nz+1] * sum2;
2307:       sum4 -= v4[nz+2] * sum3;
2308:       sum5 -= v5[nz+2] * sum3;
2309:       sum5 -= v5[nz+3] * sum4;
2310: 
2311:       tmp[row ++]=sum1;
2312:       tmp[row ++]=sum2;
2313:       tmp[row ++]=sum3;
2314:       tmp[row ++]=sum4;
2315:       tmp[row ++]=sum5;
2316:       break;
2317:     default:
2318:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2319:     }
2320:   }
2321:   /* backward solve the upper triangular */
2322:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2323:     nsz = ns[i];
2324:     aii = ad[row+1] + 1;
2325:     v1  = aa + aii;
2326:     vi  = aj + aii;
2327:     nz  = ad[row]- ad[row+1] - 1;
2328: 
2329:     if (i > 0) {
2330:       /* Prefetch the indices for the next block */
2331:       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2332:       /* Prefetch the data for the next block */
2333:       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2334:     }

2336:     switch (nsz){               /* Each loop in 'case' is unrolled */
2337:     case 1 :
2338:       sum1 = tmp[row];

2340:       for(j=0 ; j<nz-1; j+=2){
2341:         i0   = vi[j];
2342:         i1   = vi[j+1];
2343:         tmp0 = tmps[i0];
2344:         tmp1 = tmps[i1];
2345:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2346:       }
2347:       if (j == nz-1){
2348:         tmp0  = tmps[vi[j]];
2349:         sum1 -= v1[j]*tmp0;
2350:       }
2351:       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2352:       break;
2353:     case 2 :
2354:       sum1 = tmp[row];
2355:       sum2 = tmp[row-1];
2356:       v2   = aa + ad[row] + 1;
2357:       for (j=0 ; j<nz-1; j+=2){
2358:         i0   = vi[j];
2359:         i1   = vi[j+1];
2360:         tmp0 = tmps[i0];
2361:         tmp1 = tmps[i1];
2362:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2363:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2364:       }
2365:       if (j == nz-1){
2366:         tmp0  = tmps[vi[j]];
2367:         sum1 -= v1[j]* tmp0;
2368:         sum2 -= v2[j+1]* tmp0;
2369:       }
2370: 
2371:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2372:       sum2   -= v2[0] * tmp0;
2373:       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2374:       break;
2375:     case 3 :
2376:       sum1 = tmp[row];
2377:       sum2 = tmp[row -1];
2378:       sum3 = tmp[row -2];
2379:       v2   = aa + ad[row] + 1;
2380:       v3   = aa + ad[row -1] + 1;
2381:       for (j=0 ; j<nz-1; j+=2){
2382:         i0   = vi[j];
2383:         i1   = vi[j+1];
2384:         tmp0 = tmps[i0];
2385:         tmp1 = tmps[i1];
2386:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2387:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2388:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2389:       }
2390:       if (j== nz-1){
2391:         tmp0  = tmps[vi[j]];
2392:         sum1 -= v1[j] * tmp0;
2393:         sum2 -= v2[j+1] * tmp0;
2394:         sum3 -= v3[j+2] * tmp0;
2395:       }
2396:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2397:       sum2   -= v2[0]* tmp0;
2398:       sum3   -= v3[1] * tmp0;
2399:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2400:       sum3   -= v3[0]* tmp0;
2401:       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2402: 
2403:       break;
2404:     case 4 :
2405:       sum1 = tmp[row];
2406:       sum2 = tmp[row -1];
2407:       sum3 = tmp[row -2];
2408:       sum4 = tmp[row -3];
2409:       v2   = aa + ad[row]+1;
2410:       v3   = aa + ad[row -1]+1;
2411:       v4   = aa + ad[row -2]+1;

2413:       for (j=0 ; j<nz-1; j+=2){
2414:         i0   = vi[j];
2415:         i1   = vi[j+1];
2416:         tmp0 = tmps[i0];
2417:         tmp1 = tmps[i1];
2418:         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2419:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2420:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2421:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2422:       }
2423:       if (j== nz-1){
2424:         tmp0  = tmps[vi[j]];
2425:         sum1 -= v1[j] * tmp0;
2426:         sum2 -= v2[j+1] * tmp0;
2427:         sum3 -= v3[j+2] * tmp0;
2428:         sum4 -= v4[j+3] * tmp0;
2429:       }

2431:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2432:       sum2   -= v2[0] * tmp0;
2433:       sum3   -= v3[1] * tmp0;
2434:       sum4   -= v4[2] * tmp0;
2435:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2436:       sum3   -= v3[0] * tmp0;
2437:       sum4   -= v4[1] * tmp0;
2438:       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2439:       sum4   -= v4[0] * tmp0;
2440:       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2441:       break;
2442:     case 5 :
2443:       sum1 = tmp[row];
2444:       sum2 = tmp[row -1];
2445:       sum3 = tmp[row -2];
2446:       sum4 = tmp[row -3];
2447:       sum5 = tmp[row -4];
2448:       v2   = aa + ad[row]+1;
2449:       v3   = aa + ad[row -1]+1;
2450:       v4   = aa + ad[row -2]+1;
2451:       v5   = aa + ad[row -3]+1;
2452:       for (j=0 ; j<nz-1; j+=2){
2453:         i0   = vi[j];
2454:         i1   = vi[j+1];
2455:         tmp0 = tmps[i0];
2456:         tmp1 = tmps[i1];
2457:         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458:         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2459:         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2460:         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2461:         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2462:       }
2463:       if (j==nz-1){
2464:         tmp0  = tmps[vi[j]];
2465:         sum1 -= v1[j] * tmp0;
2466:         sum2 -= v2[j+1] * tmp0;
2467:         sum3 -= v3[j+2] * tmp0;
2468:         sum4 -= v4[j+3] * tmp0;
2469:         sum5 -= v5[j+4] * tmp0;
2470:       }

2472:       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2473:       sum2   -= v2[0] * tmp0;
2474:       sum3   -= v3[1] * tmp0;
2475:       sum4   -= v4[2] * tmp0;
2476:       sum5   -= v5[3] * tmp0;
2477:       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2478:       sum3   -= v3[0] * tmp0;
2479:       sum4   -= v4[1] * tmp0;
2480:       sum5   -= v5[2] * tmp0;
2481:       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2482:       sum4   -= v4[0] * tmp0;
2483:       sum5   -= v5[1] * tmp0;
2484:       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2485:       sum5   -= v5[0] * tmp0;
2486:       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2487:       break;
2488:     default:
2489:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2490:     }
2491:   }
2492:   ISRestoreIndices(isrow,&rout);
2493:   ISRestoreIndices(iscol,&cout);
2494:   VecRestoreArray(bb,(PetscScalar**)&b);
2495:   VecRestoreArray(xx,&x);
2496:   PetscLogFlops(2.0*a->nz - A->cmap->n);
2497:   return(0);
2498: }


2501: /*
2502:      Makes a longer coloring[] array and calls the usual code with that
2503: */
2506: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2507: {
2508:   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2509:   PetscErrorCode  ierr;
2510:   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2511:   PetscInt        *colorused,i;
2512:   ISColoringValue *newcolor;

2515:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2516:   /* loop over inodes, marking a color for each column*/
2517:   row = 0;
2518:   for (i=0; i<m; i++){
2519:     for (j=0; j<ns[i]; j++) {
2520:       newcolor[row++] = coloring[i] + j*ncolors;
2521:     }
2522:   }

2524:   /* eliminate unneeded colors */
2525:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2526:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2527:   for (i=0; i<n; i++) {
2528:     colorused[newcolor[i]] = 1;
2529:   }

2531:   for (i=1; i<5*ncolors; i++) {
2532:     colorused[i] += colorused[i-1];
2533:   }
2534:   ncolors = colorused[5*ncolors-1];
2535:   for (i=0; i<n; i++) {
2536:     newcolor[i] = colorused[newcolor[i]]-1;
2537:   }
2538:   PetscFree(colorused);
2539:   ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
2540:   PetscFree(coloring);
2541:   return(0);
2542: }

2544:  #include ../src/mat/blockinvert.h

2548: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2549: {
2550:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2551:   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2552:   MatScalar          *ibdiag,*bdiag,work[25];
2553:   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2554:   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2555:   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2556:   PetscErrorCode     ierr;
2557:   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2558:   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];

2561:   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2562:   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2563:   if (its > 1) {
2564:     /* switch to non-inode version */
2565:     MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);
2566:     return(0);
2567:   }

2569:   if (!a->inode.ibdiagvalid) {
2570:     if (!a->inode.ibdiag) {
2571:       /* calculate space needed for diagonal blocks */
2572:       for (i=0; i<m; i++) {
2573:         cnt += sizes[i]*sizes[i];
2574:       }
2575:       a->inode.bdiagsize = cnt;
2576:       PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2577:     }

2579:     /* copy over the diagonal blocks and invert them */
2580:     ibdiag = a->inode.ibdiag;
2581:     bdiag  = a->inode.bdiag;
2582:     cnt = 0;
2583:     for (i=0, row = 0; i<m; i++) {
2584:       for (j=0; j<sizes[i]; j++) {
2585:         for (k=0; k<sizes[i]; k++) {
2586:           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2587:         }
2588:       }
2589:       PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2590: 
2591:       switch(sizes[i]) {
2592:         case 1:
2593:           /* Create matrix data structure */
2594:           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2595:           ibdiag[cnt] = 1.0/ibdiag[cnt];
2596:           break;
2597:         case 2:
2598:           Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2599:           break;
2600:         case 3:
2601:           Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2602:           break;
2603:         case 4:
2604:           Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2605:           break;
2606:         case 5:
2607:           Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2608:           break;
2609:        default:
2610:          SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2611:       }
2612:       cnt += sizes[i]*sizes[i];
2613:       row += sizes[i];
2614:     }
2615:     a->inode.ibdiagvalid = PETSC_TRUE;
2616:   }
2617:   ibdiag = a->inode.ibdiag;
2618:   bdiag  = a->inode.bdiag;


2621:   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2622:   if (flag & SOR_ZERO_INITIAL_GUESS) {
2623:     PetscScalar *b;
2624:     VecGetArray(xx,&x);
2625:     if (xx != bb) {
2626:       VecGetArray(bb,(PetscScalar**)&b);
2627:     } else {
2628:       b = x;
2629:     }
2630:     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){

2632:       for (i=0, row=0; i<m; i++) {
2633:         sz  = diag[row] - ii[row];
2634:         v1  = a->a + ii[row];
2635:         idx = a->j + ii[row];

2637:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2638:         switch (sizes[i]){
2639:           case 1:
2640: 
2641:             sum1  = b[row];
2642:             for(n = 0; n<sz-1; n+=2) {
2643:               i1   = idx[0];
2644:               i2   = idx[1];
2645:               idx += 2;
2646:               tmp0 = x[i1];
2647:               tmp1 = x[i2];
2648:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2649:             }
2650: 
2651:             if (n == sz-1){
2652:               tmp0  = x[*idx];
2653:               sum1 -= *v1 * tmp0;
2654:             }
2655:             x[row++] = sum1*(*ibdiag++);
2656:             break;
2657:           case 2:
2658:             v2    = a->a + ii[row+1];
2659:             sum1  = b[row];
2660:             sum2  = b[row+1];
2661:             for(n = 0; n<sz-1; n+=2) {
2662:               i1   = idx[0];
2663:               i2   = idx[1];
2664:               idx += 2;
2665:               tmp0 = x[i1];
2666:               tmp1 = x[i2];
2667:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2668:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2669:             }
2670: 
2671:             if (n == sz-1){
2672:               tmp0  = x[*idx];
2673:               sum1 -= v1[0] * tmp0;
2674:               sum2 -= v2[0] * tmp0;
2675:             }
2676:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2677:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2678:             ibdiag  += 4;
2679:             break;
2680:           case 3:
2681:             v2    = a->a + ii[row+1];
2682:             v3    = a->a + ii[row+2];
2683:             sum1  = b[row];
2684:             sum2  = b[row+1];
2685:             sum3  = b[row+2];
2686:             for(n = 0; n<sz-1; n+=2) {
2687:               i1   = idx[0];
2688:               i2   = idx[1];
2689:               idx += 2;
2690:               tmp0 = x[i1];
2691:               tmp1 = x[i2];
2692:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2693:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2694:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2695:             }
2696: 
2697:             if (n == sz-1){
2698:               tmp0  = x[*idx];
2699:               sum1 -= v1[0] * tmp0;
2700:               sum2 -= v2[0] * tmp0;
2701:               sum3 -= v3[0] * tmp0;
2702:             }
2703:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2704:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2705:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2706:             ibdiag  += 9;
2707:             break;
2708:           case 4:
2709:             v2    = a->a + ii[row+1];
2710:             v3    = a->a + ii[row+2];
2711:             v4    = a->a + ii[row+3];
2712:             sum1  = b[row];
2713:             sum2  = b[row+1];
2714:             sum3  = b[row+2];
2715:             sum4  = b[row+3];
2716:             for(n = 0; n<sz-1; n+=2) {
2717:               i1   = idx[0];
2718:               i2   = idx[1];
2719:               idx += 2;
2720:               tmp0 = x[i1];
2721:               tmp1 = x[i2];
2722:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2723:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2724:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2725:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2726:             }
2727: 
2728:             if (n == sz-1){
2729:               tmp0  = x[*idx];
2730:               sum1 -= v1[0] * tmp0;
2731:               sum2 -= v2[0] * tmp0;
2732:               sum3 -= v3[0] * tmp0;
2733:               sum4 -= v4[0] * tmp0;
2734:             }
2735:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2736:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2737:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2738:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2739:             ibdiag  += 16;
2740:             break;
2741:           case 5:
2742:             v2    = a->a + ii[row+1];
2743:             v3    = a->a + ii[row+2];
2744:             v4    = a->a + ii[row+3];
2745:             v5    = a->a + ii[row+4];
2746:             sum1  = b[row];
2747:             sum2  = b[row+1];
2748:             sum3  = b[row+2];
2749:             sum4  = b[row+3];
2750:             sum5  = b[row+4];
2751:             for(n = 0; n<sz-1; n+=2) {
2752:               i1   = idx[0];
2753:               i2   = idx[1];
2754:               idx += 2;
2755:               tmp0 = x[i1];
2756:               tmp1 = x[i2];
2757:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2758:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2759:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2760:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2761:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2762:             }
2763: 
2764:             if (n == sz-1){
2765:               tmp0  = x[*idx];
2766:               sum1 -= v1[0] * tmp0;
2767:               sum2 -= v2[0] * tmp0;
2768:               sum3 -= v3[0] * tmp0;
2769:               sum4 -= v4[0] * tmp0;
2770:               sum5 -= v5[0] * tmp0;
2771:             }
2772:             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2773:             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2774:             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2775:             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2776:             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2777:             ibdiag  += 25;
2778:             break;
2779:           default:
2780:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2781:         }
2782:       }

2784:       xb = x;
2785:       PetscLogFlops(a->nz);
2786:     } else xb = b;
2787:     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2788:         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2789:       cnt = 0;
2790:       for (i=0, row=0; i<m; i++) {

2792:         switch (sizes[i]){
2793:           case 1:
2794:             x[row++] *= bdiag[cnt++];
2795:             break;
2796:           case 2:
2797:             x1   = x[row]; x2 = x[row+1];
2798:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2799:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2800:             x[row++] = tmp1;
2801:             x[row++] = tmp2;
2802:             cnt += 4;
2803:             break;
2804:           case 3:
2805:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2806:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2807:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2808:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2809:             x[row++] = tmp1;
2810:             x[row++] = tmp2;
2811:             x[row++] = tmp3;
2812:             cnt += 9;
2813:             break;
2814:           case 4:
2815:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2816:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2817:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2818:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2819:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2820:             x[row++] = tmp1;
2821:             x[row++] = tmp2;
2822:             x[row++] = tmp3;
2823:             x[row++] = tmp4;
2824:             cnt += 16;
2825:             break;
2826:           case 5:
2827:             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2828:             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2829:             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2830:             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2831:             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2832:             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2833:             x[row++] = tmp1;
2834:             x[row++] = tmp2;
2835:             x[row++] = tmp3;
2836:             x[row++] = tmp4;
2837:             x[row++] = tmp5;
2838:             cnt += 25;
2839:             break;
2840:           default:
2841:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2842:         }
2843:       }
2844:       PetscLogFlops(m);
2845:     }
2846:     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){

2848:       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2849:       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2850:         ibdiag -= sizes[i]*sizes[i];
2851:         sz      = ii[row+1] - diag[row] - 1;
2852:         v1      = a->a + diag[row] + 1;
2853:         idx     = a->j + diag[row] + 1;

2855:         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2856:         switch (sizes[i]){
2857:           case 1:
2858: 
2859:             sum1  = xb[row];
2860:             for(n = 0; n<sz-1; n+=2) {
2861:               i1   = idx[0];
2862:               i2   = idx[1];
2863:               idx += 2;
2864:               tmp0 = x[i1];
2865:               tmp1 = x[i2];
2866:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2867:             }
2868: 
2869:             if (n == sz-1){
2870:               tmp0  = x[*idx];
2871:               sum1 -= *v1*tmp0;
2872:             }
2873:             x[row--] = sum1*(*ibdiag);
2874:             break;

2876:           case 2:
2877: 
2878:             sum1  = xb[row];
2879:             sum2  = xb[row-1];
2880:             /* note that sum1 is associated with the second of the two rows */
2881:             v2    = a->a + diag[row-1] + 2;
2882:             for(n = 0; n<sz-1; n+=2) {
2883:               i1   = idx[0];
2884:               i2   = idx[1];
2885:               idx += 2;
2886:               tmp0 = x[i1];
2887:               tmp1 = x[i2];
2888:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2889:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2890:             }
2891: 
2892:             if (n == sz-1){
2893:               tmp0  = x[*idx];
2894:               sum1 -= *v1*tmp0;
2895:               sum2 -= *v2*tmp0;
2896:             }
2897:             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2898:             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2899:             break;
2900:           case 3:
2901: 
2902:             sum1  = xb[row];
2903:             sum2  = xb[row-1];
2904:             sum3  = xb[row-2];
2905:             v2    = a->a + diag[row-1] + 2;
2906:             v3    = a->a + diag[row-2] + 3;
2907:             for(n = 0; n<sz-1; n+=2) {
2908:               i1   = idx[0];
2909:               i2   = idx[1];
2910:               idx += 2;
2911:               tmp0 = x[i1];
2912:               tmp1 = x[i2];
2913:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2914:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2915:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2916:             }
2917: 
2918:             if (n == sz-1){
2919:               tmp0  = x[*idx];
2920:               sum1 -= *v1*tmp0;
2921:               sum2 -= *v2*tmp0;
2922:               sum3 -= *v3*tmp0;
2923:             }
2924:             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2925:             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2926:             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2927:             break;
2928:           case 4:
2929: 
2930:             sum1  = xb[row];
2931:             sum2  = xb[row-1];
2932:             sum3  = xb[row-2];
2933:             sum4  = xb[row-3];
2934:             v2    = a->a + diag[row-1] + 2;
2935:             v3    = a->a + diag[row-2] + 3;
2936:             v4    = a->a + diag[row-3] + 4;
2937:             for(n = 0; n<sz-1; n+=2) {
2938:               i1   = idx[0];
2939:               i2   = idx[1];
2940:               idx += 2;
2941:               tmp0 = x[i1];
2942:               tmp1 = x[i2];
2943:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2944:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2945:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2946:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2947:             }
2948: 
2949:             if (n == sz-1){
2950:               tmp0  = x[*idx];
2951:               sum1 -= *v1*tmp0;
2952:               sum2 -= *v2*tmp0;
2953:               sum3 -= *v3*tmp0;
2954:               sum4 -= *v4*tmp0;
2955:             }
2956:             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2957:             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2958:             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2959:             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2960:             break;
2961:           case 5:
2962: 
2963:             sum1  = xb[row];
2964:             sum2  = xb[row-1];
2965:             sum3  = xb[row-2];
2966:             sum4  = xb[row-3];
2967:             sum5  = xb[row-4];
2968:             v2    = a->a + diag[row-1] + 2;
2969:             v3    = a->a + diag[row-2] + 3;
2970:             v4    = a->a + diag[row-3] + 4;
2971:             v5    = a->a + diag[row-4] + 5;
2972:             for(n = 0; n<sz-1; n+=2) {
2973:               i1   = idx[0];
2974:               i2   = idx[1];
2975:               idx += 2;
2976:               tmp0 = x[i1];
2977:               tmp1 = x[i2];
2978:               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979:               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980:               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981:               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982:               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983:             }
2984: 
2985:             if (n == sz-1){
2986:               tmp0  = x[*idx];
2987:               sum1 -= *v1*tmp0;
2988:               sum2 -= *v2*tmp0;
2989:               sum3 -= *v3*tmp0;
2990:               sum4 -= *v4*tmp0;
2991:               sum5 -= *v5*tmp0;
2992:             }
2993:             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2994:             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2995:             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2996:             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2997:             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2998:             break;
2999:           default:
3000:                SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001:         }
3002:       }

3004:       PetscLogFlops(a->nz);
3005:     }
3006:     its--;
3007:     VecRestoreArray(xx,&x);
3008:     if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
3009:   }
3010:   if (flag & SOR_EISENSTAT) {
3011:     const PetscScalar *b;
3012:     MatScalar         *t = a->inode.ssor_work;

3014:     VecGetArray(xx,&x);
3015:     VecGetArray(bb,(PetscScalar**)&b);
3016:     /*
3017:           Apply  (U + D)^-1  where D is now the block diagonal 
3018:     */
3019:     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3020:     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3021:       ibdiag -= sizes[i]*sizes[i];
3022:       sz      = ii[row+1] - diag[row] - 1;
3023:       v1      = a->a + diag[row] + 1;
3024:       idx     = a->j + diag[row] + 1;
3025:       CHKMEMQ;
3026:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3027:       switch (sizes[i]){
3028:         case 1:
3029: 
3030:           sum1  = b[row];
3031:           for(n = 0; n<sz-1; n+=2) {
3032:             i1   = idx[0];
3033:             i2   = idx[1];
3034:             idx += 2;
3035:             tmp0 = x[i1];
3036:             tmp1 = x[i2];
3037:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3038:           }
3039: 
3040:           if (n == sz-1){
3041:             tmp0  = x[*idx];
3042:             sum1 -= *v1*tmp0;
3043:           }
3044:           x[row] = sum1*(*ibdiag);row--;
3045:           break;

3047:         case 2:
3048: 
3049:           sum1  = b[row];
3050:           sum2  = b[row-1];
3051:           /* note that sum1 is associated with the second of the two rows */
3052:           v2    = a->a + diag[row-1] + 2;
3053:           for(n = 0; n<sz-1; n+=2) {
3054:             i1   = idx[0];
3055:             i2   = idx[1];
3056:             idx += 2;
3057:             tmp0 = x[i1];
3058:             tmp1 = x[i2];
3059:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3060:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3061:           }
3062: 
3063:           if (n == sz-1){
3064:             tmp0  = x[*idx];
3065:             sum1 -= *v1*tmp0;
3066:             sum2 -= *v2*tmp0;
3067:           }
3068:           x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3069:           x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3070:           row -= 2;
3071:           break;
3072:         case 3:
3073: 
3074:           sum1  = b[row];
3075:           sum2  = b[row-1];
3076:           sum3  = b[row-2];
3077:           v2    = a->a + diag[row-1] + 2;
3078:           v3    = a->a + diag[row-2] + 3;
3079:           for(n = 0; n<sz-1; n+=2) {
3080:             i1   = idx[0];
3081:             i2   = idx[1];
3082:             idx += 2;
3083:             tmp0 = x[i1];
3084:             tmp1 = x[i2];
3085:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3086:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3087:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3088:           }
3089: 
3090:           if (n == sz-1){
3091:             tmp0  = x[*idx];
3092:             sum1 -= *v1*tmp0;
3093:             sum2 -= *v2*tmp0;
3094:             sum3 -= *v3*tmp0;
3095:           }
3096:           x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3097:           x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3098:           x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3099:           row -= 3;
3100:           break;
3101:         case 4:
3102: 
3103:           sum1  = b[row];
3104:           sum2  = b[row-1];
3105:           sum3  = b[row-2];
3106:           sum4  = b[row-3];
3107:           v2    = a->a + diag[row-1] + 2;
3108:           v3    = a->a + diag[row-2] + 3;
3109:           v4    = a->a + diag[row-3] + 4;
3110:           for(n = 0; n<sz-1; n+=2) {
3111:             i1   = idx[0];
3112:             i2   = idx[1];
3113:             idx += 2;
3114:             tmp0 = x[i1];
3115:             tmp1 = x[i2];
3116:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3117:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3118:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3119:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3120:           }
3121: 
3122:           if (n == sz-1){
3123:             tmp0  = x[*idx];
3124:             sum1 -= *v1*tmp0;
3125:             sum2 -= *v2*tmp0;
3126:             sum3 -= *v3*tmp0;
3127:             sum4 -= *v4*tmp0;
3128:           }
3129:           x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3130:           x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3131:           x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3132:           x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3133:           row -= 4;
3134:           break;
3135:         case 5:
3136: 
3137:           sum1  = b[row];
3138:           sum2  = b[row-1];
3139:           sum3  = b[row-2];
3140:           sum4  = b[row-3];
3141:           sum5  = b[row-4];
3142:           v2    = a->a + diag[row-1] + 2;
3143:           v3    = a->a + diag[row-2] + 3;
3144:           v4    = a->a + diag[row-3] + 4;
3145:           v5    = a->a + diag[row-4] + 5;
3146:           for(n = 0; n<sz-1; n+=2) {
3147:             i1   = idx[0];
3148:             i2   = idx[1];
3149:             idx += 2;
3150:             tmp0 = x[i1];
3151:             tmp1 = x[i2];
3152:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3153:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3154:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3155:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3156:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3157:           }
3158: 
3159:           if (n == sz-1){
3160:             tmp0  = x[*idx];
3161:             sum1 -= *v1*tmp0;
3162:             sum2 -= *v2*tmp0;
3163:             sum3 -= *v3*tmp0;
3164:             sum4 -= *v4*tmp0;
3165:             sum5 -= *v5*tmp0;
3166:           }
3167:           x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3168:           x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3169:           x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3170:           x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3171:           x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3172:           row -= 5;
3173:           break;
3174:         default:
3175:           SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3176:       }
3177:       CHKMEMQ;
3178:     }
3179:     PetscLogFlops(a->nz);

3181:     /*
3182:            t = b - D x    where D is the block diagonal
3183:     */
3184:     cnt = 0;
3185:     for (i=0, row=0; i<m; i++) {
3186:       CHKMEMQ;
3187:       switch (sizes[i]){
3188:         case 1:
3189:           t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3190:           break;
3191:         case 2:
3192:           x1   = x[row]; x2 = x[row+1];
3193:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3194:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3195:           t[row]   = b[row] - tmp1;
3196:           t[row+1] = b[row+1] - tmp2; row += 2;
3197:           cnt += 4;
3198:           break;
3199:         case 3:
3200:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3201:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3202:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3203:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3204:           t[row] = b[row] - tmp1;
3205:           t[row+1] = b[row+1] - tmp2;
3206:           t[row+2] = b[row+2] - tmp3; row += 3;
3207:           cnt += 9;
3208:           break;
3209:         case 4:
3210:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3211:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3212:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3213:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3214:           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3215:           t[row] = b[row] - tmp1;
3216:           t[row+1] = b[row+1] - tmp2;
3217:           t[row+2] = b[row+2] - tmp3;
3218:           t[row+3] = b[row+3] - tmp4; row += 4;
3219:           cnt += 16;
3220:           break;
3221:         case 5:
3222:           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3223:           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3224:           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3225:           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3226:           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3227:           tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3228:           t[row] = b[row] - tmp1;
3229:           t[row+1] = b[row+1] - tmp2;
3230:           t[row+2] = b[row+2] - tmp3;
3231:           t[row+3] = b[row+3] - tmp4;
3232:           t[row+4] = b[row+4] - tmp5;row += 5;
3233:           cnt += 25;
3234:           break;
3235:         default:
3236:           SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3237:       }
3238:       CHKMEMQ;
3239:     }
3240:     PetscLogFlops(m);



3244:     /*
3245:           Apply (L + D)^-1 where D is the block diagonal
3246:     */
3247:     for (i=0, row=0; i<m; i++) {
3248:       sz  = diag[row] - ii[row];
3249:       v1  = a->a + ii[row];
3250:       idx = a->j + ii[row];
3251:       CHKMEMQ;
3252:       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3253:       switch (sizes[i]){
3254:         case 1:
3255: 
3256:           sum1  = t[row];
3257:           for(n = 0; n<sz-1; n+=2) {
3258:             i1   = idx[0];
3259:             i2   = idx[1];
3260:             idx += 2;
3261:             tmp0 = t[i1];
3262:             tmp1 = t[i2];
3263:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3264:           }
3265: 
3266:           if (n == sz-1){
3267:             tmp0  = t[*idx];
3268:             sum1 -= *v1 * tmp0;
3269:           }
3270:           x[row] += t[row] = sum1*(*ibdiag++); row++;
3271:           break;
3272:         case 2:
3273:           v2    = a->a + ii[row+1];
3274:           sum1  = t[row];
3275:           sum2  = t[row+1];
3276:           for(n = 0; n<sz-1; n+=2) {
3277:             i1   = idx[0];
3278:             i2   = idx[1];
3279:             idx += 2;
3280:             tmp0 = t[i1];
3281:             tmp1 = t[i2];
3282:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3283:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3284:           }
3285: 
3286:           if (n == sz-1){
3287:             tmp0  = t[*idx];
3288:             sum1 -= v1[0] * tmp0;
3289:             sum2 -= v2[0] * tmp0;
3290:           }
3291:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3292:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3293:           ibdiag  += 4; row += 2;
3294:           break;
3295:         case 3:
3296:           v2    = a->a + ii[row+1];
3297:           v3    = a->a + ii[row+2];
3298:           sum1  = t[row];
3299:           sum2  = t[row+1];
3300:           sum3  = t[row+2];
3301:           for(n = 0; n<sz-1; n+=2) {
3302:             i1   = idx[0];
3303:             i2   = idx[1];
3304:             idx += 2;
3305:             tmp0 = t[i1];
3306:             tmp1 = t[i2];
3307:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310:           }
3311: 
3312:           if (n == sz-1){
3313:             tmp0  = t[*idx];
3314:             sum1 -= v1[0] * tmp0;
3315:             sum2 -= v2[0] * tmp0;
3316:             sum3 -= v3[0] * tmp0;
3317:           }
3318:           x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321:           ibdiag  += 9; row += 3;
3322:           break;
3323:         case 4:
3324:           v2    = a->a + ii[row+1];
3325:           v3    = a->a + ii[row+2];
3326:           v4    = a->a + ii[row+3];
3327:           sum1  = t[row];
3328:           sum2  = t[row+1];
3329:           sum3  = t[row+2];
3330:           sum4  = t[row+3];
3331:           for(n = 0; n<sz-1; n+=2) {
3332:             i1   = idx[0];
3333:             i2   = idx[1];
3334:             idx += 2;
3335:             tmp0 = t[i1];
3336:             tmp1 = t[i2];
3337:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3338:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3339:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3340:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3341:           }
3342: 
3343:           if (n == sz-1){
3344:             tmp0  = t[*idx];
3345:             sum1 -= v1[0] * tmp0;
3346:             sum2 -= v2[0] * tmp0;
3347:             sum3 -= v3[0] * tmp0;
3348:             sum4 -= v4[0] * tmp0;
3349:           }
3350:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3351:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3352:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3353:           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3354:           ibdiag  += 16; row += 4;
3355:           break;
3356:         case 5:
3357:           v2    = a->a + ii[row+1];
3358:           v3    = a->a + ii[row+2];
3359:           v4    = a->a + ii[row+3];
3360:           v5    = a->a + ii[row+4];
3361:           sum1  = t[row];
3362:           sum2  = t[row+1];
3363:           sum3  = t[row+2];
3364:           sum4  = t[row+3];
3365:           sum5  = t[row+4];
3366:           for(n = 0; n<sz-1; n+=2) {
3367:             i1   = idx[0];
3368:             i2   = idx[1];
3369:             idx += 2;
3370:             tmp0 = t[i1];
3371:             tmp1 = t[i2];
3372:             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3373:             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3374:             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3375:             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3376:             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3377:           }
3378: 
3379:           if (n == sz-1){
3380:             tmp0  = t[*idx];
3381:             sum1 -= v1[0] * tmp0;
3382:             sum2 -= v2[0] * tmp0;
3383:             sum3 -= v3[0] * tmp0;
3384:             sum4 -= v4[0] * tmp0;
3385:             sum5 -= v5[0] * tmp0;
3386:           }
3387:           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3388:           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3389:           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3390:           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3391:           x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3392:           ibdiag  += 25; row += 5;
3393:           break;
3394:         default:
3395:           SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3396:       }
3397:       CHKMEMQ;
3398:     }
3399:     PetscLogFlops(a->nz);
3400:     VecRestoreArray(xx,&x);
3401:     VecRestoreArray(bb,(PetscScalar**)&b);
3402:   }
3403:   return(0);
3404: }

3408: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3409: {
3410:   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3411:   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3412:   const MatScalar    *bdiag = a->inode.bdiag;
3413:   const PetscScalar  *b;
3414:   PetscErrorCode      ierr;
3415:   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3416:   const PetscInt      *sizes = a->inode.size;

3419:   VecGetArray(xx,&x);
3420:   VecGetArray(bb,(PetscScalar**)&b);
3421:   cnt = 0;
3422:   for (i=0, row=0; i<m; i++) {
3423:     switch (sizes[i]){
3424:       case 1:
3425:         x[row] = b[row]*bdiag[cnt++];row++;
3426:         break;
3427:       case 2:
3428:         x1   = b[row]; x2 = b[row+1];
3429:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3430:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3431:         x[row++] = tmp1;
3432:         x[row++] = tmp2;
3433:         cnt += 4;
3434:         break;
3435:       case 3:
3436:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3437:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3438:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3439:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3440:         x[row++] = tmp1;
3441:         x[row++] = tmp2;
3442:         x[row++] = tmp3;
3443:         cnt += 9;
3444:         break;
3445:       case 4:
3446:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3447:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3448:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3449:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3450:         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3451:         x[row++] = tmp1;
3452:         x[row++] = tmp2;
3453:         x[row++] = tmp3;
3454:         x[row++] = tmp4;
3455:         cnt += 16;
3456:         break;
3457:       case 5:
3458:         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3459:         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3460:         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3461:         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3462:         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3463:         tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3464:         x[row++] = tmp1;
3465:         x[row++] = tmp2;
3466:         x[row++] = tmp3;
3467:         x[row++] = tmp4;
3468:         x[row++] = tmp5;
3469:         cnt += 25;
3470:         break;
3471:       default:
3472:         SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3473:     }
3474:   }
3475:   PetscLogFlops(2*cnt);
3476:   VecRestoreArray(xx,&x);
3477:   VecRestoreArray(bb,(PetscScalar**)&b);
3478:   return(0);
3479: }

3481: /*
3482:     samestructure indicates that the matrix has not changed its nonzero structure so we 
3483:     do not need to recompute the inodes 
3484: */
3487: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3488: {
3489:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3491:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3492:   PetscTruth     flag;

3495:   if (!a->inode.use)                     return(0);
3496:   if (a->inode.checked && samestructure) return(0);


3499:   m = A->rmap->n;
3500:   if (a->inode.size) {ns = a->inode.size;}
3501:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

3503:   i          = 0;
3504:   node_count = 0;
3505:   idx        = a->j;
3506:   ii         = a->i;
3507:   while (i < m){                /* For each row */
3508:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3509:     /* Limits the number of elements in a node to 'a->inode.limit' */
3510:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3511:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3512:       if (nzy != nzx) break;
3513:       idy  += nzx;             /* Same nonzero pattern */
3514:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3515:       if (!flag) break;
3516:     }
3517:     ns[node_count++] = blk_size;
3518:     idx += blk_size*nzx;
3519:     i    = j;
3520:   }
3521:   /* If not enough inodes found,, do not use inode version of the routines */
3522:   if (!a->inode.size && m && node_count > .9*m) {
3523:     PetscFree(ns);
3524:     a->inode.node_count     = 0;
3525:     a->inode.size           = PETSC_NULL;
3526:     a->inode.use            = PETSC_FALSE;
3527:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
3528:   } else {
3529:     if (!A->factor) {
3530:       A->ops->mult              = MatMult_SeqAIJ_Inode;
3531:       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3532:       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3533:       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3534:       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3535:       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3536:       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3537:       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3538:       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3539:     } else {
3540:       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3541:     }
3542:     a->inode.node_count       = node_count;
3543:     a->inode.size             = ns;
3544:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
3545:   }
3546:   return(0);
3547: }

3549: #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {        \
3550: PetscInt __k, *__vi; \
3551: __vi = aj + ai[row];                                \
3552: for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3553: __vi = aj + adiag[row];                                \
3554: cols[nzl] = __vi[0];\
3555: __vi = aj + adiag[row+1]+1;\
3556: for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}


3559: /*
3560:    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3561:    Modified from Mat_CheckInode().

3563:    Input Parameters:
3564: +  Mat A - ILU or LU matrix factor
3565: -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 
3566:     do not need to recompute the inodes 
3567: */
3570: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3571: {
3572:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3574:   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3575:   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3576:   PetscTruth     flag;

3579:   if (!a->inode.use)                     return(0);
3580:   if (a->inode.checked && samestructure) return(0);

3582:   m = A->rmap->n;
3583:   if (a->inode.size) {ns = a->inode.size;}
3584:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

3586:   i          = 0;
3587:   node_count = 0;
3588:   while (i < m){                /* For each row */
3589:     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3590:     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3591:     nzx  = nzl1 + nzu1 + 1;
3592:     PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
3593:     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3594: 
3595:     /* Limits the number of elements in a node to 'a->inode.limit' */
3596:     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3597:       nzl2    = ai[j+1] - ai[j];
3598:       nzu2    = adiag[j] - adiag[j+1] - 1;
3599:       nzy     = nzl2 + nzu2 + 1;
3600:       if( nzy != nzx) break;
3601:       PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
3602:       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3603:       PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
3604:       if (!flag) {PetscFree(cols2);break;}
3605:       PetscFree(cols2);
3606:     }
3607:     ns[node_count++] = blk_size;
3608:     PetscFree(cols1);
3609:     i    = j;
3610:   }
3611:   /* If not enough inodes found,, do not use inode version of the routines */
3612:   if (!a->inode.size && m && node_count > .9*m) {
3613:     PetscFree(ns);
3614:     a->inode.node_count     = 0;
3615:     a->inode.size           = PETSC_NULL;
3616:     a->inode.use            = PETSC_FALSE;
3617:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
3618:   } else {
3619:     A->ops->mult              = 0;
3620:     A->ops->sor               = 0;
3621:     A->ops->multadd           = 0;
3622:     A->ops->getrowij          = 0;
3623:     A->ops->restorerowij      = 0;
3624:     A->ops->getcolumnij       = 0;
3625:     A->ops->restorecolumnij   = 0;
3626:     A->ops->coloringpatch     = 0;
3627:     A->ops->multdiagonalblock = 0;
3628:     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3629:     a->inode.node_count       = node_count;
3630:     a->inode.size             = ns;
3631:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
3632:   }
3633:   return(0);
3634: }

3636: /*
3637:      This is really ugly. if inodes are used this replaces the 
3638:   permutations with ones that correspond to rows/cols of the matrix
3639:   rather then inode blocks
3640: */
3643: PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3644: {
3645:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

3648:   PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
3649:   if (f) {
3650:     (*f)(A,rperm,cperm);
3651:   }
3652:   return(0);
3653: }

3658: PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3659: {
3660:   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3662:   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3663:   const PetscInt *ridx,*cidx;
3664:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3665:   PetscInt       nslim_col,*ns_col;
3666:   IS             ris = *rperm,cis = *cperm;

3669:   if (!a->inode.size) return(0); /* no inodes so return */
3670:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

3672:   Mat_CreateColInode(A,&nslim_col,&ns_col);
3673:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
3674:   PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);

3676:   ISGetIndices(ris,&ridx);
3677:   ISGetIndices(cis,&cidx);

3679:   /* Form the inode structure for the rows of permuted matric using inv perm*/
3680:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

3682:   /* Construct the permutations for rows*/
3683:   for (i=0,row = 0; i<nslim_row; ++i){
3684:     indx      = ridx[i];
3685:     start_val = tns[indx];
3686:     end_val   = tns[indx + 1];
3687:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3688:   }

3690:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3691:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

3693:  /* Construct permutations for columns */
3694:   for (i=0,col=0; i<nslim_col; ++i){
3695:     indx      = cidx[i];
3696:     start_val = tns[indx];
3697:     end_val   = tns[indx + 1];
3698:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3699:   }

3701:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
3702:   ISSetPermutation(*rperm);
3703:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
3704:   ISSetPermutation(*cperm);
3705: 
3706:   ISRestoreIndices(ris,&ridx);
3707:   ISRestoreIndices(cis,&cidx);

3709:   PetscFree(ns_col);
3710:   PetscFree2(permr,permc);
3711:   ISDestroy(cis);
3712:   ISDestroy(ris);
3713:   PetscFree(tns);
3714:   return(0);
3715: }

3720: /*@C
3721:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

3723:    Collective on Mat

3725:    Input Parameter:
3726: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

3728:    Output Parameter:
3729: +  node_count - no of inodes present in the matrix.
3730: .  sizes      - an array of size node_count,with sizes of each inode.
3731: -  limit      - the max size used to generate the inodes.

3733:    Level: advanced

3735:    Notes: This routine returns some internal storage information
3736:    of the matrix, it is intended to be used by advanced users.
3737:    It should be called after the matrix is assembled.
3738:    The contents of the sizes[] array should not be changed.
3739:    PETSC_NULL may be passed for information not requested.

3741: .keywords: matrix, seqaij, get, inode

3743: .seealso: MatGetInfo()
3744: @*/
3745: PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3746: {
3747:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

3750:   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3751:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
3752:   if (f) {
3753:     (*f)(A,node_count,sizes,limit);
3754:   }
3755:   return(0);
3756: }

3761: PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3762: {
3763:   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;

3766:   if (node_count) *node_count = a->inode.node_count;
3767:   if (sizes)      *sizes      = a->inode.size;
3768:   if (limit)      *limit      = a->inode.limit;
3769:   return(0);
3770: }