Actual source code: baij2.c

  1: #define PETSCMAT_DLL

 3:  #include ../src/mat/impls/baij/seq/baij.h
 4:  #include ../src/mat/blockinvert.h
 5:  #include petscbt.h
 6:  #include petscblaslapack.h

 10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 11: {
 12:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 14:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 15:   const PetscInt *idx;
 16:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 17:   PetscBT        table;

 20:   m     = a->mbs;
 21:   ai    = a->i;
 22:   aj    = a->j;
 23:   bs    = A->rmap->bs;

 25:   if (ov < 0)  SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 27:   PetscBTCreate(m,table);
 28:   PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
 29:   PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);

 31:   for (i=0; i<is_max; i++) {
 32:     /* Initialise the two local arrays */
 33:     isz  = 0;
 34:     PetscBTMemzero(m,table);
 35: 
 36:     /* Extract the indices, assume there can be duplicate entries */
 37:     ISGetIndices(is[i],&idx);
 38:     ISGetLocalSize(is[i],&n);

 40:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 41:     for (j=0; j<n ; ++j){
 42:       ival = idx[j]/bs; /* convert the indices into block indices */
 43:       if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 44:       if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
 45:     }
 46:     ISRestoreIndices(is[i],&idx);
 47:     ISDestroy(is[i]);
 48: 
 49:     k = 0;
 50:     for (j=0; j<ov; j++){ /* for each overlap*/
 51:       n = isz;
 52:       for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
 53:         row   = nidx[k];
 54:         start = ai[row];
 55:         end   = ai[row+1];
 56:         for (l = start; l<end ; l++){
 57:           val = aj[l];
 58:           if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
 59:         }
 60:       }
 61:     }
 62:     /* expand the Index Set */
 63:     for (j=0; j<isz; j++) {
 64:       for (k=0; k<bs; k++)
 65:         nidx2[j*bs+k] = nidx[j]*bs+k;
 66:     }
 67:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
 68:   }
 69:   PetscBTDestroy(table);
 70:   PetscFree(nidx);
 71:   PetscFree(nidx2);
 72:   return(0);
 73: }

 77: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 78: {
 79:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 81:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 82:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 83:   const PetscInt *irow,*icol;
 84:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 85:   PetscInt       *aj = a->j,*ai = a->i;
 86:   MatScalar      *mat_a;
 87:   Mat            C;
 88:   PetscTruth     flag,sorted;

 91:   ISSorted(iscol,&sorted);
 92:   if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");

 94:   ISGetIndices(isrow,&irow);
 95:   ISGetIndices(iscol,&icol);
 96:   ISGetLocalSize(isrow,&nrows);
 97:   ISGetLocalSize(iscol,&ncols);

 99:   PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
100:   ssmap = smap;
101:   PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
102:   PetscMemzero(smap,oldcols*sizeof(PetscInt));
103:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104:   /* determine lens of each row */
105:   for (i=0; i<nrows; i++) {
106:     kstart  = ai[irow[i]];
107:     kend    = kstart + a->ilen[irow[i]];
108:     lens[i] = 0;
109:       for (k=kstart; k<kend; k++) {
110:         if (ssmap[aj[k]]) {
111:           lens[i]++;
112:         }
113:       }
114:     }
115:   /* Create and fill new matrix */
116:   if (scall == MAT_REUSE_MATRIX) {
117:     c = (Mat_SeqBAIJ *)((*B)->data);

119:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120:     PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
121:     if (!flag) {
122:       SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
123:     }
124:     PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
125:     C = *B;
126:   } else {
127:     MatCreate(((PetscObject)A)->comm,&C);
128:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
129:     MatSetType(C,((PetscObject)A)->type_name);
130:     MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
131:   }
132:   c = (Mat_SeqBAIJ *)(C->data);
133:   for (i=0; i<nrows; i++) {
134:     row    = irow[i];
135:     kstart = ai[row];
136:     kend   = kstart + a->ilen[row];
137:     mat_i  = c->i[i];
138:     mat_j  = c->j + mat_i;
139:     mat_a  = c->a + mat_i*bs2;
140:     mat_ilen = c->ilen + i;
141:     for (k=kstart; k<kend; k++) {
142:       if ((tcol=ssmap[a->j[k]])) {
143:         *mat_j++ = tcol - 1;
144:         PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
145:         mat_a   += bs2;
146:         (*mat_ilen)++;
147:       }
148:     }
149:   }
150: 
151:   /* Free work space */
152:   ISRestoreIndices(iscol,&icol);
153:   PetscFree(smap);
154:   PetscFree(lens);
155:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
157: 
158:   ISRestoreIndices(isrow,&irow);
159:   *B = C;
160:   return(0);
161: }

165: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166: {
167:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
168:   IS             is1,is2;
170:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
171:   const PetscInt *irow,*icol;

174:   ISGetIndices(isrow,&irow);
175:   ISGetIndices(iscol,&icol);
176:   ISGetLocalSize(isrow,&nrows);
177:   ISGetLocalSize(iscol,&ncols);
178: 
179:   /* Verify if the indices corespond to each element in a block 
180:    and form the IS with compressed IS */
181:   PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
182:   PetscMemzero(vary,a->mbs*sizeof(PetscInt));
183:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184:   count = 0;
185:   for (i=0; i<a->mbs; i++) {
186:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
187:     if (vary[i]==bs) iary[count++] = i;
188:   }
189:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
190: 
191:   PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
192:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
193:   count = 0;
194:   for (i=0; i<a->mbs; i++) {
195:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
196:     if (vary[i]==bs) iary[count++] = i;
197:   }
198:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
199:   ISRestoreIndices(isrow,&irow);
200:   ISRestoreIndices(iscol,&icol);
201:   PetscFree2(vary,iary);

203:   MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
204:   ISDestroy(is1);
205:   ISDestroy(is2);
206:   return(0);
207: }

211: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
212: {
214:   PetscInt       i;

217:   if (scall == MAT_INITIAL_MATRIX) {
218:     PetscMalloc((n+1)*sizeof(Mat),B);
219:   }

221:   for (i=0; i<n; i++) {
222:     MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
223:   }
224:   return(0);
225: }


228: /* -------------------------------------------------------*/
229: /* Should check that shapes of vectors and matrices match */
230: /* -------------------------------------------------------*/

234: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235: {
236:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
237:   PetscScalar       *z,sum;
238:   const PetscScalar *x;
239:   const MatScalar   *v;
240:   PetscErrorCode    ierr;
241:   PetscInt          mbs,i,n,nonzerorow=0;
242:   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
243:   PetscTruth        usecprow=a->compressedrow.use;

246:   VecGetArray(xx,(PetscScalar**)&x);
247:   VecGetArray(zz,&z);

249:   if (usecprow){
250:     mbs  = a->compressedrow.nrows;
251:     ii   = a->compressedrow.i;
252:     ridx = a->compressedrow.rindex;
253:     PetscMemzero(z,mbs*sizeof(PetscScalar));
254:   } else {
255:     mbs = a->mbs;
256:     ii  = a->i;
257:   }

259:   for (i=0; i<mbs; i++) {
260:     n    = ii[1] - ii[0];
261:     v    = a->a + ii[0];
262:     idx  = a->j + ii[0];
263:     ii++;
264:     sum  = 0.0;
265:     PetscSparseDensePlusDot(sum,x,v,idx,n);
266:     if (usecprow){
267:       z[ridx[i]] = sum;
268:     } else {
269:       nonzerorow += (n>0);
270:       z[i] = sum;
271:     }
272:   }
273:   VecRestoreArray(xx,(PetscScalar**)&x);
274:   VecRestoreArray(zz,&z);
275:   PetscLogFlops(2.0*a->nz - nonzerorow);
276:   return(0);
277: }

281: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282: {
283:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
284:   PetscScalar       *z = 0,sum1,sum2,*zarray;
285:   const PetscScalar *x,*xb;
286:   PetscScalar       x1,x2;
287:   const MatScalar   *v;
288:   PetscErrorCode    ierr;
289:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290:   PetscTruth        usecprow=a->compressedrow.use;

293:   VecGetArray(xx,(PetscScalar**)&x);
294:   VecGetArray(zz,&zarray);

296:   idx = a->j;
297:   v   = a->a;
298:   if (usecprow){
299:     mbs  = a->compressedrow.nrows;
300:     ii   = a->compressedrow.i;
301:     ridx = a->compressedrow.rindex;
302:   } else {
303:     mbs = a->mbs;
304:     ii  = a->i;
305:     z   = zarray;
306:   }

308:   for (i=0; i<mbs; i++) {
309:     n  = ii[1] - ii[0]; ii++;
310:     sum1 = 0.0; sum2 = 0.0;
311:     nonzerorow += (n>0);
312:     for (j=0; j<n; j++) {
313:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314:       sum1 += v[0]*x1 + v[2]*x2;
315:       sum2 += v[1]*x1 + v[3]*x2;
316:       v += 4;
317:     }
318:     if (usecprow) z = zarray + 2*ridx[i];
319:     z[0] = sum1; z[1] = sum2;
320:     if (!usecprow) z += 2;
321:   }
322:   VecRestoreArray(xx,(PetscScalar**)&x);
323:   VecRestoreArray(zz,&zarray);
324:   PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);
325:   return(0);
326: }

330: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331: {
332:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
333:   PetscScalar       *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334:   const PetscScalar *x,*xb;
335:   const MatScalar   *v;
336:   PetscErrorCode    ierr;
337:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338:   PetscTruth        usecprow=a->compressedrow.use;
339: 

341: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342: #pragma disjoint(*v,*z,*xb)
343: #endif

346:   VecGetArray(xx,(PetscScalar**)&x);
347:   VecGetArray(zz,&zarray);

349:   idx = a->j;
350:   v   = a->a;
351:   if (usecprow){
352:     mbs  = a->compressedrow.nrows;
353:     ii   = a->compressedrow.i;
354:     ridx = a->compressedrow.rindex;
355:   } else {
356:     mbs = a->mbs;
357:     ii  = a->i;
358:     z   = zarray;
359:   }

361:   for (i=0; i<mbs; i++) {
362:     n  = ii[1] - ii[0]; ii++;
363:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364:     nonzerorow += (n>0);
365:     for (j=0; j<n; j++) {
366:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370:       v += 9;
371:     }
372:     if (usecprow) z = zarray + 3*ridx[i];
373:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
374:     if (!usecprow) z += 3;
375:   }
376:   VecRestoreArray(xx,(PetscScalar**)&x);
377:   VecRestoreArray(zz,&zarray);
378:   PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);
379:   return(0);
380: }

384: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385: {
386:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
387:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388:   const PetscScalar *x,*xb;
389:   const MatScalar   *v;
390:   PetscErrorCode    ierr;
391:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392:   PetscTruth        usecprow=a->compressedrow.use;

395:   VecGetArray(xx,(PetscScalar**)&x);
396:   VecGetArray(zz,&zarray);

398:   idx = a->j;
399:   v   = a->a;
400:   if (usecprow){
401:     mbs  = a->compressedrow.nrows;
402:     ii   = a->compressedrow.i;
403:     ridx = a->compressedrow.rindex;
404:   } else {
405:     mbs = a->mbs;
406:     ii  = a->i;
407:     z   = zarray;
408:   }

410:   for (i=0; i<mbs; i++) {
411:     n  = ii[1] - ii[0]; ii++;
412:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413:     nonzerorow += (n>0);
414:     for (j=0; j<n; j++) {
415:       xb = x + 4*(*idx++);
416:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
418:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
419:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
420:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
421:       v += 16;
422:     }
423:     if (usecprow) z = zarray + 4*ridx[i];
424:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425:     if (!usecprow) z += 4;
426:   }
427:   VecRestoreArray(xx,(PetscScalar**)&x);
428:   VecRestoreArray(zz,&zarray);
429:   PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);
430:   return(0);
431: }

435: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436: {
437:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
438:   PetscScalar       sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439:   const PetscScalar *xb,*x;
440:   const MatScalar   *v;
441:   PetscErrorCode    ierr;
442:   const PetscInt    *idx,*ii,*ridx=PETSC_NULL;
443:   PetscInt          mbs,i,j,n,nonzerorow=0;
444:   PetscTruth        usecprow=a->compressedrow.use;

447:   VecGetArray(xx,(PetscScalar**)&x);
448:   VecGetArray(zz,&zarray);

450:   idx = a->j;
451:   v   = a->a;
452:   if (usecprow){
453:     mbs  = a->compressedrow.nrows;
454:     ii   = a->compressedrow.i;
455:     ridx = a->compressedrow.rindex;
456:   } else {
457:     mbs = a->mbs;
458:     ii  = a->i;
459:     z   = zarray;
460:   }

462:   for (i=0; i<mbs; i++) {
463:     n  = ii[1] - ii[0]; ii++;
464:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
465:     nonzerorow += (n>0);
466:     for (j=0; j<n; j++) {
467:       xb = x + 5*(*idx++);
468:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
469:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
470:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
471:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
472:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
473:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
474:       v += 25;
475:     }
476:     if (usecprow) z = zarray + 5*ridx[i];
477:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
478:     if (!usecprow) z += 5;
479:   }
480:   VecRestoreArray(xx,(PetscScalar**)&x);
481:   VecRestoreArray(zz,&zarray);
482:   PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
483:   return(0);
484: }


489: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
490: {
491:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
492:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
493:   const PetscScalar *x,*xb;
494:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
495:   const MatScalar   *v;
496:   PetscErrorCode    ierr;
497:   PetscInt          mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
498:   PetscTruth        usecprow=a->compressedrow.use;

501:   VecGetArray(xx,(PetscScalar**)&x);
502:   VecGetArray(zz,&zarray);

504:   idx = a->j;
505:   v   = a->a;
506:   if (usecprow){
507:     mbs  = a->compressedrow.nrows;
508:     ii   = a->compressedrow.i;
509:     ridx = a->compressedrow.rindex;
510:   } else {
511:     mbs = a->mbs;
512:     ii  = a->i;
513:     z   = zarray;
514:   }

516:   for (i=0; i<mbs; i++) {
517:     n  = ii[1] - ii[0]; ii++;
518:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
519:     nonzerorow += (n>0);
520:     for (j=0; j<n; j++) {
521:       xb = x + 6*(*idx++);
522:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
523:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
524:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
525:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
526:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
527:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
528:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
529:       v += 36;
530:     }
531:     if (usecprow) z = zarray + 6*ridx[i];
532:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
533:     if (!usecprow) z += 6;
534:   }

536:   VecRestoreArray(xx,(PetscScalar**)&x);
537:   VecRestoreArray(zz,&zarray);
538:   PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);
539:   return(0);
540: }

544: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
545: {
546:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
547:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
548:   const PetscScalar *x,*xb;
549:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
550:   const MatScalar   *v;
551:   PetscErrorCode    ierr;
552:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
553:   PetscTruth        usecprow=a->compressedrow.use;

556:   VecGetArray(xx,(PetscScalar**)&x);
557:   VecGetArray(zz,&zarray);

559:   idx = a->j;
560:   v   = a->a;
561:   if (usecprow){
562:     mbs    = a->compressedrow.nrows;
563:     ii     = a->compressedrow.i;
564:     ridx = a->compressedrow.rindex;
565:   } else {
566:     mbs = a->mbs;
567:     ii  = a->i;
568:     z   = zarray;
569:   }

571:   for (i=0; i<mbs; i++) {
572:     n  = ii[1] - ii[0]; ii++;
573:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
574:     nonzerorow += (n>0);
575:     for (j=0; j<n; j++) {
576:       xb = x + 7*(*idx++);
577:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
578:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
579:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
580:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
581:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
582:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
583:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
584:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
585:       v += 49;
586:     }
587:     if (usecprow) z = zarray + 7*ridx[i];
588:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
589:     if (!usecprow) z += 7;
590:   }

592:   VecRestoreArray(xx,(PetscScalar**)&x);
593:   VecRestoreArray(zz,&zarray);
594:   PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);
595:   return(0);
596: }

598: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
599: /* Default MatMult for block size 15 */

603: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
604: {
605:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
606:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
607:   const PetscScalar *x,*xb;
608:   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
609:   const MatScalar   *v;
610:   PetscErrorCode    ierr;
611:   const PetscInt    *ii,*ij=a->j,*idx;
612:   PetscInt          mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
613:   PetscTruth        usecprow=a->compressedrow.use;

616:   VecGetArray(xx,(PetscScalar**)&x);
617:   VecGetArray(zz,&zarray);

619:   v   = a->a;
620:   if (usecprow){
621:     mbs    = a->compressedrow.nrows;
622:     ii     = a->compressedrow.i;
623:     ridx = a->compressedrow.rindex;
624:   } else {
625:     mbs = a->mbs;
626:     ii  = a->i;
627:     z   = zarray;
628:   }

630:   for (i=0; i<mbs; i++) {
631:     n  = ii[i+1] - ii[i];
632:     idx = ij + ii[i];
633:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
634:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

636:     nonzerorow += (n>0);
637:     for (j=0; j<n; j++) {
638:       xb = x + 15*(idx[j]);
639:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
640:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

642:       for(k=0;k<15;k++){
643:         sum1  += v[0]*xb[k];
644:         sum2  += v[1]*xb[k];
645:         sum3  += v[2]*xb[k];
646:         sum4  += v[3]*xb[k];
647:         sum5  += v[4]*xb[k];
648:         sum6  += v[5]*xb[k];
649:         sum7  += v[6]*xb[k];
650:         sum8  += v[7]*xb[k];
651:         sum9  += v[8]*xb[k];
652:         sum10 += v[9]*xb[k];
653:         sum11 += v[10]*xb[k];
654:         sum12 += v[11]*xb[k];
655:         sum13 += v[12]*xb[k];
656:         sum14 += v[13]*xb[k];
657:         sum15 += v[14]*xb[k];
658:         v += 15;
659:       }
660:     }
661:     if (usecprow) z = zarray + 15*ridx[i];
662:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

665:     if (!usecprow) z += 15;
666:   }

668:   VecRestoreArray(xx,(PetscScalar**)&x);
669:   VecRestoreArray(zz,&zarray);
670:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
671:   return(0);
672: }

674: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
677: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
678: {
679:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
680:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
681:   const PetscScalar *x,*xb;
682:   PetscScalar        x1,x2,x3,x4,*zarray;
683:   const MatScalar   *v;
684:   PetscErrorCode    ierr;
685:   const PetscInt    *ii,*ij=a->j,*idx;
686:   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
687:   PetscTruth        usecprow=a->compressedrow.use;

690:   VecGetArray(xx,(PetscScalar**)&x);
691:   VecGetArray(zz,&zarray);

693:   v   = a->a;
694:   if (usecprow){
695:     mbs    = a->compressedrow.nrows;
696:     ii     = a->compressedrow.i;
697:     ridx = a->compressedrow.rindex;
698:   } else {
699:     mbs = a->mbs;
700:     ii  = a->i;
701:     z   = zarray;
702:   }

704:   for (i=0; i<mbs; i++) {
705:     n  = ii[i+1] - ii[i];
706:     idx = ij + ii[i];
707:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
708:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

710:     nonzerorow += (n>0);
711:     for (j=0; j<n; j++) {
712:       xb = x + 15*(idx[j]);
713:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

715:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
716:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
717:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
718:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
719:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
720:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
721:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
722:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
723:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
724:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
725:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
726:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
727:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
728:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
729:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

731:       v += 60;

733:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

735:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
736:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
737:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
738:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
739:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
740:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
741:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
742:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
743:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
744:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
745:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
746:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
747:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
748:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
749:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
750:       v += 60;

752:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
753:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
754:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
755:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
756:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
757:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
758:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
759:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
760:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
761:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
762:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
763:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
764:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
765:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
766:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
767:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
768:       v  += 60;

770:       x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
771:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
772:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
773:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
774:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
775:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
776:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
777:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
778:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
779:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
780:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
781:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
782:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
783:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
784:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
785:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
786:       v += 45;
787:     }
788:     if (usecprow) z = zarray + 15*ridx[i];
789:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

792:     if (!usecprow) z += 15;
793:   }

795:   VecRestoreArray(xx,(PetscScalar**)&x);
796:   VecRestoreArray(zz,&zarray);
797:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
798:   return(0);
799: }

801: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
804: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
805: {
806:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
807:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
808:   const PetscScalar *x,*xb;
809:   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
810:   const MatScalar   *v;
811:   PetscErrorCode    ierr;
812:   const PetscInt    *ii,*ij=a->j,*idx;
813:   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
814:   PetscTruth        usecprow=a->compressedrow.use;

817:   VecGetArray(xx,(PetscScalar**)&x);
818:   VecGetArray(zz,&zarray);

820:   v   = a->a;
821:   if (usecprow){
822:     mbs    = a->compressedrow.nrows;
823:     ii     = a->compressedrow.i;
824:     ridx = a->compressedrow.rindex;
825:   } else {
826:     mbs = a->mbs;
827:     ii  = a->i;
828:     z   = zarray;
829:   }

831:   for (i=0; i<mbs; i++) {
832:     n  = ii[i+1] - ii[i];
833:     idx = ij + ii[i];
834:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
835:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

837:     nonzerorow += (n>0);
838:     for (j=0; j<n; j++) {
839:       xb = x + 15*(idx[j]);
840:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
841:       x8 = xb[7];

843:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
844:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
845:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
846:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
847:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
848:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
849:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
850:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
851:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
852:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
853:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
854:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
855:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
856:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
857:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
858:       v += 120;

860:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

862:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
863:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
864:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
865:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
866:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
867:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
868:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
869:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
870:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
871:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
872:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
873:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
874:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
875:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
876:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
877:       v += 105;
878:     }
879:     if (usecprow) z = zarray + 15*ridx[i];
880:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
881:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

883:     if (!usecprow) z += 15;
884:   }

886:   VecRestoreArray(xx,(PetscScalar**)&x);
887:   VecRestoreArray(zz,&zarray);
888:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
889:   return(0);
890: }

892: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */

896: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
897: {
898:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
899:   PetscScalar       *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
900:   const PetscScalar *x,*xb;
901:   PetscScalar        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
902:   const MatScalar   *v;
903:   PetscErrorCode    ierr;
904:   const PetscInt    *ii,*ij=a->j,*idx;
905:   PetscInt          mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
906:   PetscTruth        usecprow=a->compressedrow.use;

909:   VecGetArray(xx,(PetscScalar**)&x);
910:   VecGetArray(zz,&zarray);

912:   v   = a->a;
913:   if (usecprow){
914:     mbs    = a->compressedrow.nrows;
915:     ii     = a->compressedrow.i;
916:     ridx = a->compressedrow.rindex;
917:   } else {
918:     mbs = a->mbs;
919:     ii  = a->i;
920:     z   = zarray;
921:   }

923:   for (i=0; i<mbs; i++) {
924:     n  = ii[i+1] - ii[i];
925:     idx = ij + ii[i];
926:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
927:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

929:     nonzerorow += (n>0);
930:     for (j=0; j<n; j++) {
931:       xb = x + 15*(idx[j]);
932:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
933:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

935:       sum1 +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
936:       sum2 +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
937:       sum3 +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
938:       sum4 +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
939:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
940:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
941:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
942:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
943:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
944:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
945:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
946:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
947:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
948:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
949:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
950:       v += 225;
951:     }
952:     if (usecprow) z = zarray + 15*ridx[i];
953:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
954:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

956:     if (!usecprow) z += 15;
957:   }

959:   VecRestoreArray(xx,(PetscScalar**)&x);
960:   VecRestoreArray(zz,&zarray);
961:   PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
962:   return(0);
963: }


966: /*
967:     This will not work with MatScalar == float because it calls the BLAS
968: */
971: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
972: {
973:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
974:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
975:   MatScalar      *v;
977:   PetscInt       mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
978:   PetscInt       ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
979:   PetscTruth     usecprow=a->compressedrow.use;

982:   VecGetArray(xx,&x);
983:   VecGetArray(zz,&zarray);

985:   idx = a->j;
986:   v   = a->a;
987:   if (usecprow){
988:     mbs  = a->compressedrow.nrows;
989:     ii   = a->compressedrow.i;
990:     ridx = a->compressedrow.rindex;
991:   } else {
992:     mbs = a->mbs;
993:     ii  = a->i;
994:     z   = zarray;
995:   }

997:   if (!a->mult_work) {
998:     k    = PetscMax(A->rmap->n,A->cmap->n);
999:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1000:   }
1001:   work = a->mult_work;
1002:   for (i=0; i<mbs; i++) {
1003:     n     = ii[1] - ii[0]; ii++;
1004:     ncols = n*bs;
1005:     workt = work;
1006:     nonzerorow += (n>0);
1007:     for (j=0; j<n; j++) {
1008:       xb = x + bs*(*idx++);
1009:       for (k=0; k<bs; k++) workt[k] = xb[k];
1010:       workt += bs;
1011:     }
1012:     if (usecprow) z = zarray + bs*ridx[i];
1013:     Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1014:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1015:     v += n*bs2;
1016:     if (!usecprow) z += bs;
1017:   }
1018:   VecRestoreArray(xx,&x);
1019:   VecRestoreArray(zz,&zarray);
1020:   PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);
1021:   return(0);
1022: }

1027: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1028: {
1029:   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1030:   const PetscScalar  *x;
1031:   PetscScalar        *y,*z,sum;
1032:   const MatScalar    *v;
1033:   PetscErrorCode     ierr;
1034:   PetscInt           mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1035:   const PetscInt     *idx,*ii;
1036:   PetscTruth         usecprow=a->compressedrow.use;

1039:   VecGetArray(xx,(PetscScalar**)&x);
1040:   VecGetArray(yy,&y);
1041:   if (zz != yy) {
1042:     VecGetArray(zz,&z);
1043:   } else {
1044:     z = y;
1045:   }

1047:   idx = a->j;
1048:   v   = a->a;
1049:   if (usecprow){
1050:     if (zz != yy){
1051:       PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1052:     }
1053:     mbs  = a->compressedrow.nrows;
1054:     ii   = a->compressedrow.i;
1055:     ridx = a->compressedrow.rindex;
1056:   } else {
1057:     ii  = a->i;
1058:   }

1060:   for (i=0; i<mbs; i++) {
1061:     n    = ii[1] - ii[0];
1062:     ii++;
1063:     if (!usecprow){
1064:       nonzerorow += (n>0);
1065:       sum = y[i];
1066:     } else {
1067:       sum = y[ridx[i]];
1068:     }
1069:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1070:     v += n;
1071:     idx += n;
1072:     if (usecprow){
1073:       z[ridx[i]] = sum;
1074:     } else {
1075:       z[i] = sum;
1076:     }
1077:   }
1078:   VecRestoreArray(xx,(PetscScalar**)&x);
1079:   VecRestoreArray(yy,&y);
1080:   if (zz != yy) {
1081:     VecRestoreArray(zz,&z);
1082:   }
1083:   PetscLogFlops(2.0*a->nz - nonzerorow);
1084:   return(0);
1085: }

1089: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1090: {
1091:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1092:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2;
1093:   PetscScalar    x1,x2,*yarray,*zarray;
1094:   MatScalar      *v;
1096:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1097:   PetscTruth     usecprow=a->compressedrow.use;

1100:   VecGetArray(xx,&x);
1101:   VecGetArray(yy,&yarray);
1102:   if (zz != yy) {
1103:     VecGetArray(zz,&zarray);
1104:   } else {
1105:     zarray = yarray;
1106:   }

1108:   idx = a->j;
1109:   v   = a->a;
1110:   if (usecprow){
1111:     if (zz != yy){
1112:       PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1113:     }
1114:     mbs  = a->compressedrow.nrows;
1115:     ii   = a->compressedrow.i;
1116:     ridx = a->compressedrow.rindex;
1117:     if (zz != yy){
1118:       PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1119:     }
1120:   } else {
1121:     ii  = a->i;
1122:     y   = yarray;
1123:     z   = zarray;
1124:   }

1126:   for (i=0; i<mbs; i++) {
1127:     n  = ii[1] - ii[0]; ii++;
1128:     if (usecprow){
1129:       z = zarray + 2*ridx[i];
1130:       y = yarray + 2*ridx[i];
1131:     }
1132:     sum1 = y[0]; sum2 = y[1];
1133:     for (j=0; j<n; j++) {
1134:       xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1135:       sum1 += v[0]*x1 + v[2]*x2;
1136:       sum2 += v[1]*x1 + v[3]*x2;
1137:       v += 4;
1138:     }
1139:     z[0] = sum1; z[1] = sum2;
1140:     if (!usecprow){
1141:       z += 2; y += 2;
1142:     }
1143:   }
1144:   VecRestoreArray(xx,&x);
1145:   VecRestoreArray(yy,&yarray);
1146:   if (zz != yy) {
1147:     VecRestoreArray(zz,&zarray);
1148:   }
1149:   PetscLogFlops(4.0*a->nz);
1150:   return(0);
1151: }

1155: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1156: {
1157:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1158:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1159:   MatScalar      *v;
1161:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1162:   PetscTruth     usecprow=a->compressedrow.use;

1165:   VecGetArray(xx,&x);
1166:   VecGetArray(yy,&yarray);
1167:   if (zz != yy) {
1168:     VecGetArray(zz,&zarray);
1169:   } else {
1170:     zarray = yarray;
1171:   }

1173:   idx = a->j;
1174:   v   = a->a;
1175:   if (usecprow){
1176:     if (zz != yy){
1177:       PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1178:     }
1179:     mbs  = a->compressedrow.nrows;
1180:     ii   = a->compressedrow.i;
1181:     ridx = a->compressedrow.rindex;
1182:   } else {
1183:     ii  = a->i;
1184:     y   = yarray;
1185:     z   = zarray;
1186:   }

1188:   for (i=0; i<mbs; i++) {
1189:     n  = ii[1] - ii[0]; ii++;
1190:     if (usecprow){
1191:       z = zarray + 3*ridx[i];
1192:       y = yarray + 3*ridx[i];
1193:     }
1194:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1195:     for (j=0; j<n; j++) {
1196:       xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1197:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1198:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1199:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1200:       v += 9;
1201:     }
1202:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1203:     if (!usecprow){
1204:       z += 3; y += 3;
1205:     }
1206:   }
1207:   VecRestoreArray(xx,&x);
1208:   VecRestoreArray(yy,&yarray);
1209:   if (zz != yy) {
1210:     VecRestoreArray(zz,&zarray);
1211:   }
1212:   PetscLogFlops(18.0*a->nz);
1213:   return(0);
1214: }

1218: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1219: {
1220:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1221:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1222:   MatScalar      *v;
1224:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1225:   PetscTruth     usecprow=a->compressedrow.use;

1228:   VecGetArray(xx,&x);
1229:   VecGetArray(yy,&yarray);
1230:   if (zz != yy) {
1231:     VecGetArray(zz,&zarray);
1232:   } else {
1233:     zarray = yarray;
1234:   }

1236:   idx   = a->j;
1237:   v     = a->a;
1238:   if (usecprow){
1239:     if (zz != yy){
1240:       PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1241:     }
1242:     mbs  = a->compressedrow.nrows;
1243:     ii   = a->compressedrow.i;
1244:     ridx = a->compressedrow.rindex;
1245:   } else {
1246:     ii  = a->i;
1247:     y   = yarray;
1248:     z   = zarray;
1249:   }

1251:   for (i=0; i<mbs; i++) {
1252:     n  = ii[1] - ii[0]; ii++;
1253:     if (usecprow){
1254:       z = zarray + 4*ridx[i];
1255:       y = yarray + 4*ridx[i];
1256:     }
1257:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1258:     for (j=0; j<n; j++) {
1259:       xb = x + 4*(*idx++);
1260:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1261:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1262:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1263:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1264:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1265:       v += 16;
1266:     }
1267:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1268:     if (!usecprow){
1269:       z += 4; y += 4;
1270:     }
1271:   }
1272:   VecRestoreArray(xx,&x);
1273:   VecRestoreArray(yy,&yarray);
1274:   if (zz != yy) {
1275:     VecRestoreArray(zz,&zarray);
1276:   }
1277:   PetscLogFlops(32.0*a->nz);
1278:   return(0);
1279: }

1283: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1284: {
1285:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1286:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1287:   PetscScalar    *yarray,*zarray;
1288:   MatScalar      *v;
1290:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1291:   PetscTruth     usecprow=a->compressedrow.use;

1294:   VecGetArray(xx,&x);
1295:   VecGetArray(yy,&yarray);
1296:   if (zz != yy) {
1297:     VecGetArray(zz,&zarray);
1298:   } else {
1299:     zarray = yarray;
1300:   }

1302:   idx = a->j;
1303:   v   = a->a;
1304:   if (usecprow){
1305:     if (zz != yy){
1306:       PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1307:     }
1308:     mbs  = a->compressedrow.nrows;
1309:     ii   = a->compressedrow.i;
1310:     ridx = a->compressedrow.rindex;
1311:   } else {
1312:     ii  = a->i;
1313:     y   = yarray;
1314:     z   = zarray;
1315:   }

1317:   for (i=0; i<mbs; i++) {
1318:     n  = ii[1] - ii[0]; ii++;
1319:     if (usecprow){
1320:       z = zarray + 5*ridx[i];
1321:       y = yarray + 5*ridx[i];
1322:     }
1323:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1324:     for (j=0; j<n; j++) {
1325:       xb = x + 5*(*idx++);
1326:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1327:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1328:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1329:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1330:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1331:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1332:       v += 25;
1333:     }
1334:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1335:     if (!usecprow){
1336:       z += 5; y += 5;
1337:     }
1338:   }
1339:   VecRestoreArray(xx,&x);
1340:   VecRestoreArray(yy,&yarray);
1341:   if (zz != yy) {
1342:     VecRestoreArray(zz,&zarray);
1343:   }
1344:   PetscLogFlops(50.0*a->nz);
1345:   return(0);
1346: }
1349: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1350: {
1351:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1352:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1353:   PetscScalar    x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1354:   MatScalar      *v;
1356:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1357:   PetscTruth     usecprow=a->compressedrow.use;

1360:   VecGetArray(xx,&x);
1361:   VecGetArray(yy,&yarray);
1362:   if (zz != yy) {
1363:     VecGetArray(zz,&zarray);
1364:   } else {
1365:     zarray = yarray;
1366:   }

1368:   idx = a->j;
1369:   v   = a->a;
1370:   if (usecprow){
1371:     if (zz != yy){
1372:       PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1373:     }
1374:     mbs  = a->compressedrow.nrows;
1375:     ii   = a->compressedrow.i;
1376:     ridx = a->compressedrow.rindex;
1377:   } else {
1378:     ii  = a->i;
1379:     y   = yarray;
1380:     z   = zarray;
1381:   }

1383:   for (i=0; i<mbs; i++) {
1384:     n  = ii[1] - ii[0]; ii++;
1385:     if (usecprow){
1386:       z = zarray + 6*ridx[i];
1387:       y = yarray + 6*ridx[i];
1388:     }
1389:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1390:     for (j=0; j<n; j++) {
1391:       xb = x + 6*(*idx++);
1392:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1393:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1394:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1395:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1396:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1397:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1398:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1399:       v += 36;
1400:     }
1401:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1402:     if (!usecprow){
1403:       z += 6; y += 6;
1404:     }
1405:   }
1406:   VecRestoreArray(xx,&x);
1407:   VecRestoreArray(yy,&yarray);
1408:   if (zz != yy) {
1409:     VecRestoreArray(zz,&zarray);
1410:   }
1411:   PetscLogFlops(72.0*a->nz);
1412:   return(0);
1413: }

1417: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1418: {
1419:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1420:   PetscScalar    *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1421:   PetscScalar    x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1422:   MatScalar      *v;
1424:   PetscInt       mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1425:   PetscTruth     usecprow=a->compressedrow.use;

1428:   VecGetArray(xx,&x);
1429:   VecGetArray(yy,&yarray);
1430:   if (zz != yy) {
1431:     VecGetArray(zz,&zarray);
1432:   } else {
1433:     zarray = yarray;
1434:   }

1436:   idx = a->j;
1437:   v   = a->a;
1438:   if (usecprow){
1439:     if (zz != yy){
1440:       PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1441:     }
1442:     mbs  = a->compressedrow.nrows;
1443:     ii   = a->compressedrow.i;
1444:     ridx = a->compressedrow.rindex;
1445:   } else {
1446:     ii  = a->i;
1447:     y   = yarray;
1448:     z   = zarray;
1449:   }

1451:   for (i=0; i<mbs; i++) {
1452:     n  = ii[1] - ii[0]; ii++;
1453:     if (usecprow){
1454:       z = zarray + 7*ridx[i];
1455:       y = yarray + 7*ridx[i];
1456:     }
1457:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1458:     for (j=0; j<n; j++) {
1459:       xb = x + 7*(*idx++);
1460:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1461:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1462:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1463:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1464:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1465:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1466:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1467:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1468:       v += 49;
1469:     }
1470:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1471:     if (!usecprow){
1472:       z += 7; y += 7;
1473:     }
1474:   }
1475:   VecRestoreArray(xx,&x);
1476:   VecRestoreArray(yy,&yarray);
1477:   if (zz != yy) {
1478:     VecRestoreArray(zz,&zarray);
1479:   }
1480:   PetscLogFlops(98.0*a->nz);
1481:   return(0);
1482: }

1486: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1487: {
1488:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1489:   PetscScalar    *x,*z = 0,*xb,*work,*workt,*zarray;
1490:   MatScalar      *v;
1492:   PetscInt       mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1493:   PetscInt       ncols,k,*ridx=PETSC_NULL;
1494:   PetscTruth     usecprow=a->compressedrow.use;

1497:   VecCopy_Seq(yy,zz);
1498:   VecGetArray(xx,&x);
1499:   VecGetArray(zz,&zarray);

1501:   idx = a->j;
1502:   v   = a->a;
1503:   if (usecprow){
1504:     mbs    = a->compressedrow.nrows;
1505:     ii     = a->compressedrow.i;
1506:     ridx = a->compressedrow.rindex;
1507:   } else {
1508:     mbs = a->mbs;
1509:     ii  = a->i;
1510:     z   = zarray;
1511:   }

1513:   if (!a->mult_work) {
1514:     k    = PetscMax(A->rmap->n,A->cmap->n);
1515:     PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1516:   }
1517:   work = a->mult_work;
1518:   for (i=0; i<mbs; i++) {
1519:     n     = ii[1] - ii[0]; ii++;
1520:     ncols = n*bs;
1521:     workt = work;
1522:     for (j=0; j<n; j++) {
1523:       xb = x + bs*(*idx++);
1524:       for (k=0; k<bs; k++) workt[k] = xb[k];
1525:       workt += bs;
1526:     }
1527:     if (usecprow) z = zarray + bs*ridx[i];
1528:     Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1529:     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1530:     v += n*bs2;
1531:     if (!usecprow){
1532:       z += bs;
1533:     }
1534:   }
1535:   VecRestoreArray(xx,&x);
1536:   VecRestoreArray(zz,&zarray);
1537:   PetscLogFlops(2.0*a->nz*bs2);
1538:   return(0);
1539: }

1543: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1544: {
1545:   PetscScalar    zero = 0.0;

1549:   VecSet(zz,zero);
1550:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1551:   return(0);
1552: }

1556: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1557: {
1558:   PetscScalar    zero = 0.0;

1562:   VecSet(zz,zero);
1563:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1564:   return(0);
1565: }

1569: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)

1571: {
1572:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1573:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1574:   MatScalar         *v;
1575:   PetscErrorCode    ierr;
1576:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1577:   Mat_CompressedRow cprow = a->compressedrow;
1578:   PetscTruth        usecprow=cprow.use;

1581:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1582:   VecGetArray(xx,&x);
1583:   VecGetArray(zz,&z);

1585:   idx = a->j;
1586:   v   = a->a;
1587:   if (usecprow){
1588:     mbs  = cprow.nrows;
1589:     ii   = cprow.i;
1590:     ridx = cprow.rindex;
1591:   } else {
1592:     mbs=a->mbs;
1593:     ii = a->i;
1594:     xb = x;
1595:   }

1597:   switch (bs) {
1598:   case 1:
1599:     for (i=0; i<mbs; i++) {
1600:       if (usecprow) xb = x + ridx[i];
1601:       x1 = xb[0];
1602:       ib = idx + ii[0];
1603:       n  = ii[1] - ii[0]; ii++;
1604:       for (j=0; j<n; j++) {
1605:         rval    = ib[j];
1606:         z[rval] += PetscConj(*v) * x1;
1607:         v++;
1608:       }
1609:       if (!usecprow) xb++;
1610:     }
1611:     break;
1612:   case 2:
1613:     for (i=0; i<mbs; i++) {
1614:       if (usecprow) xb = x + 2*ridx[i];
1615:       x1 = xb[0]; x2 = xb[1];
1616:       ib = idx + ii[0];
1617:       n  = ii[1] - ii[0]; ii++;
1618:       for (j=0; j<n; j++) {
1619:         rval      = ib[j]*2;
1620:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1621:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1622:         v  += 4;
1623:       }
1624:       if (!usecprow) xb += 2;
1625:     }
1626:     break;
1627:   case 3:
1628:     for (i=0; i<mbs; i++) {
1629:       if (usecprow) xb = x + 3*ridx[i];
1630:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1631:       ib = idx + ii[0];
1632:       n  = ii[1] - ii[0]; ii++;
1633:       for (j=0; j<n; j++) {
1634:         rval      = ib[j]*3;
1635:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1636:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1637:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1638:         v  += 9;
1639:       }
1640:       if (!usecprow) xb += 3;
1641:     }
1642:     break;
1643:   case 4:
1644:     for (i=0; i<mbs; i++) {
1645:       if (usecprow) xb = x + 4*ridx[i];
1646:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1647:       ib = idx + ii[0];
1648:       n  = ii[1] - ii[0]; ii++;
1649:       for (j=0; j<n; j++) {
1650:         rval      = ib[j]*4;
1651:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
1652:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
1653:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1654:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1655:         v  += 16;
1656:       }
1657:       if (!usecprow) xb += 4;
1658:     }
1659:     break;
1660:   case 5:
1661:     for (i=0; i<mbs; i++) {
1662:       if (usecprow) xb = x + 5*ridx[i];
1663:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1664:       x4 = xb[3]; x5 = xb[4];
1665:       ib = idx + ii[0];
1666:       n  = ii[1] - ii[0]; ii++;
1667:       for (j=0; j<n; j++) {
1668:         rval      = ib[j]*5;
1669:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
1670:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
1671:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1672:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1673:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1674:         v  += 25;
1675:       }
1676:       if (!usecprow) xb += 5;
1677:     }
1678:     break;
1679:   default: {      /* block sizes larger than 5 by 5 are handled by BLAS */
1680:       PetscInt     ncols,k;
1681:       PetscScalar  *work,*workt,*xtmp;

1683:       SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1684:       if (!a->mult_work) {
1685:         k = PetscMax(A->rmap->n,A->cmap->n);
1686:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1687:       }
1688:       work = a->mult_work;
1689:       xtmp = x;
1690:       for (i=0; i<mbs; i++) {
1691:         n     = ii[1] - ii[0]; ii++;
1692:         ncols = n*bs;
1693:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1694:         if (usecprow) {
1695:           xtmp = x + bs*ridx[i];
1696:         }
1697:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1698:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1699:         v += n*bs2;
1700:         if (!usecprow) xtmp += bs;
1701:         workt = work;
1702:         for (j=0; j<n; j++) {
1703:           zb = z + bs*(*idx++);
1704:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1705:           workt += bs;
1706:         }
1707:       }
1708:     }
1709:   }
1710:   VecRestoreArray(xx,&x);
1711:   VecRestoreArray(zz,&z);
1712:   PetscLogFlops(2.0*a->nz*a->bs2);
1713:   return(0);
1714: }

1718: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1719: {
1720:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1721:   PetscScalar       *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1722:   MatScalar         *v;
1723:   PetscErrorCode    ierr;
1724:   PetscInt          mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1725:   Mat_CompressedRow cprow = a->compressedrow;
1726:   PetscTruth        usecprow=cprow.use;

1729:   if (yy != zz) { VecCopy_Seq(yy,zz); }
1730:   VecGetArray(xx,&x);
1731:   VecGetArray(zz,&z);

1733:   idx = a->j;
1734:   v   = a->a;
1735:   if (usecprow){
1736:     mbs  = cprow.nrows;
1737:     ii   = cprow.i;
1738:     ridx = cprow.rindex;
1739:   } else {
1740:     mbs=a->mbs;
1741:     ii = a->i;
1742:     xb = x;
1743:   }

1745:   switch (bs) {
1746:   case 1:
1747:     for (i=0; i<mbs; i++) {
1748:       if (usecprow) xb = x + ridx[i];
1749:       x1 = xb[0];
1750:       ib = idx + ii[0];
1751:       n  = ii[1] - ii[0]; ii++;
1752:       for (j=0; j<n; j++) {
1753:         rval    = ib[j];
1754:         z[rval] += *v * x1;
1755:         v++;
1756:       }
1757:       if (!usecprow) xb++;
1758:     }
1759:     break;
1760:   case 2:
1761:     for (i=0; i<mbs; i++) {
1762:       if (usecprow) xb = x + 2*ridx[i];
1763:       x1 = xb[0]; x2 = xb[1];
1764:       ib = idx + ii[0];
1765:       n  = ii[1] - ii[0]; ii++;
1766:       for (j=0; j<n; j++) {
1767:         rval      = ib[j]*2;
1768:         z[rval++] += v[0]*x1 + v[1]*x2;
1769:         z[rval++] += v[2]*x1 + v[3]*x2;
1770:         v  += 4;
1771:       }
1772:       if (!usecprow) xb += 2;
1773:     }
1774:     break;
1775:   case 3:
1776:     for (i=0; i<mbs; i++) {
1777:       if (usecprow) xb = x + 3*ridx[i];
1778:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1779:       ib = idx + ii[0];
1780:       n  = ii[1] - ii[0]; ii++;
1781:       for (j=0; j<n; j++) {
1782:         rval      = ib[j]*3;
1783:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1784:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1785:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1786:         v  += 9;
1787:       }
1788:       if (!usecprow) xb += 3;
1789:     }
1790:     break;
1791:   case 4:
1792:     for (i=0; i<mbs; i++) {
1793:       if (usecprow) xb = x + 4*ridx[i];
1794:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1795:       ib = idx + ii[0];
1796:       n  = ii[1] - ii[0]; ii++;
1797:       for (j=0; j<n; j++) {
1798:         rval      = ib[j]*4;
1799:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1800:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1801:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1802:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1803:         v  += 16;
1804:       }
1805:       if (!usecprow) xb += 4;
1806:     }
1807:     break;
1808:   case 5:
1809:     for (i=0; i<mbs; i++) {
1810:       if (usecprow) xb = x + 5*ridx[i];
1811:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1812:       x4 = xb[3]; x5 = xb[4];
1813:       ib = idx + ii[0];
1814:       n  = ii[1] - ii[0]; ii++;
1815:       for (j=0; j<n; j++) {
1816:         rval      = ib[j]*5;
1817:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1818:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1819:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1820:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1821:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1822:         v  += 25;
1823:       }
1824:       if (!usecprow) xb += 5;
1825:     }
1826:     break;
1827:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
1828:       PetscInt     ncols,k;
1829:       PetscScalar  *work,*workt,*xtmp;

1831:       if (!a->mult_work) {
1832:         k = PetscMax(A->rmap->n,A->cmap->n);
1833:         PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1834:       }
1835:       work = a->mult_work;
1836:       xtmp = x;
1837:       for (i=0; i<mbs; i++) {
1838:         n     = ii[1] - ii[0]; ii++;
1839:         ncols = n*bs;
1840:         PetscMemzero(work,ncols*sizeof(PetscScalar));
1841:         if (usecprow) {
1842:           xtmp = x + bs*ridx[i];
1843:         }
1844:         Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1845:         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1846:         v += n*bs2;
1847:         if (!usecprow) xtmp += bs;
1848:         workt = work;
1849:         for (j=0; j<n; j++) {
1850:           zb = z + bs*(*idx++);
1851:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
1852:           workt += bs;
1853:         }
1854:       }
1855:     }
1856:   }
1857:   VecRestoreArray(xx,&x);
1858:   VecRestoreArray(zz,&z);
1859:   PetscLogFlops(2.0*a->nz*a->bs2);
1860:   return(0);
1861: }

1865: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1866: {
1867:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)inA->data;
1868:   PetscInt       totalnz = a->bs2*a->nz;
1869:   PetscScalar    oalpha = alpha;
1871:   PetscBLASInt   one = 1,tnz = PetscBLASIntCast(totalnz);

1874:   BLASscal_(&tnz,&oalpha,a->a,&one);
1875:   PetscLogFlops(totalnz);
1876:   return(0);
1877: }

1881: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1882: {
1884:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1885:   MatScalar      *v = a->a;
1886:   PetscReal      sum = 0.0;
1887:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

1890:   if (type == NORM_FROBENIUS) {
1891:     for (i=0; i< bs2*nz; i++) {
1892: #if defined(PETSC_USE_COMPLEX)
1893:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1894: #else
1895:       sum += (*v)*(*v); v++;
1896: #endif
1897:     }
1898:     *norm = sqrt(sum);
1899:   } else if (type == NORM_1) { /* maximum column sum */
1900:     PetscReal *tmp;
1901:     PetscInt  *bcol = a->j;
1902:     PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1903:     PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1904:     for (i=0; i<nz; i++){
1905:       for (j=0; j<bs; j++){
1906:         k1 = bs*(*bcol) + j; /* column index */
1907:         for (k=0; k<bs; k++){
1908:           tmp[k1] += PetscAbsScalar(*v); v++;
1909:         }
1910:       }
1911:       bcol++;
1912:     }
1913:     *norm = 0.0;
1914:     for (j=0; j<A->cmap->n; j++) {
1915:       if (tmp[j] > *norm) *norm = tmp[j];
1916:     }
1917:     PetscFree(tmp);
1918:   } else if (type == NORM_INFINITY) { /* maximum row sum */
1919:     *norm = 0.0;
1920:     for (k=0; k<bs; k++) {
1921:       for (j=0; j<a->mbs; j++) {
1922:         v = a->a + bs2*a->i[j] + k;
1923:         sum = 0.0;
1924:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1925:           for (k1=0; k1<bs; k1++){
1926:             sum += PetscAbsScalar(*v);
1927:             v   += bs;
1928:           }
1929:         }
1930:         if (sum > *norm) *norm = sum;
1931:       }
1932:     }
1933:   } else {
1934:     SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1935:   }
1936:   return(0);
1937: }


1942: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1943: {
1944:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;

1948:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1949:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1950:     *flg = PETSC_FALSE;
1951:     return(0);
1952:   }
1953: 
1954:   /* if the a->i are the same */
1955:   PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1956:   if (!*flg) {
1957:     return(0);
1958:   }
1959: 
1960:   /* if a->j are the same */
1961:   PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1962:   if (!*flg) {
1963:     return(0);
1964:   }
1965:   /* if a->a are the same */
1966:   PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1967:   return(0);
1968: 
1969: }

1973: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1974: {
1975:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1977:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1978:   PetscScalar    *x,zero = 0.0;
1979:   MatScalar      *aa,*aa_j;

1982:   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1983:   bs   = A->rmap->bs;
1984:   aa   = a->a;
1985:   ai   = a->i;
1986:   aj   = a->j;
1987:   ambs = a->mbs;
1988:   bs2  = a->bs2;

1990:   VecSet(v,zero);
1991:   VecGetArray(v,&x);
1992:   VecGetLocalSize(v,&n);
1993:   if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1994:   for (i=0; i<ambs; i++) {
1995:     for (j=ai[i]; j<ai[i+1]; j++) {
1996:       if (aj[j] == i) {
1997:         row  = i*bs;
1998:         aa_j = aa+j*bs2;
1999:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2000:         break;
2001:       }
2002:     }
2003:   }
2004:   VecRestoreArray(v,&x);
2005:   return(0);
2006: }

2010: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2011: {
2012:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2013:   PetscScalar    *l,*r,x,*li,*ri;
2014:   MatScalar      *aa,*v;
2016:   PetscInt       i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;

2019:   ai  = a->i;
2020:   aj  = a->j;
2021:   aa  = a->a;
2022:   m   = A->rmap->n;
2023:   n   = A->cmap->n;
2024:   bs  = A->rmap->bs;
2025:   mbs = a->mbs;
2026:   bs2 = a->bs2;
2027:   if (ll) {
2028:     VecGetArray(ll,&l);
2029:     VecGetLocalSize(ll,&lm);
2030:     if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2031:     for (i=0; i<mbs; i++) { /* for each block row */
2032:       M  = ai[i+1] - ai[i];
2033:       li = l + i*bs;
2034:       v  = aa + bs2*ai[i];
2035:       for (j=0; j<M; j++) { /* for each block */
2036:         for (k=0; k<bs2; k++) {
2037:           (*v++) *= li[k%bs];
2038:         }
2039:       }
2040:     }
2041:     VecRestoreArray(ll,&l);
2042:     PetscLogFlops(a->nz);
2043:   }
2044: 
2045:   if (rr) {
2046:     VecGetArray(rr,&r);
2047:     VecGetLocalSize(rr,&rn);
2048:     if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2049:     for (i=0; i<mbs; i++) { /* for each block row */
2050:       M  = ai[i+1] - ai[i];
2051:       v  = aa + bs2*ai[i];
2052:       for (j=0; j<M; j++) { /* for each block */
2053:         ri = r + bs*aj[ai[i]+j];
2054:         for (k=0; k<bs; k++) {
2055:           x = ri[k];
2056:           for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2057:         }
2058:       }
2059:     }
2060:     VecRestoreArray(rr,&r);
2061:     PetscLogFlops(a->nz);
2062:   }
2063:   return(0);
2064: }


2069: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2070: {
2071:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2074:   info->block_size     = a->bs2;
2075:   info->nz_allocated   = a->maxnz;
2076:   info->nz_used        = a->bs2*a->nz;
2077:   info->nz_unneeded    = (double)(info->nz_allocated - info->nz_used);
2078:   info->assemblies   = A->num_ass;
2079:   info->mallocs      = a->reallocs;
2080:   info->memory       = ((PetscObject)A)->mem;
2081:   if (A->factor) {
2082:     info->fill_ratio_given  = A->info.fill_ratio_given;
2083:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2084:     info->factor_mallocs    = A->info.factor_mallocs;
2085:   } else {
2086:     info->fill_ratio_given  = 0;
2087:     info->fill_ratio_needed = 0;
2088:     info->factor_mallocs    = 0;
2089:   }
2090:   return(0);
2091: }


2096: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2097: {
2098:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2102:   PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2103:   return(0);
2104: }