Actual source code: baij2.c
1: #define PETSCMAT_DLL
3: #include ../src/mat/impls/baij/seq/baij.h
4: #include ../src/mat/blockinvert.h
5: #include petscbt.h
6: #include petscblaslapack.h
10: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
14: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
15: const PetscInt *idx;
16: PetscInt start,end,*ai,*aj,bs,*nidx2;
17: PetscBT table;
20: m = a->mbs;
21: ai = a->i;
22: aj = a->j;
23: bs = A->rmap->bs;
25: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
27: PetscBTCreate(m,table);
28: PetscMalloc((m+1)*sizeof(PetscInt),&nidx);
29: PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
31: for (i=0; i<is_max; i++) {
32: /* Initialise the two local arrays */
33: isz = 0;
34: PetscBTMemzero(m,table);
35:
36: /* Extract the indices, assume there can be duplicate entries */
37: ISGetIndices(is[i],&idx);
38: ISGetLocalSize(is[i],&n);
40: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
41: for (j=0; j<n ; ++j){
42: ival = idx[j]/bs; /* convert the indices into block indices */
43: if (ival>=m) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
44: if(!PetscBTLookupSet(table,ival)) { nidx[isz++] = ival;}
45: }
46: ISRestoreIndices(is[i],&idx);
47: ISDestroy(is[i]);
48:
49: k = 0;
50: for (j=0; j<ov; j++){ /* for each overlap*/
51: n = isz;
52: for (; k<n ; k++){ /* do only those rows in nidx[k], which are not done yet */
53: row = nidx[k];
54: start = ai[row];
55: end = ai[row+1];
56: for (l = start; l<end ; l++){
57: val = aj[l];
58: if (!PetscBTLookupSet(table,val)) {nidx[isz++] = val;}
59: }
60: }
61: }
62: /* expand the Index Set */
63: for (j=0; j<isz; j++) {
64: for (k=0; k<bs; k++)
65: nidx2[j*bs+k] = nidx[j]*bs+k;
66: }
67: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
68: }
69: PetscBTDestroy(table);
70: PetscFree(nidx);
71: PetscFree(nidx2);
72: return(0);
73: }
77: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
78: {
79: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
81: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
82: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
83: const PetscInt *irow,*icol;
84: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
85: PetscInt *aj = a->j,*ai = a->i;
86: MatScalar *mat_a;
87: Mat C;
88: PetscTruth flag,sorted;
91: ISSorted(iscol,&sorted);
92: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
94: ISGetIndices(isrow,&irow);
95: ISGetIndices(iscol,&icol);
96: ISGetLocalSize(isrow,&nrows);
97: ISGetLocalSize(iscol,&ncols);
99: PetscMalloc((1+oldcols)*sizeof(PetscInt),&smap);
100: ssmap = smap;
101: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
102: PetscMemzero(smap,oldcols*sizeof(PetscInt));
103: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
104: /* determine lens of each row */
105: for (i=0; i<nrows; i++) {
106: kstart = ai[irow[i]];
107: kend = kstart + a->ilen[irow[i]];
108: lens[i] = 0;
109: for (k=kstart; k<kend; k++) {
110: if (ssmap[aj[k]]) {
111: lens[i]++;
112: }
113: }
114: }
115: /* Create and fill new matrix */
116: if (scall == MAT_REUSE_MATRIX) {
117: c = (Mat_SeqBAIJ *)((*B)->data);
119: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
120: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
121: if (!flag) {
122: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
123: }
124: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
125: C = *B;
126: } else {
127: MatCreate(((PetscObject)A)->comm,&C);
128: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
129: MatSetType(C,((PetscObject)A)->type_name);
130: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
131: }
132: c = (Mat_SeqBAIJ *)(C->data);
133: for (i=0; i<nrows; i++) {
134: row = irow[i];
135: kstart = ai[row];
136: kend = kstart + a->ilen[row];
137: mat_i = c->i[i];
138: mat_j = c->j + mat_i;
139: mat_a = c->a + mat_i*bs2;
140: mat_ilen = c->ilen + i;
141: for (k=kstart; k<kend; k++) {
142: if ((tcol=ssmap[a->j[k]])) {
143: *mat_j++ = tcol - 1;
144: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
145: mat_a += bs2;
146: (*mat_ilen)++;
147: }
148: }
149: }
150:
151: /* Free work space */
152: ISRestoreIndices(iscol,&icol);
153: PetscFree(smap);
154: PetscFree(lens);
155: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
156: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
157:
158: ISRestoreIndices(isrow,&irow);
159: *B = C;
160: return(0);
161: }
165: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
166: {
167: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
168: IS is1,is2;
170: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count;
171: const PetscInt *irow,*icol;
174: ISGetIndices(isrow,&irow);
175: ISGetIndices(iscol,&icol);
176: ISGetLocalSize(isrow,&nrows);
177: ISGetLocalSize(iscol,&ncols);
178:
179: /* Verify if the indices corespond to each element in a block
180: and form the IS with compressed IS */
181: PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
182: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
183: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
184: count = 0;
185: for (i=0; i<a->mbs; i++) {
186: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
187: if (vary[i]==bs) iary[count++] = i;
188: }
189: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
190:
191: PetscMemzero(vary,(a->mbs)*sizeof(PetscInt));
192: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
193: count = 0;
194: for (i=0; i<a->mbs; i++) {
195: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_PLIB,"Internal error in PETSc");
196: if (vary[i]==bs) iary[count++] = i;
197: }
198: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is2);
199: ISRestoreIndices(isrow,&irow);
200: ISRestoreIndices(iscol,&icol);
201: PetscFree2(vary,iary);
203: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
204: ISDestroy(is1);
205: ISDestroy(is2);
206: return(0);
207: }
211: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
212: {
214: PetscInt i;
217: if (scall == MAT_INITIAL_MATRIX) {
218: PetscMalloc((n+1)*sizeof(Mat),B);
219: }
221: for (i=0; i<n; i++) {
222: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
223: }
224: return(0);
225: }
228: /* -------------------------------------------------------*/
229: /* Should check that shapes of vectors and matrices match */
230: /* -------------------------------------------------------*/
234: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
235: {
236: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
237: PetscScalar *z,sum;
238: const PetscScalar *x;
239: const MatScalar *v;
240: PetscErrorCode ierr;
241: PetscInt mbs,i,n,nonzerorow=0;
242: const PetscInt *idx,*ii,*ridx=PETSC_NULL;
243: PetscTruth usecprow=a->compressedrow.use;
246: VecGetArray(xx,(PetscScalar**)&x);
247: VecGetArray(zz,&z);
249: if (usecprow){
250: mbs = a->compressedrow.nrows;
251: ii = a->compressedrow.i;
252: ridx = a->compressedrow.rindex;
253: PetscMemzero(z,mbs*sizeof(PetscScalar));
254: } else {
255: mbs = a->mbs;
256: ii = a->i;
257: }
259: for (i=0; i<mbs; i++) {
260: n = ii[1] - ii[0];
261: v = a->a + ii[0];
262: idx = a->j + ii[0];
263: ii++;
264: sum = 0.0;
265: PetscSparseDensePlusDot(sum,x,v,idx,n);
266: if (usecprow){
267: z[ridx[i]] = sum;
268: } else {
269: nonzerorow += (n>0);
270: z[i] = sum;
271: }
272: }
273: VecRestoreArray(xx,(PetscScalar**)&x);
274: VecRestoreArray(zz,&z);
275: PetscLogFlops(2.0*a->nz - nonzerorow);
276: return(0);
277: }
281: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
282: {
283: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
284: PetscScalar *z = 0,sum1,sum2,*zarray;
285: const PetscScalar *x,*xb;
286: PetscScalar x1,x2;
287: const MatScalar *v;
288: PetscErrorCode ierr;
289: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
290: PetscTruth usecprow=a->compressedrow.use;
293: VecGetArray(xx,(PetscScalar**)&x);
294: VecGetArray(zz,&zarray);
296: idx = a->j;
297: v = a->a;
298: if (usecprow){
299: mbs = a->compressedrow.nrows;
300: ii = a->compressedrow.i;
301: ridx = a->compressedrow.rindex;
302: } else {
303: mbs = a->mbs;
304: ii = a->i;
305: z = zarray;
306: }
308: for (i=0; i<mbs; i++) {
309: n = ii[1] - ii[0]; ii++;
310: sum1 = 0.0; sum2 = 0.0;
311: nonzerorow += (n>0);
312: for (j=0; j<n; j++) {
313: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
314: sum1 += v[0]*x1 + v[2]*x2;
315: sum2 += v[1]*x1 + v[3]*x2;
316: v += 4;
317: }
318: if (usecprow) z = zarray + 2*ridx[i];
319: z[0] = sum1; z[1] = sum2;
320: if (!usecprow) z += 2;
321: }
322: VecRestoreArray(xx,(PetscScalar**)&x);
323: VecRestoreArray(zz,&zarray);
324: PetscLogFlops(8.0*a->nz - 2.0*nonzerorow);
325: return(0);
326: }
330: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
331: {
332: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
333: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
334: const PetscScalar *x,*xb;
335: const MatScalar *v;
336: PetscErrorCode ierr;
337: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
338: PetscTruth usecprow=a->compressedrow.use;
339:
341: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
342: #pragma disjoint(*v,*z,*xb)
343: #endif
346: VecGetArray(xx,(PetscScalar**)&x);
347: VecGetArray(zz,&zarray);
349: idx = a->j;
350: v = a->a;
351: if (usecprow){
352: mbs = a->compressedrow.nrows;
353: ii = a->compressedrow.i;
354: ridx = a->compressedrow.rindex;
355: } else {
356: mbs = a->mbs;
357: ii = a->i;
358: z = zarray;
359: }
361: for (i=0; i<mbs; i++) {
362: n = ii[1] - ii[0]; ii++;
363: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
364: nonzerorow += (n>0);
365: for (j=0; j<n; j++) {
366: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
367: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
368: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
369: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
370: v += 9;
371: }
372: if (usecprow) z = zarray + 3*ridx[i];
373: z[0] = sum1; z[1] = sum2; z[2] = sum3;
374: if (!usecprow) z += 3;
375: }
376: VecRestoreArray(xx,(PetscScalar**)&x);
377: VecRestoreArray(zz,&zarray);
378: PetscLogFlops(18.0*a->nz - 3.0*nonzerorow);
379: return(0);
380: }
384: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
385: {
386: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
387: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
388: const PetscScalar *x,*xb;
389: const MatScalar *v;
390: PetscErrorCode ierr;
391: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
392: PetscTruth usecprow=a->compressedrow.use;
395: VecGetArray(xx,(PetscScalar**)&x);
396: VecGetArray(zz,&zarray);
398: idx = a->j;
399: v = a->a;
400: if (usecprow){
401: mbs = a->compressedrow.nrows;
402: ii = a->compressedrow.i;
403: ridx = a->compressedrow.rindex;
404: } else {
405: mbs = a->mbs;
406: ii = a->i;
407: z = zarray;
408: }
410: for (i=0; i<mbs; i++) {
411: n = ii[1] - ii[0]; ii++;
412: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
413: nonzerorow += (n>0);
414: for (j=0; j<n; j++) {
415: xb = x + 4*(*idx++);
416: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
417: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
418: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
419: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
420: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
421: v += 16;
422: }
423: if (usecprow) z = zarray + 4*ridx[i];
424: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
425: if (!usecprow) z += 4;
426: }
427: VecRestoreArray(xx,(PetscScalar**)&x);
428: VecRestoreArray(zz,&zarray);
429: PetscLogFlops(32.0*a->nz - 4.0*nonzerorow);
430: return(0);
431: }
435: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
436: {
437: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
438: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
439: const PetscScalar *xb,*x;
440: const MatScalar *v;
441: PetscErrorCode ierr;
442: const PetscInt *idx,*ii,*ridx=PETSC_NULL;
443: PetscInt mbs,i,j,n,nonzerorow=0;
444: PetscTruth usecprow=a->compressedrow.use;
447: VecGetArray(xx,(PetscScalar**)&x);
448: VecGetArray(zz,&zarray);
450: idx = a->j;
451: v = a->a;
452: if (usecprow){
453: mbs = a->compressedrow.nrows;
454: ii = a->compressedrow.i;
455: ridx = a->compressedrow.rindex;
456: } else {
457: mbs = a->mbs;
458: ii = a->i;
459: z = zarray;
460: }
462: for (i=0; i<mbs; i++) {
463: n = ii[1] - ii[0]; ii++;
464: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
465: nonzerorow += (n>0);
466: for (j=0; j<n; j++) {
467: xb = x + 5*(*idx++);
468: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
469: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
470: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
471: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
472: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
473: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
474: v += 25;
475: }
476: if (usecprow) z = zarray + 5*ridx[i];
477: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
478: if (!usecprow) z += 5;
479: }
480: VecRestoreArray(xx,(PetscScalar**)&x);
481: VecRestoreArray(zz,&zarray);
482: PetscLogFlops(50.0*a->nz - 5.0*nonzerorow);
483: return(0);
484: }
489: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
490: {
491: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
492: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
493: const PetscScalar *x,*xb;
494: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
495: const MatScalar *v;
496: PetscErrorCode ierr;
497: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
498: PetscTruth usecprow=a->compressedrow.use;
501: VecGetArray(xx,(PetscScalar**)&x);
502: VecGetArray(zz,&zarray);
504: idx = a->j;
505: v = a->a;
506: if (usecprow){
507: mbs = a->compressedrow.nrows;
508: ii = a->compressedrow.i;
509: ridx = a->compressedrow.rindex;
510: } else {
511: mbs = a->mbs;
512: ii = a->i;
513: z = zarray;
514: }
516: for (i=0; i<mbs; i++) {
517: n = ii[1] - ii[0]; ii++;
518: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0;
519: nonzerorow += (n>0);
520: for (j=0; j<n; j++) {
521: xb = x + 6*(*idx++);
522: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
523: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
524: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
525: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
526: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
527: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
528: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
529: v += 36;
530: }
531: if (usecprow) z = zarray + 6*ridx[i];
532: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
533: if (!usecprow) z += 6;
534: }
536: VecRestoreArray(xx,(PetscScalar**)&x);
537: VecRestoreArray(zz,&zarray);
538: PetscLogFlops(72.0*a->nz - 6.0*nonzerorow);
539: return(0);
540: }
544: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
545: {
546: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
547: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
548: const PetscScalar *x,*xb;
549: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
550: const MatScalar *v;
551: PetscErrorCode ierr;
552: PetscInt mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL,nonzerorow=0;
553: PetscTruth usecprow=a->compressedrow.use;
556: VecGetArray(xx,(PetscScalar**)&x);
557: VecGetArray(zz,&zarray);
559: idx = a->j;
560: v = a->a;
561: if (usecprow){
562: mbs = a->compressedrow.nrows;
563: ii = a->compressedrow.i;
564: ridx = a->compressedrow.rindex;
565: } else {
566: mbs = a->mbs;
567: ii = a->i;
568: z = zarray;
569: }
571: for (i=0; i<mbs; i++) {
572: n = ii[1] - ii[0]; ii++;
573: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
574: nonzerorow += (n>0);
575: for (j=0; j<n; j++) {
576: xb = x + 7*(*idx++);
577: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
578: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
579: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
580: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
581: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
582: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
583: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
584: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
585: v += 49;
586: }
587: if (usecprow) z = zarray + 7*ridx[i];
588: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
589: if (!usecprow) z += 7;
590: }
592: VecRestoreArray(xx,(PetscScalar**)&x);
593: VecRestoreArray(zz,&zarray);
594: PetscLogFlops(98.0*a->nz - 7.0*nonzerorow);
595: return(0);
596: }
598: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
599: /* Default MatMult for block size 15 */
603: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
604: {
605: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
606: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
607: const PetscScalar *x,*xb;
608: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
609: const MatScalar *v;
610: PetscErrorCode ierr;
611: const PetscInt *ii,*ij=a->j,*idx;
612: PetscInt mbs,i,j,k,n,*ridx=PETSC_NULL,nonzerorow=0;
613: PetscTruth usecprow=a->compressedrow.use;
616: VecGetArray(xx,(PetscScalar**)&x);
617: VecGetArray(zz,&zarray);
619: v = a->a;
620: if (usecprow){
621: mbs = a->compressedrow.nrows;
622: ii = a->compressedrow.i;
623: ridx = a->compressedrow.rindex;
624: } else {
625: mbs = a->mbs;
626: ii = a->i;
627: z = zarray;
628: }
630: for (i=0; i<mbs; i++) {
631: n = ii[i+1] - ii[i];
632: idx = ij + ii[i];
633: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
634: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
636: nonzerorow += (n>0);
637: for (j=0; j<n; j++) {
638: xb = x + 15*(idx[j]);
639: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
640: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
642: for(k=0;k<15;k++){
643: sum1 += v[0]*xb[k];
644: sum2 += v[1]*xb[k];
645: sum3 += v[2]*xb[k];
646: sum4 += v[3]*xb[k];
647: sum5 += v[4]*xb[k];
648: sum6 += v[5]*xb[k];
649: sum7 += v[6]*xb[k];
650: sum8 += v[7]*xb[k];
651: sum9 += v[8]*xb[k];
652: sum10 += v[9]*xb[k];
653: sum11 += v[10]*xb[k];
654: sum12 += v[11]*xb[k];
655: sum13 += v[12]*xb[k];
656: sum14 += v[13]*xb[k];
657: sum15 += v[14]*xb[k];
658: v += 15;
659: }
660: }
661: if (usecprow) z = zarray + 15*ridx[i];
662: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
665: if (!usecprow) z += 15;
666: }
668: VecRestoreArray(xx,(PetscScalar**)&x);
669: VecRestoreArray(zz,&zarray);
670: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
671: return(0);
672: }
674: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
677: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
678: {
679: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
680: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
681: const PetscScalar *x,*xb;
682: PetscScalar x1,x2,x3,x4,*zarray;
683: const MatScalar *v;
684: PetscErrorCode ierr;
685: const PetscInt *ii,*ij=a->j,*idx;
686: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
687: PetscTruth usecprow=a->compressedrow.use;
690: VecGetArray(xx,(PetscScalar**)&x);
691: VecGetArray(zz,&zarray);
693: v = a->a;
694: if (usecprow){
695: mbs = a->compressedrow.nrows;
696: ii = a->compressedrow.i;
697: ridx = a->compressedrow.rindex;
698: } else {
699: mbs = a->mbs;
700: ii = a->i;
701: z = zarray;
702: }
704: for (i=0; i<mbs; i++) {
705: n = ii[i+1] - ii[i];
706: idx = ij + ii[i];
707: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
708: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
710: nonzerorow += (n>0);
711: for (j=0; j<n; j++) {
712: xb = x + 15*(idx[j]);
713: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
715: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
716: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
717: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
718: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
719: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
720: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
721: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
722: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
723: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
724: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
725: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
726: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
727: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
728: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
729: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
731: v += 60;
733: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
735: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
736: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
737: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
738: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
739: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
740: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
741: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
742: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
743: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
744: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
745: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
746: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
747: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
748: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
749: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
750: v += 60;
752: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
753: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
754: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
755: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
756: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
757: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
758: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
759: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
760: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
761: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
762: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
763: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
764: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
765: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
766: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
767: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
768: v += 60;
770: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
771: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
772: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
773: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
774: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
775: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
776: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
777: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
778: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
779: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
780: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
781: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
782: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
783: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
784: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
785: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
786: v += 45;
787: }
788: if (usecprow) z = zarray + 15*ridx[i];
789: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
790: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
792: if (!usecprow) z += 15;
793: }
795: VecRestoreArray(xx,(PetscScalar**)&x);
796: VecRestoreArray(zz,&zarray);
797: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
798: return(0);
799: }
801: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
804: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
805: {
806: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
807: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
808: const PetscScalar *x,*xb;
809: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
810: const MatScalar *v;
811: PetscErrorCode ierr;
812: const PetscInt *ii,*ij=a->j,*idx;
813: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
814: PetscTruth usecprow=a->compressedrow.use;
817: VecGetArray(xx,(PetscScalar**)&x);
818: VecGetArray(zz,&zarray);
820: v = a->a;
821: if (usecprow){
822: mbs = a->compressedrow.nrows;
823: ii = a->compressedrow.i;
824: ridx = a->compressedrow.rindex;
825: } else {
826: mbs = a->mbs;
827: ii = a->i;
828: z = zarray;
829: }
831: for (i=0; i<mbs; i++) {
832: n = ii[i+1] - ii[i];
833: idx = ij + ii[i];
834: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
835: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
837: nonzerorow += (n>0);
838: for (j=0; j<n; j++) {
839: xb = x + 15*(idx[j]);
840: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
841: x8 = xb[7];
843: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
844: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
845: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
846: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
847: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
848: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
849: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
850: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
851: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
852: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
853: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
854: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
855: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
856: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
857: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
858: v += 120;
860: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
862: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
863: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
864: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
865: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
866: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
867: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
868: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
869: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
870: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
871: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
872: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
873: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
874: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
875: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
876: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
877: v += 105;
878: }
879: if (usecprow) z = zarray + 15*ridx[i];
880: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
881: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
883: if (!usecprow) z += 15;
884: }
886: VecRestoreArray(xx,(PetscScalar**)&x);
887: VecRestoreArray(zz,&zarray);
888: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
889: return(0);
890: }
892: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
896: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
897: {
898: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
899: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
900: const PetscScalar *x,*xb;
901: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
902: const MatScalar *v;
903: PetscErrorCode ierr;
904: const PetscInt *ii,*ij=a->j,*idx;
905: PetscInt mbs,i,j,n,*ridx=PETSC_NULL,nonzerorow=0;
906: PetscTruth usecprow=a->compressedrow.use;
909: VecGetArray(xx,(PetscScalar**)&x);
910: VecGetArray(zz,&zarray);
912: v = a->a;
913: if (usecprow){
914: mbs = a->compressedrow.nrows;
915: ii = a->compressedrow.i;
916: ridx = a->compressedrow.rindex;
917: } else {
918: mbs = a->mbs;
919: ii = a->i;
920: z = zarray;
921: }
923: for (i=0; i<mbs; i++) {
924: n = ii[i+1] - ii[i];
925: idx = ij + ii[i];
926: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
927: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
929: nonzerorow += (n>0);
930: for (j=0; j<n; j++) {
931: xb = x + 15*(idx[j]);
932: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
933: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
935: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
936: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
937: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
938: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
939: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
940: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
941: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
942: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
943: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
944: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
945: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
946: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
947: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
948: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
949: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
950: v += 225;
951: }
952: if (usecprow) z = zarray + 15*ridx[i];
953: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
954: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
956: if (!usecprow) z += 15;
957: }
959: VecRestoreArray(xx,(PetscScalar**)&x);
960: VecRestoreArray(zz,&zarray);
961: PetscLogFlops(450.0*a->nz - 15.0*nonzerorow);
962: return(0);
963: }
966: /*
967: This will not work with MatScalar == float because it calls the BLAS
968: */
971: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
972: {
973: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
974: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
975: MatScalar *v;
977: PetscInt mbs=a->mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
978: PetscInt ncols,k,*ridx=PETSC_NULL,nonzerorow=0;
979: PetscTruth usecprow=a->compressedrow.use;
982: VecGetArray(xx,&x);
983: VecGetArray(zz,&zarray);
985: idx = a->j;
986: v = a->a;
987: if (usecprow){
988: mbs = a->compressedrow.nrows;
989: ii = a->compressedrow.i;
990: ridx = a->compressedrow.rindex;
991: } else {
992: mbs = a->mbs;
993: ii = a->i;
994: z = zarray;
995: }
997: if (!a->mult_work) {
998: k = PetscMax(A->rmap->n,A->cmap->n);
999: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1000: }
1001: work = a->mult_work;
1002: for (i=0; i<mbs; i++) {
1003: n = ii[1] - ii[0]; ii++;
1004: ncols = n*bs;
1005: workt = work;
1006: nonzerorow += (n>0);
1007: for (j=0; j<n; j++) {
1008: xb = x + bs*(*idx++);
1009: for (k=0; k<bs; k++) workt[k] = xb[k];
1010: workt += bs;
1011: }
1012: if (usecprow) z = zarray + bs*ridx[i];
1013: Kernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1014: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1015: v += n*bs2;
1016: if (!usecprow) z += bs;
1017: }
1018: VecRestoreArray(xx,&x);
1019: VecRestoreArray(zz,&zarray);
1020: PetscLogFlops(2.0*a->nz*bs2 - bs*nonzerorow);
1021: return(0);
1022: }
1027: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1028: {
1029: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1030: const PetscScalar *x;
1031: PetscScalar *y,*z,sum;
1032: const MatScalar *v;
1033: PetscErrorCode ierr;
1034: PetscInt mbs=a->mbs,i,n,*ridx=PETSC_NULL,nonzerorow=0;
1035: const PetscInt *idx,*ii;
1036: PetscTruth usecprow=a->compressedrow.use;
1039: VecGetArray(xx,(PetscScalar**)&x);
1040: VecGetArray(yy,&y);
1041: if (zz != yy) {
1042: VecGetArray(zz,&z);
1043: } else {
1044: z = y;
1045: }
1047: idx = a->j;
1048: v = a->a;
1049: if (usecprow){
1050: if (zz != yy){
1051: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1052: }
1053: mbs = a->compressedrow.nrows;
1054: ii = a->compressedrow.i;
1055: ridx = a->compressedrow.rindex;
1056: } else {
1057: ii = a->i;
1058: }
1060: for (i=0; i<mbs; i++) {
1061: n = ii[1] - ii[0];
1062: ii++;
1063: if (!usecprow){
1064: nonzerorow += (n>0);
1065: sum = y[i];
1066: } else {
1067: sum = y[ridx[i]];
1068: }
1069: PetscSparseDensePlusDot(sum,x,v,idx,n);
1070: v += n;
1071: idx += n;
1072: if (usecprow){
1073: z[ridx[i]] = sum;
1074: } else {
1075: z[i] = sum;
1076: }
1077: }
1078: VecRestoreArray(xx,(PetscScalar**)&x);
1079: VecRestoreArray(yy,&y);
1080: if (zz != yy) {
1081: VecRestoreArray(zz,&z);
1082: }
1083: PetscLogFlops(2.0*a->nz - nonzerorow);
1084: return(0);
1085: }
1089: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1090: {
1091: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1092: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2;
1093: PetscScalar x1,x2,*yarray,*zarray;
1094: MatScalar *v;
1096: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1097: PetscTruth usecprow=a->compressedrow.use;
1100: VecGetArray(xx,&x);
1101: VecGetArray(yy,&yarray);
1102: if (zz != yy) {
1103: VecGetArray(zz,&zarray);
1104: } else {
1105: zarray = yarray;
1106: }
1108: idx = a->j;
1109: v = a->a;
1110: if (usecprow){
1111: if (zz != yy){
1112: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1113: }
1114: mbs = a->compressedrow.nrows;
1115: ii = a->compressedrow.i;
1116: ridx = a->compressedrow.rindex;
1117: if (zz != yy){
1118: PetscMemcpy(zarray,yarray,a->mbs*sizeof(PetscScalar));
1119: }
1120: } else {
1121: ii = a->i;
1122: y = yarray;
1123: z = zarray;
1124: }
1126: for (i=0; i<mbs; i++) {
1127: n = ii[1] - ii[0]; ii++;
1128: if (usecprow){
1129: z = zarray + 2*ridx[i];
1130: y = yarray + 2*ridx[i];
1131: }
1132: sum1 = y[0]; sum2 = y[1];
1133: for (j=0; j<n; j++) {
1134: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
1135: sum1 += v[0]*x1 + v[2]*x2;
1136: sum2 += v[1]*x1 + v[3]*x2;
1137: v += 4;
1138: }
1139: z[0] = sum1; z[1] = sum2;
1140: if (!usecprow){
1141: z += 2; y += 2;
1142: }
1143: }
1144: VecRestoreArray(xx,&x);
1145: VecRestoreArray(yy,&yarray);
1146: if (zz != yy) {
1147: VecRestoreArray(zz,&zarray);
1148: }
1149: PetscLogFlops(4.0*a->nz);
1150: return(0);
1151: }
1155: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1156: {
1157: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1158: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1159: MatScalar *v;
1161: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1162: PetscTruth usecprow=a->compressedrow.use;
1165: VecGetArray(xx,&x);
1166: VecGetArray(yy,&yarray);
1167: if (zz != yy) {
1168: VecGetArray(zz,&zarray);
1169: } else {
1170: zarray = yarray;
1171: }
1173: idx = a->j;
1174: v = a->a;
1175: if (usecprow){
1176: if (zz != yy){
1177: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1178: }
1179: mbs = a->compressedrow.nrows;
1180: ii = a->compressedrow.i;
1181: ridx = a->compressedrow.rindex;
1182: } else {
1183: ii = a->i;
1184: y = yarray;
1185: z = zarray;
1186: }
1188: for (i=0; i<mbs; i++) {
1189: n = ii[1] - ii[0]; ii++;
1190: if (usecprow){
1191: z = zarray + 3*ridx[i];
1192: y = yarray + 3*ridx[i];
1193: }
1194: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1195: for (j=0; j<n; j++) {
1196: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1197: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1198: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1199: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1200: v += 9;
1201: }
1202: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1203: if (!usecprow){
1204: z += 3; y += 3;
1205: }
1206: }
1207: VecRestoreArray(xx,&x);
1208: VecRestoreArray(yy,&yarray);
1209: if (zz != yy) {
1210: VecRestoreArray(zz,&zarray);
1211: }
1212: PetscLogFlops(18.0*a->nz);
1213: return(0);
1214: }
1218: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1219: {
1220: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1221: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1222: MatScalar *v;
1224: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1225: PetscTruth usecprow=a->compressedrow.use;
1228: VecGetArray(xx,&x);
1229: VecGetArray(yy,&yarray);
1230: if (zz != yy) {
1231: VecGetArray(zz,&zarray);
1232: } else {
1233: zarray = yarray;
1234: }
1236: idx = a->j;
1237: v = a->a;
1238: if (usecprow){
1239: if (zz != yy){
1240: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1241: }
1242: mbs = a->compressedrow.nrows;
1243: ii = a->compressedrow.i;
1244: ridx = a->compressedrow.rindex;
1245: } else {
1246: ii = a->i;
1247: y = yarray;
1248: z = zarray;
1249: }
1251: for (i=0; i<mbs; i++) {
1252: n = ii[1] - ii[0]; ii++;
1253: if (usecprow){
1254: z = zarray + 4*ridx[i];
1255: y = yarray + 4*ridx[i];
1256: }
1257: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1258: for (j=0; j<n; j++) {
1259: xb = x + 4*(*idx++);
1260: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1261: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1262: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1263: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1264: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1265: v += 16;
1266: }
1267: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1268: if (!usecprow){
1269: z += 4; y += 4;
1270: }
1271: }
1272: VecRestoreArray(xx,&x);
1273: VecRestoreArray(yy,&yarray);
1274: if (zz != yy) {
1275: VecRestoreArray(zz,&zarray);
1276: }
1277: PetscLogFlops(32.0*a->nz);
1278: return(0);
1279: }
1283: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1284: {
1285: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1286: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1287: PetscScalar *yarray,*zarray;
1288: MatScalar *v;
1290: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1291: PetscTruth usecprow=a->compressedrow.use;
1294: VecGetArray(xx,&x);
1295: VecGetArray(yy,&yarray);
1296: if (zz != yy) {
1297: VecGetArray(zz,&zarray);
1298: } else {
1299: zarray = yarray;
1300: }
1302: idx = a->j;
1303: v = a->a;
1304: if (usecprow){
1305: if (zz != yy){
1306: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1307: }
1308: mbs = a->compressedrow.nrows;
1309: ii = a->compressedrow.i;
1310: ridx = a->compressedrow.rindex;
1311: } else {
1312: ii = a->i;
1313: y = yarray;
1314: z = zarray;
1315: }
1317: for (i=0; i<mbs; i++) {
1318: n = ii[1] - ii[0]; ii++;
1319: if (usecprow){
1320: z = zarray + 5*ridx[i];
1321: y = yarray + 5*ridx[i];
1322: }
1323: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1324: for (j=0; j<n; j++) {
1325: xb = x + 5*(*idx++);
1326: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1327: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1328: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1329: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1330: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1331: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1332: v += 25;
1333: }
1334: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1335: if (!usecprow){
1336: z += 5; y += 5;
1337: }
1338: }
1339: VecRestoreArray(xx,&x);
1340: VecRestoreArray(yy,&yarray);
1341: if (zz != yy) {
1342: VecRestoreArray(zz,&zarray);
1343: }
1344: PetscLogFlops(50.0*a->nz);
1345: return(0);
1346: }
1349: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1350: {
1351: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1352: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6;
1353: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1354: MatScalar *v;
1356: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1357: PetscTruth usecprow=a->compressedrow.use;
1360: VecGetArray(xx,&x);
1361: VecGetArray(yy,&yarray);
1362: if (zz != yy) {
1363: VecGetArray(zz,&zarray);
1364: } else {
1365: zarray = yarray;
1366: }
1368: idx = a->j;
1369: v = a->a;
1370: if (usecprow){
1371: if (zz != yy){
1372: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1373: }
1374: mbs = a->compressedrow.nrows;
1375: ii = a->compressedrow.i;
1376: ridx = a->compressedrow.rindex;
1377: } else {
1378: ii = a->i;
1379: y = yarray;
1380: z = zarray;
1381: }
1383: for (i=0; i<mbs; i++) {
1384: n = ii[1] - ii[0]; ii++;
1385: if (usecprow){
1386: z = zarray + 6*ridx[i];
1387: y = yarray + 6*ridx[i];
1388: }
1389: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1390: for (j=0; j<n; j++) {
1391: xb = x + 6*(*idx++);
1392: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1393: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1394: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1395: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1396: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1397: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1398: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1399: v += 36;
1400: }
1401: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1402: if (!usecprow){
1403: z += 6; y += 6;
1404: }
1405: }
1406: VecRestoreArray(xx,&x);
1407: VecRestoreArray(yy,&yarray);
1408: if (zz != yy) {
1409: VecRestoreArray(zz,&zarray);
1410: }
1411: PetscLogFlops(72.0*a->nz);
1412: return(0);
1413: }
1417: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1418: {
1419: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1420: PetscScalar *x,*y = 0,*z = 0,*xb,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1421: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1422: MatScalar *v;
1424: PetscInt mbs=a->mbs,i,*idx,*ii,j,n,*ridx=PETSC_NULL;
1425: PetscTruth usecprow=a->compressedrow.use;
1428: VecGetArray(xx,&x);
1429: VecGetArray(yy,&yarray);
1430: if (zz != yy) {
1431: VecGetArray(zz,&zarray);
1432: } else {
1433: zarray = yarray;
1434: }
1436: idx = a->j;
1437: v = a->a;
1438: if (usecprow){
1439: if (zz != yy){
1440: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1441: }
1442: mbs = a->compressedrow.nrows;
1443: ii = a->compressedrow.i;
1444: ridx = a->compressedrow.rindex;
1445: } else {
1446: ii = a->i;
1447: y = yarray;
1448: z = zarray;
1449: }
1451: for (i=0; i<mbs; i++) {
1452: n = ii[1] - ii[0]; ii++;
1453: if (usecprow){
1454: z = zarray + 7*ridx[i];
1455: y = yarray + 7*ridx[i];
1456: }
1457: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1458: for (j=0; j<n; j++) {
1459: xb = x + 7*(*idx++);
1460: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1461: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1462: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1463: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1464: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1465: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1466: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1467: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1468: v += 49;
1469: }
1470: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1471: if (!usecprow){
1472: z += 7; y += 7;
1473: }
1474: }
1475: VecRestoreArray(xx,&x);
1476: VecRestoreArray(yy,&yarray);
1477: if (zz != yy) {
1478: VecRestoreArray(zz,&zarray);
1479: }
1480: PetscLogFlops(98.0*a->nz);
1481: return(0);
1482: }
1486: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1487: {
1488: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1489: PetscScalar *x,*z = 0,*xb,*work,*workt,*zarray;
1490: MatScalar *v;
1492: PetscInt mbs,i,*idx,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2;
1493: PetscInt ncols,k,*ridx=PETSC_NULL;
1494: PetscTruth usecprow=a->compressedrow.use;
1497: VecCopy_Seq(yy,zz);
1498: VecGetArray(xx,&x);
1499: VecGetArray(zz,&zarray);
1501: idx = a->j;
1502: v = a->a;
1503: if (usecprow){
1504: mbs = a->compressedrow.nrows;
1505: ii = a->compressedrow.i;
1506: ridx = a->compressedrow.rindex;
1507: } else {
1508: mbs = a->mbs;
1509: ii = a->i;
1510: z = zarray;
1511: }
1513: if (!a->mult_work) {
1514: k = PetscMax(A->rmap->n,A->cmap->n);
1515: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1516: }
1517: work = a->mult_work;
1518: for (i=0; i<mbs; i++) {
1519: n = ii[1] - ii[0]; ii++;
1520: ncols = n*bs;
1521: workt = work;
1522: for (j=0; j<n; j++) {
1523: xb = x + bs*(*idx++);
1524: for (k=0; k<bs; k++) workt[k] = xb[k];
1525: workt += bs;
1526: }
1527: if (usecprow) z = zarray + bs*ridx[i];
1528: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1529: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1530: v += n*bs2;
1531: if (!usecprow){
1532: z += bs;
1533: }
1534: }
1535: VecRestoreArray(xx,&x);
1536: VecRestoreArray(zz,&zarray);
1537: PetscLogFlops(2.0*a->nz*bs2);
1538: return(0);
1539: }
1543: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1544: {
1545: PetscScalar zero = 0.0;
1549: VecSet(zz,zero);
1550: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1551: return(0);
1552: }
1556: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1557: {
1558: PetscScalar zero = 0.0;
1562: VecSet(zz,zero);
1563: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1564: return(0);
1565: }
1569: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1571: {
1572: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1573: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1574: MatScalar *v;
1575: PetscErrorCode ierr;
1576: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1577: Mat_CompressedRow cprow = a->compressedrow;
1578: PetscTruth usecprow=cprow.use;
1581: if (yy != zz) { VecCopy_Seq(yy,zz); }
1582: VecGetArray(xx,&x);
1583: VecGetArray(zz,&z);
1585: idx = a->j;
1586: v = a->a;
1587: if (usecprow){
1588: mbs = cprow.nrows;
1589: ii = cprow.i;
1590: ridx = cprow.rindex;
1591: } else {
1592: mbs=a->mbs;
1593: ii = a->i;
1594: xb = x;
1595: }
1597: switch (bs) {
1598: case 1:
1599: for (i=0; i<mbs; i++) {
1600: if (usecprow) xb = x + ridx[i];
1601: x1 = xb[0];
1602: ib = idx + ii[0];
1603: n = ii[1] - ii[0]; ii++;
1604: for (j=0; j<n; j++) {
1605: rval = ib[j];
1606: z[rval] += PetscConj(*v) * x1;
1607: v++;
1608: }
1609: if (!usecprow) xb++;
1610: }
1611: break;
1612: case 2:
1613: for (i=0; i<mbs; i++) {
1614: if (usecprow) xb = x + 2*ridx[i];
1615: x1 = xb[0]; x2 = xb[1];
1616: ib = idx + ii[0];
1617: n = ii[1] - ii[0]; ii++;
1618: for (j=0; j<n; j++) {
1619: rval = ib[j]*2;
1620: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1621: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1622: v += 4;
1623: }
1624: if (!usecprow) xb += 2;
1625: }
1626: break;
1627: case 3:
1628: for (i=0; i<mbs; i++) {
1629: if (usecprow) xb = x + 3*ridx[i];
1630: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1631: ib = idx + ii[0];
1632: n = ii[1] - ii[0]; ii++;
1633: for (j=0; j<n; j++) {
1634: rval = ib[j]*3;
1635: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1636: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1637: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1638: v += 9;
1639: }
1640: if (!usecprow) xb += 3;
1641: }
1642: break;
1643: case 4:
1644: for (i=0; i<mbs; i++) {
1645: if (usecprow) xb = x + 4*ridx[i];
1646: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1647: ib = idx + ii[0];
1648: n = ii[1] - ii[0]; ii++;
1649: for (j=0; j<n; j++) {
1650: rval = ib[j]*4;
1651: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1652: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1653: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1654: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1655: v += 16;
1656: }
1657: if (!usecprow) xb += 4;
1658: }
1659: break;
1660: case 5:
1661: for (i=0; i<mbs; i++) {
1662: if (usecprow) xb = x + 5*ridx[i];
1663: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1664: x4 = xb[3]; x5 = xb[4];
1665: ib = idx + ii[0];
1666: n = ii[1] - ii[0]; ii++;
1667: for (j=0; j<n; j++) {
1668: rval = ib[j]*5;
1669: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1670: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1671: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1672: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1673: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1674: v += 25;
1675: }
1676: if (!usecprow) xb += 5;
1677: }
1678: break;
1679: default: { /* block sizes larger than 5 by 5 are handled by BLAS */
1680: PetscInt ncols,k;
1681: PetscScalar *work,*workt,*xtmp;
1683: SETERRQ(PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1684: if (!a->mult_work) {
1685: k = PetscMax(A->rmap->n,A->cmap->n);
1686: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1687: }
1688: work = a->mult_work;
1689: xtmp = x;
1690: for (i=0; i<mbs; i++) {
1691: n = ii[1] - ii[0]; ii++;
1692: ncols = n*bs;
1693: PetscMemzero(work,ncols*sizeof(PetscScalar));
1694: if (usecprow) {
1695: xtmp = x + bs*ridx[i];
1696: }
1697: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1698: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1699: v += n*bs2;
1700: if (!usecprow) xtmp += bs;
1701: workt = work;
1702: for (j=0; j<n; j++) {
1703: zb = z + bs*(*idx++);
1704: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1705: workt += bs;
1706: }
1707: }
1708: }
1709: }
1710: VecRestoreArray(xx,&x);
1711: VecRestoreArray(zz,&z);
1712: PetscLogFlops(2.0*a->nz*a->bs2);
1713: return(0);
1714: }
1718: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1719: {
1720: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1721: PetscScalar *zb,*x,*z,*xb = 0,x1,x2,x3,x4,x5;
1722: MatScalar *v;
1723: PetscErrorCode ierr;
1724: PetscInt mbs,i,*idx,*ii,rval,bs=A->rmap->bs,j,n,bs2=a->bs2,*ib,*ridx=PETSC_NULL;
1725: Mat_CompressedRow cprow = a->compressedrow;
1726: PetscTruth usecprow=cprow.use;
1729: if (yy != zz) { VecCopy_Seq(yy,zz); }
1730: VecGetArray(xx,&x);
1731: VecGetArray(zz,&z);
1733: idx = a->j;
1734: v = a->a;
1735: if (usecprow){
1736: mbs = cprow.nrows;
1737: ii = cprow.i;
1738: ridx = cprow.rindex;
1739: } else {
1740: mbs=a->mbs;
1741: ii = a->i;
1742: xb = x;
1743: }
1745: switch (bs) {
1746: case 1:
1747: for (i=0; i<mbs; i++) {
1748: if (usecprow) xb = x + ridx[i];
1749: x1 = xb[0];
1750: ib = idx + ii[0];
1751: n = ii[1] - ii[0]; ii++;
1752: for (j=0; j<n; j++) {
1753: rval = ib[j];
1754: z[rval] += *v * x1;
1755: v++;
1756: }
1757: if (!usecprow) xb++;
1758: }
1759: break;
1760: case 2:
1761: for (i=0; i<mbs; i++) {
1762: if (usecprow) xb = x + 2*ridx[i];
1763: x1 = xb[0]; x2 = xb[1];
1764: ib = idx + ii[0];
1765: n = ii[1] - ii[0]; ii++;
1766: for (j=0; j<n; j++) {
1767: rval = ib[j]*2;
1768: z[rval++] += v[0]*x1 + v[1]*x2;
1769: z[rval++] += v[2]*x1 + v[3]*x2;
1770: v += 4;
1771: }
1772: if (!usecprow) xb += 2;
1773: }
1774: break;
1775: case 3:
1776: for (i=0; i<mbs; i++) {
1777: if (usecprow) xb = x + 3*ridx[i];
1778: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1779: ib = idx + ii[0];
1780: n = ii[1] - ii[0]; ii++;
1781: for (j=0; j<n; j++) {
1782: rval = ib[j]*3;
1783: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1784: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1785: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1786: v += 9;
1787: }
1788: if (!usecprow) xb += 3;
1789: }
1790: break;
1791: case 4:
1792: for (i=0; i<mbs; i++) {
1793: if (usecprow) xb = x + 4*ridx[i];
1794: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1795: ib = idx + ii[0];
1796: n = ii[1] - ii[0]; ii++;
1797: for (j=0; j<n; j++) {
1798: rval = ib[j]*4;
1799: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1800: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1801: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1802: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1803: v += 16;
1804: }
1805: if (!usecprow) xb += 4;
1806: }
1807: break;
1808: case 5:
1809: for (i=0; i<mbs; i++) {
1810: if (usecprow) xb = x + 5*ridx[i];
1811: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1812: x4 = xb[3]; x5 = xb[4];
1813: ib = idx + ii[0];
1814: n = ii[1] - ii[0]; ii++;
1815: for (j=0; j<n; j++) {
1816: rval = ib[j]*5;
1817: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1818: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1819: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1820: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1821: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1822: v += 25;
1823: }
1824: if (!usecprow) xb += 5;
1825: }
1826: break;
1827: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1828: PetscInt ncols,k;
1829: PetscScalar *work,*workt,*xtmp;
1831: if (!a->mult_work) {
1832: k = PetscMax(A->rmap->n,A->cmap->n);
1833: PetscMalloc((k+1)*sizeof(PetscScalar),&a->mult_work);
1834: }
1835: work = a->mult_work;
1836: xtmp = x;
1837: for (i=0; i<mbs; i++) {
1838: n = ii[1] - ii[0]; ii++;
1839: ncols = n*bs;
1840: PetscMemzero(work,ncols*sizeof(PetscScalar));
1841: if (usecprow) {
1842: xtmp = x + bs*ridx[i];
1843: }
1844: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1845: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1846: v += n*bs2;
1847: if (!usecprow) xtmp += bs;
1848: workt = work;
1849: for (j=0; j<n; j++) {
1850: zb = z + bs*(*idx++);
1851: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1852: workt += bs;
1853: }
1854: }
1855: }
1856: }
1857: VecRestoreArray(xx,&x);
1858: VecRestoreArray(zz,&z);
1859: PetscLogFlops(2.0*a->nz*a->bs2);
1860: return(0);
1861: }
1865: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1866: {
1867: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1868: PetscInt totalnz = a->bs2*a->nz;
1869: PetscScalar oalpha = alpha;
1871: PetscBLASInt one = 1,tnz = PetscBLASIntCast(totalnz);
1874: BLASscal_(&tnz,&oalpha,a->a,&one);
1875: PetscLogFlops(totalnz);
1876: return(0);
1877: }
1881: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1882: {
1884: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1885: MatScalar *v = a->a;
1886: PetscReal sum = 0.0;
1887: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1890: if (type == NORM_FROBENIUS) {
1891: for (i=0; i< bs2*nz; i++) {
1892: #if defined(PETSC_USE_COMPLEX)
1893: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1894: #else
1895: sum += (*v)*(*v); v++;
1896: #endif
1897: }
1898: *norm = sqrt(sum);
1899: } else if (type == NORM_1) { /* maximum column sum */
1900: PetscReal *tmp;
1901: PetscInt *bcol = a->j;
1902: PetscMalloc((A->cmap->n+1)*sizeof(PetscReal),&tmp);
1903: PetscMemzero(tmp,A->cmap->n*sizeof(PetscReal));
1904: for (i=0; i<nz; i++){
1905: for (j=0; j<bs; j++){
1906: k1 = bs*(*bcol) + j; /* column index */
1907: for (k=0; k<bs; k++){
1908: tmp[k1] += PetscAbsScalar(*v); v++;
1909: }
1910: }
1911: bcol++;
1912: }
1913: *norm = 0.0;
1914: for (j=0; j<A->cmap->n; j++) {
1915: if (tmp[j] > *norm) *norm = tmp[j];
1916: }
1917: PetscFree(tmp);
1918: } else if (type == NORM_INFINITY) { /* maximum row sum */
1919: *norm = 0.0;
1920: for (k=0; k<bs; k++) {
1921: for (j=0; j<a->mbs; j++) {
1922: v = a->a + bs2*a->i[j] + k;
1923: sum = 0.0;
1924: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1925: for (k1=0; k1<bs; k1++){
1926: sum += PetscAbsScalar(*v);
1927: v += bs;
1928: }
1929: }
1930: if (sum > *norm) *norm = sum;
1931: }
1932: }
1933: } else {
1934: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1935: }
1936: return(0);
1937: }
1942: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscTruth* flg)
1943: {
1944: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data,*b = (Mat_SeqBAIJ *)B->data;
1948: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1949: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1950: *flg = PETSC_FALSE;
1951: return(0);
1952: }
1953:
1954: /* if the a->i are the same */
1955: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1956: if (!*flg) {
1957: return(0);
1958: }
1959:
1960: /* if a->j are the same */
1961: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1962: if (!*flg) {
1963: return(0);
1964: }
1965: /* if a->a are the same */
1966: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1967: return(0);
1968:
1969: }
1973: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1974: {
1975: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1977: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1978: PetscScalar *x,zero = 0.0;
1979: MatScalar *aa,*aa_j;
1982: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1983: bs = A->rmap->bs;
1984: aa = a->a;
1985: ai = a->i;
1986: aj = a->j;
1987: ambs = a->mbs;
1988: bs2 = a->bs2;
1990: VecSet(v,zero);
1991: VecGetArray(v,&x);
1992: VecGetLocalSize(v,&n);
1993: if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1994: for (i=0; i<ambs; i++) {
1995: for (j=ai[i]; j<ai[i+1]; j++) {
1996: if (aj[j] == i) {
1997: row = i*bs;
1998: aa_j = aa+j*bs2;
1999: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2000: break;
2001: }
2002: }
2003: }
2004: VecRestoreArray(v,&x);
2005: return(0);
2006: }
2010: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2011: {
2012: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2013: PetscScalar *l,*r,x,*li,*ri;
2014: MatScalar *aa,*v;
2016: PetscInt i,j,k,lm,rn,M,m,n,*ai,*aj,mbs,tmp,bs,bs2;
2019: ai = a->i;
2020: aj = a->j;
2021: aa = a->a;
2022: m = A->rmap->n;
2023: n = A->cmap->n;
2024: bs = A->rmap->bs;
2025: mbs = a->mbs;
2026: bs2 = a->bs2;
2027: if (ll) {
2028: VecGetArray(ll,&l);
2029: VecGetLocalSize(ll,&lm);
2030: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2031: for (i=0; i<mbs; i++) { /* for each block row */
2032: M = ai[i+1] - ai[i];
2033: li = l + i*bs;
2034: v = aa + bs2*ai[i];
2035: for (j=0; j<M; j++) { /* for each block */
2036: for (k=0; k<bs2; k++) {
2037: (*v++) *= li[k%bs];
2038: }
2039: }
2040: }
2041: VecRestoreArray(ll,&l);
2042: PetscLogFlops(a->nz);
2043: }
2044:
2045: if (rr) {
2046: VecGetArray(rr,&r);
2047: VecGetLocalSize(rr,&rn);
2048: if (rn != n) SETERRQ(PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2049: for (i=0; i<mbs; i++) { /* for each block row */
2050: M = ai[i+1] - ai[i];
2051: v = aa + bs2*ai[i];
2052: for (j=0; j<M; j++) { /* for each block */
2053: ri = r + bs*aj[ai[i]+j];
2054: for (k=0; k<bs; k++) {
2055: x = ri[k];
2056: for (tmp=0; tmp<bs; tmp++) (*v++) *= x;
2057: }
2058: }
2059: }
2060: VecRestoreArray(rr,&r);
2061: PetscLogFlops(a->nz);
2062: }
2063: return(0);
2064: }
2069: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2070: {
2071: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2074: info->block_size = a->bs2;
2075: info->nz_allocated = a->maxnz;
2076: info->nz_used = a->bs2*a->nz;
2077: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2078: info->assemblies = A->num_ass;
2079: info->mallocs = a->reallocs;
2080: info->memory = ((PetscObject)A)->mem;
2081: if (A->factor) {
2082: info->fill_ratio_given = A->info.fill_ratio_given;
2083: info->fill_ratio_needed = A->info.fill_ratio_needed;
2084: info->factor_mallocs = A->info.factor_mallocs;
2085: } else {
2086: info->fill_ratio_given = 0;
2087: info->fill_ratio_needed = 0;
2088: info->factor_mallocs = 0;
2089: }
2090: return(0);
2091: }
2096: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2097: {
2098: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2102: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2103: return(0);
2104: }