Actual source code: sbaij2.c
1: #define PETSCMAT_DLL
3: #include ../src/mat/impls/baij/seq/baij.h
4: #include ../src/mat/blockinvert.h
5: #include petscbt.h
6: #include ../src/mat/impls/sbaij/seq/sbaij.h
7: #include petscblaslapack.h
11: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
12: {
13: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
15: PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
16: const PetscInt *idx;
17: PetscBT table,table0;
20: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
21: mbs = a->mbs;
22: ai = a->i;
23: aj = a->j;
24: bs = A->rmap->bs;
25: PetscBTCreate(mbs,table);
26: PetscMalloc((mbs+1)*sizeof(PetscInt),&nidx);
27: PetscMalloc((A->rmap->N+1)*sizeof(PetscInt),&nidx2);
28: PetscBTCreate(mbs,table0);
30: for (i=0; i<is_max; i++) { /* for each is */
31: isz = 0;
32: PetscBTMemzero(mbs,table);
33:
34: /* Extract the indices, assume there can be duplicate entries */
35: ISGetIndices(is[i],&idx);
36: ISGetLocalSize(is[i],&n);
38: /* Enter these into the temp arrays i.e mark table[brow], enter brow into new index */
39: bcol_max = 0;
40: for (j=0; j<n ; ++j){
41: brow = idx[j]/bs; /* convert the indices into block indices */
42: if (brow >= mbs) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if(!PetscBTLookupSet(table,brow)) {
44: nidx[isz++] = brow;
45: if (bcol_max < brow) bcol_max = brow;
46: }
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(is[i]);
50:
51: k = 0;
52: for (j=0; j<ov; j++){ /* for each overlap */
53: /* set table0 for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
54: PetscBTMemzero(mbs,table0);
55: for (l=k; l<isz; l++) { PetscBTSet(table0,nidx[l]); }
57: n = isz; /* length of the updated is[i] */
58: for (brow=0; brow<mbs; brow++){
59: start = ai[brow]; end = ai[brow+1];
60: if (PetscBTLookup(table0,brow)){ /* brow is on nidx - row search: collect all bcol in this brow */
61: for (l = start; l<end ; l++){
62: bcol = aj[l];
63: if (!PetscBTLookupSet(table,bcol)) {nidx[isz++] = bcol;}
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l<end ; l++){
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table0,bcol)){
72: if (!PetscBTLookupSet(table,brow)) {nidx[isz++] = brow;}
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
80: /* expand the Index Set */
81: for (j=0; j<isz; j++) {
82: for (k=0; k<bs; k++)
83: nidx2[j*bs+k] = nidx[j]*bs+k;
84: }
85: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,is+i);
86: }
87: PetscBTDestroy(table);
88: PetscFree(nidx);
89: PetscFree(nidx2);
90: PetscBTDestroy(table0);
91: return(0);
92: }
96: PetscErrorCode MatGetSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
97: {
98: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
100: PetscInt *smap,i,k,kstart,kend,oldcols = a->mbs,*lens;
101: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
102: PetscInt nrows,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
103: const PetscInt *irow,*aj = a->j,*ai = a->i;
104: MatScalar *mat_a;
105: Mat C;
106: PetscTruth flag,sorted;
109: if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
110: ISSorted(iscol,&sorted);
111: if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IS is not sorted");
113: ISGetIndices(isrow,&irow);
114: ISGetSize(isrow,&nrows);
115:
116: PetscMalloc(oldcols*sizeof(PetscInt),&smap);
117: PetscMemzero(smap,oldcols*sizeof(PetscInt));
118: ssmap = smap;
119: PetscMalloc((1+nrows)*sizeof(PetscInt),&lens);
120: for (i=0; i<nrows; i++) smap[irow[i]] = i+1; /* nrows = ncols */
121: /* determine lens of each row */
122: for (i=0; i<nrows; i++) {
123: kstart = ai[irow[i]];
124: kend = kstart + a->ilen[irow[i]];
125: lens[i] = 0;
126: for (k=kstart; k<kend; k++) {
127: if (ssmap[aj[k]]) {
128: lens[i]++;
129: }
130: }
131: }
132: /* Create and fill new matrix */
133: if (scall == MAT_REUSE_MATRIX) {
134: c = (Mat_SeqSBAIJ *)((*B)->data);
136: if (c->mbs!=nrows || (*B)->rmap->bs!=bs) SETERRQ(PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
137: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
138: if (!flag) {
139: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
140: }
141: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
142: C = *B;
143: } else {
144: MatCreate(((PetscObject)A)->comm,&C);
145: MatSetSizes(C,nrows*bs,nrows*bs,PETSC_DETERMINE,PETSC_DETERMINE);
146: MatSetType(C,((PetscObject)A)->type_name);
147: MatSeqSBAIJSetPreallocation_SeqSBAIJ(C,bs,0,lens);
148: }
149: c = (Mat_SeqSBAIJ *)(C->data);
150: for (i=0; i<nrows; i++) {
151: row = irow[i];
152: kstart = ai[row];
153: kend = kstart + a->ilen[row];
154: mat_i = c->i[i];
155: mat_j = c->j + mat_i;
156: mat_a = c->a + mat_i*bs2;
157: mat_ilen = c->ilen + i;
158: for (k=kstart; k<kend; k++) {
159: if ((tcol=ssmap[a->j[k]])) {
160: *mat_j++ = tcol - 1;
161: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
162: mat_a += bs2;
163: (*mat_ilen)++;
164: }
165: }
166: }
167:
168: /* Free work space */
169: PetscFree(smap);
170: PetscFree(lens);
171: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
172: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
173:
174: ISRestoreIndices(isrow,&irow);
175: *B = C;
176: return(0);
177: }
181: PetscErrorCode MatGetSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
182: {
183: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
184: IS is1;
186: PetscInt *vary,*iary,nrows,i,bs=A->rmap->bs,count;
187: const PetscInt *irow;
190: if (isrow != iscol) SETERRQ(PETSC_ERR_ARG_INCOMP,"For symmetric format, iscol must equal isro");
191:
192: ISGetIndices(isrow,&irow);
193: ISGetSize(isrow,&nrows);
194:
195: /* Verify if the indices corespond to each element in a block
196: and form the IS with compressed IS */
197: PetscMalloc2(a->mbs,PetscInt,&vary,a->mbs,PetscInt,&iary);
198: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
199: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
200:
201: count = 0;
202: for (i=0; i<a->mbs; i++) {
203: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_ERR_ARG_INCOMP,"Index set does not match blocks");
204: if (vary[i]==bs) iary[count++] = i;
205: }
206: ISRestoreIndices(isrow,&irow);
207: ISCreateGeneral(PETSC_COMM_SELF,count,iary,&is1);
208: PetscFree2(vary,iary);
210: MatGetSubMatrix_SeqSBAIJ_Private(A,is1,is1,scall,B);
211: ISDestroy(is1);
212: return(0);
213: }
217: PetscErrorCode MatGetSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
218: {
220: PetscInt i;
223: if (scall == MAT_INITIAL_MATRIX) {
224: PetscMalloc((n+1)*sizeof(Mat),B);
225: }
227: for (i=0; i<n; i++) {
228: MatGetSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
229: }
230: return(0);
231: }
233: /* -------------------------------------------------------*/
234: /* Should check that shapes of vectors and matrices match */
235: /* -------------------------------------------------------*/
239: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
240: {
241: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
242: PetscScalar *x,*z,*xb,x1,x2,zero=0.0;
243: MatScalar *v;
245: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
246: PetscInt nonzerorow=0;
249: VecSet(zz,zero);
250: VecGetArray(xx,&x);
251: VecGetArray(zz,&z);
252:
253: v = a->a;
254: xb = x;
256: for (i=0; i<mbs; i++) {
257: n = ai[1] - ai[0]; /* length of i_th block row of A */
258: x1 = xb[0]; x2 = xb[1];
259: ib = aj + *ai;
260: jmin = 0;
261: nonzerorow += (n>0);
262: if (*ib == i){ /* (diag of A)*x */
263: z[2*i] += v[0]*x1 + v[2]*x2;
264: z[2*i+1] += v[2]*x1 + v[3]*x2;
265: v += 4; jmin++;
266: }
267: for (j=jmin; j<n; j++) {
268: /* (strict lower triangular part of A)*x */
269: cval = ib[j]*2;
270: z[cval] += v[0]*x1 + v[1]*x2;
271: z[cval+1] += v[2]*x1 + v[3]*x2;
272: /* (strict upper triangular part of A)*x */
273: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
274: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
275: v += 4;
276: }
277: xb +=2; ai++;
278: }
280: VecRestoreArray(xx,&x);
281: VecRestoreArray(zz,&z);
282: PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
283: return(0);
284: }
288: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
289: {
290: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
291: PetscScalar *x,*z,*xb,x1,x2,x3,zero=0.0;
292: MatScalar *v;
294: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
295: PetscInt nonzerorow=0;
298: VecSet(zz,zero);
299: VecGetArray(xx,&x);
300: VecGetArray(zz,&z);
301:
302: v = a->a;
303: xb = x;
305: for (i=0; i<mbs; i++) {
306: n = ai[1] - ai[0]; /* length of i_th block row of A */
307: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
308: ib = aj + *ai;
309: jmin = 0;
310: nonzerorow += (n>0);
311: if (*ib == i){ /* (diag of A)*x */
312: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
313: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
314: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
315: v += 9; jmin++;
316: }
317: for (j=jmin; j<n; j++) {
318: /* (strict lower triangular part of A)*x */
319: cval = ib[j]*3;
320: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
321: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
322: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
323: /* (strict upper triangular part of A)*x */
324: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
325: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
326: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
327: v += 9;
328: }
329: xb +=3; ai++;
330: }
332: VecRestoreArray(xx,&x);
333: VecRestoreArray(zz,&z);
334: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
335: return(0);
336: }
340: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
341: {
342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
343: PetscScalar *x,*z,*xb,x1,x2,x3,x4,zero=0.0;
344: MatScalar *v;
346: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
347: PetscInt nonzerorow=0;
350: VecSet(zz,zero);
351: VecGetArray(xx,&x);
352: VecGetArray(zz,&z);
353:
354: v = a->a;
355: xb = x;
357: for (i=0; i<mbs; i++) {
358: n = ai[1] - ai[0]; /* length of i_th block row of A */
359: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
360: ib = aj + *ai;
361: jmin = 0;
362: nonzerorow += (n>0);
363: if (*ib == i){ /* (diag of A)*x */
364: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
365: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
366: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
367: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
368: v += 16; jmin++;
369: }
370: for (j=jmin; j<n; j++) {
371: /* (strict lower triangular part of A)*x */
372: cval = ib[j]*4;
373: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
374: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
375: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
376: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
377: /* (strict upper triangular part of A)*x */
378: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
379: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
380: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
381: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
382: v += 16;
383: }
384: xb +=4; ai++;
385: }
387: VecRestoreArray(xx,&x);
388: VecRestoreArray(zz,&z);
389: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
390: return(0);
391: }
395: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
396: {
397: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
398: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,zero=0.0;
399: MatScalar *v;
401: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
402: PetscInt nonzerorow=0;
405: VecSet(zz,zero);
406: VecGetArray(xx,&x);
407: VecGetArray(zz,&z);
408:
409: v = a->a;
410: xb = x;
412: for (i=0; i<mbs; i++) {
413: n = ai[1] - ai[0]; /* length of i_th block row of A */
414: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
415: ib = aj + *ai;
416: jmin = 0;
417: nonzerorow += (n>0);
418: if (*ib == i){ /* (diag of A)*x */
419: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
420: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
421: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
422: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
423: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
424: v += 25; jmin++;
425: }
426: for (j=jmin; j<n; j++) {
427: /* (strict lower triangular part of A)*x */
428: cval = ib[j]*5;
429: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
430: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
431: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
432: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
433: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
434: /* (strict upper triangular part of A)*x */
435: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
436: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
437: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
438: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
439: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
440: v += 25;
441: }
442: xb +=5; ai++;
443: }
445: VecRestoreArray(xx,&x);
446: VecRestoreArray(zz,&z);
447: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
448: return(0);
449: }
454: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
455: {
456: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
457: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,zero=0.0;
458: MatScalar *v;
460: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
461: PetscInt nonzerorow=0;
464: VecSet(zz,zero);
465: VecGetArray(xx,&x);
466: VecGetArray(zz,&z);
467:
468: v = a->a;
469: xb = x;
471: for (i=0; i<mbs; i++) {
472: n = ai[1] - ai[0]; /* length of i_th block row of A */
473: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
474: ib = aj + *ai;
475: jmin = 0;
476: nonzerorow += (n>0);
477: if (*ib == i){ /* (diag of A)*x */
478: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
479: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
480: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
481: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
482: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
483: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
484: v += 36; jmin++;
485: }
486: for (j=jmin; j<n; j++) {
487: /* (strict lower triangular part of A)*x */
488: cval = ib[j]*6;
489: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
490: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
491: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
492: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
493: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
494: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
495: /* (strict upper triangular part of A)*x */
496: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
497: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
498: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
499: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
500: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
501: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
502: v += 36;
503: }
504: xb +=6; ai++;
505: }
507: VecRestoreArray(xx,&x);
508: VecRestoreArray(zz,&z);
509: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
510: return(0);
511: }
514: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
515: {
516: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
517: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
518: MatScalar *v;
520: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
521: PetscInt nonzerorow=0;
524: VecSet(zz,zero);
525: VecGetArray(xx,&x);
526: VecGetArray(zz,&z);
527:
528: v = a->a;
529: xb = x;
531: for (i=0; i<mbs; i++) {
532: n = ai[1] - ai[0]; /* length of i_th block row of A */
533: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
534: ib = aj + *ai;
535: jmin = 0;
536: nonzerorow += (n>0);
537: if (*ib == i){ /* (diag of A)*x */
538: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
539: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
540: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
541: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
542: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
543: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
544: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
545: v += 49; jmin++;
546: }
547: for (j=jmin; j<n; j++) {
548: /* (strict lower triangular part of A)*x */
549: cval = ib[j]*7;
550: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
551: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
552: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
553: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
554: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
555: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
556: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
557: /* (strict upper triangular part of A)*x */
558: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
559: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
560: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
561: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
562: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
563: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
564: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
565: v += 49;
566: }
567: xb +=7; ai++;
568: }
569: VecRestoreArray(xx,&x);
570: VecRestoreArray(zz,&z);
571: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);
572: return(0);
573: }
575: /*
576: This will not work with MatScalar == float because it calls the BLAS
577: */
580: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
581: {
582: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
583: PetscScalar *x,*x_ptr,*z,*z_ptr,*xb,*zb,*work,*workt,zero=0.0;
584: MatScalar *v;
586: PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
587: PetscInt nonzerorow=0;
590: VecSet(zz,zero);
591: VecGetArray(xx,&x); x_ptr=x;
592: VecGetArray(zz,&z); z_ptr=z;
594: aj = a->j;
595: v = a->a;
596: ii = a->i;
598: if (!a->mult_work) {
599: PetscMalloc((A->rmap->N+1)*sizeof(PetscScalar),&a->mult_work);
600: }
601: work = a->mult_work;
602:
603: for (i=0; i<mbs; i++) {
604: n = ii[1] - ii[0]; ncols = n*bs;
605: workt = work; idx=aj+ii[0];
606: nonzerorow += (n>0);
608: /* upper triangular part */
609: for (j=0; j<n; j++) {
610: xb = x_ptr + bs*(*idx++);
611: for (k=0; k<bs; k++) workt[k] = xb[k];
612: workt += bs;
613: }
614: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
615: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
616:
617: /* strict lower triangular part */
618: idx = aj+ii[0];
619: if (*idx == i){
620: ncols -= bs; v += bs2; idx++; n--;
621: }
622:
623: if (ncols > 0){
624: workt = work;
625: PetscMemzero(workt,ncols*sizeof(PetscScalar));
626: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
627: for (j=0; j<n; j++) {
628: zb = z_ptr + bs*(*idx++);
629: for (k=0; k<bs; k++) zb[k] += workt[k] ;
630: workt += bs;
631: }
632: }
633: x += bs; v += n*bs2; z += bs; ii++;
634: }
635:
636: VecRestoreArray(xx,&x);
637: VecRestoreArray(zz,&z);
638: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);
639: return(0);
640: }
645: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
646: {
647: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
648: PetscScalar *x,*z,*xb,x1;
649: MatScalar *v;
651: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
652: PetscInt nonzerorow=0;
655: VecCopy_Seq(yy,zz);
656: VecGetArray(xx,&x);
657: VecGetArray(zz,&z);
658: v = a->a;
659: xb = x;
661: for (i=0; i<mbs; i++) {
662: n = ai[1] - ai[0]; /* length of i_th row of A */
663: x1 = xb[0];
664: ib = aj + *ai;
665: jmin = 0;
666: nonzerorow += (n>0);
667: if (*ib == i) { /* (diag of A)*x */
668: z[i] += *v++ * x[*ib++]; jmin++;
669: }
670: for (j=jmin; j<n; j++) {
671: cval = *ib;
672: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
673: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
674: }
675: xb++; ai++;
676: }
678: VecRestoreArray(xx,&x);
679: VecRestoreArray(zz,&z);
680:
681: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
682: return(0);
683: }
687: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
688: {
689: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
690: PetscScalar *x,*z,*xb,x1,x2;
691: MatScalar *v;
693: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
694: PetscInt nonzerorow=0;
697: VecCopy_Seq(yy,zz);
698: VecGetArray(xx,&x);
699: VecGetArray(zz,&z);
701: v = a->a;
702: xb = x;
704: for (i=0; i<mbs; i++) {
705: n = ai[1] - ai[0]; /* length of i_th block row of A */
706: x1 = xb[0]; x2 = xb[1];
707: ib = aj + *ai;
708: jmin = 0;
709: nonzerorow += (n>0);
710: if (*ib == i){ /* (diag of A)*x */
711: z[2*i] += v[0]*x1 + v[2]*x2;
712: z[2*i+1] += v[2]*x1 + v[3]*x2;
713: v += 4; jmin++;
714: }
715: for (j=jmin; j<n; j++) {
716: /* (strict lower triangular part of A)*x */
717: cval = ib[j]*2;
718: z[cval] += v[0]*x1 + v[1]*x2;
719: z[cval+1] += v[2]*x1 + v[3]*x2;
720: /* (strict upper triangular part of A)*x */
721: z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
722: z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
723: v += 4;
724: }
725: xb +=2; ai++;
726: }
727: VecRestoreArray(xx,&x);
728: VecRestoreArray(zz,&z);
730: PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));
731: return(0);
732: }
736: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
737: {
738: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
739: PetscScalar *x,*z,*xb,x1,x2,x3;
740: MatScalar *v;
742: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
743: PetscInt nonzerorow=0;
746: VecCopy_Seq(yy,zz);
747: VecGetArray(xx,&x);
748: VecGetArray(zz,&z);
750: v = a->a;
751: xb = x;
753: for (i=0; i<mbs; i++) {
754: n = ai[1] - ai[0]; /* length of i_th block row of A */
755: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
756: ib = aj + *ai;
757: jmin = 0;
758: nonzerorow += (n>0);
759: if (*ib == i){ /* (diag of A)*x */
760: z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
761: z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
762: z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
763: v += 9; jmin++;
764: }
765: for (j=jmin; j<n; j++) {
766: /* (strict lower triangular part of A)*x */
767: cval = ib[j]*3;
768: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
769: z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
770: z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
771: /* (strict upper triangular part of A)*x */
772: z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
773: z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
774: z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
775: v += 9;
776: }
777: xb +=3; ai++;
778: }
780: VecRestoreArray(xx,&x);
781: VecRestoreArray(zz,&z);
783: PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));
784: return(0);
785: }
789: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
790: {
791: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
792: PetscScalar *x,*z,*xb,x1,x2,x3,x4;
793: MatScalar *v;
795: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
796: PetscInt nonzerorow=0;
799: VecCopy_Seq(yy,zz);
800: VecGetArray(xx,&x);
801: VecGetArray(zz,&z);
803: v = a->a;
804: xb = x;
806: for (i=0; i<mbs; i++) {
807: n = ai[1] - ai[0]; /* length of i_th block row of A */
808: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
809: ib = aj + *ai;
810: jmin = 0;
811: nonzerorow += (n>0);
812: if (*ib == i){ /* (diag of A)*x */
813: z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
814: z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
815: z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
816: z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
817: v += 16; jmin++;
818: }
819: for (j=jmin; j<n; j++) {
820: /* (strict lower triangular part of A)*x */
821: cval = ib[j]*4;
822: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
823: z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
824: z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
825: z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
826: /* (strict upper triangular part of A)*x */
827: z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
828: z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
829: z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
830: z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
831: v += 16;
832: }
833: xb +=4; ai++;
834: }
836: VecRestoreArray(xx,&x);
837: VecRestoreArray(zz,&z);
839: PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));
840: return(0);
841: }
845: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
846: {
847: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
848: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5;
849: MatScalar *v;
851: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
852: PetscInt nonzerorow=0;
855: VecCopy_Seq(yy,zz);
856: VecGetArray(xx,&x);
857: VecGetArray(zz,&z);
859: v = a->a;
860: xb = x;
862: for (i=0; i<mbs; i++) {
863: n = ai[1] - ai[0]; /* length of i_th block row of A */
864: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
865: ib = aj + *ai;
866: jmin = 0;
867: nonzerorow += (n>0);
868: if (*ib == i){ /* (diag of A)*x */
869: z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
870: z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
871: z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
872: z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
873: z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
874: v += 25; jmin++;
875: }
876: for (j=jmin; j<n; j++) {
877: /* (strict lower triangular part of A)*x */
878: cval = ib[j]*5;
879: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
880: z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
881: z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
882: z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
883: z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
884: /* (strict upper triangular part of A)*x */
885: z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
886: z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
887: z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
888: z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
889: z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
890: v += 25;
891: }
892: xb +=5; ai++;
893: }
895: VecRestoreArray(xx,&x);
896: VecRestoreArray(zz,&z);
898: PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));
899: return(0);
900: }
903: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
904: {
905: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
906: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6;
907: MatScalar *v;
909: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
910: PetscInt nonzerorow=0;
913: VecCopy_Seq(yy,zz);
914: VecGetArray(xx,&x);
915: VecGetArray(zz,&z);
917: v = a->a;
918: xb = x;
920: for (i=0; i<mbs; i++) {
921: n = ai[1] - ai[0]; /* length of i_th block row of A */
922: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
923: ib = aj + *ai;
924: jmin = 0;
925: nonzerorow += (n>0);
926: if (*ib == i){ /* (diag of A)*x */
927: z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
928: z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
929: z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
930: z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
931: z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
932: z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
933: v += 36; jmin++;
934: }
935: for (j=jmin; j<n; j++) {
936: /* (strict lower triangular part of A)*x */
937: cval = ib[j]*6;
938: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
939: z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
940: z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
941: z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
942: z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
943: z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
944: /* (strict upper triangular part of A)*x */
945: z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
946: z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
947: z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
948: z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
949: z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
950: z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
951: v += 36;
952: }
953: xb +=6; ai++;
954: }
956: VecRestoreArray(xx,&x);
957: VecRestoreArray(zz,&z);
959: PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));
960: return(0);
961: }
965: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
966: {
967: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
968: PetscScalar *x,*z,*xb,x1,x2,x3,x4,x5,x6,x7;
969: MatScalar *v;
971: PetscInt mbs=a->mbs,i,*aj=a->j,*ai=a->i,n,*ib,cval,j,jmin;
972: PetscInt nonzerorow=0;
975: VecCopy_Seq(yy,zz);
976: VecGetArray(xx,&x);
977: VecGetArray(zz,&z);
979: v = a->a;
980: xb = x;
982: for (i=0; i<mbs; i++) {
983: n = ai[1] - ai[0]; /* length of i_th block row of A */
984: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
985: ib = aj + *ai;
986: jmin = 0;
987: nonzerorow += (n>0);
988: if (*ib == i){ /* (diag of A)*x */
989: z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
990: z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
991: z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
992: z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
993: z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
994: z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
995: z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
996: v += 49; jmin++;
997: }
998: for (j=jmin; j<n; j++) {
999: /* (strict lower triangular part of A)*x */
1000: cval = ib[j]*7;
1001: z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1002: z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1003: z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1004: z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1005: z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1006: z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1007: z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1008: /* (strict upper triangular part of A)*x */
1009: z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1010: z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1011: z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1012: z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1013: z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1014: z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1015: z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1016: v += 49;
1017: }
1018: xb +=7; ai++;
1019: }
1021: VecRestoreArray(xx,&x);
1022: VecRestoreArray(zz,&z);
1024: PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));
1025: return(0);
1026: }
1030: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1031: {
1032: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1033: PetscScalar *x,*x_ptr,*z,*z_ptr=0,*xb,*zb,*work,*workt;
1034: MatScalar *v;
1036: PetscInt mbs=a->mbs,i,*idx,*aj,*ii,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1037: PetscInt nonzerorow=0;
1040: VecCopy_Seq(yy,zz);
1041: VecGetArray(xx,&x); x_ptr=x;
1042: VecGetArray(zz,&z); z_ptr=z;
1044: aj = a->j;
1045: v = a->a;
1046: ii = a->i;
1048: if (!a->mult_work) {
1049: PetscMalloc((A->rmap->n+1)*sizeof(PetscScalar),&a->mult_work);
1050: }
1051: work = a->mult_work;
1052:
1053:
1054: for (i=0; i<mbs; i++) {
1055: n = ii[1] - ii[0]; ncols = n*bs;
1056: workt = work; idx=aj+ii[0];
1057: nonzerorow += (n>0);
1059: /* upper triangular part */
1060: for (j=0; j<n; j++) {
1061: xb = x_ptr + bs*(*idx++);
1062: for (k=0; k<bs; k++) workt[k] = xb[k];
1063: workt += bs;
1064: }
1065: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1066: Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1068: /* strict lower triangular part */
1069: idx = aj+ii[0];
1070: if (*idx == i){
1071: ncols -= bs; v += bs2; idx++; n--;
1072: }
1073: if (ncols > 0){
1074: workt = work;
1075: PetscMemzero(workt,ncols*sizeof(PetscScalar));
1076: Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1077: for (j=0; j<n; j++) {
1078: zb = z_ptr + bs*(*idx++);
1079: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1080: workt += bs;
1081: }
1082: }
1084: x += bs; v += n*bs2; z += bs; ii++;
1085: }
1087: VecRestoreArray(xx,&x);
1088: VecRestoreArray(zz,&z);
1090: PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));
1091: return(0);
1092: }
1096: PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1097: {
1098: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1099: PetscScalar oalpha = alpha;
1101: PetscBLASInt one = 1,totalnz = PetscBLASIntCast(a->bs2*a->nz);
1104: BLASscal_(&totalnz,&oalpha,a->a,&one);
1105: PetscLogFlops(totalnz);
1106: return(0);
1107: }
1111: PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1112: {
1113: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1114: MatScalar *v = a->a;
1115: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1116: PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,*aj=a->j;
1118: PetscInt *jl,*il,jmin,jmax,nexti,ik,*col;
1119:
1121: if (type == NORM_FROBENIUS) {
1122: for (k=0; k<mbs; k++){
1123: jmin = a->i[k]; jmax = a->i[k+1];
1124: col = aj + jmin;
1125: if (*col == k){ /* diagonal block */
1126: for (i=0; i<bs2; i++){
1127: #if defined(PETSC_USE_COMPLEX)
1128: sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1129: #else
1130: sum_diag += (*v)*(*v); v++;
1131: #endif
1132: }
1133: jmin++;
1134: }
1135: for (j=jmin; j<jmax; j++){ /* off-diagonal blocks */
1136: for (i=0; i<bs2; i++){
1137: #if defined(PETSC_USE_COMPLEX)
1138: sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1139: #else
1140: sum_off += (*v)*(*v); v++;
1141: #endif
1142: }
1143: }
1144: }
1145: *norm = sqrt(sum_diag + 2*sum_off);
1146: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1147: PetscMalloc3(bs,PetscReal,&sum,mbs,PetscInt,&il,mbs,PetscInt,&jl);
1148: for (i=0; i<mbs; i++) jl[i] = mbs;
1149: il[0] = 0;
1151: *norm = 0.0;
1152: for (k=0; k<mbs; k++) { /* k_th block row */
1153: for (j=0; j<bs; j++) sum[j]=0.0;
1154: /*-- col sum --*/
1155: i = jl[k]; /* first |A(i,k)| to be added */
1156: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1157: at step k */
1158: while (i<mbs){
1159: nexti = jl[i]; /* next block row to be added */
1160: ik = il[i]; /* block index of A(i,k) in the array a */
1161: for (j=0; j<bs; j++){
1162: v = a->a + ik*bs2 + j*bs;
1163: for (k1=0; k1<bs; k1++) {
1164: sum[j] += PetscAbsScalar(*v); v++;
1165: }
1166: }
1167: /* update il, jl */
1168: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1169: jmax = a->i[i+1];
1170: if (jmin < jmax){
1171: il[i] = jmin;
1172: j = a->j[jmin];
1173: jl[i] = jl[j]; jl[j]=i;
1174: }
1175: i = nexti;
1176: }
1177: /*-- row sum --*/
1178: jmin = a->i[k]; jmax = a->i[k+1];
1179: for (i=jmin; i<jmax; i++) {
1180: for (j=0; j<bs; j++){
1181: v = a->a + i*bs2 + j;
1182: for (k1=0; k1<bs; k1++){
1183: sum[j] += PetscAbsScalar(*v); v += bs;
1184: }
1185: }
1186: }
1187: /* add k_th block row to il, jl */
1188: col = aj+jmin;
1189: if (*col == k) jmin++;
1190: if (jmin < jmax){
1191: il[k] = jmin;
1192: j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1193: }
1194: for (j=0; j<bs; j++){
1195: if (sum[j] > *norm) *norm = sum[j];
1196: }
1197: }
1198: PetscFree3(sum,il,jl);
1199: } else {
1200: SETERRQ(PETSC_ERR_SUP,"No support for this norm yet");
1201: }
1202: return(0);
1203: }
1207: PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscTruth* flg)
1208: {
1209: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data,*b = (Mat_SeqSBAIJ *)B->data;
1214: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1215: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1216: *flg = PETSC_FALSE;
1217: return(0);
1218: }
1219:
1220: /* if the a->i are the same */
1221: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1222: if (!*flg) {
1223: return(0);
1224: }
1225:
1226: /* if a->j are the same */
1227: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1228: if (!*flg) {
1229: return(0);
1230: }
1231: /* if a->a are the same */
1232: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs)*sizeof(PetscScalar),flg);
1233: return(0);
1234: }
1238: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1239: {
1240: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1242: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1243: PetscScalar *x,zero = 0.0;
1244: MatScalar *aa,*aa_j;
1247: bs = A->rmap->bs;
1248: if (A->factor && bs>1) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1249:
1250: aa = a->a;
1251: ai = a->i;
1252: aj = a->j;
1253: ambs = a->mbs;
1254: bs2 = a->bs2;
1256: VecSet(v,zero);
1257: VecGetArray(v,&x);
1258: VecGetLocalSize(v,&n);
1259: if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1260: for (i=0; i<ambs; i++) {
1261: j=ai[i];
1262: if (aj[j] == i) { /* if this is a diagonal element */
1263: row = i*bs;
1264: aa_j = aa + j*bs2;
1265: if (A->factor && bs==1){
1266: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = 1.0/aa_j[k];
1267: } else {
1268: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1269: }
1270: }
1271: }
1272:
1273: VecRestoreArray(v,&x);
1274: return(0);
1275: }
1279: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1280: {
1281: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1282: PetscScalar *l,x,*li,*ri;
1283: MatScalar *aa,*v;
1285: PetscInt i,j,k,lm,M,m,*ai,*aj,mbs,tmp,bs,bs2;
1286: PetscTruth flg;
1289: if (ll != rr){
1290: VecEqual(ll,rr,&flg);
1291: if (!flg)
1292: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1293: }
1294: if (!ll) return(0);
1295: ai = a->i;
1296: aj = a->j;
1297: aa = a->a;
1298: m = A->rmap->N;
1299: bs = A->rmap->bs;
1300: mbs = a->mbs;
1301: bs2 = a->bs2;
1303: VecGetArray(ll,&l);
1304: VecGetLocalSize(ll,&lm);
1305: if (lm != m) SETERRQ(PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1306: for (i=0; i<mbs; i++) { /* for each block row */
1307: M = ai[i+1] - ai[i];
1308: li = l + i*bs;
1309: v = aa + bs2*ai[i];
1310: for (j=0; j<M; j++) { /* for each block */
1311: ri = l + bs*aj[ai[i]+j];
1312: for (k=0; k<bs; k++) {
1313: x = ri[k];
1314: for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1315: }
1316: }
1317: }
1318: VecRestoreArray(ll,&l);
1319: PetscLogFlops(2.0*a->nz);
1320: return(0);
1321: }
1325: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1326: {
1327: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1330: info->block_size = a->bs2;
1331: info->nz_allocated = a->maxnz; /*num. of nonzeros in upper triangular part */
1332: info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1333: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
1334: info->assemblies = A->num_ass;
1335: info->mallocs = a->reallocs;
1336: info->memory = ((PetscObject)A)->mem;
1337: if (A->factor) {
1338: info->fill_ratio_given = A->info.fill_ratio_given;
1339: info->fill_ratio_needed = A->info.fill_ratio_needed;
1340: info->factor_mallocs = A->info.factor_mallocs;
1341: } else {
1342: info->fill_ratio_given = 0;
1343: info->fill_ratio_needed = 0;
1344: info->factor_mallocs = 0;
1345: }
1346: return(0);
1347: }
1352: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1353: {
1354: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1358: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
1359: return(0);
1360: }
1364: /*
1365: This code does not work since it only checks the upper triangular part of
1366: the matrix. Hence it is not listed in the function table.
1367: */
1368: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1369: {
1370: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1372: PetscInt i,j,n,row,col,bs,*ai,*aj,mbs;
1373: PetscReal atmp;
1374: MatScalar *aa;
1375: PetscScalar *x;
1376: PetscInt ncols,brow,bcol,krow,kcol;
1379: if (idx) SETERRQ(PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1380: if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1381: bs = A->rmap->bs;
1382: aa = a->a;
1383: ai = a->i;
1384: aj = a->j;
1385: mbs = a->mbs;
1387: VecSet(v,0.0);
1388: VecGetArray(v,&x);
1389: VecGetLocalSize(v,&n);
1390: if (n != A->rmap->N) SETERRQ(PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1391: for (i=0; i<mbs; i++) {
1392: ncols = ai[1] - ai[0]; ai++;
1393: brow = bs*i;
1394: for (j=0; j<ncols; j++){
1395: bcol = bs*(*aj);
1396: for (kcol=0; kcol<bs; kcol++){
1397: col = bcol + kcol; /* col index */
1398: for (krow=0; krow<bs; krow++){
1399: atmp = PetscAbsScalar(*aa); aa++;
1400: row = brow + krow; /* row index */
1401: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1402: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1403: }
1404: }
1405: aj++;
1406: }
1407: }
1408: VecRestoreArray(v,&x);
1409: return(0);
1410: }