Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include ../src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap->n;
19: m = A->rmap->n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap->n;
72: n = A->cmap->n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap->n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
242: if (!blockcompressed) {
243: MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
244: } else if (symmetric) {
245: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
246: } else {
247: MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
248: }
249: return(0);
250: }
254: static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255: {
259: if (!ia) return(0);
261: if (!blockcompressed) {
262: MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
263: } else {
264: PetscFree(*ia);
265: PetscFree(*ja);
266: }
268: return(0);
269: }
271: /* ----------------------------------------------------------- */
275: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
279: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
283: nslim_row = a->inode.node_count;
284: n = A->cmap->n;
286: /* Create The column_inode for this matrix */
287: Mat_CreateColInode(A,&nslim_col,&ns_col);
288:
289: /* allocate space for reformated column_inode structure */
290: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
291: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
292: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
294: for (i1=0,col=0; i1<nslim_col; ++i1){
295: nsz = ns_col[i1];
296: for (i2=0; i2<nsz; ++i2,++col)
297: tvc[col] = i1;
298: }
299: /* allocate space for column pointers */
300: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
301: *iia = ia;
302: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
303: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
305: /* determine the number of columns in each row */
306: ia[0] = oshift;
307: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308: j = aj + ai[row] + ishift;
309: col = *j++ + ishift;
310: i2 = tvc[col];
311: nz = ai[row+1] - ai[row];
312: while (nz-- > 0) { /* off-diagonal elemets */
313: /* ia[i1+1]++; */
314: ia[i2+1]++;
315: i2++;
316: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317: if (nz > 0) i2 = tvc[col];
318: }
319: }
321: /* shift ia[i] to point to next col */
322: for (i1=1; i1<nslim_col+1; i1++) {
323: col = ia[i1-1];
324: ia[i1] += col;
325: work[i1-1] = col - oshift;
326: }
328: /* allocate space for column pointers */
329: nz = ia[nslim_col] + (!ishift);
330: PetscMalloc(nz*sizeof(PetscInt),&ja);
331: *jja = ja;
333: /* loop over matrix putting into ja */
334: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335: j = aj + ai[row] + ishift;
336: i2 = 0; /* Col inode index */
337: col = *j++ + ishift;
338: i2 = tvc[col];
339: nz = ai[row+1] - ai[row];
340: while (nz-- > 0) {
341: /* ja[work[i1]++] = i2 + oshift; */
342: ja[work[i2]++] = i1 + oshift;
343: i2++;
344: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345: if (nz > 0) i2 = tvc[col];
346: }
347: }
348: PetscFree(ns_col);
349: PetscFree(work);
350: PetscFree(tns);
351: PetscFree(tvc);
352: return(0);
353: }
357: static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358: {
362: Mat_CreateColInode(A,n,PETSC_NULL);
363: if (!ia) return(0);
365: if (!blockcompressed) {
366: MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
367: } else if (symmetric) {
368: /* Since the indices are symmetric it does'nt matter */
369: MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);
370: } else {
371: MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
372: }
373: return(0);
374: }
378: static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379: {
383: if (!ia) return(0);
384: if (!blockcompressed) {
385: MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);;
386: } else {
387: PetscFree(*ia);
388: PetscFree(*ja);
389: }
390: return(0);
391: }
393: /* ----------------------------------------------------------- */
397: static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398: {
399: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
400: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401: PetscScalar *y;
402: const PetscScalar *x;
403: const MatScalar *v1,*v2,*v3,*v4,*v5;
404: PetscErrorCode ierr;
405: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406:
407: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409: #endif
412: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413: node_max = a->inode.node_count;
414: ns = a->inode.size; /* Node Size array */
415: VecGetArray(xx,(PetscScalar**)&x);
416: VecGetArray(yy,&y);
417: idx = a->j;
418: v1 = a->a;
419: ii = a->i;
421: for (i = 0,row = 0; i< node_max; ++i){
422: nsz = ns[i];
423: n = ii[1] - ii[0];
424: nonzerorow += (n>0)*nsz;
425: ii += nsz;
426: PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */
427: PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */
428: sz = n; /* No of non zeros in this row */
429: /* Switch on the size of Node */
430: switch (nsz){ /* Each loop in 'case' is unrolled */
431: case 1 :
432: sum1 = 0.;
433:
434: for(n = 0; n< sz-1; n+=2) {
435: i1 = idx[0]; /* The instructions are ordered to */
436: i2 = idx[1]; /* make the compiler's job easy */
437: idx += 2;
438: tmp0 = x[i1];
439: tmp1 = x[i2];
440: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441: }
442:
443: if (n == sz-1){ /* Take care of the last nonzero */
444: tmp0 = x[*idx++];
445: sum1 += *v1++ * tmp0;
446: }
447: y[row++]=sum1;
448: break;
449: case 2:
450: sum1 = 0.;
451: sum2 = 0.;
452: v2 = v1 + n;
453:
454: for (n = 0; n< sz-1; n+=2) {
455: i1 = idx[0];
456: i2 = idx[1];
457: idx += 2;
458: tmp0 = x[i1];
459: tmp1 = x[i2];
460: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462: }
463: if (n == sz-1){
464: tmp0 = x[*idx++];
465: sum1 += *v1++ * tmp0;
466: sum2 += *v2++ * tmp0;
467: }
468: y[row++]=sum1;
469: y[row++]=sum2;
470: v1 =v2; /* Since the next block to be processed starts there*/
471: idx +=sz;
472: break;
473: case 3:
474: sum1 = 0.;
475: sum2 = 0.;
476: sum3 = 0.;
477: v2 = v1 + n;
478: v3 = v2 + n;
479:
480: for (n = 0; n< sz-1; n+=2) {
481: i1 = idx[0];
482: i2 = idx[1];
483: idx += 2;
484: tmp0 = x[i1];
485: tmp1 = x[i2];
486: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489: }
490: if (n == sz-1){
491: tmp0 = x[*idx++];
492: sum1 += *v1++ * tmp0;
493: sum2 += *v2++ * tmp0;
494: sum3 += *v3++ * tmp0;
495: }
496: y[row++]=sum1;
497: y[row++]=sum2;
498: y[row++]=sum3;
499: v1 =v3; /* Since the next block to be processed starts there*/
500: idx +=2*sz;
501: break;
502: case 4:
503: sum1 = 0.;
504: sum2 = 0.;
505: sum3 = 0.;
506: sum4 = 0.;
507: v2 = v1 + n;
508: v3 = v2 + n;
509: v4 = v3 + n;
510:
511: for (n = 0; n< sz-1; n+=2) {
512: i1 = idx[0];
513: i2 = idx[1];
514: idx += 2;
515: tmp0 = x[i1];
516: tmp1 = x[i2];
517: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521: }
522: if (n == sz-1){
523: tmp0 = x[*idx++];
524: sum1 += *v1++ * tmp0;
525: sum2 += *v2++ * tmp0;
526: sum3 += *v3++ * tmp0;
527: sum4 += *v4++ * tmp0;
528: }
529: y[row++]=sum1;
530: y[row++]=sum2;
531: y[row++]=sum3;
532: y[row++]=sum4;
533: v1 =v4; /* Since the next block to be processed starts there*/
534: idx +=3*sz;
535: break;
536: case 5:
537: sum1 = 0.;
538: sum2 = 0.;
539: sum3 = 0.;
540: sum4 = 0.;
541: sum5 = 0.;
542: v2 = v1 + n;
543: v3 = v2 + n;
544: v4 = v3 + n;
545: v5 = v4 + n;
546:
547: for (n = 0; n<sz-1; n+=2) {
548: i1 = idx[0];
549: i2 = idx[1];
550: idx += 2;
551: tmp0 = x[i1];
552: tmp1 = x[i2];
553: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558: }
559: if (n == sz-1){
560: tmp0 = x[*idx++];
561: sum1 += *v1++ * tmp0;
562: sum2 += *v2++ * tmp0;
563: sum3 += *v3++ * tmp0;
564: sum4 += *v4++ * tmp0;
565: sum5 += *v5++ * tmp0;
566: }
567: y[row++]=sum1;
568: y[row++]=sum2;
569: y[row++]=sum3;
570: y[row++]=sum4;
571: y[row++]=sum5;
572: v1 =v5; /* Since the next block to be processed starts there */
573: idx +=4*sz;
574: break;
575: default :
576: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577: }
578: }
579: VecRestoreArray(xx,(PetscScalar**)&x);
580: VecRestoreArray(yy,&y);
581: PetscLogFlops(2.0*a->nz - nonzerorow);
582: return(0);
583: }
584: /* ----------------------------------------------------------- */
585: /* Almost same code as the MatMult_SeqAIJ_Inode() */
588: static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589: {
590: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
591: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592: MatScalar *v1,*v2,*v3,*v4,*v5;
593: PetscScalar *x,*y,*z,*zt;
595: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596:
598: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599: node_max = a->inode.node_count;
600: ns = a->inode.size; /* Node Size array */
601: VecGetArray(xx,&x);
602: VecGetArray(yy,&y);
603: if (zz != yy) {
604: VecGetArray(zz,&z);
605: } else {
606: z = y;
607: }
608: zt = z;
610: idx = a->j;
611: v1 = a->a;
612: ii = a->i;
614: for (i = 0,row = 0; i< node_max; ++i){
615: nsz = ns[i];
616: n = ii[1] - ii[0];
617: ii += nsz;
618: sz = n; /* No of non zeros in this row */
619: /* Switch on the size of Node */
620: switch (nsz){ /* Each loop in 'case' is unrolled */
621: case 1 :
622: sum1 = *zt++;
623:
624: for(n = 0; n< sz-1; n+=2) {
625: i1 = idx[0]; /* The instructions are ordered to */
626: i2 = idx[1]; /* make the compiler's job easy */
627: idx += 2;
628: tmp0 = x[i1];
629: tmp1 = x[i2];
630: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631: }
632:
633: if(n == sz-1){ /* Take care of the last nonzero */
634: tmp0 = x[*idx++];
635: sum1 += *v1++ * tmp0;
636: }
637: y[row++]=sum1;
638: break;
639: case 2:
640: sum1 = *zt++;
641: sum2 = *zt++;
642: v2 = v1 + n;
643:
644: for(n = 0; n< sz-1; n+=2) {
645: i1 = idx[0];
646: i2 = idx[1];
647: idx += 2;
648: tmp0 = x[i1];
649: tmp1 = x[i2];
650: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652: }
653: if(n == sz-1){
654: tmp0 = x[*idx++];
655: sum1 += *v1++ * tmp0;
656: sum2 += *v2++ * tmp0;
657: }
658: y[row++]=sum1;
659: y[row++]=sum2;
660: v1 =v2; /* Since the next block to be processed starts there*/
661: idx +=sz;
662: break;
663: case 3:
664: sum1 = *zt++;
665: sum2 = *zt++;
666: sum3 = *zt++;
667: v2 = v1 + n;
668: v3 = v2 + n;
669:
670: for (n = 0; n< sz-1; n+=2) {
671: i1 = idx[0];
672: i2 = idx[1];
673: idx += 2;
674: tmp0 = x[i1];
675: tmp1 = x[i2];
676: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679: }
680: if (n == sz-1){
681: tmp0 = x[*idx++];
682: sum1 += *v1++ * tmp0;
683: sum2 += *v2++ * tmp0;
684: sum3 += *v3++ * tmp0;
685: }
686: y[row++]=sum1;
687: y[row++]=sum2;
688: y[row++]=sum3;
689: v1 =v3; /* Since the next block to be processed starts there*/
690: idx +=2*sz;
691: break;
692: case 4:
693: sum1 = *zt++;
694: sum2 = *zt++;
695: sum3 = *zt++;
696: sum4 = *zt++;
697: v2 = v1 + n;
698: v3 = v2 + n;
699: v4 = v3 + n;
700:
701: for (n = 0; n< sz-1; n+=2) {
702: i1 = idx[0];
703: i2 = idx[1];
704: idx += 2;
705: tmp0 = x[i1];
706: tmp1 = x[i2];
707: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711: }
712: if (n == sz-1){
713: tmp0 = x[*idx++];
714: sum1 += *v1++ * tmp0;
715: sum2 += *v2++ * tmp0;
716: sum3 += *v3++ * tmp0;
717: sum4 += *v4++ * tmp0;
718: }
719: y[row++]=sum1;
720: y[row++]=sum2;
721: y[row++]=sum3;
722: y[row++]=sum4;
723: v1 =v4; /* Since the next block to be processed starts there*/
724: idx +=3*sz;
725: break;
726: case 5:
727: sum1 = *zt++;
728: sum2 = *zt++;
729: sum3 = *zt++;
730: sum4 = *zt++;
731: sum5 = *zt++;
732: v2 = v1 + n;
733: v3 = v2 + n;
734: v4 = v3 + n;
735: v5 = v4 + n;
736:
737: for (n = 0; n<sz-1; n+=2) {
738: i1 = idx[0];
739: i2 = idx[1];
740: idx += 2;
741: tmp0 = x[i1];
742: tmp1 = x[i2];
743: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748: }
749: if(n == sz-1){
750: tmp0 = x[*idx++];
751: sum1 += *v1++ * tmp0;
752: sum2 += *v2++ * tmp0;
753: sum3 += *v3++ * tmp0;
754: sum4 += *v4++ * tmp0;
755: sum5 += *v5++ * tmp0;
756: }
757: y[row++]=sum1;
758: y[row++]=sum2;
759: y[row++]=sum3;
760: y[row++]=sum4;
761: y[row++]=sum5;
762: v1 =v5; /* Since the next block to be processed starts there */
763: idx +=4*sz;
764: break;
765: default :
766: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767: }
768: }
769: VecRestoreArray(xx,&x);
770: VecRestoreArray(yy,&y);
771: if (zz != yy) {
772: VecRestoreArray(zz,&z);
773: }
774: PetscLogFlops(2.0*a->nz);
775: return(0);
776: }
778: /* ----------------------------------------------------------- */
781: PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782: {
783: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
784: IS iscol = a->col,isrow = a->row;
785: PetscErrorCode ierr;
786: const PetscInt *r,*c,*rout,*cout;
787: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
790: PetscScalar sum1,sum2,sum3,sum4,sum5;
791: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792: const PetscScalar *b;
795: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796: node_max = a->inode.node_count;
797: ns = a->inode.size; /* Node Size array */
799: VecGetArray(bb,(PetscScalar**)&b);
800: VecGetArray(xx,&x);
801: tmp = a->solve_work;
802:
803: ISGetIndices(isrow,&rout); r = rout;
804: ISGetIndices(iscol,&cout); c = cout + (n-1);
805:
806: /* forward solve the lower triangular */
807: tmps = tmp ;
808: aa = a_a ;
809: aj = a_j ;
810: ad = a->diag;
812: for (i = 0,row = 0; i< node_max; ++i){
813: nsz = ns[i];
814: aii = ai[row];
815: v1 = aa + aii;
816: vi = aj + aii;
817: nz = ad[row]- aii;
818: if (i < node_max-1) {
819: /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820: * but our indexing to determine it's size could. */
821: PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822: /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823: PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824: /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825: }
827: switch (nsz){ /* Each loop in 'case' is unrolled */
828: case 1 :
829: sum1 = b[*r++];
830: for(j=0; j<nz-1; j+=2){
831: i0 = vi[0];
832: i1 = vi[1];
833: vi +=2;
834: tmp0 = tmps[i0];
835: tmp1 = tmps[i1];
836: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837: }
838: if(j == nz-1){
839: tmp0 = tmps[*vi++];
840: sum1 -= *v1++ *tmp0;
841: }
842: tmp[row ++]=sum1;
843: break;
844: case 2:
845: sum1 = b[*r++];
846: sum2 = b[*r++];
847: v2 = aa + ai[row+1];
849: for(j=0; j<nz-1; j+=2){
850: i0 = vi[0];
851: i1 = vi[1];
852: vi +=2;
853: tmp0 = tmps[i0];
854: tmp1 = tmps[i1];
855: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857: }
858: if(j == nz-1){
859: tmp0 = tmps[*vi++];
860: sum1 -= *v1++ *tmp0;
861: sum2 -= *v2++ *tmp0;
862: }
863: sum2 -= *v2++ * sum1;
864: tmp[row ++]=sum1;
865: tmp[row ++]=sum2;
866: break;
867: case 3:
868: sum1 = b[*r++];
869: sum2 = b[*r++];
870: sum3 = b[*r++];
871: v2 = aa + ai[row+1];
872: v3 = aa + ai[row+2];
873:
874: for (j=0; j<nz-1; j+=2){
875: i0 = vi[0];
876: i1 = vi[1];
877: vi +=2;
878: tmp0 = tmps[i0];
879: tmp1 = tmps[i1];
880: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883: }
884: if (j == nz-1){
885: tmp0 = tmps[*vi++];
886: sum1 -= *v1++ *tmp0;
887: sum2 -= *v2++ *tmp0;
888: sum3 -= *v3++ *tmp0;
889: }
890: sum2 -= *v2++ * sum1;
891: sum3 -= *v3++ * sum1;
892: sum3 -= *v3++ * sum2;
893: tmp[row ++]=sum1;
894: tmp[row ++]=sum2;
895: tmp[row ++]=sum3;
896: break;
897:
898: case 4:
899: sum1 = b[*r++];
900: sum2 = b[*r++];
901: sum3 = b[*r++];
902: sum4 = b[*r++];
903: v2 = aa + ai[row+1];
904: v3 = aa + ai[row+2];
905: v4 = aa + ai[row+3];
906:
907: for (j=0; j<nz-1; j+=2){
908: i0 = vi[0];
909: i1 = vi[1];
910: vi +=2;
911: tmp0 = tmps[i0];
912: tmp1 = tmps[i1];
913: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917: }
918: if (j == nz-1){
919: tmp0 = tmps[*vi++];
920: sum1 -= *v1++ *tmp0;
921: sum2 -= *v2++ *tmp0;
922: sum3 -= *v3++ *tmp0;
923: sum4 -= *v4++ *tmp0;
924: }
925: sum2 -= *v2++ * sum1;
926: sum3 -= *v3++ * sum1;
927: sum4 -= *v4++ * sum1;
928: sum3 -= *v3++ * sum2;
929: sum4 -= *v4++ * sum2;
930: sum4 -= *v4++ * sum3;
931:
932: tmp[row ++]=sum1;
933: tmp[row ++]=sum2;
934: tmp[row ++]=sum3;
935: tmp[row ++]=sum4;
936: break;
937: case 5:
938: sum1 = b[*r++];
939: sum2 = b[*r++];
940: sum3 = b[*r++];
941: sum4 = b[*r++];
942: sum5 = b[*r++];
943: v2 = aa + ai[row+1];
944: v3 = aa + ai[row+2];
945: v4 = aa + ai[row+3];
946: v5 = aa + ai[row+4];
947:
948: for (j=0; j<nz-1; j+=2){
949: i0 = vi[0];
950: i1 = vi[1];
951: vi +=2;
952: tmp0 = tmps[i0];
953: tmp1 = tmps[i1];
954: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959: }
960: if (j == nz-1){
961: tmp0 = tmps[*vi++];
962: sum1 -= *v1++ *tmp0;
963: sum2 -= *v2++ *tmp0;
964: sum3 -= *v3++ *tmp0;
965: sum4 -= *v4++ *tmp0;
966: sum5 -= *v5++ *tmp0;
967: }
969: sum2 -= *v2++ * sum1;
970: sum3 -= *v3++ * sum1;
971: sum4 -= *v4++ * sum1;
972: sum5 -= *v5++ * sum1;
973: sum3 -= *v3++ * sum2;
974: sum4 -= *v4++ * sum2;
975: sum5 -= *v5++ * sum2;
976: sum4 -= *v4++ * sum3;
977: sum5 -= *v5++ * sum3;
978: sum5 -= *v5++ * sum4;
979:
980: tmp[row ++]=sum1;
981: tmp[row ++]=sum2;
982: tmp[row ++]=sum3;
983: tmp[row ++]=sum4;
984: tmp[row ++]=sum5;
985: break;
986: default:
987: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988: }
989: }
990: /* backward solve the upper triangular */
991: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992: nsz = ns[i];
993: aii = ai[row+1] -1;
994: v1 = aa + aii;
995: vi = aj + aii;
996: nz = aii- ad[row];
997: switch (nsz){ /* Each loop in 'case' is unrolled */
998: case 1 :
999: sum1 = tmp[row];
1001: for(j=nz ; j>1; j-=2){
1002: vi -=2;
1003: i0 = vi[2];
1004: i1 = vi[1];
1005: tmp0 = tmps[i0];
1006: tmp1 = tmps[i1];
1007: v1 -= 2;
1008: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009: }
1010: if (j==1){
1011: tmp0 = tmps[*vi--];
1012: sum1 -= *v1-- * tmp0;
1013: }
1014: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015: break;
1016: case 2 :
1017: sum1 = tmp[row];
1018: sum2 = tmp[row -1];
1019: v2 = aa + ai[row]-1;
1020: for (j=nz ; j>1; j-=2){
1021: vi -=2;
1022: i0 = vi[2];
1023: i1 = vi[1];
1024: tmp0 = tmps[i0];
1025: tmp1 = tmps[i1];
1026: v1 -= 2;
1027: v2 -= 2;
1028: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030: }
1031: if (j==1){
1032: tmp0 = tmps[*vi--];
1033: sum1 -= *v1-- * tmp0;
1034: sum2 -= *v2-- * tmp0;
1035: }
1036:
1037: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038: sum2 -= *v2-- * tmp0;
1039: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040: break;
1041: case 3 :
1042: sum1 = tmp[row];
1043: sum2 = tmp[row -1];
1044: sum3 = tmp[row -2];
1045: v2 = aa + ai[row]-1;
1046: v3 = aa + ai[row -1]-1;
1047: for (j=nz ; j>1; j-=2){
1048: vi -=2;
1049: i0 = vi[2];
1050: i1 = vi[1];
1051: tmp0 = tmps[i0];
1052: tmp1 = tmps[i1];
1053: v1 -= 2;
1054: v2 -= 2;
1055: v3 -= 2;
1056: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059: }
1060: if (j==1){
1061: tmp0 = tmps[*vi--];
1062: sum1 -= *v1-- * tmp0;
1063: sum2 -= *v2-- * tmp0;
1064: sum3 -= *v3-- * tmp0;
1065: }
1066: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067: sum2 -= *v2-- * tmp0;
1068: sum3 -= *v3-- * tmp0;
1069: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070: sum3 -= *v3-- * tmp0;
1071: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072:
1073: break;
1074: case 4 :
1075: sum1 = tmp[row];
1076: sum2 = tmp[row -1];
1077: sum3 = tmp[row -2];
1078: sum4 = tmp[row -3];
1079: v2 = aa + ai[row]-1;
1080: v3 = aa + ai[row -1]-1;
1081: v4 = aa + ai[row -2]-1;
1083: for (j=nz ; j>1; j-=2){
1084: vi -=2;
1085: i0 = vi[2];
1086: i1 = vi[1];
1087: tmp0 = tmps[i0];
1088: tmp1 = tmps[i1];
1089: v1 -= 2;
1090: v2 -= 2;
1091: v3 -= 2;
1092: v4 -= 2;
1093: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097: }
1098: if (j==1){
1099: tmp0 = tmps[*vi--];
1100: sum1 -= *v1-- * tmp0;
1101: sum2 -= *v2-- * tmp0;
1102: sum3 -= *v3-- * tmp0;
1103: sum4 -= *v4-- * tmp0;
1104: }
1106: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107: sum2 -= *v2-- * tmp0;
1108: sum3 -= *v3-- * tmp0;
1109: sum4 -= *v4-- * tmp0;
1110: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111: sum3 -= *v3-- * tmp0;
1112: sum4 -= *v4-- * tmp0;
1113: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114: sum4 -= *v4-- * tmp0;
1115: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116: break;
1117: case 5 :
1118: sum1 = tmp[row];
1119: sum2 = tmp[row -1];
1120: sum3 = tmp[row -2];
1121: sum4 = tmp[row -3];
1122: sum5 = tmp[row -4];
1123: v2 = aa + ai[row]-1;
1124: v3 = aa + ai[row -1]-1;
1125: v4 = aa + ai[row -2]-1;
1126: v5 = aa + ai[row -3]-1;
1127: for (j=nz ; j>1; j-=2){
1128: vi -= 2;
1129: i0 = vi[2];
1130: i1 = vi[1];
1131: tmp0 = tmps[i0];
1132: tmp1 = tmps[i1];
1133: v1 -= 2;
1134: v2 -= 2;
1135: v3 -= 2;
1136: v4 -= 2;
1137: v5 -= 2;
1138: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143: }
1144: if (j==1){
1145: tmp0 = tmps[*vi--];
1146: sum1 -= *v1-- * tmp0;
1147: sum2 -= *v2-- * tmp0;
1148: sum3 -= *v3-- * tmp0;
1149: sum4 -= *v4-- * tmp0;
1150: sum5 -= *v5-- * tmp0;
1151: }
1153: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154: sum2 -= *v2-- * tmp0;
1155: sum3 -= *v3-- * tmp0;
1156: sum4 -= *v4-- * tmp0;
1157: sum5 -= *v5-- * tmp0;
1158: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159: sum3 -= *v3-- * tmp0;
1160: sum4 -= *v4-- * tmp0;
1161: sum5 -= *v5-- * tmp0;
1162: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163: sum4 -= *v4-- * tmp0;
1164: sum5 -= *v5-- * tmp0;
1165: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166: sum5 -= *v5-- * tmp0;
1167: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168: break;
1169: default:
1170: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171: }
1172: }
1173: ISRestoreIndices(isrow,&rout);
1174: ISRestoreIndices(iscol,&cout);
1175: VecRestoreArray(bb,(PetscScalar**)&b);
1176: VecRestoreArray(xx,&x);
1177: PetscLogFlops(2.0*a->nz - A->cmap->n);
1178: return(0);
1179: }
1183: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184: {
1185: Mat C=B;
1186: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187: IS isrow = b->row,isicol = b->icol;
1188: PetscErrorCode ierr;
1189: const PetscInt *r,*ic,*ics;
1190: const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191: PetscInt i,j,k,nz,nzL,row,*pj;
1192: const PetscInt *ajtmp,*bjtmp;
1193: MatScalar *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3;
1194: const MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195: FactorShiftCtx sctx;
1196: const PetscInt *ddiag;
1197: PetscReal rs;
1198: MatScalar d;
1199: PetscInt inod,nodesz,node_max,col;
1200: const PetscInt *ns;
1201: PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1202:
1204: /* MatPivotSetUp(): initialize shift context sctx */
1205: PetscMemzero(&sctx,sizeof(FactorShiftCtx));
1207: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208: ddiag = a->diag;
1209: sctx.shift_top = info->zeropivot;
1210: for (i=0; i<n; i++) {
1211: /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212: d = (aa)[ddiag[i]];
1213: rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214: v = aa+ai[i];
1215: nz = ai[i+1] - ai[i];
1216: for (j=0; j<nz; j++)
1217: rs += PetscAbsScalar(v[j]);
1218: if (rs>sctx.shift_top) sctx.shift_top = rs;
1219: }
1220: sctx.shift_top *= 1.1;
1221: sctx.nshift_max = 5;
1222: sctx.shift_lo = 0.;
1223: sctx.shift_hi = 1.;
1224: }
1226: ISGetIndices(isrow,&r);
1227: ISGetIndices(isicol,&ic);
1228:
1229: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1230: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1231: rtmp2 = rtmp1 + n;
1232: rtmp3 = rtmp2 + n;
1233: ics = ic;
1235: node_max = a->inode.node_count;
1236: ns = a->inode.size;
1237: if (!ns){
1238: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239: }
1241: /* If max inode size > 3, split it into two inodes.*/
1242: /* also map the inode sizes according to the ordering */
1243: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1244: for (i=0,j=0; i<node_max; ++i,++j){
1245: if (ns[i]>3) {
1246: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1247: ++j;
1248: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249: } else {
1250: tmp_vec1[j] = ns[i];
1251: }
1252: }
1253: /* Use the correct node_max */
1254: node_max = j;
1255:
1256: /* Now reorder the inode info based on mat re-ordering info */
1257: /* First create a row -> inode_size_array_index map */
1258: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1259: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1260: for (i=0,row=0; i<node_max; i++) {
1261: nodesz = tmp_vec1[i];
1262: for (j=0; j<nodesz; j++,row++) {
1263: nsmap[row] = i;
1264: }
1265: }
1266: /* Using nsmap, create a reordered ns structure */
1267: for (i=0,j=0; i< node_max; i++) {
1268: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1269: tmp_vec2[i] = nodesz;
1270: j += nodesz;
1271: }
1272: PetscFree(nsmap);
1273: PetscFree(tmp_vec1);
1275: /* Now use the correct ns */
1276: ns = tmp_vec2;
1278: do {
1279: sctx.useshift = PETSC_FALSE;
1280: /* Now loop over each block-row, and do the factorization */
1281: for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1282: nodesz = ns[inod];
1283:
1284: switch (nodesz){
1285: case 1:
1286: /*----------*/
1287: /* zero rtmp1 */
1288: /* L part */
1289: nz = bi[i+1] - bi[i];
1290: bjtmp = bj + bi[i];
1291: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1293: /* U part */
1294: nz = bdiag[i]-bdiag[i+1];
1295: bjtmp = bj + bdiag[i+1]+1;
1296: for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1297:
1298: /* load in initial (unfactored row) */
1299: nz = ai[r[i]+1] - ai[r[i]];
1300: ajtmp = aj + ai[r[i]];
1301: v = aa + ai[r[i]];
1302: for (j=0; j<nz; j++) {
1303: rtmp1[ics[ajtmp[j]]] = v[j];
1304: }
1305: /* ZeropivotApply() */
1306: rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1307:
1308: /* elimination */
1309: bjtmp = bj + bi[i];
1310: row = *bjtmp++;
1311: nzL = bi[i+1] - bi[i];
1312: for(k=0; k < nzL;k++) {
1313: pc = rtmp1 + row;
1314: if (*pc != 0.0) {
1315: pv = b->a + bdiag[row];
1316: mul1 = *pc * (*pv);
1317: *pc = mul1;
1318: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1319: pv = b->a + bdiag[row+1]+1;
1320: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1321: for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1322: PetscLogFlops(2.0*nz);
1323: }
1324: row = *bjtmp++;
1325: }
1327: /* finished row so stick it into b->a */
1328: rs = 0.0;
1329: /* L part */
1330: pv = b->a + bi[i] ;
1331: pj = b->j + bi[i] ;
1332: nz = bi[i+1] - bi[i];
1333: for (j=0; j<nz; j++) {
1334: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335: }
1337: /* U part */
1338: pv = b->a + bdiag[i+1]+1;
1339: pj = b->j + bdiag[i+1]+1;
1340: nz = bdiag[i] - bdiag[i+1]-1;
1341: for (j=0; j<nz; j++) {
1342: pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1343: }
1345: /* Check zero pivot */
1346: sctx.rs = rs;
1347: sctx.pv = rtmp1[i];
1348: MatPivotCheck(info,sctx,i);
1349:
1350: /* Mark diagonal and invert diagonal for simplier triangular solves */
1351: pv = b->a + bdiag[i];
1352: *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353: break;
1354:
1355: case 2:
1356: /*----------*/
1357: /* zero rtmp1 and rtmp2 */
1358: /* L part */
1359: nz = bi[i+1] - bi[i];
1360: bjtmp = bj + bi[i];
1361: for (j=0; j<nz; j++) {
1362: col = bjtmp[j];
1363: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364: }
1366: /* U part */
1367: nz = bdiag[i]-bdiag[i+1];
1368: bjtmp = bj + bdiag[i+1]+1;
1369: for (j=0; j<nz; j++) {
1370: col = bjtmp[j];
1371: rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372: }
1373:
1374: /* load in initial (unfactored row) */
1375: nz = ai[r[i]+1] - ai[r[i]];
1376: ajtmp = aj + ai[r[i]];
1377: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378: for (j=0; j<nz; j++) {
1379: col = ics[ajtmp[j]];
1380: rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381: }
1382: /* ZeropivotApply(): shift the diagonal of the matrix */
1383: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384:
1385: /* elimination */
1386: bjtmp = bj + bi[i];
1387: row = *bjtmp++; /* pivot row */
1388: nzL = bi[i+1] - bi[i];
1389: for(k=0; k < nzL;k++) {
1390: pc1 = rtmp1 + row;
1391: pc2 = rtmp2 + row;
1392: if (*pc1 != 0.0 || *pc2 != 0.0) {
1393: pv = b->a + bdiag[row];
1394: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395: *pc1 = mul1; *pc2 = mul2;
1397: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1398: pv = b->a + bdiag[row+1]+1;
1399: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400: for (j=0; j<nz; j++){
1401: col = pj[j];
1402: rtmp1[col] -= mul1 * pv[j];
1403: rtmp2[col] -= mul2 * pv[j];
1404: }
1405: PetscLogFlops(4*nz);
1406: }
1407: row = *bjtmp++;
1408: }
1410: /* finished row i; check zero pivot, then stick row i into b->a */
1411: rs = 0.0;
1412: /* L part */
1413: pc1 = b->a + bi[i];
1414: pj = b->j + bi[i] ;
1415: nz = bi[i+1] - bi[i];
1416: for (j=0; j<nz; j++) {
1417: col = pj[j];
1418: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419: }
1420: /* U part */
1421: pc1 = b->a + bdiag[i+1]+1;
1422: pj = b->j + bdiag[i+1]+1;
1423: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424: for (j=0; j<nz; j++) {
1425: col = pj[j];
1426: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427: }
1428:
1429: sctx.rs = rs;
1430: sctx.pv = rtmp1[i];
1431: MatPivotCheck(info,sctx,i);
1432: pc1 = b->a + bdiag[i]; /* Mark diagonal */
1433: *pc1 = 1.0/sctx.pv;
1435: /* Now take care of diagonal 2x2 block. */
1436: pc2 = rtmp2 + i;
1437: if (*pc2 != 0.0){
1438: mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439: *pc2 = mul1; /* insert L entry */
1440: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1441: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442: for (j=0; j<nz; j++) {
1443: col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444: }
1445: PetscLogFlops(2*nz);
1446: }
1448: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449: rs = 0.0;
1450: /* L part */
1451: pc2 = b->a + bi[i+1];
1452: pj = b->j + bi[i+1] ;
1453: nz = bi[i+2] - bi[i+1];
1454: for (j=0; j<nz; j++) {
1455: col = pj[j];
1456: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457: }
1458: /* U part */
1459: pc2 = b->a + bdiag[i+2]+1;
1460: pj = b->j + bdiag[i+2]+1;
1461: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462: for (j=0; j<nz; j++) {
1463: col = pj[j];
1464: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465: }
1467: sctx.rs = rs;
1468: sctx.pv = rtmp2[i+1];
1469: MatPivotCheck(info,sctx,i+1);
1470: pc2 = b->a + bdiag[i+1];
1471: *pc2 = 1.0/sctx.pv;
1472: break;
1474: case 3:
1475: /*----------*/
1476: /* zero rtmp */
1477: /* L part */
1478: nz = bi[i+1] - bi[i];
1479: bjtmp = bj + bi[i];
1480: for (j=0; j<nz; j++) {
1481: col = bjtmp[j];
1482: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1483: }
1485: /* U part */
1486: nz = bdiag[i]-bdiag[i+1];
1487: bjtmp = bj + bdiag[i+1]+1;
1488: for (j=0; j<nz; j++) {
1489: col = bjtmp[j];
1490: rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1491: }
1492:
1493: /* load in initial (unfactored row) */
1494: nz = ai[r[i]+1] - ai[r[i]];
1495: ajtmp = aj + ai[r[i]];
1496: v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1497: for (j=0; j<nz; j++) {
1498: col = ics[ajtmp[j]];
1499: rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1500: }
1501: /* ZeropivotApply(): shift the diagonal of the matrix */
1502: rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1503:
1504: /* elimination */
1505: bjtmp = bj + bi[i];
1506: row = *bjtmp++; /* pivot row */
1507: nzL = bi[i+1] - bi[i];
1508: for(k=0; k < nzL;k++) {
1509: pc1 = rtmp1 + row;
1510: pc2 = rtmp2 + row;
1511: pc3 = rtmp3 + row;
1512: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1513: pv = b->a + bdiag[row];
1514: mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1515: *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1518: pv = b->a + bdiag[row+1]+1;
1519: nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1520: for (j=0; j<nz; j++){
1521: col = pj[j];
1522: rtmp1[col] -= mul1 * pv[j];
1523: rtmp2[col] -= mul2 * pv[j];
1524: rtmp3[col] -= mul3 * pv[j];
1525: }
1526: PetscLogFlops(6*nz);
1527: }
1528: row = *bjtmp++;
1529: }
1531: /* finished row i; check zero pivot, then stick row i into b->a */
1532: rs = 0.0;
1533: /* L part */
1534: pc1 = b->a + bi[i];
1535: pj = b->j + bi[i] ;
1536: nz = bi[i+1] - bi[i];
1537: for (j=0; j<nz; j++) {
1538: col = pj[j];
1539: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1540: }
1541: /* U part */
1542: pc1 = b->a + bdiag[i+1]+1;
1543: pj = b->j + bdiag[i+1]+1;
1544: nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1545: for (j=0; j<nz; j++) {
1546: col = pj[j];
1547: pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1548: }
1549:
1550: sctx.rs = rs;
1551: sctx.pv = rtmp1[i];
1552: MatPivotCheck(info,sctx,i);
1553: pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1554: *pc1 = 1.0/sctx.pv;
1556: /* Now take care of 1st column of diagonal 3x3 block. */
1557: pc2 = rtmp2 + i;
1558: pc3 = rtmp3 + i;
1559: if (*pc2 != 0.0 || *pc3 != 0.0){
1560: mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561: mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562: pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1563: nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564: for (j=0; j<nz; j++) {
1565: col = pj[j];
1566: rtmp2[col] -= mul2 * rtmp1[col];
1567: rtmp3[col] -= mul3 * rtmp1[col];
1568: }
1569: PetscLogFlops(4*nz);
1570: }
1571:
1572: /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573: rs = 0.0;
1574: /* L part */
1575: pc2 = b->a + bi[i+1];
1576: pj = b->j + bi[i+1] ;
1577: nz = bi[i+2] - bi[i+1];
1578: for (j=0; j<nz; j++) {
1579: col = pj[j];
1580: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581: }
1582: /* U part */
1583: pc2 = b->a + bdiag[i+2]+1;
1584: pj = b->j + bdiag[i+2]+1;
1585: nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586: for (j=0; j<nz; j++) {
1587: col = pj[j];
1588: pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589: }
1591: sctx.rs = rs;
1592: sctx.pv = rtmp2[i+1];
1593: MatPivotCheck(info,sctx,i+1);
1594: pc2 = b->a + bdiag[i+1];
1595: *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1597: /* Now take care of 2nd column of diagonal 3x3 block. */
1598: pc3 = rtmp3 + i+1;
1599: if (*pc3 != 0.0){
1600: mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1601: pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1602: nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1603: for (j=0; j<nz; j++) {
1604: col = pj[j];
1605: rtmp3[col] -= mul3 * rtmp2[col];
1606: }
1607: PetscLogFlops(2*nz);
1608: }
1610: /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1611: rs = 0.0;
1612: /* L part */
1613: pc3 = b->a + bi[i+2];
1614: pj = b->j + bi[i+2] ;
1615: nz = bi[i+3] - bi[i+2];
1616: for (j=0; j<nz; j++) {
1617: col = pj[j];
1618: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1619: }
1620: /* U part */
1621: pc3 = b->a + bdiag[i+3]+1;
1622: pj = b->j + bdiag[i+3]+1;
1623: nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1624: for (j=0; j<nz; j++) {
1625: col = pj[j];
1626: pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1627: }
1629: sctx.rs = rs;
1630: sctx.pv = rtmp3[i+2];
1631: MatPivotCheck(info,sctx,i+2);
1632: pc3 = b->a + bdiag[i+2];
1633: *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1634: break;
1636: default:
1637: SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1638: }
1639: i += nodesz; /* Update the row */
1640: }
1642: /* MatPivotRefine() */
1643: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1644: /*
1645: * if no shift in this attempt & shifting & started shifting & can refine,
1646: * then try lower shift
1647: */
1648: sctx.shift_hi = sctx.shift_fraction;
1649: sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1650: sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1651: sctx.useshift = PETSC_TRUE;
1652: sctx.nshift++;
1653: }
1654: } while (sctx.useshift);
1656: PetscFree(rtmp1);
1657: PetscFree(tmp_vec2);
1658: ISRestoreIndices(isicol,&ic);
1659: ISRestoreIndices(isrow,&r);
1661: C->ops->solve = MatSolve_SeqAIJ;
1662: C->ops->solveadd = MatSolveAdd_SeqAIJ;
1663: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1664: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1665: C->ops->matsolve = MatMatSolve_SeqAIJ;
1666: C->assembled = PETSC_TRUE;
1667: C->preallocated = PETSC_TRUE;
1668: PetscLogFlops(C->cmap->n);
1670: /* MatShiftView(A,info,&sctx) */
1671: if (sctx.nshift){
1672: if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1673: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
1674: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1675: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1676: } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1677: PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);
1678: }
1679: }
1680: Mat_CheckInode_FactorLU(C,PETSC_FALSE);
1681: return(0);
1682: }
1686: PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1687: {
1688: Mat C = B;
1689: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1690: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1691: PetscErrorCode ierr;
1692: const PetscInt *r,*ic,*c,*ics;
1693: PetscInt n = A->rmap->n,*bi = b->i;
1694: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1695: PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1696: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1697: PetscScalar mul1,mul2,mul3,tmp;
1698: MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1699: const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1700: PetscReal rs=0.0;
1701: FactorShiftCtx sctx;
1702: PetscInt newshift;
1705: sctx.shift_top = 0;
1706: sctx.nshift_max = 0;
1707: sctx.shift_lo = 0;
1708: sctx.shift_hi = 0;
1709: sctx.shift_fraction = 0;
1711: /* if both shift schemes are chosen by user, only use info->shiftpd */
1712: if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1713: sctx.shift_top = 0;
1714: for (i=0; i<n; i++) {
1715: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1716: rs = 0.0;
1717: ajtmp = aj + ai[i];
1718: rtmp1 = aa + ai[i];
1719: nz = ai[i+1] - ai[i];
1720: for (j=0; j<nz; j++){
1721: if (*ajtmp != i){
1722: rs += PetscAbsScalar(*rtmp1++);
1723: } else {
1724: rs -= PetscRealPart(*rtmp1++);
1725: }
1726: ajtmp++;
1727: }
1728: if (rs>sctx.shift_top) sctx.shift_top = rs;
1729: }
1730: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1731: sctx.shift_top *= 1.1;
1732: sctx.nshift_max = 5;
1733: sctx.shift_lo = 0.;
1734: sctx.shift_hi = 1.;
1735: }
1736: sctx.shift_amount = 0;
1737: sctx.nshift = 0;
1739: ISGetIndices(isrow,&r);
1740: ISGetIndices(iscol,&c);
1741: ISGetIndices(isicol,&ic);
1742: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);
1743: PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));
1744: ics = ic ;
1745: rtmp22 = rtmp11 + n;
1746: rtmp33 = rtmp22 + n;
1747:
1748: node_max = a->inode.node_count;
1749: ns = a->inode.size;
1750: if (!ns){
1751: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1752: }
1754: /* If max inode size > 3, split it into two inodes.*/
1755: /* also map the inode sizes according to the ordering */
1756: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1757: for (i=0,j=0; i<node_max; ++i,++j){
1758: if (ns[i]>3) {
1759: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1760: ++j;
1761: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1762: } else {
1763: tmp_vec1[j] = ns[i];
1764: }
1765: }
1766: /* Use the correct node_max */
1767: node_max = j;
1769: /* Now reorder the inode info based on mat re-ordering info */
1770: /* First create a row -> inode_size_array_index map */
1771: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1772: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1773: for (i=0,row=0; i<node_max; i++) {
1774: nodesz = tmp_vec1[i];
1775: for (j=0; j<nodesz; j++,row++) {
1776: nsmap[row] = i;
1777: }
1778: }
1779: /* Using nsmap, create a reordered ns structure */
1780: for (i=0,j=0; i< node_max; i++) {
1781: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1782: tmp_vec2[i] = nodesz;
1783: j += nodesz;
1784: }
1785: PetscFree(nsmap);
1786: PetscFree(tmp_vec1);
1787: /* Now use the correct ns */
1788: ns = tmp_vec2;
1790: do {
1791: sctx.useshift = PETSC_FALSE;
1792: /* Now loop over each block-row, and do the factorization */
1793: for (i=0,row=0; i<node_max; i++) {
1794: nodesz = ns[i];
1795: nz = bi[row+1] - bi[row];
1796: bjtmp = bj + bi[row];
1798: switch (nodesz){
1799: case 1:
1800: for (j=0; j<nz; j++){
1801: idx = bjtmp[j];
1802: rtmp11[idx] = 0.0;
1803: }
1804:
1805: /* load in initial (unfactored row) */
1806: idx = r[row];
1807: nz_tmp = ai[idx+1] - ai[idx];
1808: ajtmp = aj + ai[idx];
1809: v1 = aa + ai[idx];
1811: for (j=0; j<nz_tmp; j++) {
1812: idx = ics[ajtmp[j]];
1813: rtmp11[idx] = v1[j];
1814: }
1815: rtmp11[ics[r[row]]] += sctx.shift_amount;
1817: prow = *bjtmp++ ;
1818: while (prow < row) {
1819: pc1 = rtmp11 + prow;
1820: if (*pc1 != 0.0){
1821: pv = ba + bd[prow];
1822: pj = nbj + bd[prow];
1823: mul1 = *pc1 * *pv++;
1824: *pc1 = mul1;
1825: nz_tmp = bi[prow+1] - bd[prow] - 1;
1826: PetscLogFlops(2*nz_tmp);
1827: for (j=0; j<nz_tmp; j++) {
1828: tmp = pv[j];
1829: idx = pj[j];
1830: rtmp11[idx] -= mul1 * tmp;
1831: }
1832: }
1833: prow = *bjtmp++ ;
1834: }
1835: pj = bj + bi[row];
1836: pc1 = ba + bi[row];
1838: sctx.pv = rtmp11[row];
1839: rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1840: rs = 0.0;
1841: for (j=0; j<nz; j++) {
1842: idx = pj[j];
1843: pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1844: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1845: }
1846: sctx.rs = rs;
1847: MatLUCheckShift_inline(info,sctx,row,newshift);
1848: if (newshift == 1) goto endofwhile;
1849: break;
1850:
1851: case 2:
1852: for (j=0; j<nz; j++) {
1853: idx = bjtmp[j];
1854: rtmp11[idx] = 0.0;
1855: rtmp22[idx] = 0.0;
1856: }
1857:
1858: /* load in initial (unfactored row) */
1859: idx = r[row];
1860: nz_tmp = ai[idx+1] - ai[idx];
1861: ajtmp = aj + ai[idx];
1862: v1 = aa + ai[idx];
1863: v2 = aa + ai[idx+1];
1864: for (j=0; j<nz_tmp; j++) {
1865: idx = ics[ajtmp[j]];
1866: rtmp11[idx] = v1[j];
1867: rtmp22[idx] = v2[j];
1868: }
1869: rtmp11[ics[r[row]]] += sctx.shift_amount;
1870: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1872: prow = *bjtmp++ ;
1873: while (prow < row) {
1874: pc1 = rtmp11 + prow;
1875: pc2 = rtmp22 + prow;
1876: if (*pc1 != 0.0 || *pc2 != 0.0){
1877: pv = ba + bd[prow];
1878: pj = nbj + bd[prow];
1879: mul1 = *pc1 * *pv;
1880: mul2 = *pc2 * *pv;
1881: ++pv;
1882: *pc1 = mul1;
1883: *pc2 = mul2;
1884:
1885: nz_tmp = bi[prow+1] - bd[prow] - 1;
1886: for (j=0; j<nz_tmp; j++) {
1887: tmp = pv[j];
1888: idx = pj[j];
1889: rtmp11[idx] -= mul1 * tmp;
1890: rtmp22[idx] -= mul2 * tmp;
1891: }
1892: PetscLogFlops(4*nz_tmp);
1893: }
1894: prow = *bjtmp++ ;
1895: }
1897: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1898: pc1 = rtmp11 + prow;
1899: pc2 = rtmp22 + prow;
1901: sctx.pv = *pc1;
1902: pj = bj + bi[prow];
1903: rs = 0.0;
1904: for (j=0; j<nz; j++){
1905: idx = pj[j];
1906: if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1907: }
1908: sctx.rs = rs;
1909: MatLUCheckShift_inline(info,sctx,row,newshift);
1910: if (newshift == 1) goto endofwhile;
1912: if (*pc2 != 0.0){
1913: pj = nbj + bd[prow];
1914: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1915: *pc2 = mul2;
1916: nz_tmp = bi[prow+1] - bd[prow] - 1;
1917: for (j=0; j<nz_tmp; j++) {
1918: idx = pj[j] ;
1919: tmp = rtmp11[idx];
1920: rtmp22[idx] -= mul2 * tmp;
1921: }
1922: PetscLogFlops(2*nz_tmp);
1923: }
1924:
1925: pj = bj + bi[row];
1926: pc1 = ba + bi[row];
1927: pc2 = ba + bi[row+1];
1929: sctx.pv = rtmp22[row+1];
1930: rs = 0.0;
1931: rtmp11[row] = 1.0/rtmp11[row];
1932: rtmp22[row+1] = 1.0/rtmp22[row+1];
1933: /* copy row entries from dense representation to sparse */
1934: for (j=0; j<nz; j++) {
1935: idx = pj[j];
1936: pc1[j] = rtmp11[idx];
1937: pc2[j] = rtmp22[idx];
1938: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1939: }
1940: sctx.rs = rs;
1941: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1942: if (newshift == 1) goto endofwhile;
1943: break;
1945: case 3:
1946: for (j=0; j<nz; j++) {
1947: idx = bjtmp[j];
1948: rtmp11[idx] = 0.0;
1949: rtmp22[idx] = 0.0;
1950: rtmp33[idx] = 0.0;
1951: }
1952: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1953: idx = r[row];
1954: nz_tmp = ai[idx+1] - ai[idx];
1955: ajtmp = aj + ai[idx];
1956: v1 = aa + ai[idx];
1957: v2 = aa + ai[idx+1];
1958: v3 = aa + ai[idx+2];
1959: for (j=0; j<nz_tmp; j++) {
1960: idx = ics[ajtmp[j]];
1961: rtmp11[idx] = v1[j];
1962: rtmp22[idx] = v2[j];
1963: rtmp33[idx] = v3[j];
1964: }
1965: rtmp11[ics[r[row]]] += sctx.shift_amount;
1966: rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1967: rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1969: /* loop over all pivot row blocks above this row block */
1970: prow = *bjtmp++ ;
1971: while (prow < row) {
1972: pc1 = rtmp11 + prow;
1973: pc2 = rtmp22 + prow;
1974: pc3 = rtmp33 + prow;
1975: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1976: pv = ba + bd[prow];
1977: pj = nbj + bd[prow];
1978: mul1 = *pc1 * *pv;
1979: mul2 = *pc2 * *pv;
1980: mul3 = *pc3 * *pv;
1981: ++pv;
1982: *pc1 = mul1;
1983: *pc2 = mul2;
1984: *pc3 = mul3;
1985:
1986: nz_tmp = bi[prow+1] - bd[prow] - 1;
1987: /* update this row based on pivot row */
1988: for (j=0; j<nz_tmp; j++) {
1989: tmp = pv[j];
1990: idx = pj[j];
1991: rtmp11[idx] -= mul1 * tmp;
1992: rtmp22[idx] -= mul2 * tmp;
1993: rtmp33[idx] -= mul3 * tmp;
1994: }
1995: PetscLogFlops(6*nz_tmp);
1996: }
1997: prow = *bjtmp++ ;
1998: }
2000: /* Now take care of diagonal 3x3 block in this set of rows */
2001: /* note: prow = row here */
2002: pc1 = rtmp11 + prow;
2003: pc2 = rtmp22 + prow;
2004: pc3 = rtmp33 + prow;
2006: sctx.pv = *pc1;
2007: pj = bj + bi[prow];
2008: rs = 0.0;
2009: for (j=0; j<nz; j++){
2010: idx = pj[j];
2011: if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2012: }
2013: sctx.rs = rs;
2014: MatLUCheckShift_inline(info,sctx,row,newshift);
2015: if (newshift == 1) goto endofwhile;
2017: if (*pc2 != 0.0 || *pc3 != 0.0){
2018: mul2 = (*pc2)/(*pc1);
2019: mul3 = (*pc3)/(*pc1);
2020: *pc2 = mul2;
2021: *pc3 = mul3;
2022: nz_tmp = bi[prow+1] - bd[prow] - 1;
2023: pj = nbj + bd[prow];
2024: for (j=0; j<nz_tmp; j++) {
2025: idx = pj[j] ;
2026: tmp = rtmp11[idx];
2027: rtmp22[idx] -= mul2 * tmp;
2028: rtmp33[idx] -= mul3 * tmp;
2029: }
2030: PetscLogFlops(4*nz_tmp);
2031: }
2032: ++prow;
2034: pc2 = rtmp22 + prow;
2035: pc3 = rtmp33 + prow;
2036: sctx.pv = *pc2;
2037: pj = bj + bi[prow];
2038: rs = 0.0;
2039: for (j=0; j<nz; j++){
2040: idx = pj[j];
2041: if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2042: }
2043: sctx.rs = rs;
2044: MatLUCheckShift_inline(info,sctx,row+1,newshift);
2045: if (newshift == 1) goto endofwhile;
2047: if (*pc3 != 0.0){
2048: mul3 = (*pc3)/(*pc2);
2049: *pc3 = mul3;
2050: pj = nbj + bd[prow];
2051: nz_tmp = bi[prow+1] - bd[prow] - 1;
2052: for (j=0; j<nz_tmp; j++) {
2053: idx = pj[j] ;
2054: tmp = rtmp22[idx];
2055: rtmp33[idx] -= mul3 * tmp;
2056: }
2057: PetscLogFlops(2*nz_tmp);
2058: }
2060: pj = bj + bi[row];
2061: pc1 = ba + bi[row];
2062: pc2 = ba + bi[row+1];
2063: pc3 = ba + bi[row+2];
2065: sctx.pv = rtmp33[row+2];
2066: rs = 0.0;
2067: rtmp11[row] = 1.0/rtmp11[row];
2068: rtmp22[row+1] = 1.0/rtmp22[row+1];
2069: rtmp33[row+2] = 1.0/rtmp33[row+2];
2070: /* copy row entries from dense representation to sparse */
2071: for (j=0; j<nz; j++) {
2072: idx = pj[j];
2073: pc1[j] = rtmp11[idx];
2074: pc2[j] = rtmp22[idx];
2075: pc3[j] = rtmp33[idx];
2076: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2077: }
2079: sctx.rs = rs;
2080: MatLUCheckShift_inline(info,sctx,row+2,newshift);
2081: if (newshift == 1) goto endofwhile;
2082: break;
2084: default:
2085: SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2086: }
2087: row += nodesz; /* Update the row */
2088: }
2089: endofwhile:;
2090: } while (sctx.useshift);
2091: PetscFree(rtmp11);
2092: PetscFree(tmp_vec2);
2093: ISRestoreIndices(isicol,&ic);
2094: ISRestoreIndices(isrow,&r);
2095: ISRestoreIndices(iscol,&c);
2096: (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2097: /* do not set solve add, since MatSolve_Inode + Add is faster */
2098: C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2099: C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2100: C->assembled = PETSC_TRUE;
2101: C->preallocated = PETSC_TRUE;
2102: if (sctx.nshift) {
2103: if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2104: PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);
2105: } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2106: PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
2107: }
2108: }
2109: PetscLogFlops(C->cmap->n);
2110: Mat_CheckInode(C,PETSC_FALSE);
2111: return(0);
2112: }
2115: /* ----------------------------------------------------------- */
2118: PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2119: {
2120: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2121: IS iscol = a->col,isrow = a->row;
2122: PetscErrorCode ierr;
2123: const PetscInt *r,*c,*rout,*cout;
2124: PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2125: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2126: PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2127: PetscScalar sum1,sum2,sum3,sum4,sum5;
2128: const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2129: const PetscScalar *b;
2132: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2133: node_max = a->inode.node_count;
2134: ns = a->inode.size; /* Node Size array */
2136: VecGetArray(bb,(PetscScalar**)&b);
2137: VecGetArray(xx,&x);
2138: tmp = a->solve_work;
2139:
2140: ISGetIndices(isrow,&rout); r = rout;
2141: ISGetIndices(iscol,&cout); c = cout;
2142:
2143: /* forward solve the lower triangular */
2144: tmps = tmp ;
2145: aa = a_a ;
2146: aj = a_j ;
2147: ad = a->diag;
2149: for (i = 0,row = 0; i< node_max; ++i){
2150: nsz = ns[i];
2151: aii = ai[row];
2152: v1 = aa + aii;
2153: vi = aj + aii;
2154: nz = ai[row+1]- ai[row];
2155:
2156: if (i < node_max-1) {
2157: /* Prefetch the indices for the next block */
2158: PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2159: /* Prefetch the data for the next block */
2160: PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2161: }
2163: switch (nsz){ /* Each loop in 'case' is unrolled */
2164: case 1 :
2165: sum1 = b[r[row]];
2166: for(j=0; j<nz-1; j+=2){
2167: i0 = vi[j];
2168: i1 = vi[j+1];
2169: tmp0 = tmps[i0];
2170: tmp1 = tmps[i1];
2171: sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2172: }
2173: if(j == nz-1){
2174: tmp0 = tmps[vi[j]];
2175: sum1 -= v1[j]*tmp0;
2176: }
2177: tmp[row++]=sum1;
2178: break;
2179: case 2:
2180: sum1 = b[r[row]];
2181: sum2 = b[r[row+1]];
2182: v2 = aa + ai[row+1];
2184: for(j=0; j<nz-1; j+=2){
2185: i0 = vi[j];
2186: i1 = vi[j+1];
2187: tmp0 = tmps[i0];
2188: tmp1 = tmps[i1];
2189: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2190: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2191: }
2192: if(j == nz-1){
2193: tmp0 = tmps[vi[j]];
2194: sum1 -= v1[j] *tmp0;
2195: sum2 -= v2[j] *tmp0;
2196: }
2197: sum2 -= v2[nz] * sum1;
2198: tmp[row ++]=sum1;
2199: tmp[row ++]=sum2;
2200: break;
2201: case 3:
2202: sum1 = b[r[row]];
2203: sum2 = b[r[row+1]];
2204: sum3 = b[r[row+2]];
2205: v2 = aa + ai[row+1];
2206: v3 = aa + ai[row+2];
2207:
2208: for (j=0; j<nz-1; j+=2){
2209: i0 = vi[j];
2210: i1 = vi[j+1];
2211: tmp0 = tmps[i0];
2212: tmp1 = tmps[i1];
2213: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2214: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2215: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2216: }
2217: if (j == nz-1){
2218: tmp0 = tmps[vi[j]];
2219: sum1 -= v1[j] *tmp0;
2220: sum2 -= v2[j] *tmp0;
2221: sum3 -= v3[j] *tmp0;
2222: }
2223: sum2 -= v2[nz] * sum1;
2224: sum3 -= v3[nz] * sum1;
2225: sum3 -= v3[nz+1] * sum2;
2226: tmp[row ++]=sum1;
2227: tmp[row ++]=sum2;
2228: tmp[row ++]=sum3;
2229: break;
2230:
2231: case 4:
2232: sum1 = b[r[row]];
2233: sum2 = b[r[row+1]];
2234: sum3 = b[r[row+2]];
2235: sum4 = b[r[row+3]];
2236: v2 = aa + ai[row+1];
2237: v3 = aa + ai[row+2];
2238: v4 = aa + ai[row+3];
2239:
2240: for (j=0; j<nz-1; j+=2){
2241: i0 = vi[j];
2242: i1 = vi[j+1];
2243: tmp0 = tmps[i0];
2244: tmp1 = tmps[i1];
2245: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2246: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2247: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2248: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2249: }
2250: if (j == nz-1){
2251: tmp0 = tmps[vi[j]];
2252: sum1 -= v1[j] *tmp0;
2253: sum2 -= v2[j] *tmp0;
2254: sum3 -= v3[j] *tmp0;
2255: sum4 -= v4[j] *tmp0;
2256: }
2257: sum2 -= v2[nz] * sum1;
2258: sum3 -= v3[nz] * sum1;
2259: sum4 -= v4[nz] * sum1;
2260: sum3 -= v3[nz+1] * sum2;
2261: sum4 -= v4[nz+1] * sum2;
2262: sum4 -= v4[nz+2] * sum3;
2263:
2264: tmp[row ++]=sum1;
2265: tmp[row ++]=sum2;
2266: tmp[row ++]=sum3;
2267: tmp[row ++]=sum4;
2268: break;
2269: case 5:
2270: sum1 = b[r[row]];
2271: sum2 = b[r[row+1]];
2272: sum3 = b[r[row+2]];
2273: sum4 = b[r[row+3]];
2274: sum5 = b[r[row+4]];
2275: v2 = aa + ai[row+1];
2276: v3 = aa + ai[row+2];
2277: v4 = aa + ai[row+3];
2278: v5 = aa + ai[row+4];
2279:
2280: for (j=0; j<nz-1; j+=2){
2281: i0 = vi[j];
2282: i1 = vi[j+1];
2283: tmp0 = tmps[i0];
2284: tmp1 = tmps[i1];
2285: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2286: sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2287: sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2288: sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2289: sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2290: }
2291: if (j == nz-1){
2292: tmp0 = tmps[vi[j]];
2293: sum1 -= v1[j] *tmp0;
2294: sum2 -= v2[j] *tmp0;
2295: sum3 -= v3[j] *tmp0;
2296: sum4 -= v4[j] *tmp0;
2297: sum5 -= v5[j] *tmp0;
2298: }
2300: sum2 -= v2[nz] * sum1;
2301: sum3 -= v3[nz] * sum1;
2302: sum4 -= v4[nz] * sum1;
2303: sum5 -= v5[nz] * sum1;
2304: sum3 -= v3[nz+1] * sum2;
2305: sum4 -= v4[nz+1] * sum2;
2306: sum5 -= v5[nz+1] * sum2;
2307: sum4 -= v4[nz+2] * sum3;
2308: sum5 -= v5[nz+2] * sum3;
2309: sum5 -= v5[nz+3] * sum4;
2310:
2311: tmp[row ++]=sum1;
2312: tmp[row ++]=sum2;
2313: tmp[row ++]=sum3;
2314: tmp[row ++]=sum4;
2315: tmp[row ++]=sum5;
2316: break;
2317: default:
2318: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2319: }
2320: }
2321: /* backward solve the upper triangular */
2322: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2323: nsz = ns[i];
2324: aii = ad[row+1] + 1;
2325: v1 = aa + aii;
2326: vi = aj + aii;
2327: nz = ad[row]- ad[row+1] - 1;
2328:
2329: if (i > 0) {
2330: /* Prefetch the indices for the next block */
2331: PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2332: /* Prefetch the data for the next block */
2333: PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2334: }
2336: switch (nsz){ /* Each loop in 'case' is unrolled */
2337: case 1 :
2338: sum1 = tmp[row];
2340: for(j=0 ; j<nz-1; j+=2){
2341: i0 = vi[j];
2342: i1 = vi[j+1];
2343: tmp0 = tmps[i0];
2344: tmp1 = tmps[i1];
2345: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2346: }
2347: if (j == nz-1){
2348: tmp0 = tmps[vi[j]];
2349: sum1 -= v1[j]*tmp0;
2350: }
2351: x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2352: break;
2353: case 2 :
2354: sum1 = tmp[row];
2355: sum2 = tmp[row-1];
2356: v2 = aa + ad[row] + 1;
2357: for (j=0 ; j<nz-1; j+=2){
2358: i0 = vi[j];
2359: i1 = vi[j+1];
2360: tmp0 = tmps[i0];
2361: tmp1 = tmps[i1];
2362: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2363: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2364: }
2365: if (j == nz-1){
2366: tmp0 = tmps[vi[j]];
2367: sum1 -= v1[j]* tmp0;
2368: sum2 -= v2[j+1]* tmp0;
2369: }
2370:
2371: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2372: sum2 -= v2[0] * tmp0;
2373: x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2374: break;
2375: case 3 :
2376: sum1 = tmp[row];
2377: sum2 = tmp[row -1];
2378: sum3 = tmp[row -2];
2379: v2 = aa + ad[row] + 1;
2380: v3 = aa + ad[row -1] + 1;
2381: for (j=0 ; j<nz-1; j+=2){
2382: i0 = vi[j];
2383: i1 = vi[j+1];
2384: tmp0 = tmps[i0];
2385: tmp1 = tmps[i1];
2386: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2387: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2388: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2389: }
2390: if (j== nz-1){
2391: tmp0 = tmps[vi[j]];
2392: sum1 -= v1[j] * tmp0;
2393: sum2 -= v2[j+1] * tmp0;
2394: sum3 -= v3[j+2] * tmp0;
2395: }
2396: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2397: sum2 -= v2[0]* tmp0;
2398: sum3 -= v3[1] * tmp0;
2399: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2400: sum3 -= v3[0]* tmp0;
2401: x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2402:
2403: break;
2404: case 4 :
2405: sum1 = tmp[row];
2406: sum2 = tmp[row -1];
2407: sum3 = tmp[row -2];
2408: sum4 = tmp[row -3];
2409: v2 = aa + ad[row]+1;
2410: v3 = aa + ad[row -1]+1;
2411: v4 = aa + ad[row -2]+1;
2413: for (j=0 ; j<nz-1; j+=2){
2414: i0 = vi[j];
2415: i1 = vi[j+1];
2416: tmp0 = tmps[i0];
2417: tmp1 = tmps[i1];
2418: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2419: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2420: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2421: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2422: }
2423: if (j== nz-1){
2424: tmp0 = tmps[vi[j]];
2425: sum1 -= v1[j] * tmp0;
2426: sum2 -= v2[j+1] * tmp0;
2427: sum3 -= v3[j+2] * tmp0;
2428: sum4 -= v4[j+3] * tmp0;
2429: }
2431: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2432: sum2 -= v2[0] * tmp0;
2433: sum3 -= v3[1] * tmp0;
2434: sum4 -= v4[2] * tmp0;
2435: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2436: sum3 -= v3[0] * tmp0;
2437: sum4 -= v4[1] * tmp0;
2438: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2439: sum4 -= v4[0] * tmp0;
2440: x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2441: break;
2442: case 5 :
2443: sum1 = tmp[row];
2444: sum2 = tmp[row -1];
2445: sum3 = tmp[row -2];
2446: sum4 = tmp[row -3];
2447: sum5 = tmp[row -4];
2448: v2 = aa + ad[row]+1;
2449: v3 = aa + ad[row -1]+1;
2450: v4 = aa + ad[row -2]+1;
2451: v5 = aa + ad[row -3]+1;
2452: for (j=0 ; j<nz-1; j+=2){
2453: i0 = vi[j];
2454: i1 = vi[j+1];
2455: tmp0 = tmps[i0];
2456: tmp1 = tmps[i1];
2457: sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458: sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2459: sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2460: sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2461: sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2462: }
2463: if (j==nz-1){
2464: tmp0 = tmps[vi[j]];
2465: sum1 -= v1[j] * tmp0;
2466: sum2 -= v2[j+1] * tmp0;
2467: sum3 -= v3[j+2] * tmp0;
2468: sum4 -= v4[j+3] * tmp0;
2469: sum5 -= v5[j+4] * tmp0;
2470: }
2472: tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2473: sum2 -= v2[0] * tmp0;
2474: sum3 -= v3[1] * tmp0;
2475: sum4 -= v4[2] * tmp0;
2476: sum5 -= v5[3] * tmp0;
2477: tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2478: sum3 -= v3[0] * tmp0;
2479: sum4 -= v4[1] * tmp0;
2480: sum5 -= v5[2] * tmp0;
2481: tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2482: sum4 -= v4[0] * tmp0;
2483: sum5 -= v5[1] * tmp0;
2484: tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2485: sum5 -= v5[0] * tmp0;
2486: x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2487: break;
2488: default:
2489: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2490: }
2491: }
2492: ISRestoreIndices(isrow,&rout);
2493: ISRestoreIndices(iscol,&cout);
2494: VecRestoreArray(bb,(PetscScalar**)&b);
2495: VecRestoreArray(xx,&x);
2496: PetscLogFlops(2.0*a->nz - A->cmap->n);
2497: return(0);
2498: }
2501: /*
2502: Makes a longer coloring[] array and calls the usual code with that
2503: */
2506: PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2507: {
2508: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2509: PetscErrorCode ierr;
2510: PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2511: PetscInt *colorused,i;
2512: ISColoringValue *newcolor;
2515: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
2516: /* loop over inodes, marking a color for each column*/
2517: row = 0;
2518: for (i=0; i<m; i++){
2519: for (j=0; j<ns[i]; j++) {
2520: newcolor[row++] = coloring[i] + j*ncolors;
2521: }
2522: }
2524: /* eliminate unneeded colors */
2525: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
2526: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
2527: for (i=0; i<n; i++) {
2528: colorused[newcolor[i]] = 1;
2529: }
2531: for (i=1; i<5*ncolors; i++) {
2532: colorused[i] += colorused[i-1];
2533: }
2534: ncolors = colorused[5*ncolors-1];
2535: for (i=0; i<n; i++) {
2536: newcolor[i] = colorused[newcolor[i]]-1;
2537: }
2538: PetscFree(colorused);
2539: ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);
2540: PetscFree(coloring);
2541: return(0);
2542: }
2544: #include ../src/mat/blockinvert.h
2548: PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2549: {
2550: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2551: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2552: MatScalar *ibdiag,*bdiag,work[25];
2553: PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2554: const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5;
2555: PetscReal zeropivot = 1.0e-15, shift = 0.0;
2556: PetscErrorCode ierr;
2557: PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2558: PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2561: if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2562: if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2563: if (its > 1) {
2564: /* switch to non-inode version */
2565: MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);
2566: return(0);
2567: }
2569: if (!a->inode.ibdiagvalid) {
2570: if (!a->inode.ibdiag) {
2571: /* calculate space needed for diagonal blocks */
2572: for (i=0; i<m; i++) {
2573: cnt += sizes[i]*sizes[i];
2574: }
2575: a->inode.bdiagsize = cnt;
2576: PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);
2577: }
2579: /* copy over the diagonal blocks and invert them */
2580: ibdiag = a->inode.ibdiag;
2581: bdiag = a->inode.bdiag;
2582: cnt = 0;
2583: for (i=0, row = 0; i<m; i++) {
2584: for (j=0; j<sizes[i]; j++) {
2585: for (k=0; k<sizes[i]; k++) {
2586: bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2587: }
2588: }
2589: PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));
2590:
2591: switch(sizes[i]) {
2592: case 1:
2593: /* Create matrix data structure */
2594: if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2595: ibdiag[cnt] = 1.0/ibdiag[cnt];
2596: break;
2597: case 2:
2598: Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);
2599: break;
2600: case 3:
2601: Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);
2602: break;
2603: case 4:
2604: Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);
2605: break;
2606: case 5:
2607: Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);
2608: break;
2609: default:
2610: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2611: }
2612: cnt += sizes[i]*sizes[i];
2613: row += sizes[i];
2614: }
2615: a->inode.ibdiagvalid = PETSC_TRUE;
2616: }
2617: ibdiag = a->inode.ibdiag;
2618: bdiag = a->inode.bdiag;
2621: /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2622: if (flag & SOR_ZERO_INITIAL_GUESS) {
2623: PetscScalar *b;
2624: VecGetArray(xx,&x);
2625: if (xx != bb) {
2626: VecGetArray(bb,(PetscScalar**)&b);
2627: } else {
2628: b = x;
2629: }
2630: if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2632: for (i=0, row=0; i<m; i++) {
2633: sz = diag[row] - ii[row];
2634: v1 = a->a + ii[row];
2635: idx = a->j + ii[row];
2637: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2638: switch (sizes[i]){
2639: case 1:
2640:
2641: sum1 = b[row];
2642: for(n = 0; n<sz-1; n+=2) {
2643: i1 = idx[0];
2644: i2 = idx[1];
2645: idx += 2;
2646: tmp0 = x[i1];
2647: tmp1 = x[i2];
2648: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2649: }
2650:
2651: if (n == sz-1){
2652: tmp0 = x[*idx];
2653: sum1 -= *v1 * tmp0;
2654: }
2655: x[row++] = sum1*(*ibdiag++);
2656: break;
2657: case 2:
2658: v2 = a->a + ii[row+1];
2659: sum1 = b[row];
2660: sum2 = b[row+1];
2661: for(n = 0; n<sz-1; n+=2) {
2662: i1 = idx[0];
2663: i2 = idx[1];
2664: idx += 2;
2665: tmp0 = x[i1];
2666: tmp1 = x[i2];
2667: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2668: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2669: }
2670:
2671: if (n == sz-1){
2672: tmp0 = x[*idx];
2673: sum1 -= v1[0] * tmp0;
2674: sum2 -= v2[0] * tmp0;
2675: }
2676: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2677: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2678: ibdiag += 4;
2679: break;
2680: case 3:
2681: v2 = a->a + ii[row+1];
2682: v3 = a->a + ii[row+2];
2683: sum1 = b[row];
2684: sum2 = b[row+1];
2685: sum3 = b[row+2];
2686: for(n = 0; n<sz-1; n+=2) {
2687: i1 = idx[0];
2688: i2 = idx[1];
2689: idx += 2;
2690: tmp0 = x[i1];
2691: tmp1 = x[i2];
2692: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2693: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2694: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2695: }
2696:
2697: if (n == sz-1){
2698: tmp0 = x[*idx];
2699: sum1 -= v1[0] * tmp0;
2700: sum2 -= v2[0] * tmp0;
2701: sum3 -= v3[0] * tmp0;
2702: }
2703: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2704: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2705: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2706: ibdiag += 9;
2707: break;
2708: case 4:
2709: v2 = a->a + ii[row+1];
2710: v3 = a->a + ii[row+2];
2711: v4 = a->a + ii[row+3];
2712: sum1 = b[row];
2713: sum2 = b[row+1];
2714: sum3 = b[row+2];
2715: sum4 = b[row+3];
2716: for(n = 0; n<sz-1; n+=2) {
2717: i1 = idx[0];
2718: i2 = idx[1];
2719: idx += 2;
2720: tmp0 = x[i1];
2721: tmp1 = x[i2];
2722: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2723: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2724: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2725: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2726: }
2727:
2728: if (n == sz-1){
2729: tmp0 = x[*idx];
2730: sum1 -= v1[0] * tmp0;
2731: sum2 -= v2[0] * tmp0;
2732: sum3 -= v3[0] * tmp0;
2733: sum4 -= v4[0] * tmp0;
2734: }
2735: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2736: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2737: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2738: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2739: ibdiag += 16;
2740: break;
2741: case 5:
2742: v2 = a->a + ii[row+1];
2743: v3 = a->a + ii[row+2];
2744: v4 = a->a + ii[row+3];
2745: v5 = a->a + ii[row+4];
2746: sum1 = b[row];
2747: sum2 = b[row+1];
2748: sum3 = b[row+2];
2749: sum4 = b[row+3];
2750: sum5 = b[row+4];
2751: for(n = 0; n<sz-1; n+=2) {
2752: i1 = idx[0];
2753: i2 = idx[1];
2754: idx += 2;
2755: tmp0 = x[i1];
2756: tmp1 = x[i2];
2757: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2758: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2759: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2760: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2761: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2762: }
2763:
2764: if (n == sz-1){
2765: tmp0 = x[*idx];
2766: sum1 -= v1[0] * tmp0;
2767: sum2 -= v2[0] * tmp0;
2768: sum3 -= v3[0] * tmp0;
2769: sum4 -= v4[0] * tmp0;
2770: sum5 -= v5[0] * tmp0;
2771: }
2772: x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2773: x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2774: x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2775: x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2776: x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2777: ibdiag += 25;
2778: break;
2779: default:
2780: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2781: }
2782: }
2784: xb = x;
2785: PetscLogFlops(a->nz);
2786: } else xb = b;
2787: if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2788: (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2789: cnt = 0;
2790: for (i=0, row=0; i<m; i++) {
2792: switch (sizes[i]){
2793: case 1:
2794: x[row++] *= bdiag[cnt++];
2795: break;
2796: case 2:
2797: x1 = x[row]; x2 = x[row+1];
2798: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2799: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2800: x[row++] = tmp1;
2801: x[row++] = tmp2;
2802: cnt += 4;
2803: break;
2804: case 3:
2805: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
2806: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2807: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2808: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2809: x[row++] = tmp1;
2810: x[row++] = tmp2;
2811: x[row++] = tmp3;
2812: cnt += 9;
2813: break;
2814: case 4:
2815: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2816: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2817: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2818: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2819: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2820: x[row++] = tmp1;
2821: x[row++] = tmp2;
2822: x[row++] = tmp3;
2823: x[row++] = tmp4;
2824: cnt += 16;
2825: break;
2826: case 5:
2827: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2828: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2829: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2830: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2831: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2832: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2833: x[row++] = tmp1;
2834: x[row++] = tmp2;
2835: x[row++] = tmp3;
2836: x[row++] = tmp4;
2837: x[row++] = tmp5;
2838: cnt += 25;
2839: break;
2840: default:
2841: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2842: }
2843: }
2844: PetscLogFlops(m);
2845: }
2846: if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2848: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2849: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2850: ibdiag -= sizes[i]*sizes[i];
2851: sz = ii[row+1] - diag[row] - 1;
2852: v1 = a->a + diag[row] + 1;
2853: idx = a->j + diag[row] + 1;
2855: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2856: switch (sizes[i]){
2857: case 1:
2858:
2859: sum1 = xb[row];
2860: for(n = 0; n<sz-1; n+=2) {
2861: i1 = idx[0];
2862: i2 = idx[1];
2863: idx += 2;
2864: tmp0 = x[i1];
2865: tmp1 = x[i2];
2866: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2867: }
2868:
2869: if (n == sz-1){
2870: tmp0 = x[*idx];
2871: sum1 -= *v1*tmp0;
2872: }
2873: x[row--] = sum1*(*ibdiag);
2874: break;
2876: case 2:
2877:
2878: sum1 = xb[row];
2879: sum2 = xb[row-1];
2880: /* note that sum1 is associated with the second of the two rows */
2881: v2 = a->a + diag[row-1] + 2;
2882: for(n = 0; n<sz-1; n+=2) {
2883: i1 = idx[0];
2884: i2 = idx[1];
2885: idx += 2;
2886: tmp0 = x[i1];
2887: tmp1 = x[i2];
2888: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2889: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2890: }
2891:
2892: if (n == sz-1){
2893: tmp0 = x[*idx];
2894: sum1 -= *v1*tmp0;
2895: sum2 -= *v2*tmp0;
2896: }
2897: x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2898: x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2899: break;
2900: case 3:
2901:
2902: sum1 = xb[row];
2903: sum2 = xb[row-1];
2904: sum3 = xb[row-2];
2905: v2 = a->a + diag[row-1] + 2;
2906: v3 = a->a + diag[row-2] + 3;
2907: for(n = 0; n<sz-1; n+=2) {
2908: i1 = idx[0];
2909: i2 = idx[1];
2910: idx += 2;
2911: tmp0 = x[i1];
2912: tmp1 = x[i2];
2913: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2914: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2915: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2916: }
2917:
2918: if (n == sz-1){
2919: tmp0 = x[*idx];
2920: sum1 -= *v1*tmp0;
2921: sum2 -= *v2*tmp0;
2922: sum3 -= *v3*tmp0;
2923: }
2924: x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2925: x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2926: x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2927: break;
2928: case 4:
2929:
2930: sum1 = xb[row];
2931: sum2 = xb[row-1];
2932: sum3 = xb[row-2];
2933: sum4 = xb[row-3];
2934: v2 = a->a + diag[row-1] + 2;
2935: v3 = a->a + diag[row-2] + 3;
2936: v4 = a->a + diag[row-3] + 4;
2937: for(n = 0; n<sz-1; n+=2) {
2938: i1 = idx[0];
2939: i2 = idx[1];
2940: idx += 2;
2941: tmp0 = x[i1];
2942: tmp1 = x[i2];
2943: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2944: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2945: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2946: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2947: }
2948:
2949: if (n == sz-1){
2950: tmp0 = x[*idx];
2951: sum1 -= *v1*tmp0;
2952: sum2 -= *v2*tmp0;
2953: sum3 -= *v3*tmp0;
2954: sum4 -= *v4*tmp0;
2955: }
2956: x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2957: x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2958: x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2959: x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2960: break;
2961: case 5:
2962:
2963: sum1 = xb[row];
2964: sum2 = xb[row-1];
2965: sum3 = xb[row-2];
2966: sum4 = xb[row-3];
2967: sum5 = xb[row-4];
2968: v2 = a->a + diag[row-1] + 2;
2969: v3 = a->a + diag[row-2] + 3;
2970: v4 = a->a + diag[row-3] + 4;
2971: v5 = a->a + diag[row-4] + 5;
2972: for(n = 0; n<sz-1; n+=2) {
2973: i1 = idx[0];
2974: i2 = idx[1];
2975: idx += 2;
2976: tmp0 = x[i1];
2977: tmp1 = x[i2];
2978: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983: }
2984:
2985: if (n == sz-1){
2986: tmp0 = x[*idx];
2987: sum1 -= *v1*tmp0;
2988: sum2 -= *v2*tmp0;
2989: sum3 -= *v3*tmp0;
2990: sum4 -= *v4*tmp0;
2991: sum5 -= *v5*tmp0;
2992: }
2993: x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2994: x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2995: x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2996: x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2997: x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2998: break;
2999: default:
3000: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3001: }
3002: }
3004: PetscLogFlops(a->nz);
3005: }
3006: its--;
3007: VecRestoreArray(xx,&x);
3008: if (bb != xx) {VecRestoreArray(bb,(PetscScalar**)&b);}
3009: }
3010: if (flag & SOR_EISENSTAT) {
3011: const PetscScalar *b;
3012: MatScalar *t = a->inode.ssor_work;
3014: VecGetArray(xx,&x);
3015: VecGetArray(bb,(PetscScalar**)&b);
3016: /*
3017: Apply (U + D)^-1 where D is now the block diagonal
3018: */
3019: ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3020: for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3021: ibdiag -= sizes[i]*sizes[i];
3022: sz = ii[row+1] - diag[row] - 1;
3023: v1 = a->a + diag[row] + 1;
3024: idx = a->j + diag[row] + 1;
3025: CHKMEMQ;
3026: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3027: switch (sizes[i]){
3028: case 1:
3029:
3030: sum1 = b[row];
3031: for(n = 0; n<sz-1; n+=2) {
3032: i1 = idx[0];
3033: i2 = idx[1];
3034: idx += 2;
3035: tmp0 = x[i1];
3036: tmp1 = x[i2];
3037: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3038: }
3039:
3040: if (n == sz-1){
3041: tmp0 = x[*idx];
3042: sum1 -= *v1*tmp0;
3043: }
3044: x[row] = sum1*(*ibdiag);row--;
3045: break;
3047: case 2:
3048:
3049: sum1 = b[row];
3050: sum2 = b[row-1];
3051: /* note that sum1 is associated with the second of the two rows */
3052: v2 = a->a + diag[row-1] + 2;
3053: for(n = 0; n<sz-1; n+=2) {
3054: i1 = idx[0];
3055: i2 = idx[1];
3056: idx += 2;
3057: tmp0 = x[i1];
3058: tmp1 = x[i2];
3059: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3060: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3061: }
3062:
3063: if (n == sz-1){
3064: tmp0 = x[*idx];
3065: sum1 -= *v1*tmp0;
3066: sum2 -= *v2*tmp0;
3067: }
3068: x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3069: x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3070: row -= 2;
3071: break;
3072: case 3:
3073:
3074: sum1 = b[row];
3075: sum2 = b[row-1];
3076: sum3 = b[row-2];
3077: v2 = a->a + diag[row-1] + 2;
3078: v3 = a->a + diag[row-2] + 3;
3079: for(n = 0; n<sz-1; n+=2) {
3080: i1 = idx[0];
3081: i2 = idx[1];
3082: idx += 2;
3083: tmp0 = x[i1];
3084: tmp1 = x[i2];
3085: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3086: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3087: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3088: }
3089:
3090: if (n == sz-1){
3091: tmp0 = x[*idx];
3092: sum1 -= *v1*tmp0;
3093: sum2 -= *v2*tmp0;
3094: sum3 -= *v3*tmp0;
3095: }
3096: x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3097: x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3098: x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3099: row -= 3;
3100: break;
3101: case 4:
3102:
3103: sum1 = b[row];
3104: sum2 = b[row-1];
3105: sum3 = b[row-2];
3106: sum4 = b[row-3];
3107: v2 = a->a + diag[row-1] + 2;
3108: v3 = a->a + diag[row-2] + 3;
3109: v4 = a->a + diag[row-3] + 4;
3110: for(n = 0; n<sz-1; n+=2) {
3111: i1 = idx[0];
3112: i2 = idx[1];
3113: idx += 2;
3114: tmp0 = x[i1];
3115: tmp1 = x[i2];
3116: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3117: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3118: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3119: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3120: }
3121:
3122: if (n == sz-1){
3123: tmp0 = x[*idx];
3124: sum1 -= *v1*tmp0;
3125: sum2 -= *v2*tmp0;
3126: sum3 -= *v3*tmp0;
3127: sum4 -= *v4*tmp0;
3128: }
3129: x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3130: x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3131: x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3132: x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3133: row -= 4;
3134: break;
3135: case 5:
3136:
3137: sum1 = b[row];
3138: sum2 = b[row-1];
3139: sum3 = b[row-2];
3140: sum4 = b[row-3];
3141: sum5 = b[row-4];
3142: v2 = a->a + diag[row-1] + 2;
3143: v3 = a->a + diag[row-2] + 3;
3144: v4 = a->a + diag[row-3] + 4;
3145: v5 = a->a + diag[row-4] + 5;
3146: for(n = 0; n<sz-1; n+=2) {
3147: i1 = idx[0];
3148: i2 = idx[1];
3149: idx += 2;
3150: tmp0 = x[i1];
3151: tmp1 = x[i2];
3152: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3153: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3154: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3155: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3156: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3157: }
3158:
3159: if (n == sz-1){
3160: tmp0 = x[*idx];
3161: sum1 -= *v1*tmp0;
3162: sum2 -= *v2*tmp0;
3163: sum3 -= *v3*tmp0;
3164: sum4 -= *v4*tmp0;
3165: sum5 -= *v5*tmp0;
3166: }
3167: x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3168: x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3169: x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3170: x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3171: x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3172: row -= 5;
3173: break;
3174: default:
3175: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3176: }
3177: CHKMEMQ;
3178: }
3179: PetscLogFlops(a->nz);
3181: /*
3182: t = b - D x where D is the block diagonal
3183: */
3184: cnt = 0;
3185: for (i=0, row=0; i<m; i++) {
3186: CHKMEMQ;
3187: switch (sizes[i]){
3188: case 1:
3189: t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3190: break;
3191: case 2:
3192: x1 = x[row]; x2 = x[row+1];
3193: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3194: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3195: t[row] = b[row] - tmp1;
3196: t[row+1] = b[row+1] - tmp2; row += 2;
3197: cnt += 4;
3198: break;
3199: case 3:
3200: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3201: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3202: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3203: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3204: t[row] = b[row] - tmp1;
3205: t[row+1] = b[row+1] - tmp2;
3206: t[row+2] = b[row+2] - tmp3; row += 3;
3207: cnt += 9;
3208: break;
3209: case 4:
3210: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3211: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3212: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3213: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3214: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3215: t[row] = b[row] - tmp1;
3216: t[row+1] = b[row+1] - tmp2;
3217: t[row+2] = b[row+2] - tmp3;
3218: t[row+3] = b[row+3] - tmp4; row += 4;
3219: cnt += 16;
3220: break;
3221: case 5:
3222: x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3223: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3224: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3225: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3226: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3227: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3228: t[row] = b[row] - tmp1;
3229: t[row+1] = b[row+1] - tmp2;
3230: t[row+2] = b[row+2] - tmp3;
3231: t[row+3] = b[row+3] - tmp4;
3232: t[row+4] = b[row+4] - tmp5;row += 5;
3233: cnt += 25;
3234: break;
3235: default:
3236: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3237: }
3238: CHKMEMQ;
3239: }
3240: PetscLogFlops(m);
3244: /*
3245: Apply (L + D)^-1 where D is the block diagonal
3246: */
3247: for (i=0, row=0; i<m; i++) {
3248: sz = diag[row] - ii[row];
3249: v1 = a->a + ii[row];
3250: idx = a->j + ii[row];
3251: CHKMEMQ;
3252: /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3253: switch (sizes[i]){
3254: case 1:
3255:
3256: sum1 = t[row];
3257: for(n = 0; n<sz-1; n+=2) {
3258: i1 = idx[0];
3259: i2 = idx[1];
3260: idx += 2;
3261: tmp0 = t[i1];
3262: tmp1 = t[i2];
3263: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3264: }
3265:
3266: if (n == sz-1){
3267: tmp0 = t[*idx];
3268: sum1 -= *v1 * tmp0;
3269: }
3270: x[row] += t[row] = sum1*(*ibdiag++); row++;
3271: break;
3272: case 2:
3273: v2 = a->a + ii[row+1];
3274: sum1 = t[row];
3275: sum2 = t[row+1];
3276: for(n = 0; n<sz-1; n+=2) {
3277: i1 = idx[0];
3278: i2 = idx[1];
3279: idx += 2;
3280: tmp0 = t[i1];
3281: tmp1 = t[i2];
3282: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3283: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3284: }
3285:
3286: if (n == sz-1){
3287: tmp0 = t[*idx];
3288: sum1 -= v1[0] * tmp0;
3289: sum2 -= v2[0] * tmp0;
3290: }
3291: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3292: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3293: ibdiag += 4; row += 2;
3294: break;
3295: case 3:
3296: v2 = a->a + ii[row+1];
3297: v3 = a->a + ii[row+2];
3298: sum1 = t[row];
3299: sum2 = t[row+1];
3300: sum3 = t[row+2];
3301: for(n = 0; n<sz-1; n+=2) {
3302: i1 = idx[0];
3303: i2 = idx[1];
3304: idx += 2;
3305: tmp0 = t[i1];
3306: tmp1 = t[i2];
3307: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310: }
3311:
3312: if (n == sz-1){
3313: tmp0 = t[*idx];
3314: sum1 -= v1[0] * tmp0;
3315: sum2 -= v2[0] * tmp0;
3316: sum3 -= v3[0] * tmp0;
3317: }
3318: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321: ibdiag += 9; row += 3;
3322: break;
3323: case 4:
3324: v2 = a->a + ii[row+1];
3325: v3 = a->a + ii[row+2];
3326: v4 = a->a + ii[row+3];
3327: sum1 = t[row];
3328: sum2 = t[row+1];
3329: sum3 = t[row+2];
3330: sum4 = t[row+3];
3331: for(n = 0; n<sz-1; n+=2) {
3332: i1 = idx[0];
3333: i2 = idx[1];
3334: idx += 2;
3335: tmp0 = t[i1];
3336: tmp1 = t[i2];
3337: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3338: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3339: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3340: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3341: }
3342:
3343: if (n == sz-1){
3344: tmp0 = t[*idx];
3345: sum1 -= v1[0] * tmp0;
3346: sum2 -= v2[0] * tmp0;
3347: sum3 -= v3[0] * tmp0;
3348: sum4 -= v4[0] * tmp0;
3349: }
3350: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3351: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3352: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3353: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3354: ibdiag += 16; row += 4;
3355: break;
3356: case 5:
3357: v2 = a->a + ii[row+1];
3358: v3 = a->a + ii[row+2];
3359: v4 = a->a + ii[row+3];
3360: v5 = a->a + ii[row+4];
3361: sum1 = t[row];
3362: sum2 = t[row+1];
3363: sum3 = t[row+2];
3364: sum4 = t[row+3];
3365: sum5 = t[row+4];
3366: for(n = 0; n<sz-1; n+=2) {
3367: i1 = idx[0];
3368: i2 = idx[1];
3369: idx += 2;
3370: tmp0 = t[i1];
3371: tmp1 = t[i2];
3372: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3373: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3374: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3375: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3376: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3377: }
3378:
3379: if (n == sz-1){
3380: tmp0 = t[*idx];
3381: sum1 -= v1[0] * tmp0;
3382: sum2 -= v2[0] * tmp0;
3383: sum3 -= v3[0] * tmp0;
3384: sum4 -= v4[0] * tmp0;
3385: sum5 -= v5[0] * tmp0;
3386: }
3387: x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3388: x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3389: x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3390: x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3391: x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3392: ibdiag += 25; row += 5;
3393: break;
3394: default:
3395: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3396: }
3397: CHKMEMQ;
3398: }
3399: PetscLogFlops(a->nz);
3400: VecRestoreArray(xx,&x);
3401: VecRestoreArray(bb,(PetscScalar**)&b);
3402: }
3403: return(0);
3404: }
3408: PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3409: {
3410: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3411: PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3412: const MatScalar *bdiag = a->inode.bdiag;
3413: const PetscScalar *b;
3414: PetscErrorCode ierr;
3415: PetscInt m = a->inode.node_count,cnt = 0,i,row;
3416: const PetscInt *sizes = a->inode.size;
3419: VecGetArray(xx,&x);
3420: VecGetArray(bb,(PetscScalar**)&b);
3421: cnt = 0;
3422: for (i=0, row=0; i<m; i++) {
3423: switch (sizes[i]){
3424: case 1:
3425: x[row] = b[row]*bdiag[cnt++];row++;
3426: break;
3427: case 2:
3428: x1 = b[row]; x2 = b[row+1];
3429: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3430: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3431: x[row++] = tmp1;
3432: x[row++] = tmp2;
3433: cnt += 4;
3434: break;
3435: case 3:
3436: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3437: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3438: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3439: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3440: x[row++] = tmp1;
3441: x[row++] = tmp2;
3442: x[row++] = tmp3;
3443: cnt += 9;
3444: break;
3445: case 4:
3446: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3447: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3448: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3449: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3450: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3451: x[row++] = tmp1;
3452: x[row++] = tmp2;
3453: x[row++] = tmp3;
3454: x[row++] = tmp4;
3455: cnt += 16;
3456: break;
3457: case 5:
3458: x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3459: tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3460: tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3461: tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3462: tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3463: tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3464: x[row++] = tmp1;
3465: x[row++] = tmp2;
3466: x[row++] = tmp3;
3467: x[row++] = tmp4;
3468: x[row++] = tmp5;
3469: cnt += 25;
3470: break;
3471: default:
3472: SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3473: }
3474: }
3475: PetscLogFlops(2*cnt);
3476: VecRestoreArray(xx,&x);
3477: VecRestoreArray(bb,(PetscScalar**)&b);
3478: return(0);
3479: }
3481: /*
3482: samestructure indicates that the matrix has not changed its nonzero structure so we
3483: do not need to recompute the inodes
3484: */
3487: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3488: {
3489: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3491: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3492: PetscTruth flag;
3495: if (!a->inode.use) return(0);
3496: if (a->inode.checked && samestructure) return(0);
3499: m = A->rmap->n;
3500: if (a->inode.size) {ns = a->inode.size;}
3501: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
3503: i = 0;
3504: node_count = 0;
3505: idx = a->j;
3506: ii = a->i;
3507: while (i < m){ /* For each row */
3508: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
3509: /* Limits the number of elements in a node to 'a->inode.limit' */
3510: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3511: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
3512: if (nzy != nzx) break;
3513: idy += nzx; /* Same nonzero pattern */
3514: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
3515: if (!flag) break;
3516: }
3517: ns[node_count++] = blk_size;
3518: idx += blk_size*nzx;
3519: i = j;
3520: }
3521: /* If not enough inodes found,, do not use inode version of the routines */
3522: if (!a->inode.size && m && node_count > .9*m) {
3523: PetscFree(ns);
3524: a->inode.node_count = 0;
3525: a->inode.size = PETSC_NULL;
3526: a->inode.use = PETSC_FALSE;
3527: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
3528: } else {
3529: if (!A->factor) {
3530: A->ops->mult = MatMult_SeqAIJ_Inode;
3531: A->ops->sor = MatSOR_SeqAIJ_Inode;
3532: A->ops->multadd = MatMultAdd_SeqAIJ_Inode;
3533: A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
3534: A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
3535: A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
3536: A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
3537: A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
3538: A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3539: } else {
3540: A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3541: }
3542: a->inode.node_count = node_count;
3543: a->inode.size = ns;
3544: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
3545: }
3546: return(0);
3547: }
3549: #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \
3550: PetscInt __k, *__vi; \
3551: __vi = aj + ai[row]; \
3552: for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3553: __vi = aj + adiag[row]; \
3554: cols[nzl] = __vi[0];\
3555: __vi = aj + adiag[row+1]+1;\
3556: for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3559: /*
3560: Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3561: Modified from Mat_CheckInode().
3563: Input Parameters:
3564: + Mat A - ILU or LU matrix factor
3565: - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3566: do not need to recompute the inodes
3567: */
3570: PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3571: {
3572: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3574: PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3575: PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3576: PetscTruth flag;
3579: if (!a->inode.use) return(0);
3580: if (a->inode.checked && samestructure) return(0);
3582: m = A->rmap->n;
3583: if (a->inode.size) {ns = a->inode.size;}
3584: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
3586: i = 0;
3587: node_count = 0;
3588: while (i < m){ /* For each row */
3589: nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
3590: nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3591: nzx = nzl1 + nzu1 + 1;
3592: PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);
3593: MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3594:
3595: /* Limits the number of elements in a node to 'a->inode.limit' */
3596: for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3597: nzl2 = ai[j+1] - ai[j];
3598: nzu2 = adiag[j] - adiag[j+1] - 1;
3599: nzy = nzl2 + nzu2 + 1;
3600: if( nzy != nzx) break;
3601: PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);
3602: MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3603: PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);
3604: if (!flag) {PetscFree(cols2);break;}
3605: PetscFree(cols2);
3606: }
3607: ns[node_count++] = blk_size;
3608: PetscFree(cols1);
3609: i = j;
3610: }
3611: /* If not enough inodes found,, do not use inode version of the routines */
3612: if (!a->inode.size && m && node_count > .9*m) {
3613: PetscFree(ns);
3614: a->inode.node_count = 0;
3615: a->inode.size = PETSC_NULL;
3616: a->inode.use = PETSC_FALSE;
3617: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
3618: } else {
3619: A->ops->mult = 0;
3620: A->ops->sor = 0;
3621: A->ops->multadd = 0;
3622: A->ops->getrowij = 0;
3623: A->ops->restorerowij = 0;
3624: A->ops->getcolumnij = 0;
3625: A->ops->restorecolumnij = 0;
3626: A->ops->coloringpatch = 0;
3627: A->ops->multdiagonalblock = 0;
3628: A->ops->solve = MatSolve_SeqAIJ_Inode;
3629: a->inode.node_count = node_count;
3630: a->inode.size = ns;
3631: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
3632: }
3633: return(0);
3634: }
3636: /*
3637: This is really ugly. if inodes are used this replaces the
3638: permutations with ones that correspond to rows/cols of the matrix
3639: rather then inode blocks
3640: */
3643: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3644: {
3645: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3648: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
3649: if (f) {
3650: (*f)(A,rperm,cperm);
3651: }
3652: return(0);
3653: }
3658: PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3659: {
3660: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
3662: PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3663: const PetscInt *ridx,*cidx;
3664: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
3665: PetscInt nslim_col,*ns_col;
3666: IS ris = *rperm,cis = *cperm;
3669: if (!a->inode.size) return(0); /* no inodes so return */
3670: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
3672: Mat_CreateColInode(A,&nslim_col,&ns_col);
3673: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
3674: PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);
3676: ISGetIndices(ris,&ridx);
3677: ISGetIndices(cis,&cidx);
3679: /* Form the inode structure for the rows of permuted matric using inv perm*/
3680: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3682: /* Construct the permutations for rows*/
3683: for (i=0,row = 0; i<nslim_row; ++i){
3684: indx = ridx[i];
3685: start_val = tns[indx];
3686: end_val = tns[indx + 1];
3687: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3688: }
3690: /* Form the inode structure for the columns of permuted matrix using inv perm*/
3691: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3693: /* Construct permutations for columns */
3694: for (i=0,col=0; i<nslim_col; ++i){
3695: indx = cidx[i];
3696: start_val = tns[indx];
3697: end_val = tns[indx + 1];
3698: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3699: }
3701: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
3702: ISSetPermutation(*rperm);
3703: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
3704: ISSetPermutation(*cperm);
3705:
3706: ISRestoreIndices(ris,&ridx);
3707: ISRestoreIndices(cis,&cidx);
3709: PetscFree(ns_col);
3710: PetscFree2(permr,permc);
3711: ISDestroy(cis);
3712: ISDestroy(ris);
3713: PetscFree(tns);
3714: return(0);
3715: }
3720: /*@C
3721: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3723: Collective on Mat
3725: Input Parameter:
3726: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3728: Output Parameter:
3729: + node_count - no of inodes present in the matrix.
3730: . sizes - an array of size node_count,with sizes of each inode.
3731: - limit - the max size used to generate the inodes.
3733: Level: advanced
3735: Notes: This routine returns some internal storage information
3736: of the matrix, it is intended to be used by advanced users.
3737: It should be called after the matrix is assembled.
3738: The contents of the sizes[] array should not be changed.
3739: PETSC_NULL may be passed for information not requested.
3741: .keywords: matrix, seqaij, get, inode
3743: .seealso: MatGetInfo()
3744: @*/
3745: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3746: {
3747: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3750: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3751: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
3752: if (f) {
3753: (*f)(A,node_count,sizes,limit);
3754: }
3755: return(0);
3756: }
3761: PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3762: {
3763: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3766: if (node_count) *node_count = a->inode.node_count;
3767: if (sizes) *sizes = a->inode.size;
3768: if (limit) *limit = a->inode.limit;
3769: return(0);
3770: }