Actual source code: baijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
7: #include ../src/mat/impls/baij/seq/baij.h
8: #include ../src/mat/blockinvert.h
9: #include petscbt.h
10: #include ../src/mat/utils/freespace.h
14: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15: {
16: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
17: PetscErrorCode ierr;
18: PetscInt i,nz;
19: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
20: const MatScalar *aa=a->a,*v;
21: PetscScalar s1,*x;
22: const PetscScalar *b;
25: VecCopy(bb,xx);
26: VecGetArray(bb,(PetscScalar**)&b);
27: VecGetArray(xx,&x);
28:
29: /* forward solve the U^T */
30: for (i=0; i<n; i++) {
32: v = aa + diag[i];
33: /* multiply by the inverse of the block diagonal */
34: s1 = (*v++)*x[i];
35: vi = aj + diag[i] + 1;
36: nz = ai[i+1] - diag[i] - 1;
37: while (nz--) {
38: x[*vi++] -= (*v++)*s1;
39: }
40: x[i] = s1;
41: }
42: /* backward solve the L^T */
43: for (i=n-1; i>=0; i--){
44: v = aa + diag[i] - 1;
45: vi = aj + diag[i] - 1;
46: nz = diag[i] - ai[i];
47: s1 = x[i];
48: while (nz--) {
49: x[*vi--] -= (*v--)*s1;
50: }
51: }
52: VecRestoreArray(bb,(PetscScalar**)&b);
53: VecRestoreArray(xx,&x);
54: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
55: return(0);
56: }
60: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61: {
62: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
63: PetscErrorCode ierr;
64: PetscInt i,nz,idx,idt,oidx;
65: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66: const MatScalar *aa=a->a,*v;
67: PetscScalar s1,s2,x1,x2,*x;
68: const PetscScalar *b;
71: VecCopy(bb,xx);
72: VecGetArray(bb,(PetscScalar**)&b);
73: VecGetArray(xx,&x);
75: /* forward solve the U^T */
76: idx = 0;
77: for (i=0; i<n; i++) {
79: v = aa + 4*diag[i];
80: /* multiply by the inverse of the block diagonal */
81: x1 = x[idx]; x2 = x[1+idx];
82: s1 = v[0]*x1 + v[1]*x2;
83: s2 = v[2]*x1 + v[3]*x2;
84: v += 4;
86: vi = aj + diag[i] + 1;
87: nz = ai[i+1] - diag[i] - 1;
88: while (nz--) {
89: oidx = 2*(*vi++);
90: x[oidx] -= v[0]*s1 + v[1]*s2;
91: x[oidx+1] -= v[2]*s1 + v[3]*s2;
92: v += 4;
93: }
94: x[idx] = s1;x[1+idx] = s2;
95: idx += 2;
96: }
97: /* backward solve the L^T */
98: for (i=n-1; i>=0; i--){
99: v = aa + 4*diag[i] - 4;
100: vi = aj + diag[i] - 1;
101: nz = diag[i] - ai[i];
102: idt = 2*i;
103: s1 = x[idt]; s2 = x[1+idt];
104: while (nz--) {
105: idx = 2*(*vi--);
106: x[idx] -= v[0]*s1 + v[1]*s2;
107: x[idx+1] -= v[2]*s1 + v[3]*s2;
108: v -= 4;
109: }
110: }
111: VecRestoreArray(bb,(PetscScalar**)&b);
112: VecRestoreArray(xx,&x);
113: PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);
114: return(0);
115: }
119: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120: {
121: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
122: PetscErrorCode ierr;
123: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124: PetscInt nz,idx,idt,j,i,oidx;
125: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
126: const MatScalar *aa=a->a,*v;
127: PetscScalar s1,s2,x1,x2,*x;
128: const PetscScalar *b;
131: VecCopy(bb,xx);
132: VecGetArray(bb,(PetscScalar**)&b);
133: VecGetArray(xx,&x);
135: /* forward solve the U^T */
136: idx = 0;
137: for (i=0; i<n; i++) {
138: v = aa + bs2*diag[i];
139: /* multiply by the inverse of the block diagonal */
140: x1 = x[idx]; x2 = x[1+idx];
141: s1 = v[0]*x1 + v[1]*x2;
142: s2 = v[2]*x1 + v[3]*x2;
143: v -= bs2;
145: vi = aj + diag[i] - 1;
146: nz = diag[i] - diag[i+1] - 1;
147: for(j=0;j>-nz;j--){
148: oidx = bs*vi[j];
149: x[oidx] -= v[0]*s1 + v[1]*s2;
150: x[oidx+1] -= v[2]*s1 + v[3]*s2;
151: v -= bs2;
152: }
153: x[idx] = s1;x[1+idx] = s2;
154: idx += bs;
155: }
156: /* backward solve the L^T */
157: for (i=n-1; i>=0; i--){
158: v = aa + bs2*ai[i];
159: vi = aj + ai[i];
160: nz = ai[i+1] - ai[i];
161: idt = bs*i;
162: s1 = x[idt]; s2 = x[1+idt];
163: for(j=0;j<nz;j++){
164: idx = bs*vi[j];
165: x[idx] -= v[0]*s1 + v[1]*s2;
166: x[idx+1] -= v[2]*s1 + v[3]*s2;
167: v += bs2;
168: }
169: }
170: VecRestoreArray(bb,(PetscScalar**)&b);
171: VecRestoreArray(xx,&x);
172: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
173: return(0);
174: }
178: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179: {
180: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
181: PetscErrorCode ierr;
182: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183: PetscInt i,nz,idx,idt,oidx;
184: const MatScalar *aa=a->a,*v;
185: PetscScalar s1,s2,s3,x1,x2,x3,*x;
186: const PetscScalar *b;
189: VecCopy(bb,xx);
190: VecGetArray(bb,(PetscScalar**)&b);
191: VecGetArray(xx,&x);
193: /* forward solve the U^T */
194: idx = 0;
195: for (i=0; i<n; i++) {
197: v = aa + 9*diag[i];
198: /* multiply by the inverse of the block diagonal */
199: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
200: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
201: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
202: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
203: v += 9;
205: vi = aj + diag[i] + 1;
206: nz = ai[i+1] - diag[i] - 1;
207: while (nz--) {
208: oidx = 3*(*vi++);
209: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
210: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
211: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212: v += 9;
213: }
214: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
215: idx += 3;
216: }
217: /* backward solve the L^T */
218: for (i=n-1; i>=0; i--){
219: v = aa + 9*diag[i] - 9;
220: vi = aj + diag[i] - 1;
221: nz = diag[i] - ai[i];
222: idt = 3*i;
223: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
224: while (nz--) {
225: idx = 3*(*vi--);
226: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
227: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
228: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229: v -= 9;
230: }
231: }
232: VecRestoreArray(bb,(PetscScalar**)&b);
233: VecRestoreArray(xx,&x);
234: PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);
235: return(0);
236: }
240: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241: {
242: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
243: PetscErrorCode ierr;
244: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245: PetscInt nz,idx,idt,j,i,oidx;
246: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
247: const MatScalar *aa=a->a,*v;
248: PetscScalar s1,s2,s3,x1,x2,x3,*x;
249: const PetscScalar *b;
252: VecCopy(bb,xx);
253: VecGetArray(bb,(PetscScalar**)&b);
254: VecGetArray(xx,&x);
256: /* forward solve the U^T */
257: idx = 0;
258: for (i=0; i<n; i++) {
259: v = aa + bs2*diag[i];
260: /* multiply by the inverse of the block diagonal */
261: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
262: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
263: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
264: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
265: v -= bs2;
267: vi = aj + diag[i] - 1;
268: nz = diag[i] - diag[i+1] - 1;
269: for(j=0;j>-nz;j--){
270: oidx = bs*vi[j];
271: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
272: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
273: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
274: v -= bs2;
275: }
276: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
277: idx += bs;
278: }
279: /* backward solve the L^T */
280: for (i=n-1; i>=0; i--){
281: v = aa + bs2*ai[i];
282: vi = aj + ai[i];
283: nz = ai[i+1] - ai[i];
284: idt = bs*i;
285: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
286: for(j=0;j<nz;j++){
287: idx = bs*vi[j];
288: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
289: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
290: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
291: v += bs2;
292: }
293: }
294: VecRestoreArray(bb,(PetscScalar**)&b);
295: VecRestoreArray(xx,&x);
296: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
297: return(0);
298: }
302: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303: {
304: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
305: PetscErrorCode ierr;
306: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307: PetscInt i,nz,idx,idt,oidx;
308: const MatScalar *aa=a->a,*v;
309: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
310: const PetscScalar *b;
313: VecCopy(bb,xx);
314: VecGetArray(bb,(PetscScalar**)&b);
315: VecGetArray(xx,&x);
317: /* forward solve the U^T */
318: idx = 0;
319: for (i=0; i<n; i++) {
321: v = aa + 16*diag[i];
322: /* multiply by the inverse of the block diagonal */
323: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
324: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
325: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
326: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
327: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328: v += 16;
330: vi = aj + diag[i] + 1;
331: nz = ai[i+1] - diag[i] - 1;
332: while (nz--) {
333: oidx = 4*(*vi++);
334: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
335: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
336: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338: v += 16;
339: }
340: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341: idx += 4;
342: }
343: /* backward solve the L^T */
344: for (i=n-1; i>=0; i--){
345: v = aa + 16*diag[i] - 16;
346: vi = aj + diag[i] - 1;
347: nz = diag[i] - ai[i];
348: idt = 4*i;
349: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350: while (nz--) {
351: idx = 4*(*vi--);
352: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
353: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
354: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356: v -= 16;
357: }
358: }
359: VecRestoreArray(bb,(PetscScalar**)&b);
360: VecRestoreArray(xx,&x);
361: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
362: return(0);
363: }
367: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368: {
369: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
370: PetscErrorCode ierr;
371: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372: PetscInt nz,idx,idt,j,i,oidx;
373: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
374: const MatScalar *aa=a->a,*v;
375: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x;
376: const PetscScalar *b;
379: VecCopy(bb,xx);
380: VecGetArray(bb,(PetscScalar**)&b);
381: VecGetArray(xx,&x);
383: /* forward solve the U^T */
384: idx = 0;
385: for (i=0; i<n; i++) {
386: v = aa + bs2*diag[i];
387: /* multiply by the inverse of the block diagonal */
388: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
389: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
390: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
391: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
392: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
393: v -= bs2;
395: vi = aj + diag[i] - 1;
396: nz = diag[i] - diag[i+1] - 1;
397: for(j=0;j>-nz;j--){
398: oidx = bs*vi[j];
399: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
400: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
401: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
402: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
403: v -= bs2;
404: }
405: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4;
406: idx += bs;
407: }
408: /* backward solve the L^T */
409: for (i=n-1; i>=0; i--){
410: v = aa + bs2*ai[i];
411: vi = aj + ai[i];
412: nz = ai[i+1] - ai[i];
413: idt = bs*i;
414: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt];
415: for(j=0;j<nz;j++){
416: idx = bs*vi[j];
417: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
418: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
419: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
420: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
421: v += bs2;
422: }
423: }
424: VecRestoreArray(bb,(PetscScalar**)&b);
425: VecRestoreArray(xx,&x);
426: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
427: return(0);
428: }
432: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433: {
434: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
435: PetscErrorCode ierr;
436: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437: PetscInt i,nz,idx,idt,oidx;
438: const MatScalar *aa=a->a,*v;
439: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440: const PetscScalar *b;
443: VecCopy(bb,xx);
444: VecGetArray(bb,(PetscScalar**)&b);
445: VecGetArray(xx,&x);
447: /* forward solve the U^T */
448: idx = 0;
449: for (i=0; i<n; i++) {
451: v = aa + 25*diag[i];
452: /* multiply by the inverse of the block diagonal */
453: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
455: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
456: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459: v += 25;
461: vi = aj + diag[i] + 1;
462: nz = ai[i+1] - diag[i] - 1;
463: while (nz--) {
464: oidx = 5*(*vi++);
465: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
466: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
467: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470: v += 25;
471: }
472: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473: idx += 5;
474: }
475: /* backward solve the L^T */
476: for (i=n-1; i>=0; i--){
477: v = aa + 25*diag[i] - 25;
478: vi = aj + diag[i] - 1;
479: nz = diag[i] - ai[i];
480: idt = 5*i;
481: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482: while (nz--) {
483: idx = 5*(*vi--);
484: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
485: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
486: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489: v -= 25;
490: }
491: }
492: VecRestoreArray(bb,(PetscScalar**)&b);
493: VecRestoreArray(xx,&x);
494: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
495: return(0);
496: }
500: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501: {
502: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
504: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505: PetscInt nz,idx,idt,j,i,oidx;
506: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
507: const MatScalar *aa=a->a,*v;
508: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509: const PetscScalar *b;
512: VecCopy(bb,xx);
513: VecGetArray(bb,(PetscScalar**)&b);
514: VecGetArray(xx,&x);
516: /* forward solve the U^T */
517: idx = 0;
518: for (i=0; i<n; i++) {
519: v = aa + bs2*diag[i];
520: /* multiply by the inverse of the block diagonal */
521: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
522: x5 = x[4+idx];
523: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
524: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
525: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
526: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
527: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
528: v -= bs2;
530: vi = aj + diag[i] - 1;
531: nz = diag[i] - diag[i+1] - 1;
532: for(j=0;j>-nz;j--){
533: oidx = bs*vi[j];
534: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
535: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
536: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
537: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
538: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
539: v -= bs2;
540: }
541: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
542: idx += bs;
543: }
544: /* backward solve the L^T */
545: for (i=n-1; i>=0; i--){
546: v = aa + bs2*ai[i];
547: vi = aj + ai[i];
548: nz = ai[i+1] - ai[i];
549: idt = bs*i;
550: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
551: for(j=0;j<nz;j++){
552: idx = bs*vi[j];
553: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
554: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
555: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
556: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
557: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
558: v += bs2;
559: }
560: }
561: VecRestoreArray(bb,(PetscScalar**)&b);
562: VecRestoreArray(xx,&x);
563: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
564: return(0);
565: }
569: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570: {
571: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
572: PetscErrorCode ierr;
573: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574: PetscInt i,nz,idx,idt,oidx;
575: const MatScalar *aa=a->a,*v;
576: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577: const PetscScalar *b;
580: VecCopy(bb,xx);
581: VecGetArray(bb,(PetscScalar**)&b);
582: VecGetArray(xx,&x);
584: /* forward solve the U^T */
585: idx = 0;
586: for (i=0; i<n; i++) {
588: v = aa + 36*diag[i];
589: /* multiply by the inverse of the block diagonal */
590: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591: x6 = x[5+idx];
592: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
593: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
594: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598: v += 36;
600: vi = aj + diag[i] + 1;
601: nz = ai[i+1] - diag[i] - 1;
602: while (nz--) {
603: oidx = 6*(*vi++);
604: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
605: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
606: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610: v += 36;
611: }
612: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613: x[5+idx] = s6;
614: idx += 6;
615: }
616: /* backward solve the L^T */
617: for (i=n-1; i>=0; i--){
618: v = aa + 36*diag[i] - 36;
619: vi = aj + diag[i] - 1;
620: nz = diag[i] - ai[i];
621: idt = 6*i;
622: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623: s6 = x[5+idt];
624: while (nz--) {
625: idx = 6*(*vi--);
626: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
627: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
628: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632: v -= 36;
633: }
634: }
635: VecRestoreArray(bb,(PetscScalar**)&b);
636: VecRestoreArray(xx,&x);
637: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
638: return(0);
639: }
643: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644: {
645: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
646: PetscErrorCode ierr;
647: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648: PetscInt nz,idx,idt,j,i,oidx;
649: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
650: const MatScalar *aa=a->a,*v;
651: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652: const PetscScalar *b;
655: VecCopy(bb,xx);
656: VecGetArray(bb,(PetscScalar**)&b);
657: VecGetArray(xx,&x);
659: /* forward solve the U^T */
660: idx = 0;
661: for (i=0; i<n; i++) {
662: v = aa + bs2*diag[i];
663: /* multiply by the inverse of the block diagonal */
664: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
665: x5 = x[4+idx]; x6 = x[5+idx];
666: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
667: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
668: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672: v -= bs2;
674: vi = aj + diag[i] - 1;
675: nz = diag[i] - diag[i+1] - 1;
676: for(j=0;j>-nz;j--){
677: oidx = bs*vi[j];
678: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
679: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
680: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684: v -= bs2;
685: }
686: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
687: x[5+idx] = s6;
688: idx += bs;
689: }
690: /* backward solve the L^T */
691: for (i=n-1; i>=0; i--){
692: v = aa + bs2*ai[i];
693: vi = aj + ai[i];
694: nz = ai[i+1] - ai[i];
695: idt = bs*i;
696: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
697: s6 = x[5+idt];
698: for(j=0;j<nz;j++){
699: idx = bs*vi[j];
700: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
701: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
702: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706: v += bs2;
707: }
708: }
709: VecRestoreArray(bb,(PetscScalar**)&b);
710: VecRestoreArray(xx,&x);
711: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
712: return(0);
713: }
717: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718: {
719: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
720: PetscErrorCode ierr;
721: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722: PetscInt i,nz,idx,idt,oidx;
723: const MatScalar *aa=a->a,*v;
724: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725: const PetscScalar *b;
728: VecCopy(bb,xx);
729: VecGetArray(bb,(PetscScalar**)&b);
730: VecGetArray(xx,&x);
732: /* forward solve the U^T */
733: idx = 0;
734: for (i=0; i<n; i++) {
736: v = aa + 49*diag[i];
737: /* multiply by the inverse of the block diagonal */
738: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739: x6 = x[5+idx]; x7 = x[6+idx];
740: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
741: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747: v += 49;
749: vi = aj + diag[i] + 1;
750: nz = ai[i+1] - diag[i] - 1;
751: while (nz--) {
752: oidx = 7*(*vi++);
753: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
754: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760: v += 49;
761: }
762: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763: x[5+idx] = s6;x[6+idx] = s7;
764: idx += 7;
765: }
766: /* backward solve the L^T */
767: for (i=n-1; i>=0; i--){
768: v = aa + 49*diag[i] - 49;
769: vi = aj + diag[i] - 1;
770: nz = diag[i] - ai[i];
771: idt = 7*i;
772: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773: s6 = x[5+idt];s7 = x[6+idt];
774: while (nz--) {
775: idx = 7*(*vi--);
776: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
777: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783: v -= 49;
784: }
785: }
786: VecRestoreArray(bb,(PetscScalar**)&b);
787: VecRestoreArray(xx,&x);
788: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
789: return(0);
790: }
793: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794: {
795: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
796: PetscErrorCode ierr;
797: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798: PetscInt nz,idx,idt,j,i,oidx;
799: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
800: const MatScalar *aa=a->a,*v;
801: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802: const PetscScalar *b;
805: VecCopy(bb,xx);
806: VecGetArray(bb,(PetscScalar**)&b);
807: VecGetArray(xx,&x);
809: /* forward solve the U^T */
810: idx = 0;
811: for (i=0; i<n; i++) {
812: v = aa + bs2*diag[i];
813: /* multiply by the inverse of the block diagonal */
814: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
815: x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx];
816: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
817: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823: v -= bs2;
824: vi = aj + diag[i] - 1;
825: nz = diag[i] - diag[i+1] - 1;
826: for(j=0;j>-nz;j--){
827: oidx = bs*vi[j];
828: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
829: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835: v -= bs2;
836: }
837: x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5;
838: x[5+idx] = s6; x[6+idx] = s7;
839: idx += bs;
840: }
841: /* backward solve the L^T */
842: for (i=n-1; i>=0; i--){
843: v = aa + bs2*ai[i];
844: vi = aj + ai[i];
845: nz = ai[i+1] - ai[i];
846: idt = bs*i;
847: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
848: s6 = x[5+idt]; s7 = x[6+idt];
849: for(j=0;j<nz;j++){
850: idx = bs*vi[j];
851: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
852: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858: v += bs2;
859: }
860: }
861: VecRestoreArray(bb,(PetscScalar**)&b);
862: VecRestoreArray(xx,&x);
863: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
864: return(0);
865: }
867: /*---------------------------------------------------------------------------------------------*/
870: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871: {
872: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
873: IS iscol=a->col,isrow=a->row;
874: PetscErrorCode ierr;
875: const PetscInt *r,*c,*rout,*cout;
876: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877: PetscInt i,nz;
878: const MatScalar *aa=a->a,*v;
879: PetscScalar s1,*x,*t;
880: const PetscScalar *b;
883: VecGetArray(bb,(PetscScalar**)&b);
884: VecGetArray(xx,&x);
885: t = a->solve_work;
887: ISGetIndices(isrow,&rout); r = rout;
888: ISGetIndices(iscol,&cout); c = cout;
890: /* copy the b into temp work space according to permutation */
891: for (i=0; i<n; i++) {
892: t[i] = b[c[i]];
893: }
895: /* forward solve the U^T */
896: for (i=0; i<n; i++) {
898: v = aa + diag[i];
899: /* multiply by the inverse of the block diagonal */
900: s1 = (*v++)*t[i];
901: vi = aj + diag[i] + 1;
902: nz = ai[i+1] - diag[i] - 1;
903: while (nz--) {
904: t[*vi++] -= (*v++)*s1;
905: }
906: t[i] = s1;
907: }
908: /* backward solve the L^T */
909: for (i=n-1; i>=0; i--){
910: v = aa + diag[i] - 1;
911: vi = aj + diag[i] - 1;
912: nz = diag[i] - ai[i];
913: s1 = t[i];
914: while (nz--) {
915: t[*vi--] -= (*v--)*s1;
916: }
917: }
919: /* copy t into x according to permutation */
920: for (i=0; i<n; i++) {
921: x[r[i]] = t[i];
922: }
924: ISRestoreIndices(isrow,&rout);
925: ISRestoreIndices(iscol,&cout);
926: VecRestoreArray(bb,(PetscScalar**)&b);
927: VecRestoreArray(xx,&x);
928: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
929: return(0);
930: }
934: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935: {
936: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
937: IS iscol=a->col,isrow=a->row;
938: PetscErrorCode ierr;
939: const PetscInt *r,*c,*rout,*cout;
940: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
942: const MatScalar *aa=a->a,*v;
943: PetscScalar s1,s2,x1,x2,*x,*t;
944: const PetscScalar *b;
947: VecGetArray(bb,(PetscScalar**)&b);
948: VecGetArray(xx,&x);
949: t = a->solve_work;
951: ISGetIndices(isrow,&rout); r = rout;
952: ISGetIndices(iscol,&cout); c = cout;
954: /* copy the b into temp work space according to permutation */
955: ii = 0;
956: for (i=0; i<n; i++) {
957: ic = 2*c[i];
958: t[ii] = b[ic];
959: t[ii+1] = b[ic+1];
960: ii += 2;
961: }
963: /* forward solve the U^T */
964: idx = 0;
965: for (i=0; i<n; i++) {
967: v = aa + 4*diag[i];
968: /* multiply by the inverse of the block diagonal */
969: x1 = t[idx]; x2 = t[1+idx];
970: s1 = v[0]*x1 + v[1]*x2;
971: s2 = v[2]*x1 + v[3]*x2;
972: v += 4;
974: vi = aj + diag[i] + 1;
975: nz = ai[i+1] - diag[i] - 1;
976: while (nz--) {
977: oidx = 2*(*vi++);
978: t[oidx] -= v[0]*s1 + v[1]*s2;
979: t[oidx+1] -= v[2]*s1 + v[3]*s2;
980: v += 4;
981: }
982: t[idx] = s1;t[1+idx] = s2;
983: idx += 2;
984: }
985: /* backward solve the L^T */
986: for (i=n-1; i>=0; i--){
987: v = aa + 4*diag[i] - 4;
988: vi = aj + diag[i] - 1;
989: nz = diag[i] - ai[i];
990: idt = 2*i;
991: s1 = t[idt]; s2 = t[1+idt];
992: while (nz--) {
993: idx = 2*(*vi--);
994: t[idx] -= v[0]*s1 + v[1]*s2;
995: t[idx+1] -= v[2]*s1 + v[3]*s2;
996: v -= 4;
997: }
998: }
1000: /* copy t into x according to permutation */
1001: ii = 0;
1002: for (i=0; i<n; i++) {
1003: ir = 2*r[i];
1004: x[ir] = t[ii];
1005: x[ir+1] = t[ii+1];
1006: ii += 2;
1007: }
1009: ISRestoreIndices(isrow,&rout);
1010: ISRestoreIndices(iscol,&cout);
1011: VecRestoreArray(bb,(PetscScalar**)&b);
1012: VecRestoreArray(xx,&x);
1013: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
1014: return(0);
1015: }
1019: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1020: {
1021: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1022: PetscErrorCode ierr;
1023: IS iscol=a->col,isrow=a->row;
1024: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1025: const PetscInt *r,*c,*rout,*cout;
1026: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1027: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1028: const MatScalar *aa=a->a,*v;
1029: PetscScalar s1,s2,x1,x2,*x,*t;
1030: const PetscScalar *b;
1033: VecGetArray(bb,(PetscScalar**)&b);
1034: VecGetArray(xx,&x);
1035: t = a->solve_work;
1037: ISGetIndices(isrow,&rout); r = rout;
1038: ISGetIndices(iscol,&cout); c = cout;
1040: /* copy b into temp work space according to permutation */
1041: for(i=0;i<n;i++){
1042: ii = bs*i; ic = bs*c[i];
1043: t[ii] = b[ic]; t[ii+1] = b[ic+1];
1044: }
1046: /* forward solve the U^T */
1047: idx = 0;
1048: for (i=0; i<n; i++) {
1049: v = aa + bs2*diag[i];
1050: /* multiply by the inverse of the block diagonal */
1051: x1 = t[idx]; x2 = t[1+idx];
1052: s1 = v[0]*x1 + v[1]*x2;
1053: s2 = v[2]*x1 + v[3]*x2;
1054: v -= bs2;
1056: vi = aj + diag[i] - 1;
1057: nz = diag[i] - diag[i+1] - 1;
1058: for(j=0;j>-nz;j--){
1059: oidx = bs*vi[j];
1060: t[oidx] -= v[0]*s1 + v[1]*s2;
1061: t[oidx+1] -= v[2]*s1 + v[3]*s2;
1062: v -= bs2;
1063: }
1064: t[idx] = s1;t[1+idx] = s2;
1065: idx += bs;
1066: }
1067: /* backward solve the L^T */
1068: for (i=n-1; i>=0; i--){
1069: v = aa + bs2*ai[i];
1070: vi = aj + ai[i];
1071: nz = ai[i+1] - ai[i];
1072: idt = bs*i;
1073: s1 = t[idt]; s2 = t[1+idt];
1074: for(j=0;j<nz;j++){
1075: idx = bs*vi[j];
1076: t[idx] -= v[0]*s1 + v[1]*s2;
1077: t[idx+1] -= v[2]*s1 + v[3]*s2;
1078: v += bs2;
1079: }
1080: }
1082: /* copy t into x according to permutation */
1083: for(i=0;i<n;i++){
1084: ii = bs*i; ir = bs*r[i];
1085: x[ir] = t[ii]; x[ir+1] = t[ii+1];
1086: }
1088: ISRestoreIndices(isrow,&rout);
1089: ISRestoreIndices(iscol,&cout);
1090: VecRestoreArray(bb,(PetscScalar**)&b);
1091: VecRestoreArray(xx,&x);
1092: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1093: return(0);
1094: }
1098: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099: {
1100: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1101: IS iscol=a->col,isrow=a->row;
1102: PetscErrorCode ierr;
1103: const PetscInt *r,*c,*rout,*cout;
1104: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1106: const MatScalar *aa=a->a,*v;
1107: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1108: const PetscScalar *b;
1111: VecGetArray(bb,(PetscScalar**)&b);
1112: VecGetArray(xx,&x);
1113: t = a->solve_work;
1115: ISGetIndices(isrow,&rout); r = rout;
1116: ISGetIndices(iscol,&cout); c = cout;
1118: /* copy the b into temp work space according to permutation */
1119: ii = 0;
1120: for (i=0; i<n; i++) {
1121: ic = 3*c[i];
1122: t[ii] = b[ic];
1123: t[ii+1] = b[ic+1];
1124: t[ii+2] = b[ic+2];
1125: ii += 3;
1126: }
1128: /* forward solve the U^T */
1129: idx = 0;
1130: for (i=0; i<n; i++) {
1132: v = aa + 9*diag[i];
1133: /* multiply by the inverse of the block diagonal */
1134: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1135: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1136: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1137: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1138: v += 9;
1140: vi = aj + diag[i] + 1;
1141: nz = ai[i+1] - diag[i] - 1;
1142: while (nz--) {
1143: oidx = 3*(*vi++);
1144: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1145: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1146: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147: v += 9;
1148: }
1149: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1150: idx += 3;
1151: }
1152: /* backward solve the L^T */
1153: for (i=n-1; i>=0; i--){
1154: v = aa + 9*diag[i] - 9;
1155: vi = aj + diag[i] - 1;
1156: nz = diag[i] - ai[i];
1157: idt = 3*i;
1158: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1159: while (nz--) {
1160: idx = 3*(*vi--);
1161: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1162: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1163: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164: v -= 9;
1165: }
1166: }
1168: /* copy t into x according to permutation */
1169: ii = 0;
1170: for (i=0; i<n; i++) {
1171: ir = 3*r[i];
1172: x[ir] = t[ii];
1173: x[ir+1] = t[ii+1];
1174: x[ir+2] = t[ii+2];
1175: ii += 3;
1176: }
1178: ISRestoreIndices(isrow,&rout);
1179: ISRestoreIndices(iscol,&cout);
1180: VecRestoreArray(bb,(PetscScalar**)&b);
1181: VecRestoreArray(xx,&x);
1182: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
1183: return(0);
1184: }
1188: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1189: {
1190: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1191: PetscErrorCode ierr;
1192: IS iscol=a->col,isrow=a->row;
1193: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1194: const PetscInt *r,*c,*rout,*cout;
1195: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1196: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1197: const MatScalar *aa=a->a,*v;
1198: PetscScalar s1,s2,s3,x1,x2,x3,*x,*t;
1199: const PetscScalar *b;
1202: VecGetArray(bb,(PetscScalar**)&b);
1203: VecGetArray(xx,&x);
1204: t = a->solve_work;
1206: ISGetIndices(isrow,&rout); r = rout;
1207: ISGetIndices(iscol,&cout); c = cout;
1209: /* copy b into temp work space according to permutation */
1210: for(i=0;i<n;i++){
1211: ii = bs*i; ic = bs*c[i];
1212: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1213: }
1215: /* forward solve the U^T */
1216: idx = 0;
1217: for (i=0; i<n; i++) {
1218: v = aa + bs2*diag[i];
1219: /* multiply by the inverse of the block diagonal */
1220: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1221: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
1222: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
1223: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
1224: v -= bs2;
1226: vi = aj + diag[i] - 1;
1227: nz = diag[i] - diag[i+1] - 1;
1228: for(j=0;j>-nz;j--){
1229: oidx = bs*vi[j];
1230: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1231: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1232: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1233: v -= bs2;
1234: }
1235: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
1236: idx += bs;
1237: }
1238: /* backward solve the L^T */
1239: for (i=n-1; i>=0; i--){
1240: v = aa + bs2*ai[i];
1241: vi = aj + ai[i];
1242: nz = ai[i+1] - ai[i];
1243: idt = bs*i;
1244: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
1245: for(j=0;j<nz;j++){
1246: idx = bs*vi[j];
1247: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
1248: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
1249: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1250: v += bs2;
1251: }
1252: }
1254: /* copy t into x according to permutation */
1255: for(i=0;i<n;i++){
1256: ii = bs*i; ir = bs*r[i];
1257: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1258: }
1260: ISRestoreIndices(isrow,&rout);
1261: ISRestoreIndices(iscol,&cout);
1262: VecRestoreArray(bb,(PetscScalar**)&b);
1263: VecRestoreArray(xx,&x);
1264: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1265: return(0);
1266: }
1270: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271: {
1272: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1273: IS iscol=a->col,isrow=a->row;
1274: PetscErrorCode ierr;
1275: const PetscInt *r,*c,*rout,*cout;
1276: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1278: const MatScalar *aa=a->a,*v;
1279: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280: const PetscScalar *b;
1283: VecGetArray(bb,(PetscScalar**)&b);
1284: VecGetArray(xx,&x);
1285: t = a->solve_work;
1287: ISGetIndices(isrow,&rout); r = rout;
1288: ISGetIndices(iscol,&cout); c = cout;
1290: /* copy the b into temp work space according to permutation */
1291: ii = 0;
1292: for (i=0; i<n; i++) {
1293: ic = 4*c[i];
1294: t[ii] = b[ic];
1295: t[ii+1] = b[ic+1];
1296: t[ii+2] = b[ic+2];
1297: t[ii+3] = b[ic+3];
1298: ii += 4;
1299: }
1301: /* forward solve the U^T */
1302: idx = 0;
1303: for (i=0; i<n; i++) {
1305: v = aa + 16*diag[i];
1306: /* multiply by the inverse of the block diagonal */
1307: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1308: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1309: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1310: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1311: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312: v += 16;
1314: vi = aj + diag[i] + 1;
1315: nz = ai[i+1] - diag[i] - 1;
1316: while (nz--) {
1317: oidx = 4*(*vi++);
1318: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1319: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1320: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322: v += 16;
1323: }
1324: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325: idx += 4;
1326: }
1327: /* backward solve the L^T */
1328: for (i=n-1; i>=0; i--){
1329: v = aa + 16*diag[i] - 16;
1330: vi = aj + diag[i] - 1;
1331: nz = diag[i] - ai[i];
1332: idt = 4*i;
1333: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334: while (nz--) {
1335: idx = 4*(*vi--);
1336: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1337: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1338: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340: v -= 16;
1341: }
1342: }
1344: /* copy t into x according to permutation */
1345: ii = 0;
1346: for (i=0; i<n; i++) {
1347: ir = 4*r[i];
1348: x[ir] = t[ii];
1349: x[ir+1] = t[ii+1];
1350: x[ir+2] = t[ii+2];
1351: x[ir+3] = t[ii+3];
1352: ii += 4;
1353: }
1355: ISRestoreIndices(isrow,&rout);
1356: ISRestoreIndices(iscol,&cout);
1357: VecRestoreArray(bb,(PetscScalar**)&b);
1358: VecRestoreArray(xx,&x);
1359: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
1360: return(0);
1361: }
1365: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1366: {
1367: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1368: PetscErrorCode ierr;
1369: IS iscol=a->col,isrow=a->row;
1370: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1371: const PetscInt *r,*c,*rout,*cout;
1372: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1373: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1374: const MatScalar *aa=a->a,*v;
1375: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376: const PetscScalar *b;
1379: VecGetArray(bb,(PetscScalar**)&b);
1380: VecGetArray(xx,&x);
1381: t = a->solve_work;
1383: ISGetIndices(isrow,&rout); r = rout;
1384: ISGetIndices(iscol,&cout); c = cout;
1386: /* copy b into temp work space according to permutation */
1387: for(i=0;i<n;i++){
1388: ii = bs*i; ic = bs*c[i];
1389: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1390: }
1392: /* forward solve the U^T */
1393: idx = 0;
1394: for (i=0; i<n; i++) {
1395: v = aa + bs2*diag[i];
1396: /* multiply by the inverse of the block diagonal */
1397: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
1398: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1399: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1400: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1401: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1402: v -= bs2;
1404: vi = aj + diag[i] - 1;
1405: nz = diag[i] - diag[i+1] - 1;
1406: for(j=0;j>-nz;j--){
1407: oidx = bs*vi[j];
1408: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1409: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1410: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1411: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1412: v -= bs2;
1413: }
1414: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4;
1415: idx += bs;
1416: }
1417: /* backward solve the L^T */
1418: for (i=n-1; i>=0; i--){
1419: v = aa + bs2*ai[i];
1420: vi = aj + ai[i];
1421: nz = ai[i+1] - ai[i];
1422: idt = bs*i;
1423: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt];
1424: for(j=0;j<nz;j++){
1425: idx = bs*vi[j];
1426: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
1427: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
1428: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1429: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1430: v += bs2;
1431: }
1432: }
1434: /* copy t into x according to permutation */
1435: for(i=0;i<n;i++){
1436: ii = bs*i; ir = bs*r[i];
1437: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1438: }
1440: ISRestoreIndices(isrow,&rout);
1441: ISRestoreIndices(iscol,&cout);
1442: VecRestoreArray(bb,(PetscScalar**)&b);
1443: VecRestoreArray(xx,&x);
1444: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1445: return(0);
1446: }
1450: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451: {
1452: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1453: IS iscol=a->col,isrow=a->row;
1454: PetscErrorCode ierr;
1455: const PetscInt *r,*c,*rout,*cout;
1456: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1458: const MatScalar *aa=a->a,*v;
1459: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460: const PetscScalar *b;
1463: VecGetArray(bb,(PetscScalar**)&b);
1464: VecGetArray(xx,&x);
1465: t = a->solve_work;
1467: ISGetIndices(isrow,&rout); r = rout;
1468: ISGetIndices(iscol,&cout); c = cout;
1470: /* copy the b into temp work space according to permutation */
1471: ii = 0;
1472: for (i=0; i<n; i++) {
1473: ic = 5*c[i];
1474: t[ii] = b[ic];
1475: t[ii+1] = b[ic+1];
1476: t[ii+2] = b[ic+2];
1477: t[ii+3] = b[ic+3];
1478: t[ii+4] = b[ic+4];
1479: ii += 5;
1480: }
1482: /* forward solve the U^T */
1483: idx = 0;
1484: for (i=0; i<n; i++) {
1486: v = aa + 25*diag[i];
1487: /* multiply by the inverse of the block diagonal */
1488: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1490: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1491: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494: v += 25;
1496: vi = aj + diag[i] + 1;
1497: nz = ai[i+1] - diag[i] - 1;
1498: while (nz--) {
1499: oidx = 5*(*vi++);
1500: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1501: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1502: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505: v += 25;
1506: }
1507: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508: idx += 5;
1509: }
1510: /* backward solve the L^T */
1511: for (i=n-1; i>=0; i--){
1512: v = aa + 25*diag[i] - 25;
1513: vi = aj + diag[i] - 1;
1514: nz = diag[i] - ai[i];
1515: idt = 5*i;
1516: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517: while (nz--) {
1518: idx = 5*(*vi--);
1519: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1520: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1521: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524: v -= 25;
1525: }
1526: }
1528: /* copy t into x according to permutation */
1529: ii = 0;
1530: for (i=0; i<n; i++) {
1531: ir = 5*r[i];
1532: x[ir] = t[ii];
1533: x[ir+1] = t[ii+1];
1534: x[ir+2] = t[ii+2];
1535: x[ir+3] = t[ii+3];
1536: x[ir+4] = t[ii+4];
1537: ii += 5;
1538: }
1540: ISRestoreIndices(isrow,&rout);
1541: ISRestoreIndices(iscol,&cout);
1542: VecRestoreArray(bb,(PetscScalar**)&b);
1543: VecRestoreArray(xx,&x);
1544: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
1545: return(0);
1546: }
1550: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1551: {
1552: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1553: PetscErrorCode ierr;
1554: IS iscol=a->col,isrow=a->row;
1555: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1556: const PetscInt *r,*c,*rout,*cout;
1557: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1558: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1559: const MatScalar *aa=a->a,*v;
1560: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561: const PetscScalar *b;
1564: VecGetArray(bb,(PetscScalar**)&b);
1565: VecGetArray(xx,&x);
1566: t = a->solve_work;
1568: ISGetIndices(isrow,&rout); r = rout;
1569: ISGetIndices(iscol,&cout); c = cout;
1571: /* copy b into temp work space according to permutation */
1572: for(i=0;i<n;i++){
1573: ii = bs*i; ic = bs*c[i];
1574: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1575: t[ii+4] = b[ic+4];
1576: }
1578: /* forward solve the U^T */
1579: idx = 0;
1580: for (i=0; i<n; i++) {
1581: v = aa + bs2*diag[i];
1582: /* multiply by the inverse of the block diagonal */
1583: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1584: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1585: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1586: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1587: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1588: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1589: v -= bs2;
1591: vi = aj + diag[i] - 1;
1592: nz = diag[i] - diag[i+1] - 1;
1593: for(j=0;j>-nz;j--){
1594: oidx = bs*vi[j];
1595: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1596: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1597: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1598: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1599: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1600: v -= bs2;
1601: }
1602: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1603: idx += bs;
1604: }
1605: /* backward solve the L^T */
1606: for (i=n-1; i>=0; i--){
1607: v = aa + bs2*ai[i];
1608: vi = aj + ai[i];
1609: nz = ai[i+1] - ai[i];
1610: idt = bs*i;
1611: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1612: for(j=0;j<nz;j++){
1613: idx = bs*vi[j];
1614: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
1615: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
1616: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1617: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1618: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1619: v += bs2;
1620: }
1621: }
1623: /* copy t into x according to permutation */
1624: for(i=0;i<n;i++){
1625: ii = bs*i; ir = bs*r[i];
1626: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1627: x[ir+4] = t[ii+4];
1628: }
1630: ISRestoreIndices(isrow,&rout);
1631: ISRestoreIndices(iscol,&cout);
1632: VecRestoreArray(bb,(PetscScalar**)&b);
1633: VecRestoreArray(xx,&x);
1634: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1635: return(0);
1636: }
1640: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641: {
1642: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1643: IS iscol=a->col,isrow=a->row;
1644: PetscErrorCode ierr;
1645: const PetscInt *r,*c,*rout,*cout;
1646: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1648: const MatScalar *aa=a->a,*v;
1649: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650: const PetscScalar *b;
1653: VecGetArray(bb,(PetscScalar**)&b);
1654: VecGetArray(xx,&x);
1655: t = a->solve_work;
1657: ISGetIndices(isrow,&rout); r = rout;
1658: ISGetIndices(iscol,&cout); c = cout;
1660: /* copy the b into temp work space according to permutation */
1661: ii = 0;
1662: for (i=0; i<n; i++) {
1663: ic = 6*c[i];
1664: t[ii] = b[ic];
1665: t[ii+1] = b[ic+1];
1666: t[ii+2] = b[ic+2];
1667: t[ii+3] = b[ic+3];
1668: t[ii+4] = b[ic+4];
1669: t[ii+5] = b[ic+5];
1670: ii += 6;
1671: }
1673: /* forward solve the U^T */
1674: idx = 0;
1675: for (i=0; i<n; i++) {
1677: v = aa + 36*diag[i];
1678: /* multiply by the inverse of the block diagonal */
1679: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680: x6 = t[5+idx];
1681: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1682: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1683: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687: v += 36;
1689: vi = aj + diag[i] + 1;
1690: nz = ai[i+1] - diag[i] - 1;
1691: while (nz--) {
1692: oidx = 6*(*vi++);
1693: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1694: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1695: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699: v += 36;
1700: }
1701: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702: t[5+idx] = s6;
1703: idx += 6;
1704: }
1705: /* backward solve the L^T */
1706: for (i=n-1; i>=0; i--){
1707: v = aa + 36*diag[i] - 36;
1708: vi = aj + diag[i] - 1;
1709: nz = diag[i] - ai[i];
1710: idt = 6*i;
1711: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712: s6 = t[5+idt];
1713: while (nz--) {
1714: idx = 6*(*vi--);
1715: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1716: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1717: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721: v -= 36;
1722: }
1723: }
1725: /* copy t into x according to permutation */
1726: ii = 0;
1727: for (i=0; i<n; i++) {
1728: ir = 6*r[i];
1729: x[ir] = t[ii];
1730: x[ir+1] = t[ii+1];
1731: x[ir+2] = t[ii+2];
1732: x[ir+3] = t[ii+3];
1733: x[ir+4] = t[ii+4];
1734: x[ir+5] = t[ii+5];
1735: ii += 6;
1736: }
1738: ISRestoreIndices(isrow,&rout);
1739: ISRestoreIndices(iscol,&cout);
1740: VecRestoreArray(bb,(PetscScalar**)&b);
1741: VecRestoreArray(xx,&x);
1742: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
1743: return(0);
1744: }
1748: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1749: {
1750: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1751: PetscErrorCode ierr;
1752: IS iscol=a->col,isrow=a->row;
1753: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1754: const PetscInt *r,*c,*rout,*cout;
1755: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1756: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1757: const MatScalar *aa=a->a,*v;
1758: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759: const PetscScalar *b;
1762: VecGetArray(bb,(PetscScalar**)&b);
1763: VecGetArray(xx,&x);
1764: t = a->solve_work;
1766: ISGetIndices(isrow,&rout); r = rout;
1767: ISGetIndices(iscol,&cout); c = cout;
1769: /* copy b into temp work space according to permutation */
1770: for(i=0;i<n;i++){
1771: ii = bs*i; ic = bs*c[i];
1772: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1773: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5];
1774: }
1776: /* forward solve the U^T */
1777: idx = 0;
1778: for (i=0; i<n; i++) {
1779: v = aa + bs2*diag[i];
1780: /* multiply by the inverse of the block diagonal */
1781: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1782: x6 = t[5+idx];
1783: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
1784: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
1785: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1786: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1787: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1788: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1789: v -= bs2;
1791: vi = aj + diag[i] - 1;
1792: nz = diag[i] - diag[i+1] - 1;
1793: for(j=0;j>-nz;j--){
1794: oidx = bs*vi[j];
1795: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1796: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1797: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1798: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1799: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1800: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1801: v -= bs2;
1802: }
1803: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
1804: t[5+idx] = s6;
1805: idx += bs;
1806: }
1807: /* backward solve the L^T */
1808: for (i=n-1; i>=0; i--){
1809: v = aa + bs2*ai[i];
1810: vi = aj + ai[i];
1811: nz = ai[i+1] - ai[i];
1812: idt = bs*i;
1813: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
1814: s6 = t[5+idt];
1815: for(j=0;j<nz;j++){
1816: idx = bs*vi[j];
1817: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
1818: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
1819: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1820: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1821: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1822: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1823: v += bs2;
1824: }
1825: }
1827: /* copy t into x according to permutation */
1828: for(i=0;i<n;i++){
1829: ii = bs*i; ir = bs*r[i];
1830: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
1831: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5];
1832: }
1834: ISRestoreIndices(isrow,&rout);
1835: ISRestoreIndices(iscol,&cout);
1836: VecRestoreArray(bb,(PetscScalar**)&b);
1837: VecRestoreArray(xx,&x);
1838: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
1839: return(0);
1840: }
1844: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845: {
1846: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1847: IS iscol=a->col,isrow=a->row;
1848: PetscErrorCode ierr;
1849: const PetscInt *r,*c,*rout,*cout;
1850: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851: PetscInt i,nz,idx,idt,ii,ic,ir,oidx;
1852: const MatScalar *aa=a->a,*v;
1853: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854: const PetscScalar *b;
1857: VecGetArray(bb,(PetscScalar**)&b);
1858: VecGetArray(xx,&x);
1859: t = a->solve_work;
1861: ISGetIndices(isrow,&rout); r = rout;
1862: ISGetIndices(iscol,&cout); c = cout;
1864: /* copy the b into temp work space according to permutation */
1865: ii = 0;
1866: for (i=0; i<n; i++) {
1867: ic = 7*c[i];
1868: t[ii] = b[ic];
1869: t[ii+1] = b[ic+1];
1870: t[ii+2] = b[ic+2];
1871: t[ii+3] = b[ic+3];
1872: t[ii+4] = b[ic+4];
1873: t[ii+5] = b[ic+5];
1874: t[ii+6] = b[ic+6];
1875: ii += 7;
1876: }
1878: /* forward solve the U^T */
1879: idx = 0;
1880: for (i=0; i<n; i++) {
1882: v = aa + 49*diag[i];
1883: /* multiply by the inverse of the block diagonal */
1884: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885: x6 = t[5+idx]; x7 = t[6+idx];
1886: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1887: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893: v += 49;
1895: vi = aj + diag[i] + 1;
1896: nz = ai[i+1] - diag[i] - 1;
1897: while (nz--) {
1898: oidx = 7*(*vi++);
1899: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1900: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906: v += 49;
1907: }
1908: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909: t[5+idx] = s6;t[6+idx] = s7;
1910: idx += 7;
1911: }
1912: /* backward solve the L^T */
1913: for (i=n-1; i>=0; i--){
1914: v = aa + 49*diag[i] - 49;
1915: vi = aj + diag[i] - 1;
1916: nz = diag[i] - ai[i];
1917: idt = 7*i;
1918: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919: s6 = t[5+idt];s7 = t[6+idt];
1920: while (nz--) {
1921: idx = 7*(*vi--);
1922: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1923: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929: v -= 49;
1930: }
1931: }
1933: /* copy t into x according to permutation */
1934: ii = 0;
1935: for (i=0; i<n; i++) {
1936: ir = 7*r[i];
1937: x[ir] = t[ii];
1938: x[ir+1] = t[ii+1];
1939: x[ir+2] = t[ii+2];
1940: x[ir+3] = t[ii+3];
1941: x[ir+4] = t[ii+4];
1942: x[ir+5] = t[ii+5];
1943: x[ir+6] = t[ii+6];
1944: ii += 7;
1945: }
1947: ISRestoreIndices(isrow,&rout);
1948: ISRestoreIndices(iscol,&cout);
1949: VecRestoreArray(bb,(PetscScalar**)&b);
1950: VecRestoreArray(xx,&x);
1951: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
1952: return(0);
1953: }
1956: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1957: {
1958: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1959: PetscErrorCode ierr;
1960: IS iscol=a->col,isrow=a->row;
1961: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1962: const PetscInt *r,*c,*rout,*cout;
1963: PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir;
1964: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
1965: const MatScalar *aa=a->a,*v;
1966: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967: const PetscScalar *b;
1970: VecGetArray(bb,(PetscScalar**)&b);
1971: VecGetArray(xx,&x);
1972: t = a->solve_work;
1974: ISGetIndices(isrow,&rout); r = rout;
1975: ISGetIndices(iscol,&cout); c = cout;
1977: /* copy b into temp work space according to permutation */
1978: for(i=0;i<n;i++){
1979: ii = bs*i; ic = bs*c[i];
1980: t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1981: t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6];
1982: }
1984: /* forward solve the U^T */
1985: idx = 0;
1986: for (i=0; i<n; i++) {
1987: v = aa + bs2*diag[i];
1988: /* multiply by the inverse of the block diagonal */
1989: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1990: x6 = t[5+idx]; x7 = t[6+idx];
1991: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1992: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1993: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1994: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1995: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1996: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1997: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1998: v -= bs2;
2000: vi = aj + diag[i] - 1;
2001: nz = diag[i] - diag[i+1] - 1;
2002: for(j=0;j>-nz;j--){
2003: oidx = bs*vi[j];
2004: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2005: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2006: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2007: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2008: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2009: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2010: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2011: v -= bs2;
2012: }
2013: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5;
2014: t[5+idx] = s6; t[6+idx] = s7;
2015: idx += bs;
2016: }
2017: /* backward solve the L^T */
2018: for (i=n-1; i>=0; i--){
2019: v = aa + bs2*ai[i];
2020: vi = aj + ai[i];
2021: nz = ai[i+1] - ai[i];
2022: idt = bs*i;
2023: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt];
2024: s6 = t[5+idt]; s7 = t[6+idt];
2025: for(j=0;j<nz;j++){
2026: idx = bs*vi[j];
2027: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
2028: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2029: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2030: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2031: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2032: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2033: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2034: v += bs2;
2035: }
2036: }
2038: /* copy t into x according to permutation */
2039: for(i=0;i<n;i++){
2040: ii = bs*i; ir = bs*r[i];
2041: x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3];
2042: x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6];
2043: }
2045: ISRestoreIndices(isrow,&rout);
2046: ISRestoreIndices(iscol,&cout);
2047: VecRestoreArray(bb,(PetscScalar**)&b);
2048: VecRestoreArray(xx,&x);
2049: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2050: return(0);
2051: }
2053: /* ----------------------------------------------------------- */
2056: PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2057: {
2058: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2059: IS iscol=a->col,isrow=a->row;
2060: PetscErrorCode ierr;
2061: const PetscInt *r,*c,*rout,*cout;
2062: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063: PetscInt i,nz;
2064: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
2065: const MatScalar *aa=a->a,*v;
2066: PetscScalar *x,*s,*t,*ls;
2067: const PetscScalar *b;
2070: VecGetArray(bb,(PetscScalar**)&b);
2071: VecGetArray(xx,&x);
2072: t = a->solve_work;
2074: ISGetIndices(isrow,&rout); r = rout;
2075: ISGetIndices(iscol,&cout); c = cout + (n-1);
2077: /* forward solve the lower triangular */
2078: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
2079: for (i=1; i<n; i++) {
2080: v = aa + bs2*ai[i];
2081: vi = aj + ai[i];
2082: nz = a->diag[i] - ai[i];
2083: s = t + bs*i;
2084: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
2085: while (nz--) {
2086: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2087: v += bs2;
2088: }
2089: }
2090: /* backward solve the upper triangular */
2091: ls = a->solve_work + A->cmap->n;
2092: for (i=n-1; i>=0; i--){
2093: v = aa + bs2*(a->diag[i] + 1);
2094: vi = aj + a->diag[i] + 1;
2095: nz = ai[i+1] - a->diag[i] - 1;
2096: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2097: while (nz--) {
2098: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2099: v += bs2;
2100: }
2101: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2102: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
2103: }
2105: ISRestoreIndices(isrow,&rout);
2106: ISRestoreIndices(iscol,&cout);
2107: VecRestoreArray(bb,(PetscScalar**)&b);
2108: VecRestoreArray(xx,&x);
2109: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2110: return(0);
2111: }
2113: /* ----------------------------------------------------------- */
2116: PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2117: {
2118: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2119: IS iscol=a->col,isrow=a->row;
2120: PetscErrorCode ierr;
2121: const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122: PetscInt i,nz,j;
2123: const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
2124: const MatScalar *aa=a->a,*v;
2125: PetscScalar *x,*t,*ls;
2126: const PetscScalar *b;
2128: VecGetArray(bb,(PetscScalar**)&b);
2129: VecGetArray(xx,&x);
2130: t = a->solve_work;
2132: ISGetIndices(isrow,&rout); r = rout;
2133: ISGetIndices(iscol,&cout); c = cout;
2135: /* copy the b into temp work space according to permutation */
2136: for (i=0; i<n; i++) {
2137: for (j=0; j<bs; j++) {
2138: t[i*bs+j] = b[c[i]*bs+j];
2139: }
2140: }
2143: /* forward solve the upper triangular transpose */
2144: ls = a->solve_work + A->cmap->n;
2145: for (i=0; i<n; i++){
2146: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2147: Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2148: v = aa + bs2*(a->diag[i] + 1);
2149: vi = aj + a->diag[i] + 1;
2150: nz = ai[i+1] - a->diag[i] - 1;
2151: while (nz--) {
2152: Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2153: v += bs2;
2154: }
2155: }
2157: /* backward solve the lower triangular transpose */
2158: for (i=n-1; i>=0; i--) {
2159: v = aa + bs2*ai[i];
2160: vi = aj + ai[i];
2161: nz = a->diag[i] - ai[i];
2162: while (nz--) {
2163: Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2164: v += bs2;
2165: }
2166: }
2168: /* copy t into x according to permutation */
2169: for (i=0; i<n; i++) {
2170: for (j=0; j<bs; j++) {
2171: x[bs*r[i]+j] = t[bs*i+j];
2172: }
2173: }
2175: ISRestoreIndices(isrow,&rout);
2176: ISRestoreIndices(iscol,&cout);
2177: VecRestoreArray(bb,(PetscScalar**)&b);
2178: VecRestoreArray(xx,&x);
2179: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2180: return(0);
2181: }
2185: PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2186: {
2187: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2188: IS iscol=a->col,isrow=a->row;
2189: PetscErrorCode ierr;
2190: const PetscInt *r,*c,*rout,*cout;
2191: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192: PetscInt i,j,nz;
2193: const PetscInt bs=A->rmap->bs,bs2=a->bs2;
2194: const MatScalar *aa=a->a,*v;
2195: PetscScalar *x,*t,*ls;
2196: const PetscScalar *b;
2199: VecGetArray(bb,(PetscScalar**)&b);
2200: VecGetArray(xx,&x);
2201: t = a->solve_work;
2203: ISGetIndices(isrow,&rout); r = rout;
2204: ISGetIndices(iscol,&cout); c = cout;
2206: /* copy the b into temp work space according to permutation */
2207: for (i=0; i<n; i++) {
2208: for (j=0; j<bs; j++) {
2209: t[i*bs+j] = b[c[i]*bs+j];
2210: }
2211: }
2214: /* forward solve the upper triangular transpose */
2215: ls = a->solve_work + A->cmap->n;
2216: for (i=0; i<n; i++){
2217: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
2218: Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2219: v = aa + bs2*(diag[i] - 1);
2220: vi = aj + diag[i] - 1;
2221: nz = diag[i] - diag[i+1] - 1;
2222: for(j=0;j>-nz;j--){
2223: Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2224: v -= bs2;
2225: }
2226: }
2228: /* backward solve the lower triangular transpose */
2229: for (i=n-1; i>=0; i--) {
2230: v = aa + bs2*ai[i];
2231: vi = aj + ai[i];
2232: nz = ai[i+1] - ai[i];
2233: for(j=0;j<nz;j++){
2234: Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2235: v += bs2;
2236: }
2237: }
2239: /* copy t into x according to permutation */
2240: for (i=0; i<n; i++) {
2241: for (j=0; j<bs; j++) {
2242: x[bs*r[i]+j] = t[bs*i+j];
2243: }
2244: }
2246: ISRestoreIndices(isrow,&rout);
2247: ISRestoreIndices(iscol,&cout);
2248: VecRestoreArray(bb,(PetscScalar**)&b);
2249: VecRestoreArray(xx,&x);
2250: PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);
2251: return(0);
2252: }
2254: /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */
2258: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
2259: {
2260: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2261: PetscErrorCode ierr;
2262: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2263: PetscInt i,nz,idx,idt,m;
2264: const MatScalar *aa=a->a,*v;
2265: PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2266: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2267: PetscScalar *x;
2268: const PetscScalar *b;
2271: VecGetArray(bb,(PetscScalar**)&b);
2272: VecGetArray(xx,&x);
2274: /* forward solve the lower triangular */
2275: idx = 0;
2276: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx];
2277: x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx];
2278: x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
2280: for (i=1; i<n; i++) {
2281: v = aa + bs2*ai[i];
2282: vi = aj + ai[i];
2283: nz = ai[i+1] - ai[i];
2284: idt = bs*i;
2285: s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt];
2286: s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt];
2287: s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
2288: for(m=0;m<nz;m++){
2289: idx = bs*vi[m];
2290: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2291: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2292: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2294:
2295: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2296: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2297: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2298: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2299: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2300: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2301: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2302: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2303: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2304: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2305: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2306: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2307: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2308: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2309: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2310:
2311: v += bs2;
2312: }
2313: x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5;
2314: x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10;
2315: x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
2316:
2317: }
2318: /* backward solve the upper triangular */
2319: for (i=n-1; i>=0; i--){
2320: v = aa + bs2*(adiag[i+1]+1);
2321: vi = aj + adiag[i+1]+1;
2322: nz = adiag[i] - adiag[i+1] - 1;
2323: idt = bs*i;
2324: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt];
2325: s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt];
2326: s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
2327:
2328: for(m=0;m<nz;m++){
2329: idx = bs*vi[m];
2330: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2331: x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx];
2332: x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
2334: s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2335: s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2336: s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2337: s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2338: s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2339: s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2340: s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2341: s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2342: s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2343: s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2344: s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2345: s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2346: s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2347: s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2348: s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2350: v += bs2;
2351: }
2353: x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2354: x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2355: x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2356: x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2357: x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2358: x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2359: x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2360: x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2361: x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2362: x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2363: x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2364: x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2365: x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2366: x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2367: x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2369: }
2371: VecRestoreArray(bb,(PetscScalar**)&b);
2372: VecRestoreArray(xx,&x);
2373: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2374: return(0);
2375: }
2377: /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2378: /* Default MatSolve for block size 15 */
2382: PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
2383: {
2384: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2385: PetscErrorCode ierr;
2386: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
2387: PetscInt i,k,nz,kdx,idx,idt,m;
2388: const MatScalar *aa=a->a,*v;
2389: PetscScalar s[15];
2390: PetscScalar *x;
2391: const PetscScalar *b;
2394: VecGetArray(bb,(PetscScalar**)&b);
2395: VecGetArray(xx,&x);
2397: /* forward solve the lower triangular */
2398: for (i=0; i<n; i++) {
2399: v = aa + bs2*ai[i];
2400: vi = aj + ai[i];
2401: nz = ai[i+1] - ai[i];
2402: idt = bs*i;
2403: x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
2404: x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
2405: x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
2406: for(m=0;m<nz;m++){
2407: idx = bs*vi[m];
2408: for(k=0;k<15;k++){
2409: kdx = k + idx;
2410: x[idt] -= v[0]*x[kdx];
2411: x[1+idt] -= v[1]*x[kdx];
2412: x[2+idt] -= v[2]*x[kdx];
2413: x[3+idt] -= v[3]*x[kdx];
2414: x[4+idt] -= v[4]*x[kdx];
2415: x[5+idt] -= v[5]*x[kdx];
2416: x[6+idt] -= v[6]*x[kdx];
2417: x[7+idt] -= v[7]*x[kdx];
2418: x[8+idt] -= v[8]*x[kdx];
2419: x[9+idt] -= v[9]*x[kdx];
2420: x[10+idt] -= v[10]*x[kdx];
2421: x[11+idt] -= v[11]*x[kdx];
2422: x[12+idt] -= v[12]*x[kdx];
2423: x[13+idt] -= v[13]*x[kdx];
2424: x[14+idt] -= v[14]*x[kdx];
2425: v += 15;
2426: }
2427: }
2428: }
2429: /* backward solve the upper triangular */
2430: for (i=n-1; i>=0; i--){
2431: v = aa + bs2*(adiag[i+1]+1);
2432: vi = aj + adiag[i+1]+1;
2433: nz = adiag[i] - adiag[i+1] - 1;
2434: idt = bs*i;
2435: s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
2436: s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
2437: s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
2438:
2439: for(m=0;m<nz;m++){
2440: idx = bs*vi[m];
2441: for(k=0;k<15;k++){
2442: kdx = k + idx;
2443: s[0] -= v[0]*x[kdx];
2444: s[1] -= v[1]*x[kdx];
2445: s[2] -= v[2]*x[kdx];
2446: s[3] -= v[3]*x[kdx];
2447: s[4] -= v[4]*x[kdx];
2448: s[5] -= v[5]*x[kdx];
2449: s[6] -= v[6]*x[kdx];
2450: s[7] -= v[7]*x[kdx];
2451: s[8] -= v[8]*x[kdx];
2452: s[9] -= v[9]*x[kdx];
2453: s[10] -= v[10]*x[kdx];
2454: s[11] -= v[11]*x[kdx];
2455: s[12] -= v[12]*x[kdx];
2456: s[13] -= v[13]*x[kdx];
2457: s[14] -= v[14]*x[kdx];
2458: v += 15;
2459: }
2460: }
2461: PetscMemzero(x+idt,bs*sizeof(MatScalar));
2462: for(k=0;k<15;k++){
2463: x[idt] += v[0]*s[k];
2464: x[1+idt] += v[1]*s[k];
2465: x[2+idt] += v[2]*s[k];
2466: x[3+idt] += v[3]*s[k];
2467: x[4+idt] += v[4]*s[k];
2468: x[5+idt] += v[5]*s[k];
2469: x[6+idt] += v[6]*s[k];
2470: x[7+idt] += v[7]*s[k];
2471: x[8+idt] += v[8]*s[k];
2472: x[9+idt] += v[9]*s[k];
2473: x[10+idt] += v[10]*s[k];
2474: x[11+idt] += v[11]*s[k];
2475: x[12+idt] += v[12]*s[k];
2476: x[13+idt] += v[13]*s[k];
2477: x[14+idt] += v[14]*s[k];
2478: v += 15;
2479: }
2480: }
2481: VecRestoreArray(bb,(PetscScalar**)&b);
2482: VecRestoreArray(xx,&x);
2483: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2484: return(0);
2485: }
2490: PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2491: {
2492: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2493: IS iscol=a->col,isrow=a->row;
2494: PetscErrorCode ierr;
2495: const PetscInt *r,*c,*ai=a->i,*aj=a->j;
2496: const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2497: PetscInt i,nz,idx,idt,idc;
2498: const MatScalar *aa=a->a,*v;
2499: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2500: const PetscScalar *b;
2503: VecGetArray(bb,(PetscScalar**)&b);
2504: VecGetArray(xx,&x);
2505: t = a->solve_work;
2507: ISGetIndices(isrow,&rout); r = rout;
2508: ISGetIndices(iscol,&cout); c = cout + (n-1);
2510: /* forward solve the lower triangular */
2511: idx = 7*(*r++);
2512: t[0] = b[idx]; t[1] = b[1+idx];
2513: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2514: t[5] = b[5+idx]; t[6] = b[6+idx];
2516: for (i=1; i<n; i++) {
2517: v = aa + 49*ai[i];
2518: vi = aj + ai[i];
2519: nz = diag[i] - ai[i];
2520: idx = 7*(*r++);
2521: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2522: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2523: while (nz--) {
2524: idx = 7*(*vi++);
2525: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2526: x4 = t[3+idx];x5 = t[4+idx];
2527: x6 = t[5+idx];x7 = t[6+idx];
2528: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2529: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2530: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2531: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2532: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2533: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2534: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2535: v += 49;
2536: }
2537: idx = 7*i;
2538: t[idx] = s1;t[1+idx] = s2;
2539: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2540: t[5+idx] = s6;t[6+idx] = s7;
2541: }
2542: /* backward solve the upper triangular */
2543: for (i=n-1; i>=0; i--){
2544: v = aa + 49*diag[i] + 49;
2545: vi = aj + diag[i] + 1;
2546: nz = ai[i+1] - diag[i] - 1;
2547: idt = 7*i;
2548: s1 = t[idt]; s2 = t[1+idt];
2549: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2550: s6 = t[5+idt];s7 = t[6+idt];
2551: while (nz--) {
2552: idx = 7*(*vi++);
2553: x1 = t[idx]; x2 = t[1+idx];
2554: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2555: x6 = t[5+idx]; x7 = t[6+idx];
2556: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2557: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2558: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2559: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2560: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2561: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2562: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2563: v += 49;
2564: }
2565: idc = 7*(*c--);
2566: v = aa + 49*diag[i];
2567: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2568: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2569: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2570: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2571: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2572: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2573: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2574: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2575: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2576: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2577: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2578: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2579: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2580: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2581: }
2583: ISRestoreIndices(isrow,&rout);
2584: ISRestoreIndices(iscol,&cout);
2585: VecRestoreArray(bb,(PetscScalar**)&b);
2586: VecRestoreArray(xx,&x);
2587: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2588: return(0);
2589: }
2593: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2594: {
2595: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2596: IS iscol=a->col,isrow=a->row;
2597: PetscErrorCode ierr;
2598: const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2599: const PetscInt n=a->mbs,*rout,*cout,*vi;
2600: PetscInt i,nz,idx,idt,idc,m;
2601: const MatScalar *aa=a->a,*v;
2602: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2603: const PetscScalar *b;
2606: VecGetArray(bb,(PetscScalar**)&b);
2607: VecGetArray(xx,&x);
2608: t = a->solve_work;
2610: ISGetIndices(isrow,&rout); r = rout;
2611: ISGetIndices(iscol,&cout); c = cout;
2613: /* forward solve the lower triangular */
2614: idx = 7*r[0];
2615: t[0] = b[idx]; t[1] = b[1+idx];
2616: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2617: t[5] = b[5+idx]; t[6] = b[6+idx];
2619: for (i=1; i<n; i++) {
2620: v = aa + 49*ai[i];
2621: vi = aj + ai[i];
2622: nz = ai[i+1] - ai[i];
2623: idx = 7*r[i];
2624: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2625: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2626: for(m=0;m<nz;m++){
2627: idx = 7*vi[m];
2628: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
2629: x4 = t[3+idx];x5 = t[4+idx];
2630: x6 = t[5+idx];x7 = t[6+idx];
2631: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2632: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2633: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2634: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2635: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2636: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2637: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2638: v += 49;
2639: }
2640: idx = 7*i;
2641: t[idx] = s1;t[1+idx] = s2;
2642: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2643: t[5+idx] = s6;t[6+idx] = s7;
2644: }
2645: /* backward solve the upper triangular */
2646: for (i=n-1; i>=0; i--){
2647: v = aa + 49*(adiag[i+1]+1);
2648: vi = aj + adiag[i+1]+1;
2649: nz = adiag[i] - adiag[i+1] - 1;
2650: idt = 7*i;
2651: s1 = t[idt]; s2 = t[1+idt];
2652: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2653: s6 = t[5+idt];s7 = t[6+idt];
2654: for(m=0;m<nz;m++){
2655: idx = 7*vi[m];
2656: x1 = t[idx]; x2 = t[1+idx];
2657: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2658: x6 = t[5+idx]; x7 = t[6+idx];
2659: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2660: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2661: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2662: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2663: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2664: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2665: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2666: v += 49;
2667: }
2668: idc = 7*c[i];
2669: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
2670: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2671: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2672: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2673: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2674: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2675: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2676: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2677: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2678: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2679: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2680: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2681: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2682: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2683: }
2685: ISRestoreIndices(isrow,&rout);
2686: ISRestoreIndices(iscol,&cout);
2687: VecRestoreArray(bb,(PetscScalar**)&b);
2688: VecRestoreArray(xx,&x);
2689: PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);
2690: return(0);
2691: }
2695: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2696: {
2697: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2698: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2699: PetscErrorCode ierr;
2700: PetscInt i,nz,idx,idt,jdx;
2701: const MatScalar *aa=a->a,*v;
2702: PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2703: const PetscScalar *b;
2706: VecGetArray(bb,(PetscScalar**)&b);
2707: VecGetArray(xx,&x);
2708: /* forward solve the lower triangular */
2709: idx = 0;
2710: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
2711: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2712: x[6] = b[6+idx];
2713: for (i=1; i<n; i++) {
2714: v = aa + 49*ai[i];
2715: vi = aj + ai[i];
2716: nz = diag[i] - ai[i];
2717: idx = 7*i;
2718: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2719: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2720: s7 = b[6+idx];
2721: while (nz--) {
2722: jdx = 7*(*vi++);
2723: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
2724: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2725: x7 = x[6+jdx];
2726: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2727: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2728: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2729: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2730: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2731: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2732: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2733: v += 49;
2734: }
2735: x[idx] = s1;
2736: x[1+idx] = s2;
2737: x[2+idx] = s3;
2738: x[3+idx] = s4;
2739: x[4+idx] = s5;
2740: x[5+idx] = s6;
2741: x[6+idx] = s7;
2742: }
2743: /* backward solve the upper triangular */
2744: for (i=n-1; i>=0; i--){
2745: v = aa + 49*diag[i] + 49;
2746: vi = aj + diag[i] + 1;
2747: nz = ai[i+1] - diag[i] - 1;
2748: idt = 7*i;
2749: s1 = x[idt]; s2 = x[1+idt];
2750: s3 = x[2+idt]; s4 = x[3+idt];
2751: s5 = x[4+idt]; s6 = x[5+idt];
2752: s7 = x[6+idt];
2753: while (nz--) {
2754: idx = 7*(*vi++);
2755: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
2756: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2757: x7 = x[6+idx];
2758: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2759: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2760: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2761: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2762: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2763: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2764: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2765: v += 49;
2766: }
2767: v = aa + 49*diag[i];
2768: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
2769: + v[28]*s5 + v[35]*s6 + v[42]*s7;
2770: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
2771: + v[29]*s5 + v[36]*s6 + v[43]*s7;
2772: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
2773: + v[30]*s5 + v[37]*s6 + v[44]*s7;
2774: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
2775: + v[31]*s5 + v[38]*s6 + v[45]*s7;
2776: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
2777: + v[32]*s5 + v[39]*s6 + v[46]*s7;
2778: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
2779: + v[33]*s5 + v[40]*s6 + v[47]*s7;
2780: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
2781: + v[34]*s5 + v[41]*s6 + v[48]*s7;
2782: }
2784: VecRestoreArray(bb,(PetscScalar**)&b);
2785: VecRestoreArray(xx,&x);
2786: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2787: return(0);
2788: }
2792: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2793: {
2794: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2795: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2796: PetscErrorCode ierr;
2797: PetscInt i,k,nz,idx,jdx,idt;
2798: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
2799: const MatScalar *aa=a->a,*v;
2800: PetscScalar *x;
2801: const PetscScalar *b;
2802: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2805: VecGetArray(bb,(PetscScalar**)&b);
2806: VecGetArray(xx,&x);
2807: /* forward solve the lower triangular */
2808: idx = 0;
2809: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2810: x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2811: for (i=1; i<n; i++) {
2812: v = aa + bs2*ai[i];
2813: vi = aj + ai[i];
2814: nz = ai[i+1] - ai[i];
2815: idx = bs*i;
2816: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2817: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2818: for(k=0;k<nz;k++) {
2819: jdx = bs*vi[k];
2820: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2821: x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2822: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2823: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2824: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2825: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2826: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2827: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2828: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2829: v += bs2;
2830: }
2832: x[idx] = s1;
2833: x[1+idx] = s2;
2834: x[2+idx] = s3;
2835: x[3+idx] = s4;
2836: x[4+idx] = s5;
2837: x[5+idx] = s6;
2838: x[6+idx] = s7;
2839: }
2840:
2841: /* backward solve the upper triangular */
2842: for (i=n-1; i>=0; i--){
2843: v = aa + bs2*(adiag[i+1]+1);
2844: vi = aj + adiag[i+1]+1;
2845: nz = adiag[i] - adiag[i+1]-1;
2846: idt = bs*i;
2847: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2848: s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2849: for(k=0;k<nz;k++) {
2850: idx = bs*vi[k];
2851: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2852: x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2853: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2854: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2855: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2856: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2857: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2858: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2859: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2860: v += bs2;
2861: }
2862: /* x = inv_diagonal*x */
2863: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7;
2864: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7;
2865: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7;
2866: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7;
2867: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7;
2868: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7;
2869: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7;
2870: }
2872: VecRestoreArray(bb,(PetscScalar**)&b);
2873: VecRestoreArray(xx,&x);
2874: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
2875: return(0);
2876: }
2880: PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2881: {
2882: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2883: IS iscol=a->col,isrow=a->row;
2884: PetscErrorCode ierr;
2885: const PetscInt *r,*c,*rout,*cout;
2886: const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2887: PetscInt i,nz,idx,idt,idc;
2888: const MatScalar *aa=a->a,*v;
2889: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2890: const PetscScalar *b;
2893: VecGetArray(bb,(PetscScalar**)&b);
2894: VecGetArray(xx,&x);
2895: t = a->solve_work;
2897: ISGetIndices(isrow,&rout); r = rout;
2898: ISGetIndices(iscol,&cout); c = cout + (n-1);
2900: /* forward solve the lower triangular */
2901: idx = 6*(*r++);
2902: t[0] = b[idx]; t[1] = b[1+idx];
2903: t[2] = b[2+idx]; t[3] = b[3+idx];
2904: t[4] = b[4+idx]; t[5] = b[5+idx];
2905: for (i=1; i<n; i++) {
2906: v = aa + 36*ai[i];
2907: vi = aj + ai[i];
2908: nz = diag[i] - ai[i];
2909: idx = 6*(*r++);
2910: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2911: s5 = b[4+idx]; s6 = b[5+idx];
2912: while (nz--) {
2913: idx = 6*(*vi++);
2914: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2915: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2916: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2917: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2918: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2919: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2920: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2921: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2922: v += 36;
2923: }
2924: idx = 6*i;
2925: t[idx] = s1;t[1+idx] = s2;
2926: t[2+idx] = s3;t[3+idx] = s4;
2927: t[4+idx] = s5;t[5+idx] = s6;
2928: }
2929: /* backward solve the upper triangular */
2930: for (i=n-1; i>=0; i--){
2931: v = aa + 36*diag[i] + 36;
2932: vi = aj + diag[i] + 1;
2933: nz = ai[i+1] - diag[i] - 1;
2934: idt = 6*i;
2935: s1 = t[idt]; s2 = t[1+idt];
2936: s3 = t[2+idt];s4 = t[3+idt];
2937: s5 = t[4+idt];s6 = t[5+idt];
2938: while (nz--) {
2939: idx = 6*(*vi++);
2940: x1 = t[idx]; x2 = t[1+idx];
2941: x3 = t[2+idx]; x4 = t[3+idx];
2942: x5 = t[4+idx]; x6 = t[5+idx];
2943: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2944: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2945: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2946: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2947: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2948: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2949: v += 36;
2950: }
2951: idc = 6*(*c--);
2952: v = aa + 36*diag[i];
2953: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
2954: v[18]*s4+v[24]*s5+v[30]*s6;
2955: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2956: v[19]*s4+v[25]*s5+v[31]*s6;
2957: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2958: v[20]*s4+v[26]*s5+v[32]*s6;
2959: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2960: v[21]*s4+v[27]*s5+v[33]*s6;
2961: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2962: v[22]*s4+v[28]*s5+v[34]*s6;
2963: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2964: v[23]*s4+v[29]*s5+v[35]*s6;
2965: }
2967: ISRestoreIndices(isrow,&rout);
2968: ISRestoreIndices(iscol,&cout);
2969: VecRestoreArray(bb,(PetscScalar**)&b);
2970: VecRestoreArray(xx,&x);
2971: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
2972: return(0);
2973: }
2977: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2978: {
2979: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2980: IS iscol=a->col,isrow=a->row;
2981: PetscErrorCode ierr;
2982: const PetscInt *r,*c,*rout,*cout;
2983: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2984: PetscInt i,nz,idx,idt,idc,m;
2985: const MatScalar *aa=a->a,*v;
2986: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2987: const PetscScalar *b;
2990: VecGetArray(bb,(PetscScalar**)&b);
2991: VecGetArray(xx,&x);
2992: t = a->solve_work;
2994: ISGetIndices(isrow,&rout); r = rout;
2995: ISGetIndices(iscol,&cout); c = cout;
2997: /* forward solve the lower triangular */
2998: idx = 6*r[0];
2999: t[0] = b[idx]; t[1] = b[1+idx];
3000: t[2] = b[2+idx]; t[3] = b[3+idx];
3001: t[4] = b[4+idx]; t[5] = b[5+idx];
3002: for (i=1; i<n; i++) {
3003: v = aa + 36*ai[i];
3004: vi = aj + ai[i];
3005: nz = ai[i+1] - ai[i];
3006: idx = 6*r[i];
3007: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3008: s5 = b[4+idx]; s6 = b[5+idx];
3009: for(m=0;m<nz;m++){
3010: idx = 6*vi[m];
3011: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3012: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3013: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3019: v += 36;
3020: }
3021: idx = 6*i;
3022: t[idx] = s1;t[1+idx] = s2;
3023: t[2+idx] = s3;t[3+idx] = s4;
3024: t[4+idx] = s5;t[5+idx] = s6;
3025: }
3026: /* backward solve the upper triangular */
3027: for (i=n-1; i>=0; i--){
3028: v = aa + 36*(adiag[i+1]+1);
3029: vi = aj + adiag[i+1]+1;
3030: nz = adiag[i] - adiag[i+1] - 1;
3031: idt = 6*i;
3032: s1 = t[idt]; s2 = t[1+idt];
3033: s3 = t[2+idt];s4 = t[3+idt];
3034: s5 = t[4+idt];s6 = t[5+idt];
3035: for(m=0;m<nz;m++){
3036: idx = 6*vi[m];
3037: x1 = t[idx]; x2 = t[1+idx];
3038: x3 = t[2+idx]; x4 = t[3+idx];
3039: x5 = t[4+idx]; x6 = t[5+idx];
3040: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3041: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3042: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3043: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3044: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3045: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3046: v += 36;
3047: }
3048: idc = 6*c[i];
3049: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
3050: v[18]*s4+v[24]*s5+v[30]*s6;
3051: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3052: v[19]*s4+v[25]*s5+v[31]*s6;
3053: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3054: v[20]*s4+v[26]*s5+v[32]*s6;
3055: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3056: v[21]*s4+v[27]*s5+v[33]*s6;
3057: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3058: v[22]*s4+v[28]*s5+v[34]*s6;
3059: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3060: v[23]*s4+v[29]*s5+v[35]*s6;
3061: }
3063: ISRestoreIndices(isrow,&rout);
3064: ISRestoreIndices(iscol,&cout);
3065: VecRestoreArray(bb,(PetscScalar**)&b);
3066: VecRestoreArray(xx,&x);
3067: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3068: return(0);
3069: }
3073: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3074: {
3075: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3076: PetscInt i,nz,idx,idt,jdx;
3077: PetscErrorCode ierr;
3078: const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3079: const MatScalar *aa=a->a,*v;
3080: PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3081: const PetscScalar *b;
3084: VecGetArray(bb,(PetscScalar**)&b);
3085: VecGetArray(xx,&x);
3086: /* forward solve the lower triangular */
3087: idx = 0;
3088: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
3089: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
3090: for (i=1; i<n; i++) {
3091: v = aa + 36*ai[i];
3092: vi = aj + ai[i];
3093: nz = diag[i] - ai[i];
3094: idx = 6*i;
3095: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3096: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
3097: while (nz--) {
3098: jdx = 6*(*vi++);
3099: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
3100: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3101: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3102: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3103: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3104: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3105: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3106: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3107: v += 36;
3108: }
3109: x[idx] = s1;
3110: x[1+idx] = s2;
3111: x[2+idx] = s3;
3112: x[3+idx] = s4;
3113: x[4+idx] = s5;
3114: x[5+idx] = s6;
3115: }
3116: /* backward solve the upper triangular */
3117: for (i=n-1; i>=0; i--){
3118: v = aa + 36*diag[i] + 36;
3119: vi = aj + diag[i] + 1;
3120: nz = ai[i+1] - diag[i] - 1;
3121: idt = 6*i;
3122: s1 = x[idt]; s2 = x[1+idt];
3123: s3 = x[2+idt]; s4 = x[3+idt];
3124: s5 = x[4+idt]; s6 = x[5+idt];
3125: while (nz--) {
3126: idx = 6*(*vi++);
3127: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
3128: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3129: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3130: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3131: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3132: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3133: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3134: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3135: v += 36;
3136: }
3137: v = aa + 36*diag[i];
3138: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3139: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3140: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3141: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3142: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3143: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3144: }
3146: VecRestoreArray(bb,(PetscScalar**)&b);
3147: VecRestoreArray(xx,&x);
3148: PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);
3149: return(0);
3150: }
3154: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3155: {
3156: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3157: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3158: PetscErrorCode ierr;
3159: PetscInt i,k,nz,idx,jdx,idt;
3160: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
3161: const MatScalar *aa=a->a,*v;
3162: PetscScalar *x;
3163: const PetscScalar *b;
3164: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3167: VecGetArray(bb,(PetscScalar**)&b);
3168: VecGetArray(xx,&x);
3169: /* forward solve the lower triangular */
3170: idx = 0;
3171: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3172: x[4] = b[4+idx];x[5] = b[5+idx];
3173: for (i=1; i<n; i++) {
3174: v = aa + bs2*ai[i];
3175: vi = aj + ai[i];
3176: nz = ai[i+1] - ai[i];
3177: idx = bs*i;
3178: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3179: s5 = b[4+idx];s6 = b[5+idx];
3180: for(k=0;k<nz;k++){
3181: jdx = bs*vi[k];
3182: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3183: x5 = x[4+jdx]; x6 = x[5+jdx];
3184: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3185: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3186: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3187: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3188: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3189: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3190: v += bs2;
3191: }
3193: x[idx] = s1;
3194: x[1+idx] = s2;
3195: x[2+idx] = s3;
3196: x[3+idx] = s4;
3197: x[4+idx] = s5;
3198: x[5+idx] = s6;
3199: }
3200:
3201: /* backward solve the upper triangular */
3202: for (i=n-1; i>=0; i--){
3203: v = aa + bs2*(adiag[i+1]+1);
3204: vi = aj + adiag[i+1]+1;
3205: nz = adiag[i] - adiag[i+1]-1;
3206: idt = bs*i;
3207: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3208: s5 = x[4+idt];s6 = x[5+idt];
3209: for(k=0;k<nz;k++){
3210: idx = bs*vi[k];
3211: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3212: x5 = x[4+idx];x6 = x[5+idx];
3213: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3214: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;;
3215: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3216: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3217: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3218: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3219: v += bs2;
3220: }
3221: /* x = inv_diagonal*x */
3222: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3223: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3224: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3225: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3226: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3227: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3228: }
3230: VecRestoreArray(bb,(PetscScalar**)&b);
3231: VecRestoreArray(xx,&x);
3232: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
3233: return(0);
3234: }
3238: PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3239: {
3240: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
3241: IS iscol=a->col,isrow=a->row;
3242: PetscErrorCode ierr;
3243: const PetscInt *r,*c,*rout,*cout,*diag = a->diag;
3244: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3245: PetscInt i,nz,idx,idt,idc;
3246: const MatScalar *aa=a->a,*v;
3247: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3248: const PetscScalar *b;
3251: VecGetArray(bb,(PetscScalar**)&b);
3252: VecGetArray(xx,&x);
3253: t = a->solve_work;
3255: ISGetIndices(isrow,&rout); r = rout;
3256: ISGetIndices(iscol,&cout); c = cout + (n-1);
3258: /* forward solve the lower triangular */
3259: idx = 5*(*r++);
3260: t[0] = b[idx]; t[1] = b[1+idx];
3261: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3262: for (i=1; i<n; i++) {
3263: v = aa + 25*ai[i];
3264: vi = aj + ai[i];
3265: nz = diag[i] - ai[i];
3266: idx = 5*(*r++);
3267: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3268: s5 = b[4+idx];
3269: while (nz--) {
3270: idx = 5*(*vi++);
3271: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3272: x4 = t[3+idx];x5 = t[4+idx];
3273: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3274: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3275: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3276: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3277: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3278: v += 25;
3279: }
3280: idx = 5*i;
3281: t[idx] = s1;t[1+idx] = s2;
3282: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3283: }
3284: /* backward solve the upper triangular */
3285: for (i=n-1; i>=0; i--){
3286: v = aa + 25*diag[i] + 25;
3287: vi = aj + diag[i] + 1;
3288: nz = ai[i+1] - diag[i] - 1;
3289: idt = 5*i;
3290: s1 = t[idt]; s2 = t[1+idt];
3291: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3292: while (nz--) {
3293: idx = 5*(*vi++);
3294: x1 = t[idx]; x2 = t[1+idx];
3295: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3296: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3297: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3298: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3299: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3300: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3301: v += 25;
3302: }
3303: idc = 5*(*c--);
3304: v = aa + 25*diag[i];
3305: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3306: v[15]*s4+v[20]*s5;
3307: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3308: v[16]*s4+v[21]*s5;
3309: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3310: v[17]*s4+v[22]*s5;
3311: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3312: v[18]*s4+v[23]*s5;
3313: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3314: v[19]*s4+v[24]*s5;
3315: }
3317: ISRestoreIndices(isrow,&rout);
3318: ISRestoreIndices(iscol,&cout);
3319: VecRestoreArray(bb,(PetscScalar**)&b);
3320: VecRestoreArray(xx,&x);
3321: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3322: return(0);
3323: }
3327: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3328: {
3329: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
3330: IS iscol=a->col,isrow=a->row;
3331: PetscErrorCode ierr;
3332: const PetscInt *r,*c,*rout,*cout;
3333: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3334: PetscInt i,nz,idx,idt,idc,m;
3335: const MatScalar *aa=a->a,*v;
3336: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3337: const PetscScalar *b;
3340: VecGetArray(bb,(PetscScalar**)&b);
3341: VecGetArray(xx,&x);
3342: t = a->solve_work;
3344: ISGetIndices(isrow,&rout); r = rout;
3345: ISGetIndices(iscol,&cout); c = cout;
3347: /* forward solve the lower triangular */
3348: idx = 5*r[0];
3349: t[0] = b[idx]; t[1] = b[1+idx];
3350: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3351: for (i=1; i<n; i++) {
3352: v = aa + 25*ai[i];
3353: vi = aj + ai[i];
3354: nz = ai[i+1] - ai[i];
3355: idx = 5*r[i];
3356: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3357: s5 = b[4+idx];
3358: for(m=0;m<nz;m++){
3359: idx = 5*vi[m];
3360: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
3361: x4 = t[3+idx];x5 = t[4+idx];
3362: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3363: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3364: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3365: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3366: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3367: v += 25;
3368: }
3369: idx = 5*i;
3370: t[idx] = s1;t[1+idx] = s2;
3371: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3372: }
3373: /* backward solve the upper triangular */
3374: for (i=n-1; i>=0; i--){
3375: v = aa + 25*(adiag[i+1]+1);
3376: vi = aj + adiag[i+1]+1;
3377: nz = adiag[i] - adiag[i+1] - 1;
3378: idt = 5*i;
3379: s1 = t[idt]; s2 = t[1+idt];
3380: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3381: for(m=0;m<nz;m++){
3382: idx = 5*vi[m];
3383: x1 = t[idx]; x2 = t[1+idx];
3384: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3385: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3386: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3387: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3388: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3389: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3390: v += 25;
3391: }
3392: idc = 5*c[i];
3393: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
3394: v[15]*s4+v[20]*s5;
3395: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3396: v[16]*s4+v[21]*s5;
3397: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3398: v[17]*s4+v[22]*s5;
3399: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3400: v[18]*s4+v[23]*s5;
3401: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3402: v[19]*s4+v[24]*s5;
3403: }
3405: ISRestoreIndices(isrow,&rout);
3406: ISRestoreIndices(iscol,&cout);
3407: VecRestoreArray(bb,(PetscScalar**)&b);
3408: VecRestoreArray(xx,&x);
3409: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3410: return(0);
3411: }
3415: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3416: {
3417: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3418: const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3419: PetscInt i,nz,idx,idt,jdx;
3420: PetscErrorCode ierr;
3421: const MatScalar *aa=a->a,*v;
3422: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3423: const PetscScalar *b;
3426: VecGetArray(bb,(PetscScalar**)&b);
3427: VecGetArray(xx,&x);
3428: /* forward solve the lower triangular */
3429: idx = 0;
3430: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3431: for (i=1; i<n; i++) {
3432: v = aa + 25*ai[i];
3433: vi = aj + ai[i];
3434: nz = diag[i] - ai[i];
3435: idx = 5*i;
3436: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3437: while (nz--) {
3438: jdx = 5*(*vi++);
3439: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3440: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3441: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3442: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3443: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3444: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3445: v += 25;
3446: }
3447: x[idx] = s1;
3448: x[1+idx] = s2;
3449: x[2+idx] = s3;
3450: x[3+idx] = s4;
3451: x[4+idx] = s5;
3452: }
3453: /* backward solve the upper triangular */
3454: for (i=n-1; i>=0; i--){
3455: v = aa + 25*diag[i] + 25;
3456: vi = aj + diag[i] + 1;
3457: nz = ai[i+1] - diag[i] - 1;
3458: idt = 5*i;
3459: s1 = x[idt]; s2 = x[1+idt];
3460: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3461: while (nz--) {
3462: idx = 5*(*vi++);
3463: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3464: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3465: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3466: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3467: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3468: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3469: v += 25;
3470: }
3471: v = aa + 25*diag[i];
3472: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3473: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3474: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3475: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3476: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3477: }
3479: VecRestoreArray(bb,(PetscScalar**)&b);
3480: VecRestoreArray(xx,&x);
3481: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3482: return(0);
3483: }
3487: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3488: {
3489: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3490: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3491: PetscInt i,k,nz,idx,idt,jdx;
3492: PetscErrorCode ierr;
3493: const MatScalar *aa=a->a,*v;
3494: PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3495: const PetscScalar *b;
3498: VecGetArray(bb,(PetscScalar**)&b);
3499: VecGetArray(xx,&x);
3500: /* forward solve the lower triangular */
3501: idx = 0;
3502: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3503: for (i=1; i<n; i++) {
3504: v = aa + 25*ai[i];
3505: vi = aj + ai[i];
3506: nz = ai[i+1] - ai[i];
3507: idx = 5*i;
3508: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3509: for(k=0;k<nz;k++) {
3510: jdx = 5*vi[k];
3511: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3512: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3513: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3514: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3515: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3516: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3517: v += 25;
3518: }
3519: x[idx] = s1;
3520: x[1+idx] = s2;
3521: x[2+idx] = s3;
3522: x[3+idx] = s4;
3523: x[4+idx] = s5;
3524: }
3526: /* backward solve the upper triangular */
3527: for (i=n-1; i>=0; i--){
3528: v = aa + 25*(adiag[i+1]+1);
3529: vi = aj + adiag[i+1]+1;
3530: nz = adiag[i] - adiag[i+1]-1;
3531: idt = 5*i;
3532: s1 = x[idt]; s2 = x[1+idt];
3533: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3534: for(k=0;k<nz;k++){
3535: idx = 5*vi[k];
3536: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3537: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3538: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3539: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3540: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3541: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3542: v += 25;
3543: }
3544: /* x = inv_diagonal*x */
3545: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
3546: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
3547: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
3548: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
3549: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
3550: }
3552: VecRestoreArray(bb,(PetscScalar**)&b);
3553: VecRestoreArray(xx,&x);
3554: PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);
3555: return(0);
3556: }
3560: PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3561: {
3562: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3563: IS iscol=a->col,isrow=a->row;
3564: PetscErrorCode ierr;
3565: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3566: PetscInt i,nz,idx,idt,idc;
3567: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3568: const MatScalar *aa=a->a,*v;
3569: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3570: const PetscScalar *b;
3573: VecGetArray(bb,(PetscScalar**)&b);
3574: VecGetArray(xx,&x);
3575: t = a->solve_work;
3577: ISGetIndices(isrow,&rout); r = rout;
3578: ISGetIndices(iscol,&cout); c = cout + (n-1);
3580: /* forward solve the lower triangular */
3581: idx = 4*(*r++);
3582: t[0] = b[idx]; t[1] = b[1+idx];
3583: t[2] = b[2+idx]; t[3] = b[3+idx];
3584: for (i=1; i<n; i++) {
3585: v = aa + 16*ai[i];
3586: vi = aj + ai[i];
3587: nz = diag[i] - ai[i];
3588: idx = 4*(*r++);
3589: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3590: while (nz--) {
3591: idx = 4*(*vi++);
3592: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3593: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3594: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3595: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3596: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3597: v += 16;
3598: }
3599: idx = 4*i;
3600: t[idx] = s1;t[1+idx] = s2;
3601: t[2+idx] = s3;t[3+idx] = s4;
3602: }
3603: /* backward solve the upper triangular */
3604: for (i=n-1; i>=0; i--){
3605: v = aa + 16*diag[i] + 16;
3606: vi = aj + diag[i] + 1;
3607: nz = ai[i+1] - diag[i] - 1;
3608: idt = 4*i;
3609: s1 = t[idt]; s2 = t[1+idt];
3610: s3 = t[2+idt];s4 = t[3+idt];
3611: while (nz--) {
3612: idx = 4*(*vi++);
3613: x1 = t[idx]; x2 = t[1+idx];
3614: x3 = t[2+idx]; x4 = t[3+idx];
3615: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3616: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3617: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3618: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3619: v += 16;
3620: }
3621: idc = 4*(*c--);
3622: v = aa + 16*diag[i];
3623: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3624: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3625: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3626: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3627: }
3629: ISRestoreIndices(isrow,&rout);
3630: ISRestoreIndices(iscol,&cout);
3631: VecRestoreArray(bb,(PetscScalar**)&b);
3632: VecRestoreArray(xx,&x);
3633: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3634: return(0);
3635: }
3639: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3640: {
3641: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3642: IS iscol=a->col,isrow=a->row;
3643: PetscErrorCode ierr;
3644: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3645: PetscInt i,nz,idx,idt,idc,m;
3646: const PetscInt *r,*c,*rout,*cout;
3647: const MatScalar *aa=a->a,*v;
3648: PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3649: const PetscScalar *b;
3652: VecGetArray(bb,(PetscScalar**)&b);
3653: VecGetArray(xx,&x);
3654: t = a->solve_work;
3656: ISGetIndices(isrow,&rout); r = rout;
3657: ISGetIndices(iscol,&cout); c = cout;
3659: /* forward solve the lower triangular */
3660: idx = 4*r[0];
3661: t[0] = b[idx]; t[1] = b[1+idx];
3662: t[2] = b[2+idx]; t[3] = b[3+idx];
3663: for (i=1; i<n; i++) {
3664: v = aa + 16*ai[i];
3665: vi = aj + ai[i];
3666: nz = ai[i+1] - ai[i];
3667: idx = 4*r[i];
3668: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3669: for(m=0;m<nz;m++){
3670: idx = 4*vi[m];
3671: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3672: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3673: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3674: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3675: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3676: v += 16;
3677: }
3678: idx = 4*i;
3679: t[idx] = s1;t[1+idx] = s2;
3680: t[2+idx] = s3;t[3+idx] = s4;
3681: }
3682: /* backward solve the upper triangular */
3683: for (i=n-1; i>=0; i--){
3684: v = aa + 16*(adiag[i+1]+1);
3685: vi = aj + adiag[i+1]+1;
3686: nz = adiag[i] - adiag[i+1] - 1;
3687: idt = 4*i;
3688: s1 = t[idt]; s2 = t[1+idt];
3689: s3 = t[2+idt];s4 = t[3+idt];
3690: for(m=0;m<nz;m++){
3691: idx = 4*vi[m];
3692: x1 = t[idx]; x2 = t[1+idx];
3693: x3 = t[2+idx]; x4 = t[3+idx];
3694: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3695: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3696: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3697: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3698: v += 16;
3699: }
3700: idc = 4*c[i];
3701: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3702: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3703: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3704: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3705: }
3707: ISRestoreIndices(isrow,&rout);
3708: ISRestoreIndices(iscol,&cout);
3709: VecRestoreArray(bb,(PetscScalar**)&b);
3710: VecRestoreArray(xx,&x);
3711: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3712: return(0);
3713: }
3717: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3718: {
3719: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3720: IS iscol=a->col,isrow=a->row;
3721: PetscErrorCode ierr;
3722: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3723: PetscInt i,nz,idx,idt,idc;
3724: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3725: const MatScalar *aa=a->a,*v;
3726: MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t;
3727: PetscScalar *x;
3728: const PetscScalar *b;
3731: VecGetArray(bb,(PetscScalar**)&b);
3732: VecGetArray(xx,&x);
3733: t = (MatScalar *)a->solve_work;
3735: ISGetIndices(isrow,&rout); r = rout;
3736: ISGetIndices(iscol,&cout); c = cout + (n-1);
3738: /* forward solve the lower triangular */
3739: idx = 4*(*r++);
3740: t[0] = (MatScalar)b[idx];
3741: t[1] = (MatScalar)b[1+idx];
3742: t[2] = (MatScalar)b[2+idx];
3743: t[3] = (MatScalar)b[3+idx];
3744: for (i=1; i<n; i++) {
3745: v = aa + 16*ai[i];
3746: vi = aj + ai[i];
3747: nz = diag[i] - ai[i];
3748: idx = 4*(*r++);
3749: s1 = (MatScalar)b[idx];
3750: s2 = (MatScalar)b[1+idx];
3751: s3 = (MatScalar)b[2+idx];
3752: s4 = (MatScalar)b[3+idx];
3753: while (nz--) {
3754: idx = 4*(*vi++);
3755: x1 = t[idx];
3756: x2 = t[1+idx];
3757: x3 = t[2+idx];
3758: x4 = t[3+idx];
3759: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3760: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3761: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3762: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3763: v += 16;
3764: }
3765: idx = 4*i;
3766: t[idx] = s1;
3767: t[1+idx] = s2;
3768: t[2+idx] = s3;
3769: t[3+idx] = s4;
3770: }
3771: /* backward solve the upper triangular */
3772: for (i=n-1; i>=0; i--){
3773: v = aa + 16*diag[i] + 16;
3774: vi = aj + diag[i] + 1;
3775: nz = ai[i+1] - diag[i] - 1;
3776: idt = 4*i;
3777: s1 = t[idt];
3778: s2 = t[1+idt];
3779: s3 = t[2+idt];
3780: s4 = t[3+idt];
3781: while (nz--) {
3782: idx = 4*(*vi++);
3783: x1 = t[idx];
3784: x2 = t[1+idx];
3785: x3 = t[2+idx];
3786: x4 = t[3+idx];
3787: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3788: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3789: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3790: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3791: v += 16;
3792: }
3793: idc = 4*(*c--);
3794: v = aa + 16*diag[i];
3795: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3796: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3797: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3798: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3799: x[idc] = (PetscScalar)t[idt];
3800: x[1+idc] = (PetscScalar)t[1+idt];
3801: x[2+idc] = (PetscScalar)t[2+idt];
3802: x[3+idc] = (PetscScalar)t[3+idt];
3803: }
3805: ISRestoreIndices(isrow,&rout);
3806: ISRestoreIndices(iscol,&cout);
3807: VecRestoreArray(bb,(PetscScalar**)&b);
3808: VecRestoreArray(xx,&x);
3809: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
3810: return(0);
3811: }
3813: #if defined (PETSC_HAVE_SSE)
3815: #include PETSC_HAVE_SSE
3819: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3820: {
3821: /*
3822: Note: This code uses demotion of double
3823: to float when performing the mixed-mode computation.
3824: This may not be numerically reasonable for all applications.
3825: */
3826: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3827: IS iscol=a->col,isrow=a->row;
3829: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3830: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3831: MatScalar *aa=a->a,*v;
3832: PetscScalar *x,*b,*t;
3834: /* Make space in temp stack for 16 Byte Aligned arrays */
3835: float ssealignedspace[11],*tmps,*tmpx;
3836: unsigned long offset;
3837:
3839: SSE_SCOPE_BEGIN;
3841: offset = (unsigned long)ssealignedspace % 16;
3842: if (offset) offset = (16 - offset)/4;
3843: tmps = &ssealignedspace[offset];
3844: tmpx = &ssealignedspace[offset+4];
3845: PREFETCH_NTA(aa+16*ai[1]);
3847: VecGetArray(bb,&b);
3848: VecGetArray(xx,&x);
3849: t = a->solve_work;
3851: ISGetIndices(isrow,&rout); r = rout;
3852: ISGetIndices(iscol,&cout); c = cout + (n-1);
3854: /* forward solve the lower triangular */
3855: idx = 4*(*r++);
3856: t[0] = b[idx]; t[1] = b[1+idx];
3857: t[2] = b[2+idx]; t[3] = b[3+idx];
3858: v = aa + 16*ai[1];
3860: for (i=1; i<n;) {
3861: PREFETCH_NTA(&v[8]);
3862: vi = aj + ai[i];
3863: nz = diag[i] - ai[i];
3864: idx = 4*(*r++);
3866: /* Demote sum from double to float */
3867: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3868: LOAD_PS(tmps,XMM7);
3870: while (nz--) {
3871: PREFETCH_NTA(&v[16]);
3872: idx = 4*(*vi++);
3873:
3874: /* Demote solution (so far) from double to float */
3875: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3877: /* 4x4 Matrix-Vector product with negative accumulation: */
3878: SSE_INLINE_BEGIN_2(tmpx,v)
3879: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3881: /* First Column */
3882: SSE_COPY_PS(XMM0,XMM6)
3883: SSE_SHUFFLE(XMM0,XMM0,0x00)
3884: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3885: SSE_SUB_PS(XMM7,XMM0)
3886:
3887: /* Second Column */
3888: SSE_COPY_PS(XMM1,XMM6)
3889: SSE_SHUFFLE(XMM1,XMM1,0x55)
3890: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3891: SSE_SUB_PS(XMM7,XMM1)
3892:
3893: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3894:
3895: /* Third Column */
3896: SSE_COPY_PS(XMM2,XMM6)
3897: SSE_SHUFFLE(XMM2,XMM2,0xAA)
3898: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3899: SSE_SUB_PS(XMM7,XMM2)
3901: /* Fourth Column */
3902: SSE_COPY_PS(XMM3,XMM6)
3903: SSE_SHUFFLE(XMM3,XMM3,0xFF)
3904: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3905: SSE_SUB_PS(XMM7,XMM3)
3906: SSE_INLINE_END_2
3907:
3908: v += 16;
3909: }
3910: idx = 4*i;
3911: v = aa + 16*ai[++i];
3912: PREFETCH_NTA(v);
3913: STORE_PS(tmps,XMM7);
3915: /* Promote result from float to double */
3916: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3917: }
3918: /* backward solve the upper triangular */
3919: idt = 4*(n-1);
3920: ai16 = 16*diag[n-1];
3921: v = aa + ai16 + 16;
3922: for (i=n-1; i>=0;){
3923: PREFETCH_NTA(&v[8]);
3924: vi = aj + diag[i] + 1;
3925: nz = ai[i+1] - diag[i] - 1;
3926:
3927: /* Demote accumulator from double to float */
3928: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3929: LOAD_PS(tmps,XMM7);
3931: while (nz--) {
3932: PREFETCH_NTA(&v[16]);
3933: idx = 4*(*vi++);
3935: /* Demote solution (so far) from double to float */
3936: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3938: /* 4x4 Matrix-Vector Product with negative accumulation: */
3939: SSE_INLINE_BEGIN_2(tmpx,v)
3940: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3942: /* First Column */
3943: SSE_COPY_PS(XMM0,XMM6)
3944: SSE_SHUFFLE(XMM0,XMM0,0x00)
3945: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3946: SSE_SUB_PS(XMM7,XMM0)
3948: /* Second Column */
3949: SSE_COPY_PS(XMM1,XMM6)
3950: SSE_SHUFFLE(XMM1,XMM1,0x55)
3951: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3952: SSE_SUB_PS(XMM7,XMM1)
3954: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3955:
3956: /* Third Column */
3957: SSE_COPY_PS(XMM2,XMM6)
3958: SSE_SHUFFLE(XMM2,XMM2,0xAA)
3959: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3960: SSE_SUB_PS(XMM7,XMM2)
3962: /* Fourth Column */
3963: SSE_COPY_PS(XMM3,XMM6)
3964: SSE_SHUFFLE(XMM3,XMM3,0xFF)
3965: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3966: SSE_SUB_PS(XMM7,XMM3)
3967: SSE_INLINE_END_2
3968: v += 16;
3969: }
3970: v = aa + ai16;
3971: ai16 = 16*diag[--i];
3972: PREFETCH_NTA(aa+ai16+16);
3973: /*
3974: Scale the result by the diagonal 4x4 block,
3975: which was inverted as part of the factorization
3976: */
3977: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3978: /* First Column */
3979: SSE_COPY_PS(XMM0,XMM7)
3980: SSE_SHUFFLE(XMM0,XMM0,0x00)
3981: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3983: /* Second Column */
3984: SSE_COPY_PS(XMM1,XMM7)
3985: SSE_SHUFFLE(XMM1,XMM1,0x55)
3986: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3987: SSE_ADD_PS(XMM0,XMM1)
3989: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3990:
3991: /* Third Column */
3992: SSE_COPY_PS(XMM2,XMM7)
3993: SSE_SHUFFLE(XMM2,XMM2,0xAA)
3994: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3995: SSE_ADD_PS(XMM0,XMM2)
3997: /* Fourth Column */
3998: SSE_COPY_PS(XMM3,XMM7)
3999: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4000: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4001: SSE_ADD_PS(XMM0,XMM3)
4002:
4003: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4004: SSE_INLINE_END_3
4006: /* Promote solution from float to double */
4007: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
4009: /* Apply reordering to t and stream into x. */
4010: /* This way, x doesn't pollute the cache. */
4011: /* Be careful with size: 2 doubles = 4 floats! */
4012: idc = 4*(*c--);
4013: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
4014: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
4015: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
4016: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
4017: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
4018: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
4019: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
4020: SSE_INLINE_END_2
4021: v = aa + ai16 + 16;
4022: idt -= 4;
4023: }
4025: ISRestoreIndices(isrow,&rout);
4026: ISRestoreIndices(iscol,&cout);
4027: VecRestoreArray(bb,&b);
4028: VecRestoreArray(xx,&x);
4029: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4030: SSE_SCOPE_END;
4031: return(0);
4032: }
4034: #endif
4037: /*
4038: Special case where the matrix was ILU(0) factored in the natural
4039: ordering. This eliminates the need for the column and row permutation.
4040: */
4043: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4044: {
4045: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4046: PetscInt n=a->mbs;
4047: const PetscInt *ai=a->i,*aj=a->j;
4048: PetscErrorCode ierr;
4049: const PetscInt *diag = a->diag;
4050: const MatScalar *aa=a->a;
4051: PetscScalar *x;
4052: const PetscScalar *b;
4055: VecGetArray(bb,(PetscScalar**)&b);
4056: VecGetArray(xx,&x);
4058: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
4059: {
4060: static PetscScalar w[2000]; /* very BAD need to fix */
4061: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
4062: }
4063: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
4064: {
4065: static PetscScalar w[2000]; /* very BAD need to fix */
4066: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
4067: }
4068: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
4069: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4070: #else
4071: {
4072: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4073: const MatScalar *v;
4074: PetscInt jdx,idt,idx,nz,i,ai16;
4075: const PetscInt *vi;
4077: /* forward solve the lower triangular */
4078: idx = 0;
4079: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
4080: for (i=1; i<n; i++) {
4081: v = aa + 16*ai[i];
4082: vi = aj + ai[i];
4083: nz = diag[i] - ai[i];
4084: idx += 4;
4085: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4086: while (nz--) {
4087: jdx = 4*(*vi++);
4088: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4089: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4090: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4091: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4092: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4093: v += 16;
4094: }
4095: x[idx] = s1;
4096: x[1+idx] = s2;
4097: x[2+idx] = s3;
4098: x[3+idx] = s4;
4099: }
4100: /* backward solve the upper triangular */
4101: idt = 4*(n-1);
4102: for (i=n-1; i>=0; i--){
4103: ai16 = 16*diag[i];
4104: v = aa + ai16 + 16;
4105: vi = aj + diag[i] + 1;
4106: nz = ai[i+1] - diag[i] - 1;
4107: s1 = x[idt]; s2 = x[1+idt];
4108: s3 = x[2+idt];s4 = x[3+idt];
4109: while (nz--) {
4110: idx = 4*(*vi++);
4111: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
4112: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4113: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4114: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4115: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4116: v += 16;
4117: }
4118: v = aa + ai16;
4119: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4120: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
4121: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4122: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4123: idt -= 4;
4124: }
4125: }
4126: #endif
4128: VecRestoreArray(bb,(PetscScalar**)&b);
4129: VecRestoreArray(xx,&x);
4130: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4131: return(0);
4132: }
4136: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4137: {
4138: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4139: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4140: PetscInt i,k,nz,idx,jdx,idt;
4141: PetscErrorCode ierr;
4142: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4143: const MatScalar *aa=a->a,*v;
4144: PetscScalar *x;
4145: const PetscScalar *b;
4146: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
4149: VecGetArray(bb,(PetscScalar**)&b);
4150: VecGetArray(xx,&x);
4151: /* forward solve the lower triangular */
4152: idx = 0;
4153: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4154: for (i=1; i<n; i++) {
4155: v = aa + bs2*ai[i];
4156: vi = aj + ai[i];
4157: nz = ai[i+1] - ai[i];
4158: idx = bs*i;
4159: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4160: for(k=0;k<nz;k++) {
4161: jdx = bs*vi[k];
4162: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4163: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4164: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4165: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4166: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4167:
4168: v += bs2;
4169: }
4171: x[idx] = s1;
4172: x[1+idx] = s2;
4173: x[2+idx] = s3;
4174: x[3+idx] = s4;
4175: }
4176:
4177: /* backward solve the upper triangular */
4178: for (i=n-1; i>=0; i--){
4179: v = aa + bs2*(adiag[i+1]+1);
4180: vi = aj + adiag[i+1]+1;
4181: nz = adiag[i] - adiag[i+1]-1;
4182: idt = bs*i;
4183: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4184:
4185: for(k=0;k<nz;k++){
4186: idx = bs*vi[k];
4187: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4188: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4189: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4190: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4191: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4193: v += bs2;
4194: }
4195: /* x = inv_diagonal*x */
4196: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4197: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4198: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4199: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4201: }
4203: VecRestoreArray(bb,(PetscScalar**)&b);
4204: VecRestoreArray(xx,&x);
4205: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4206: return(0);
4207: }
4211: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4212: {
4213: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4214: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4215: PetscErrorCode ierr;
4216: const MatScalar *aa=a->a;
4217: const PetscScalar *b;
4218: PetscScalar *x;
4221: VecGetArray(bb,(PetscScalar**)&b);
4222: VecGetArray(xx,&x);
4224: {
4225: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
4226: const MatScalar *v;
4227: MatScalar *t=(MatScalar *)x;
4228: PetscInt jdx,idt,idx,nz,i,ai16;
4229: const PetscInt *vi;
4231: /* forward solve the lower triangular */
4232: idx = 0;
4233: t[0] = (MatScalar)b[0];
4234: t[1] = (MatScalar)b[1];
4235: t[2] = (MatScalar)b[2];
4236: t[3] = (MatScalar)b[3];
4237: for (i=1; i<n; i++) {
4238: v = aa + 16*ai[i];
4239: vi = aj + ai[i];
4240: nz = diag[i] - ai[i];
4241: idx += 4;
4242: s1 = (MatScalar)b[idx];
4243: s2 = (MatScalar)b[1+idx];
4244: s3 = (MatScalar)b[2+idx];
4245: s4 = (MatScalar)b[3+idx];
4246: while (nz--) {
4247: jdx = 4*(*vi++);
4248: x1 = t[jdx];
4249: x2 = t[1+jdx];
4250: x3 = t[2+jdx];
4251: x4 = t[3+jdx];
4252: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4253: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4254: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4255: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4256: v += 16;
4257: }
4258: t[idx] = s1;
4259: t[1+idx] = s2;
4260: t[2+idx] = s3;
4261: t[3+idx] = s4;
4262: }
4263: /* backward solve the upper triangular */
4264: idt = 4*(n-1);
4265: for (i=n-1; i>=0; i--){
4266: ai16 = 16*diag[i];
4267: v = aa + ai16 + 16;
4268: vi = aj + diag[i] + 1;
4269: nz = ai[i+1] - diag[i] - 1;
4270: s1 = t[idt];
4271: s2 = t[1+idt];
4272: s3 = t[2+idt];
4273: s4 = t[3+idt];
4274: while (nz--) {
4275: idx = 4*(*vi++);
4276: x1 = (MatScalar)x[idx];
4277: x2 = (MatScalar)x[1+idx];
4278: x3 = (MatScalar)x[2+idx];
4279: x4 = (MatScalar)x[3+idx];
4280: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4281: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4282: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4283: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4284: v += 16;
4285: }
4286: v = aa + ai16;
4287: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
4288: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
4289: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4290: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4291: idt -= 4;
4292: }
4293: }
4295: VecRestoreArray(bb,(PetscScalar**)&b);
4296: VecRestoreArray(xx,&x);
4297: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4298: return(0);
4299: }
4301: #if defined (PETSC_HAVE_SSE)
4303: #include PETSC_HAVE_SSE
4306: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4307: {
4308: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4309: unsigned short *aj=(unsigned short *)a->j;
4311: int *ai=a->i,n=a->mbs,*diag = a->diag;
4312: MatScalar *aa=a->a;
4313: PetscScalar *x,*b;
4316: SSE_SCOPE_BEGIN;
4317: /*
4318: Note: This code currently uses demotion of double
4319: to float when performing the mixed-mode computation.
4320: This may not be numerically reasonable for all applications.
4321: */
4322: PREFETCH_NTA(aa+16*ai[1]);
4324: VecGetArray(bb,&b);
4325: VecGetArray(xx,&x);
4326: {
4327: /* x will first be computed in single precision then promoted inplace to double */
4328: MatScalar *v,*t=(MatScalar *)x;
4329: int nz,i,idt,ai16;
4330: unsigned int jdx,idx;
4331: unsigned short *vi;
4332: /* Forward solve the lower triangular factor. */
4334: /* First block is the identity. */
4335: idx = 0;
4336: CONVERT_DOUBLE4_FLOAT4(t,b);
4337: v = aa + 16*((unsigned int)ai[1]);
4339: for (i=1; i<n;) {
4340: PREFETCH_NTA(&v[8]);
4341: vi = aj + ai[i];
4342: nz = diag[i] - ai[i];
4343: idx += 4;
4345: /* Demote RHS from double to float. */
4346: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4347: LOAD_PS(&t[idx],XMM7);
4349: while (nz--) {
4350: PREFETCH_NTA(&v[16]);
4351: jdx = 4*((unsigned int)(*vi++));
4352:
4353: /* 4x4 Matrix-Vector product with negative accumulation: */
4354: SSE_INLINE_BEGIN_2(&t[jdx],v)
4355: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4357: /* First Column */
4358: SSE_COPY_PS(XMM0,XMM6)
4359: SSE_SHUFFLE(XMM0,XMM0,0x00)
4360: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4361: SSE_SUB_PS(XMM7,XMM0)
4363: /* Second Column */
4364: SSE_COPY_PS(XMM1,XMM6)
4365: SSE_SHUFFLE(XMM1,XMM1,0x55)
4366: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4367: SSE_SUB_PS(XMM7,XMM1)
4369: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4370:
4371: /* Third Column */
4372: SSE_COPY_PS(XMM2,XMM6)
4373: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4374: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4375: SSE_SUB_PS(XMM7,XMM2)
4377: /* Fourth Column */
4378: SSE_COPY_PS(XMM3,XMM6)
4379: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4380: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4381: SSE_SUB_PS(XMM7,XMM3)
4382: SSE_INLINE_END_2
4383:
4384: v += 16;
4385: }
4386: v = aa + 16*ai[++i];
4387: PREFETCH_NTA(v);
4388: STORE_PS(&t[idx],XMM7);
4389: }
4391: /* Backward solve the upper triangular factor.*/
4393: idt = 4*(n-1);
4394: ai16 = 16*diag[n-1];
4395: v = aa + ai16 + 16;
4396: for (i=n-1; i>=0;){
4397: PREFETCH_NTA(&v[8]);
4398: vi = aj + diag[i] + 1;
4399: nz = ai[i+1] - diag[i] - 1;
4400:
4401: LOAD_PS(&t[idt],XMM7);
4403: while (nz--) {
4404: PREFETCH_NTA(&v[16]);
4405: idx = 4*((unsigned int)(*vi++));
4407: /* 4x4 Matrix-Vector Product with negative accumulation: */
4408: SSE_INLINE_BEGIN_2(&t[idx],v)
4409: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4411: /* First Column */
4412: SSE_COPY_PS(XMM0,XMM6)
4413: SSE_SHUFFLE(XMM0,XMM0,0x00)
4414: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4415: SSE_SUB_PS(XMM7,XMM0)
4417: /* Second Column */
4418: SSE_COPY_PS(XMM1,XMM6)
4419: SSE_SHUFFLE(XMM1,XMM1,0x55)
4420: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4421: SSE_SUB_PS(XMM7,XMM1)
4423: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4424:
4425: /* Third Column */
4426: SSE_COPY_PS(XMM2,XMM6)
4427: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4428: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4429: SSE_SUB_PS(XMM7,XMM2)
4431: /* Fourth Column */
4432: SSE_COPY_PS(XMM3,XMM6)
4433: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4434: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4435: SSE_SUB_PS(XMM7,XMM3)
4436: SSE_INLINE_END_2
4437: v += 16;
4438: }
4439: v = aa + ai16;
4440: ai16 = 16*diag[--i];
4441: PREFETCH_NTA(aa+ai16+16);
4442: /*
4443: Scale the result by the diagonal 4x4 block,
4444: which was inverted as part of the factorization
4445: */
4446: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4447: /* First Column */
4448: SSE_COPY_PS(XMM0,XMM7)
4449: SSE_SHUFFLE(XMM0,XMM0,0x00)
4450: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4452: /* Second Column */
4453: SSE_COPY_PS(XMM1,XMM7)
4454: SSE_SHUFFLE(XMM1,XMM1,0x55)
4455: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4456: SSE_ADD_PS(XMM0,XMM1)
4458: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4459:
4460: /* Third Column */
4461: SSE_COPY_PS(XMM2,XMM7)
4462: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4463: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4464: SSE_ADD_PS(XMM0,XMM2)
4466: /* Fourth Column */
4467: SSE_COPY_PS(XMM3,XMM7)
4468: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4469: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4470: SSE_ADD_PS(XMM0,XMM3)
4472: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4473: SSE_INLINE_END_3
4475: v = aa + ai16 + 16;
4476: idt -= 4;
4477: }
4479: /* Convert t from single precision back to double precision (inplace)*/
4480: idt = 4*(n-1);
4481: for (i=n-1;i>=0;i--) {
4482: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4483: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4484: PetscScalar *xtemp=&x[idt];
4485: MatScalar *ttemp=&t[idt];
4486: xtemp[3] = (PetscScalar)ttemp[3];
4487: xtemp[2] = (PetscScalar)ttemp[2];
4488: xtemp[1] = (PetscScalar)ttemp[1];
4489: xtemp[0] = (PetscScalar)ttemp[0];
4490: idt -= 4;
4491: }
4493: } /* End of artificial scope. */
4494: VecRestoreArray(bb,&b);
4495: VecRestoreArray(xx,&x);
4496: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4497: SSE_SCOPE_END;
4498: return(0);
4499: }
4503: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4504: {
4505: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4506: int *aj=a->j;
4508: int *ai=a->i,n=a->mbs,*diag = a->diag;
4509: MatScalar *aa=a->a;
4510: PetscScalar *x,*b;
4513: SSE_SCOPE_BEGIN;
4514: /*
4515: Note: This code currently uses demotion of double
4516: to float when performing the mixed-mode computation.
4517: This may not be numerically reasonable for all applications.
4518: */
4519: PREFETCH_NTA(aa+16*ai[1]);
4521: VecGetArray(bb,&b);
4522: VecGetArray(xx,&x);
4523: {
4524: /* x will first be computed in single precision then promoted inplace to double */
4525: MatScalar *v,*t=(MatScalar *)x;
4526: int nz,i,idt,ai16;
4527: int jdx,idx;
4528: int *vi;
4529: /* Forward solve the lower triangular factor. */
4531: /* First block is the identity. */
4532: idx = 0;
4533: CONVERT_DOUBLE4_FLOAT4(t,b);
4534: v = aa + 16*ai[1];
4536: for (i=1; i<n;) {
4537: PREFETCH_NTA(&v[8]);
4538: vi = aj + ai[i];
4539: nz = diag[i] - ai[i];
4540: idx += 4;
4542: /* Demote RHS from double to float. */
4543: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4544: LOAD_PS(&t[idx],XMM7);
4546: while (nz--) {
4547: PREFETCH_NTA(&v[16]);
4548: jdx = 4*(*vi++);
4549: /* jdx = *vi++; */
4550:
4551: /* 4x4 Matrix-Vector product with negative accumulation: */
4552: SSE_INLINE_BEGIN_2(&t[jdx],v)
4553: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4555: /* First Column */
4556: SSE_COPY_PS(XMM0,XMM6)
4557: SSE_SHUFFLE(XMM0,XMM0,0x00)
4558: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4559: SSE_SUB_PS(XMM7,XMM0)
4561: /* Second Column */
4562: SSE_COPY_PS(XMM1,XMM6)
4563: SSE_SHUFFLE(XMM1,XMM1,0x55)
4564: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4565: SSE_SUB_PS(XMM7,XMM1)
4567: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4568:
4569: /* Third Column */
4570: SSE_COPY_PS(XMM2,XMM6)
4571: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4572: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4573: SSE_SUB_PS(XMM7,XMM2)
4575: /* Fourth Column */
4576: SSE_COPY_PS(XMM3,XMM6)
4577: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4578: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4579: SSE_SUB_PS(XMM7,XMM3)
4580: SSE_INLINE_END_2
4581:
4582: v += 16;
4583: }
4584: v = aa + 16*ai[++i];
4585: PREFETCH_NTA(v);
4586: STORE_PS(&t[idx],XMM7);
4587: }
4589: /* Backward solve the upper triangular factor.*/
4591: idt = 4*(n-1);
4592: ai16 = 16*diag[n-1];
4593: v = aa + ai16 + 16;
4594: for (i=n-1; i>=0;){
4595: PREFETCH_NTA(&v[8]);
4596: vi = aj + diag[i] + 1;
4597: nz = ai[i+1] - diag[i] - 1;
4598:
4599: LOAD_PS(&t[idt],XMM7);
4601: while (nz--) {
4602: PREFETCH_NTA(&v[16]);
4603: idx = 4*(*vi++);
4604: /* idx = *vi++; */
4606: /* 4x4 Matrix-Vector Product with negative accumulation: */
4607: SSE_INLINE_BEGIN_2(&t[idx],v)
4608: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4610: /* First Column */
4611: SSE_COPY_PS(XMM0,XMM6)
4612: SSE_SHUFFLE(XMM0,XMM0,0x00)
4613: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4614: SSE_SUB_PS(XMM7,XMM0)
4616: /* Second Column */
4617: SSE_COPY_PS(XMM1,XMM6)
4618: SSE_SHUFFLE(XMM1,XMM1,0x55)
4619: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4620: SSE_SUB_PS(XMM7,XMM1)
4622: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4623:
4624: /* Third Column */
4625: SSE_COPY_PS(XMM2,XMM6)
4626: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4627: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4628: SSE_SUB_PS(XMM7,XMM2)
4630: /* Fourth Column */
4631: SSE_COPY_PS(XMM3,XMM6)
4632: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4633: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4634: SSE_SUB_PS(XMM7,XMM3)
4635: SSE_INLINE_END_2
4636: v += 16;
4637: }
4638: v = aa + ai16;
4639: ai16 = 16*diag[--i];
4640: PREFETCH_NTA(aa+ai16+16);
4641: /*
4642: Scale the result by the diagonal 4x4 block,
4643: which was inverted as part of the factorization
4644: */
4645: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4646: /* First Column */
4647: SSE_COPY_PS(XMM0,XMM7)
4648: SSE_SHUFFLE(XMM0,XMM0,0x00)
4649: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4651: /* Second Column */
4652: SSE_COPY_PS(XMM1,XMM7)
4653: SSE_SHUFFLE(XMM1,XMM1,0x55)
4654: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4655: SSE_ADD_PS(XMM0,XMM1)
4657: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4658:
4659: /* Third Column */
4660: SSE_COPY_PS(XMM2,XMM7)
4661: SSE_SHUFFLE(XMM2,XMM2,0xAA)
4662: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4663: SSE_ADD_PS(XMM0,XMM2)
4665: /* Fourth Column */
4666: SSE_COPY_PS(XMM3,XMM7)
4667: SSE_SHUFFLE(XMM3,XMM3,0xFF)
4668: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4669: SSE_ADD_PS(XMM0,XMM3)
4671: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4672: SSE_INLINE_END_3
4674: v = aa + ai16 + 16;
4675: idt -= 4;
4676: }
4678: /* Convert t from single precision back to double precision (inplace)*/
4679: idt = 4*(n-1);
4680: for (i=n-1;i>=0;i--) {
4681: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4682: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4683: PetscScalar *xtemp=&x[idt];
4684: MatScalar *ttemp=&t[idt];
4685: xtemp[3] = (PetscScalar)ttemp[3];
4686: xtemp[2] = (PetscScalar)ttemp[2];
4687: xtemp[1] = (PetscScalar)ttemp[1];
4688: xtemp[0] = (PetscScalar)ttemp[0];
4689: idt -= 4;
4690: }
4692: } /* End of artificial scope. */
4693: VecRestoreArray(bb,&b);
4694: VecRestoreArray(xx,&x);
4695: PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);
4696: SSE_SCOPE_END;
4697: return(0);
4698: }
4700: #endif
4704: PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4705: {
4706: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
4707: IS iscol=a->col,isrow=a->row;
4708: PetscErrorCode ierr;
4709: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4710: PetscInt i,nz,idx,idt,idc;
4711: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4712: const MatScalar *aa=a->a,*v;
4713: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4714: const PetscScalar *b;
4717: VecGetArray(bb,(PetscScalar**)&b);
4718: VecGetArray(xx,&x);
4719: t = a->solve_work;
4721: ISGetIndices(isrow,&rout); r = rout;
4722: ISGetIndices(iscol,&cout); c = cout + (n-1);
4724: /* forward solve the lower triangular */
4725: idx = 3*(*r++);
4726: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4727: for (i=1; i<n; i++) {
4728: v = aa + 9*ai[i];
4729: vi = aj + ai[i];
4730: nz = diag[i] - ai[i];
4731: idx = 3*(*r++);
4732: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4733: while (nz--) {
4734: idx = 3*(*vi++);
4735: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4736: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4737: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4738: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4739: v += 9;
4740: }
4741: idx = 3*i;
4742: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4743: }
4744: /* backward solve the upper triangular */
4745: for (i=n-1; i>=0; i--){
4746: v = aa + 9*diag[i] + 9;
4747: vi = aj + diag[i] + 1;
4748: nz = ai[i+1] - diag[i] - 1;
4749: idt = 3*i;
4750: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4751: while (nz--) {
4752: idx = 3*(*vi++);
4753: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4754: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4755: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4756: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4757: v += 9;
4758: }
4759: idc = 3*(*c--);
4760: v = aa + 9*diag[i];
4761: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4762: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4763: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4764: }
4765: ISRestoreIndices(isrow,&rout);
4766: ISRestoreIndices(iscol,&cout);
4767: VecRestoreArray(bb,(PetscScalar**)&b);
4768: VecRestoreArray(xx,&x);
4769: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4770: return(0);
4771: }
4775: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4776: {
4777: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
4778: IS iscol=a->col,isrow=a->row;
4779: PetscErrorCode ierr;
4780: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4781: PetscInt i,nz,idx,idt,idc,m;
4782: const PetscInt *r,*c,*rout,*cout;
4783: const MatScalar *aa=a->a,*v;
4784: PetscScalar *x,s1,s2,s3,x1,x2,x3,*t;
4785: const PetscScalar *b;
4788: VecGetArray(bb,(PetscScalar**)&b);
4789: VecGetArray(xx,&x);
4790: t = a->solve_work;
4792: ISGetIndices(isrow,&rout); r = rout;
4793: ISGetIndices(iscol,&cout); c = cout;
4795: /* forward solve the lower triangular */
4796: idx = 3*r[0];
4797: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4798: for (i=1; i<n; i++) {
4799: v = aa + 9*ai[i];
4800: vi = aj + ai[i];
4801: nz = ai[i+1] - ai[i];
4802: idx = 3*r[i];
4803: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4804: for(m=0;m<nz;m++){
4805: idx = 3*vi[m];
4806: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4807: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4808: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4809: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4810: v += 9;
4811: }
4812: idx = 3*i;
4813: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4814: }
4815: /* backward solve the upper triangular */
4816: for (i=n-1; i>=0; i--){
4817: v = aa + 9*(adiag[i+1]+1);
4818: vi = aj + adiag[i+1]+1;
4819: nz = adiag[i] - adiag[i+1] - 1;
4820: idt = 3*i;
4821: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4822: for(m=0;m<nz;m++){
4823: idx = 3*vi[m];
4824: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4825: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4826: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4827: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4828: v += 9;
4829: }
4830: idc = 3*c[i];
4831: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4832: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4833: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4834: }
4835: ISRestoreIndices(isrow,&rout);
4836: ISRestoreIndices(iscol,&cout);
4837: VecRestoreArray(bb,(PetscScalar**)&b);
4838: VecRestoreArray(xx,&x);
4839: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4840: return(0);
4841: }
4843: /*
4844: Special case where the matrix was ILU(0) factored in the natural
4845: ordering. This eliminates the need for the column and row permutation.
4846: */
4849: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4850: {
4851: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4852: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
4853: PetscErrorCode ierr;
4854: const PetscInt *diag = a->diag,*vi;
4855: const MatScalar *aa=a->a,*v;
4856: PetscScalar *x,s1,s2,s3,x1,x2,x3;
4857: const PetscScalar *b;
4858: PetscInt jdx,idt,idx,nz,i;
4861: VecGetArray(bb,(PetscScalar**)&b);
4862: VecGetArray(xx,&x);
4864: /* forward solve the lower triangular */
4865: idx = 0;
4866: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
4867: for (i=1; i<n; i++) {
4868: v = aa + 9*ai[i];
4869: vi = aj + ai[i];
4870: nz = diag[i] - ai[i];
4871: idx += 3;
4872: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4873: while (nz--) {
4874: jdx = 3*(*vi++);
4875: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4876: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4879: v += 9;
4880: }
4881: x[idx] = s1;
4882: x[1+idx] = s2;
4883: x[2+idx] = s3;
4884: }
4885: /* backward solve the upper triangular */
4886: for (i=n-1; i>=0; i--){
4887: v = aa + 9*diag[i] + 9;
4888: vi = aj + diag[i] + 1;
4889: nz = ai[i+1] - diag[i] - 1;
4890: idt = 3*i;
4891: s1 = x[idt]; s2 = x[1+idt];
4892: s3 = x[2+idt];
4893: while (nz--) {
4894: idx = 3*(*vi++);
4895: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
4896: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4897: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4898: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4899: v += 9;
4900: }
4901: v = aa + 9*diag[i];
4902: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4903: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4904: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4905: }
4907: VecRestoreArray(bb,(PetscScalar**)&b);
4908: VecRestoreArray(xx,&x);
4909: PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);
4910: return(0);
4911: }
4915: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4916: {
4917: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
4918: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4919: PetscErrorCode ierr;
4920: PetscInt i,k,nz,idx,jdx,idt;
4921: const PetscInt bs = A->rmap->bs,bs2 = a->bs2;
4922: const MatScalar *aa=a->a,*v;
4923: PetscScalar *x;
4924: const PetscScalar *b;
4925: PetscScalar s1,s2,s3,x1,x2,x3;
4928: VecGetArray(bb,(PetscScalar**)&b);
4929: VecGetArray(xx,&x);
4930: /* forward solve the lower triangular */
4931: idx = 0;
4932: x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4933: for (i=1; i<n; i++) {
4934: v = aa + bs2*ai[i];
4935: vi = aj + ai[i];
4936: nz = ai[i+1] - ai[i];
4937: idx = bs*i;
4938: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939: for(k=0;k<nz;k++){
4940: jdx = bs*vi[k];
4941: x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4942: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945:
4946: v += bs2;
4947: }
4949: x[idx] = s1;
4950: x[1+idx] = s2;
4951: x[2+idx] = s3;
4952: }
4953:
4954: /* backward solve the upper triangular */
4955: for (i=n-1; i>=0; i--){
4956: v = aa + bs2*(adiag[i+1]+1);
4957: vi = aj + adiag[i+1]+1;
4958: nz = adiag[i] - adiag[i+1]-1;
4959: idt = bs*i;
4960: s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];
4961:
4962: for(k=0;k<nz;k++){
4963: idx = bs*vi[k];
4964: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
4965: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4969: v += bs2;
4970: }
4971: /* x = inv_diagonal*x */
4972: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4976: }
4978: VecRestoreArray(bb,(PetscScalar**)&b);
4979: VecRestoreArray(xx,&x);
4980: PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);
4981: return(0);
4982: }
4986: PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4987: {
4988: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
4989: IS iscol=a->col,isrow=a->row;
4990: PetscErrorCode ierr;
4991: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4992: PetscInt i,nz,idx,idt,idc;
4993: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4994: const MatScalar *aa=a->a,*v;
4995: PetscScalar *x,s1,s2,x1,x2,*t;
4996: const PetscScalar *b;
4999: VecGetArray(bb,(PetscScalar**)&b);
5000: VecGetArray(xx,&x);
5001: t = a->solve_work;
5003: ISGetIndices(isrow,&rout); r = rout;
5004: ISGetIndices(iscol,&cout); c = cout + (n-1);
5006: /* forward solve the lower triangular */
5007: idx = 2*(*r++);
5008: t[0] = b[idx]; t[1] = b[1+idx];
5009: for (i=1; i<n; i++) {
5010: v = aa + 4*ai[i];
5011: vi = aj + ai[i];
5012: nz = diag[i] - ai[i];
5013: idx = 2*(*r++);
5014: s1 = b[idx]; s2 = b[1+idx];
5015: while (nz--) {
5016: idx = 2*(*vi++);
5017: x1 = t[idx]; x2 = t[1+idx];
5018: s1 -= v[0]*x1 + v[2]*x2;
5019: s2 -= v[1]*x1 + v[3]*x2;
5020: v += 4;
5021: }
5022: idx = 2*i;
5023: t[idx] = s1; t[1+idx] = s2;
5024: }
5025: /* backward solve the upper triangular */
5026: for (i=n-1; i>=0; i--){
5027: v = aa + 4*diag[i] + 4;
5028: vi = aj + diag[i] + 1;
5029: nz = ai[i+1] - diag[i] - 1;
5030: idt = 2*i;
5031: s1 = t[idt]; s2 = t[1+idt];
5032: while (nz--) {
5033: idx = 2*(*vi++);
5034: x1 = t[idx]; x2 = t[1+idx];
5035: s1 -= v[0]*x1 + v[2]*x2;
5036: s2 -= v[1]*x1 + v[3]*x2;
5037: v += 4;
5038: }
5039: idc = 2*(*c--);
5040: v = aa + 4*diag[i];
5041: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5042: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5043: }
5044: ISRestoreIndices(isrow,&rout);
5045: ISRestoreIndices(iscol,&cout);
5046: VecRestoreArray(bb,(PetscScalar**)&b);
5047: VecRestoreArray(xx,&x);
5048: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5049: return(0);
5050: }
5054: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
5055: {
5056: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
5057: IS iscol=a->col,isrow=a->row;
5058: PetscErrorCode ierr;
5059: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5060: PetscInt i,nz,idx,jdx,idt,idc,m;
5061: const PetscInt *r,*c,*rout,*cout;
5062: const MatScalar *aa=a->a,*v;
5063: PetscScalar *x,s1,s2,x1,x2,*t;
5064: const PetscScalar *b;
5067: VecGetArray(bb,(PetscScalar**)&b);
5068: VecGetArray(xx,&x);
5069: t = a->solve_work;
5071: ISGetIndices(isrow,&rout); r = rout;
5072: ISGetIndices(iscol,&cout); c = cout;
5074: /* forward solve the lower triangular */
5075: idx = 2*r[0];
5076: t[0] = b[idx]; t[1] = b[1+idx];
5077: for (i=1; i<n; i++) {
5078: v = aa + 4*ai[i];
5079: vi = aj + ai[i];
5080: nz = ai[i+1] - ai[i];
5081: idx = 2*r[i];
5082: s1 = b[idx]; s2 = b[1+idx];
5083: for(m=0;m<nz;m++){
5084: jdx = 2*vi[m];
5085: x1 = t[jdx]; x2 = t[1+jdx];
5086: s1 -= v[0]*x1 + v[2]*x2;
5087: s2 -= v[1]*x1 + v[3]*x2;
5088: v += 4;
5089: }
5090: idx = 2*i;
5091: t[idx] = s1; t[1+idx] = s2;
5092: }
5093: /* backward solve the upper triangular */
5094: for (i=n-1; i>=0; i--){
5095: v = aa + 4*(adiag[i+1]+1);
5096: vi = aj + adiag[i+1]+1;
5097: nz = adiag[i] - adiag[i+1] - 1;
5098: idt = 2*i;
5099: s1 = t[idt]; s2 = t[1+idt];
5100: for(m=0;m<nz;m++){
5101: idx = 2*vi[m];
5102: x1 = t[idx]; x2 = t[1+idx];
5103: s1 -= v[0]*x1 + v[2]*x2;
5104: s2 -= v[1]*x1 + v[3]*x2;
5105: v += 4;
5106: }
5107: idc = 2*c[i];
5108: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
5109: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
5110: }
5111: ISRestoreIndices(isrow,&rout);
5112: ISRestoreIndices(iscol,&cout);
5113: VecRestoreArray(bb,(PetscScalar**)&b);
5114: VecRestoreArray(xx,&x);
5115: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5116: return(0);
5117: }
5119: /*
5120: Special case where the matrix was ILU(0) factored in the natural
5121: ordering. This eliminates the need for the column and row permutation.
5122: */
5125: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5126: {
5127: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5128: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5129: PetscErrorCode ierr;
5130: const MatScalar *aa=a->a,*v;
5131: PetscScalar *x,s1,s2,x1,x2;
5132: const PetscScalar *b;
5133: PetscInt jdx,idt,idx,nz,i;
5136: VecGetArray(bb,(PetscScalar**)&b);
5137: VecGetArray(xx,&x);
5139: /* forward solve the lower triangular */
5140: idx = 0;
5141: x[0] = b[0]; x[1] = b[1];
5142: for (i=1; i<n; i++) {
5143: v = aa + 4*ai[i];
5144: vi = aj + ai[i];
5145: nz = diag[i] - ai[i];
5146: idx += 2;
5147: s1 = b[idx];s2 = b[1+idx];
5148: while (nz--) {
5149: jdx = 2*(*vi++);
5150: x1 = x[jdx];x2 = x[1+jdx];
5151: s1 -= v[0]*x1 + v[2]*x2;
5152: s2 -= v[1]*x1 + v[3]*x2;
5153: v += 4;
5154: }
5155: x[idx] = s1;
5156: x[1+idx] = s2;
5157: }
5158: /* backward solve the upper triangular */
5159: for (i=n-1; i>=0; i--){
5160: v = aa + 4*diag[i] + 4;
5161: vi = aj + diag[i] + 1;
5162: nz = ai[i+1] - diag[i] - 1;
5163: idt = 2*i;
5164: s1 = x[idt]; s2 = x[1+idt];
5165: while (nz--) {
5166: idx = 2*(*vi++);
5167: x1 = x[idx]; x2 = x[1+idx];
5168: s1 -= v[0]*x1 + v[2]*x2;
5169: s2 -= v[1]*x1 + v[3]*x2;
5170: v += 4;
5171: }
5172: v = aa + 4*diag[i];
5173: x[idt] = v[0]*s1 + v[2]*s2;
5174: x[1+idt] = v[1]*s1 + v[3]*s2;
5175: }
5177: VecRestoreArray(bb,(PetscScalar**)&b);
5178: VecRestoreArray(xx,&x);
5179: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5180: return(0);
5181: }
5185: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5186: {
5187: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5188: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5189: PetscInt i,k,nz,idx,idt,jdx;
5190: PetscErrorCode ierr;
5191: const MatScalar *aa=a->a,*v;
5192: PetscScalar *x,s1,s2,x1,x2;
5193: const PetscScalar *b;
5194:
5196: VecGetArray(bb,(PetscScalar**)&b);
5197: VecGetArray(xx,&x);
5198: /* forward solve the lower triangular */
5199: idx = 0;
5200: x[0] = b[idx]; x[1] = b[1+idx];
5201: for (i=1; i<n; i++) {
5202: v = aa + 4*ai[i];
5203: vi = aj + ai[i];
5204: nz = ai[i+1] - ai[i];
5205: idx = 2*i;
5206: s1 = b[idx];s2 = b[1+idx];
5207: for(k=0;k<nz;k++){
5208: jdx = 2*vi[k];
5209: x1 = x[jdx];x2 = x[1+jdx];
5210: s1 -= v[0]*x1 + v[2]*x2;
5211: s2 -= v[1]*x1 + v[3]*x2;
5212: v += 4;
5213: }
5214: x[idx] = s1;
5215: x[1+idx] = s2;
5216: }
5217:
5218: /* backward solve the upper triangular */
5219: for (i=n-1; i>=0; i--){
5220: v = aa + 4*(adiag[i+1]+1);
5221: vi = aj + adiag[i+1]+1;
5222: nz = adiag[i] - adiag[i+1]-1;
5223: idt = 2*i;
5224: s1 = x[idt]; s2 = x[1+idt];
5225: for(k=0;k<nz;k++){
5226: idx = 2*vi[k];
5227: x1 = x[idx]; x2 = x[1+idx];
5228: s1 -= v[0]*x1 + v[2]*x2;
5229: s2 -= v[1]*x1 + v[3]*x2;
5230: v += 4;
5231: }
5232: /* x = inv_diagonal*x */
5233: x[idt] = v[0]*s1 + v[2]*s2;
5234: x[1+idt] = v[1]*s1 + v[3]*s2;
5235: }
5237: VecRestoreArray(bb,(PetscScalar**)&b);
5238: VecRestoreArray(xx,&x);
5239: PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);
5240: return(0);
5241: }
5245: PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5246: {
5247: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
5248: IS iscol=a->col,isrow=a->row;
5249: PetscErrorCode ierr;
5250: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5251: PetscInt i,nz;
5252: const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5253: const MatScalar *aa=a->a,*v;
5254: PetscScalar *x,s1,*t;
5255: const PetscScalar *b;
5258: if (!n) return(0);
5260: VecGetArray(bb,(PetscScalar**)&b);
5261: VecGetArray(xx,&x);
5262: t = a->solve_work;
5264: ISGetIndices(isrow,&rout); r = rout;
5265: ISGetIndices(iscol,&cout); c = cout + (n-1);
5267: /* forward solve the lower triangular */
5268: t[0] = b[*r++];
5269: for (i=1; i<n; i++) {
5270: v = aa + ai[i];
5271: vi = aj + ai[i];
5272: nz = diag[i] - ai[i];
5273: s1 = b[*r++];
5274: while (nz--) {
5275: s1 -= (*v++)*t[*vi++];
5276: }
5277: t[i] = s1;
5278: }
5279: /* backward solve the upper triangular */
5280: for (i=n-1; i>=0; i--){
5281: v = aa + diag[i] + 1;
5282: vi = aj + diag[i] + 1;
5283: nz = ai[i+1] - diag[i] - 1;
5284: s1 = t[i];
5285: while (nz--) {
5286: s1 -= (*v++)*t[*vi++];
5287: }
5288: x[*c--] = t[i] = aa[diag[i]]*s1;
5289: }
5291: ISRestoreIndices(isrow,&rout);
5292: ISRestoreIndices(iscol,&cout);
5293: VecRestoreArray(bb,(PetscScalar**)&b);
5294: VecRestoreArray(xx,&x);
5295: PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);
5296: return(0);
5297: }
5298: /*
5299: Special case where the matrix was ILU(0) factored in the natural
5300: ordering. This eliminates the need for the column and row permutation.
5301: */
5304: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5305: {
5306: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
5307: const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5308: PetscErrorCode ierr;
5309: const MatScalar *aa=a->a,*v;
5310: PetscScalar *x;
5311: const PetscScalar *b;
5312: PetscScalar s1,x1;
5313: PetscInt jdx,idt,idx,nz,i;
5316: VecGetArray(bb,(PetscScalar**)&b);
5317: VecGetArray(xx,&x);
5319: /* forward solve the lower triangular */
5320: idx = 0;
5321: x[0] = b[0];
5322: for (i=1; i<n; i++) {
5323: v = aa + ai[i];
5324: vi = aj + ai[i];
5325: nz = diag[i] - ai[i];
5326: idx += 1;
5327: s1 = b[idx];
5328: while (nz--) {
5329: jdx = *vi++;
5330: x1 = x[jdx];
5331: s1 -= v[0]*x1;
5332: v += 1;
5333: }
5334: x[idx] = s1;
5335: }
5336: /* backward solve the upper triangular */
5337: for (i=n-1; i>=0; i--){
5338: v = aa + diag[i] + 1;
5339: vi = aj + diag[i] + 1;
5340: nz = ai[i+1] - diag[i] - 1;
5341: idt = i;
5342: s1 = x[idt];
5343: while (nz--) {
5344: idx = *vi++;
5345: x1 = x[idx];
5346: s1 -= v[0]*x1;
5347: v += 1;
5348: }
5349: v = aa + diag[i];
5350: x[idt] = v[0]*s1;
5351: }
5352: VecRestoreArray(bb,(PetscScalar**)&b);
5353: VecRestoreArray(xx,&x);
5354: PetscLogFlops(2.0*(a->nz) - A->cmap->n);
5355: return(0);
5356: }
5358: /* ----------------------------------------------------------------*/
5359: EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5363: /*
5364: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5365: */
5366: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
5367: {
5368: Mat C=B;
5369: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5370: PetscErrorCode ierr;
5371: PetscInt i,j,k,ipvt[15];
5372: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5373: PetscInt nz,nzL,row;
5374: MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225];
5375: const MatScalar *v,*aa=a->a;
5376: PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg;
5377: PetscInt sol_ver;
5381: PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);
5383: /* generate work space needed by the factorization */
5384: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
5385: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5387: for (i=0; i<n; i++){
5388: /* zero rtmp */
5389: /* L part */
5390: nz = bi[i+1] - bi[i];
5391: bjtmp = bj + bi[i];
5392: for (j=0; j<nz; j++){
5393: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5394: }
5396: /* U part */
5397: nz = bdiag[i] - bdiag[i+1];
5398: bjtmp = bj + bdiag[i+1]+1;
5399: for (j=0; j<nz; j++){
5400: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5401: }
5402:
5403: /* load in initial (unfactored row) */
5404: nz = ai[i+1] - ai[i];
5405: ajtmp = aj + ai[i];
5406: v = aa + bs2*ai[i];
5407: for (j=0; j<nz; j++) {
5408: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
5409: }
5411: /* elimination */
5412: bjtmp = bj + bi[i];
5413: nzL = bi[i+1] - bi[i];
5414: for(k=0;k < nzL;k++) {
5415: row = bjtmp[k];
5416: pc = rtmp + bs2*row;
5417: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5418: if (flg) {
5419: pv = b->a + bs2*bdiag[row];
5420: Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5421: /*Kernel_A_gets_A_times_B_15(pc,pv,mwork);*/
5422: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5423: pv = b->a + bs2*(bdiag[row+1]+1);
5424: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5425: for (j=0; j<nz; j++) {
5426: vv = rtmp + bs2*pj[j];
5427: Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5428: /* Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
5429: pv += bs2;
5430: }
5431: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5432: }
5433: }
5435: /* finished row so stick it into b->a */
5436: /* L part */
5437: pv = b->a + bs2*bi[i] ;
5438: pj = b->j + bi[i] ;
5439: nz = bi[i+1] - bi[i];
5440: for (j=0; j<nz; j++) {
5441: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5442: }
5444: /* Mark diagonal and invert diagonal for simplier triangular solves */
5445: pv = b->a + bs2*bdiag[i];
5446: pj = b->j + bdiag[i];
5447: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5448: /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5449: Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);
5450:
5451: /* U part */
5452: pv = b->a + bs2*(bdiag[i+1]+1);
5453: pj = b->j + bdiag[i+1]+1;
5454: nz = bdiag[i] - bdiag[i+1] - 1;
5455: for (j=0; j<nz; j++){
5456: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5457: }
5458: }
5460: PetscFree2(rtmp,mwork);
5461: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5462: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
5463: C->assembled = PETSC_TRUE;
5464: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5465: return(0);
5466: }
5470: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5471: {
5472: Mat C=B;
5473: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5474: IS isrow = b->row,isicol = b->icol;
5476: const PetscInt *r,*ic,*ics;
5477: PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5478: PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5479: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5480: PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5481: MatScalar *v_work;
5482: PetscTruth col_identity,row_identity,both_identity;
5485: ISGetIndices(isrow,&r);
5486: ISGetIndices(isicol,&ic);
5487:
5488: PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);
5489: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
5490: ics = ic;
5492: /* generate work space needed by dense LU factorization */
5493: PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);
5495: for (i=0; i<n; i++){
5496: /* zero rtmp */
5497: /* L part */
5498: nz = bi[i+1] - bi[i];
5499: bjtmp = bj + bi[i];
5500: for (j=0; j<nz; j++){
5501: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5502: }
5504: /* U part */
5505: nz = bdiag[i] - bdiag[i+1];
5506: bjtmp = bj + bdiag[i+1]+1;
5507: for (j=0; j<nz; j++){
5508: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
5509: }
5510:
5511: /* load in initial (unfactored row) */
5512: nz = ai[r[i]+1] - ai[r[i]];
5513: ajtmp = aj + ai[r[i]];
5514: v = aa + bs2*ai[r[i]];
5515: for (j=0; j<nz; j++) {
5516: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
5517: }
5519: /* elimination */
5520: bjtmp = bj + bi[i];
5521: nzL = bi[i+1] - bi[i];
5522: for(k=0;k < nzL;k++) {
5523: row = bjtmp[k];
5524: pc = rtmp + bs2*row;
5525: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5526: if (flg) {
5527: pv = b->a + bs2*bdiag[row];
5528: Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5529: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5530: pv = b->a + bs2*(bdiag[row+1]+1);
5531: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5532: for (j=0; j<nz; j++) {
5533: Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5534: }
5535: PetscLogFlops(2*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5536: }
5537: }
5539: /* finished row so stick it into b->a */
5540: /* L part */
5541: pv = b->a + bs2*bi[i] ;
5542: pj = b->j + bi[i] ;
5543: nz = bi[i+1] - bi[i];
5544: for (j=0; j<nz; j++) {
5545: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5546: }
5548: /* Mark diagonal and invert diagonal for simplier triangular solves */
5549: pv = b->a + bs2*bdiag[i];
5550: pj = b->j + bdiag[i];
5551: /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5552: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
5553: Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);
5554:
5555: /* U part */
5556: pv = b->a + bs2*(bdiag[i+1]+1);
5557: pj = b->j + bdiag[i+1]+1;
5558: nz = bdiag[i] - bdiag[i+1] - 1;
5559: for (j=0; j<nz; j++){
5560: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
5561: }
5562: }
5564: PetscFree(rtmp);
5565: PetscFree3(v_work,mwork,v_pivots);
5566: ISRestoreIndices(isicol,&ic);
5567: ISRestoreIndices(isrow,&r);
5569: ISIdentity(isrow,&row_identity);
5570: ISIdentity(isicol,&col_identity);
5571: both_identity = (PetscTruth) (row_identity && col_identity);
5572: if (both_identity){
5573: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5574: } else {
5575: C->ops->solve = MatSolve_SeqBAIJ_N;
5576: }
5577: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5578:
5579: C->assembled = PETSC_TRUE;
5580: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
5581: return(0);
5582: }
5584: /*
5585: ilu(0) with natural ordering under new data structure.
5586: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5587: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5588: */
5592: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5593: {
5594:
5595: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5596: PetscErrorCode ierr;
5597: PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5598: PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp;
5601: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
5602: b = (Mat_SeqBAIJ*)(fact)->data;
5603:
5604: /* allocate matrix arrays for new data structure */
5605: PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);
5606: PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
5607: b->singlemalloc = PETSC_TRUE;
5608: if (!b->diag){
5609: PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);
5610: PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));
5611: }
5612: bdiag = b->diag;
5613:
5614: if (n > 0) {
5615: PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));
5616: }
5617:
5618: /* set bi and bj with new data structure */
5619: bi = b->i;
5620: bj = b->j;
5622: /* L part */
5623: bi[0] = 0;
5624: for (i=0; i<n; i++){
5625: nz = adiag[i] - ai[i];
5626: bi[i+1] = bi[i] + nz;
5627: aj = a->j + ai[i];
5628: for (j=0; j<nz; j++){
5629: *bj = aj[j]; bj++;
5630: }
5631: }
5632:
5633: /* U part */
5634: bi_temp = bi[n];
5635: bdiag[n] = bi[n]-1;
5636: for (i=n-1; i>=0; i--){
5637: nz = ai[i+1] - adiag[i] - 1;
5638: bi_temp = bi_temp + nz + 1;
5639: aj = a->j + adiag[i] + 1;
5640: for (j=0; j<nz; j++){
5641: *bj = aj[j]; bj++;
5642: }
5643: /* diag[i] */
5644: *bj = i; bj++;
5645: bdiag[i] = bi_temp - 1;
5646: }
5647: return(0);
5648: }
5652: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5653: {
5654: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5655: IS isicol;
5656: PetscErrorCode ierr;
5657: const PetscInt *r,*ic;
5658: PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d;
5659: PetscInt *bi,*cols,nnz,*cols_lvl;
5660: PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5661: PetscInt i,levels,diagonal_fill;
5662: PetscTruth col_identity,row_identity,both_identity;
5663: PetscReal f;
5664: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5665: PetscBT lnkbt;
5666: PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr;
5667: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5668: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5669: PetscTruth missing;
5670: PetscInt bs=A->rmap->bs,bs2=a->bs2;
5673: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5674: MatMissingDiagonal(A,&missing,&d);
5675: if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5677: f = info->fill;
5678: levels = (PetscInt)info->levels;
5679: diagonal_fill = (PetscInt)info->diagonal_fill;
5680: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
5682: ISIdentity(isrow,&row_identity);
5683: ISIdentity(iscol,&col_identity);
5684: both_identity = (PetscTruth) (row_identity && col_identity);
5685:
5686: if (!levels && both_identity) {
5687: /* special case: ilu(0) with natural ordering */
5688: MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
5689: MatSeqBAIJSetNumericFactorization(fact,both_identity);
5691: fact->factor = MAT_FACTOR_ILU;
5692: (fact)->info.factor_mallocs = 0;
5693: (fact)->info.fill_ratio_given = info->fill;
5694: (fact)->info.fill_ratio_needed = 1.0;
5695: b = (Mat_SeqBAIJ*)(fact)->data;
5696: b->row = isrow;
5697: b->col = iscol;
5698: b->icol = isicol;
5699: PetscObjectReference((PetscObject)isrow);
5700: PetscObjectReference((PetscObject)iscol);
5701: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5702: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5703: return(0);
5704: }
5705:
5706: ISGetIndices(isrow,&r);
5707: ISGetIndices(isicol,&ic);
5708:
5709: /* get new row pointers */
5710: PetscMalloc((n+1)*sizeof(PetscInt),&bi);
5711: bi[0] = 0;
5712: /* bdiag is location of diagonal in factor */
5713: PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);
5714: bdiag[0] = 0;
5716: PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);
5718: /* create a linked list for storing column indices of the active row */
5719: nlnk = n + 1;
5720: PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
5721:
5722: /* initial FreeSpace size is f*(ai[n]+1) */
5723: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);
5724: current_space = free_space;
5725: PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);
5726: current_space_lvl = free_space_lvl;
5727:
5728: for (i=0; i<n; i++) {
5729: nzi = 0;
5730: /* copy current row into linked list */
5731: nnz = ai[r[i]+1] - ai[r[i]];
5732: if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5733: cols = aj + ai[r[i]];
5734: lnk[i] = -1; /* marker to indicate if diagonal exists */
5735: PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);
5736: nzi += nlnk;
5738: /* make sure diagonal entry is included */
5739: if (diagonal_fill && lnk[i] == -1) {
5740: fm = n;
5741: while (lnk[fm] < i) fm = lnk[fm];
5742: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
5743: lnk[fm] = i;
5744: lnk_lvl[i] = 0;
5745: nzi++; dcount++;
5746: }
5748: /* add pivot rows into the active row */
5749: nzbd = 0;
5750: prow = lnk[n];
5751: while (prow < i) {
5752: nnz = bdiag[prow];
5753: cols = bj_ptr[prow] + nnz + 1;
5754: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5755: nnz = bi[prow+1] - bi[prow] - nnz - 1;
5756: PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);
5757: nzi += nlnk;
5758: prow = lnk[prow];
5759: nzbd++;
5760: }
5761: bdiag[i] = nzbd;
5762: bi[i+1] = bi[i] + nzi;
5764: /* if free space is not available, make more free space */
5765: if (current_space->local_remaining<nzi) {
5766: nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5767: PetscFreeSpaceGet(nnz,¤t_space);
5768: PetscFreeSpaceGet(nnz,¤t_space_lvl);
5769: reallocs++;
5770: }
5772: /* copy data into free_space and free_space_lvl, then initialize lnk */
5773: PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
5774: bj_ptr[i] = current_space->array;
5775: bjlvl_ptr[i] = current_space_lvl->array;
5777: /* make sure the active row i has diagonal entry */
5778: if (*(bj_ptr[i]+bdiag[i]) != i) {
5779: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5780: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5781: }
5783: current_space->array += nzi;
5784: current_space->local_used += nzi;
5785: current_space->local_remaining -= nzi;
5786: current_space_lvl->array += nzi;
5787: current_space_lvl->local_used += nzi;
5788: current_space_lvl->local_remaining -= nzi;
5789: }
5790:
5791: ISRestoreIndices(isrow,&r);
5792: ISRestoreIndices(isicol,&ic);
5794: /* destroy list of free space and other temporary arrays */
5795: PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);
5796:
5797: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5798: PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
5799:
5800: PetscIncompleteLLDestroy(lnk,lnkbt);
5801: PetscFreeSpaceDestroy(free_space_lvl);
5802: PetscFree2(bj_ptr,bjlvl_ptr);
5804: #if defined(PETSC_USE_INFO)
5805: {
5806: PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5807: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
5808: PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);
5809: PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);
5810: PetscInfo(A,"for best performance.\n");
5811: if (diagonal_fill) {
5812: PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);
5813: }
5814: }
5815: #endif
5817: /* put together the new matrix */
5818: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
5819: PetscLogObjectParent(fact,isicol);
5820: b = (Mat_SeqBAIJ*)(fact)->data;
5821: b->free_a = PETSC_TRUE;
5822: b->free_ij = PETSC_TRUE;
5823: b->singlemalloc = PETSC_FALSE;
5824: PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);
5825: b->j = bj;
5826: b->i = bi;
5827: b->diag = bdiag;
5828: b->free_diag = PETSC_TRUE;
5829: b->ilen = 0;
5830: b->imax = 0;
5831: b->row = isrow;
5832: b->col = iscol;
5833: PetscObjectReference((PetscObject)isrow);
5834: PetscObjectReference((PetscObject)iscol);
5835: b->icol = isicol;
5836: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
5837: /* In b structure: Free imax, ilen, old a, old j.
5838: Allocate bdiag, solve_work, new a, new j */
5839: PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
5840: b->maxnz = b->nz = bdiag[0]+1;
5841: fact->info.factor_mallocs = reallocs;
5842: fact->info.fill_ratio_given = f;
5843: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5844: MatSeqBAIJSetNumericFactorization(fact,both_identity);
5845: return(0);
5846: }
5849: /*
5850: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5851: except that the data structure of Mat_SeqAIJ is slightly different.
5852: Not a good example of code reuse.
5853: */
5856: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5857: {
5858: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
5859: IS isicol;
5861: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5862: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5863: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5864: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5865: PetscTruth col_identity,row_identity,both_identity,flg;
5866: PetscReal f;
5869: MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
5870: if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5871:
5872: f = info->fill;
5873: levels = (PetscInt)info->levels;
5874: diagonal_fill = (PetscInt)info->diagonal_fill;
5875: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
5877: ISIdentity(isrow,&row_identity);
5878: ISIdentity(iscol,&col_identity);
5879: both_identity = (PetscTruth) (row_identity && col_identity);
5881: if (!levels && both_identity) { /* special case copy the nonzero structure */
5882: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
5883: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
5885: fact->factor = MAT_FACTOR_ILU;
5886: b = (Mat_SeqBAIJ*)fact->data;
5887: b->row = isrow;
5888: b->col = iscol;
5889: PetscObjectReference((PetscObject)isrow);
5890: PetscObjectReference((PetscObject)iscol);
5891: b->icol = isicol;
5892: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5893: PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);
5894: return(0);
5895: }
5897: /* general case perform the symbolic factorization */
5898: ISGetIndices(isrow,&r);
5899: ISGetIndices(isicol,&ic);
5901: /* get new row pointers */
5902: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
5903: ainew[0] = 0;
5904: /* don't know how many column pointers are needed so estimate */
5905: jmax = (PetscInt)(f*ai[n] + 1);
5906: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
5907: /* ajfill is level of fill for each fill entry */
5908: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
5909: /* fill is a linked list of nonzeros in active row */
5910: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
5911: /* im is level for each filled value */
5912: PetscMalloc((n+1)*sizeof(PetscInt),&im);
5913: /* dloc is location of diagonal in factor */
5914: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
5915: dloc[0] = 0;
5916: for (prow=0; prow<n; prow++) {
5918: /* copy prow into linked list */
5919: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
5920: if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5921: xi = aj + ai[r[prow]];
5922: fill[n] = n;
5923: fill[prow] = -1; /* marker for diagonal entry */
5924: while (nz--) {
5925: fm = n;
5926: idx = ic[*xi++];
5927: do {
5928: m = fm;
5929: fm = fill[m];
5930: } while (fm < idx);
5931: fill[m] = idx;
5932: fill[idx] = fm;
5933: im[idx] = 0;
5934: }
5936: /* make sure diagonal entry is included */
5937: if (diagonal_fill && fill[prow] == -1) {
5938: fm = n;
5939: while (fill[fm] < prow) fm = fill[fm];
5940: fill[prow] = fill[fm]; /* insert diagonal into linked list */
5941: fill[fm] = prow;
5942: im[prow] = 0;
5943: nzf++;
5944: dcount++;
5945: }
5947: nzi = 0;
5948: row = fill[n];
5949: while (row < prow) {
5950: incrlev = im[row] + 1;
5951: nz = dloc[row];
5952: xi = ajnew + ainew[row] + nz + 1;
5953: flev = ajfill + ainew[row] + nz + 1;
5954: nnz = ainew[row+1] - ainew[row] - nz - 1;
5955: fm = row;
5956: while (nnz-- > 0) {
5957: idx = *xi++;
5958: if (*flev + incrlev > levels) {
5959: flev++;
5960: continue;
5961: }
5962: do {
5963: m = fm;
5964: fm = fill[m];
5965: } while (fm < idx);
5966: if (fm != idx) {
5967: im[idx] = *flev + incrlev;
5968: fill[m] = idx;
5969: fill[idx] = fm;
5970: fm = idx;
5971: nzf++;
5972: } else {
5973: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5974: }
5975: flev++;
5976: }
5977: row = fill[row];
5978: nzi++;
5979: }
5980: /* copy new filled row into permanent storage */
5981: ainew[prow+1] = ainew[prow] + nzf;
5982: if (ainew[prow+1] > jmax) {
5984: /* estimate how much additional space we will need */
5985: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5986: /* just double the memory each time */
5987: PetscInt maxadd = jmax;
5988: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5989: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5990: jmax += maxadd;
5992: /* allocate a longer ajnew and ajfill */
5993: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
5994: PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));
5995: PetscFree(ajnew);
5996: ajnew = xitmp;
5997: PetscMalloc(jmax*sizeof(PetscInt),&xitmp);
5998: PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));
5999: PetscFree(ajfill);
6000: ajfill = xitmp;
6001: reallocate++; /* count how many reallocations are needed */
6002: }
6003: xitmp = ajnew + ainew[prow];
6004: flev = ajfill + ainew[prow];
6005: dloc[prow] = nzi;
6006: fm = fill[n];
6007: while (nzf--) {
6008: *xitmp++ = fm;
6009: *flev++ = im[fm];
6010: fm = fill[fm];
6011: }
6012: /* make sure row has diagonal entry */
6013: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6014: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
6015: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6016: }
6017: }
6018: PetscFree(ajfill);
6019: ISRestoreIndices(isrow,&r);
6020: ISRestoreIndices(isicol,&ic);
6021: PetscFree(fill);
6022: PetscFree(im);
6024: #if defined(PETSC_USE_INFO)
6025: {
6026: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6027: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
6028: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
6029: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
6030: PetscInfo(A,"for best performance.\n");
6031: if (diagonal_fill) {
6032: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
6033: }
6034: }
6035: #endif
6037: /* put together the new matrix */
6038: MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
6039: PetscLogObjectParent(fact,isicol);
6040: b = (Mat_SeqBAIJ*)fact->data;
6041: b->free_a = PETSC_TRUE;
6042: b->free_ij = PETSC_TRUE;
6043: b->singlemalloc = PETSC_FALSE;
6044: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
6045: b->j = ajnew;
6046: b->i = ainew;
6047: for (i=0; i<n; i++) dloc[i] += ainew[i];
6048: b->diag = dloc;
6049: b->free_diag = PETSC_TRUE;
6050: b->ilen = 0;
6051: b->imax = 0;
6052: b->row = isrow;
6053: b->col = iscol;
6054: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6055: PetscObjectReference((PetscObject)isrow);
6056: PetscObjectReference((PetscObject)iscol);
6057: b->icol = isicol;
6058: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
6059: /* In b structure: Free imax, ilen, old a, old j.
6060: Allocate dloc, solve_work, new a, new j */
6061: PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
6062: b->maxnz = b->nz = ainew[n];
6064: fact->info.factor_mallocs = reallocate;
6065: fact->info.fill_ratio_given = f;
6066: fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
6068: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
6069: return(0);
6070: }
6074: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
6075: {
6076: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
6077: /* int i,*AJ=a->j,nz=a->nz; */
6079: /* Undo Column scaling */
6080: /* while (nz--) { */
6081: /* AJ[i] = AJ[i]/4; */
6082: /* } */
6083: /* This should really invoke a push/pop logic, but we don't have that yet. */
6084: A->ops->setunfactored = PETSC_NULL;
6085: return(0);
6086: }
6090: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
6091: {
6092: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
6093: PetscInt *AJ=a->j,nz=a->nz;
6094: unsigned short *aj=(unsigned short *)AJ;
6096: /* Is this really necessary? */
6097: while (nz--) {
6098: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
6099: }
6100: A->ops->setunfactored = PETSC_NULL;
6101: return(0);
6102: }