Actual source code: sbaijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for SBAIJ format.
5: */
7: #include ../src/mat/impls/sbaij/seq/sbaij.h
8: #include ../src/mat/impls/baij/seq/baij.h
9: #include ../src/mat/blockinvert.h
13: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
16: IS isrow=a->row;
17: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
18: PetscErrorCode ierr;
19: const PetscInt *r;
20: PetscInt nz,*vj,k,idx,k1;
21: PetscInt bs=A->rmap->bs,bs2 = a->bs2;
22: MatScalar *aa=a->a,*v,*diag;
23: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
26: VecGetArray(bb,&b);
27: VecGetArray(xx,&x);
28: t = a->solve_work;
29: ISGetIndices(isrow,&r);
30: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
32: /* solve U^T * D * y = b by forward substitution */
33: xk = t;
34: for (k=0; k<mbs; k++) { /* t <- perm(b) */
35: idx = bs*r[k];
36: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37: }
38: for (k=0; k<mbs; k++){
39: v = aa + bs2*ai[k];
40: xk = t + k*bs; /* Dk*xk = k-th block of x */
41: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
42: nz = ai[k+1] - ai[k];
43: vj = aj + ai[k];
44: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
45: while (nz--) {
46: /* x(:) += U(k,:)^T*(Dk*xk) */
47: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48: vj++; xj = t + (*vj)*bs;
49: v += bs2;
50: }
51: /* xk = inv(Dk)*(Dk*xk) */
52: diag = aa+k*bs2; /* ptr to inv(Dk) */
53: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54: }
56: /* solve U*x = y by back substitution */
57: for (k=mbs-1; k>=0; k--){
58: v = aa + bs2*ai[k];
59: xk = t + k*bs; /* xk */
60: nz = ai[k+1] - ai[k];
61: vj = aj + ai[k];
62: xj = t + (*vj)*bs;
63: while (nz--) {
64: /* xk += U(k,:)*x(:) */
65: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66: vj++;
67: v += bs2; xj = t + (*vj)*bs;
68: }
69: idx = bs*r[k];
70: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71: }
73: PetscFree(xk_tmp);
74: ISRestoreIndices(isrow,&r);
75: VecRestoreArray(bb,&b);
76: VecRestoreArray(xx,&x);
77: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
78: return(0);
79: }
83: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84: {
86: SETERRQ(1,"not implemented yet");
87: return(0);
88: }
92: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93: {
95: SETERRQ(1,"not implemented yet");
96: return(0);
97: }
101: PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102: {
104: PetscInt nz,*vj,k;
105: PetscInt bs2 = bs*bs;
106: MatScalar *v,*diag;
107: PetscScalar *xk,*xj,*xk_tmp;
108:
110: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
111: for (k=0; k<mbs; k++){
112: v = aa + bs2*ai[k];
113: xk = x + k*bs; /* Dk*xk = k-th block of x */
114: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
115: nz = ai[k+1] - ai[k];
116: vj = aj + ai[k];
117: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
118: while (nz--) {
119: /* x(:) += U(k,:)^T*(Dk*xk) */
120: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121: vj++; xj = x + (*vj)*bs;
122: v += bs2;
123: }
124: /* xk = inv(Dk)*(Dk*xk) */
125: diag = aa+k*bs2; /* ptr to inv(Dk) */
126: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127: }
128: PetscFree(xk_tmp);
129: return(0);
130: }
134: PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135: {
136: PetscInt nz,*vj,k;
137: PetscInt bs2 = bs*bs;
138: MatScalar *v;
139: PetscScalar *xk,*xj;
142: for (k=mbs-1; k>=0; k--){
143: v = aa + bs2*ai[k];
144: xk = x + k*bs; /* xk */
145: nz = ai[k+1] - ai[k];
146: vj = aj + ai[k];
147: xj = x + (*vj)*bs;
148: while (nz--) {
149: /* xk += U(k,:)*x(:) */
150: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151: vj++;
152: v += bs2; xj = x + (*vj)*bs;
153: }
154: }
155: return(0);
156: }
160: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161: {
162: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
164: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165: PetscInt bs=A->rmap->bs;
166: MatScalar *aa=a->a;
167: PetscScalar *x,*b;
170: VecGetArray(bb,&b);
171: VecGetArray(xx,&x);
173: /* solve U^T * D * y = b by forward substitution */
174: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
175: ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
177: /* solve U*x = y by back substitution */
178: BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
180: VecRestoreArray(bb,&b);
181: VecRestoreArray(xx,&x);
182: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
183: return(0);
184: }
188: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
189: {
190: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
192: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
193: PetscInt bs=A->rmap->bs;
194: MatScalar *aa=a->a;
195: PetscScalar *x,*b;
198: VecGetArray(bb,&b);
199: VecGetArray(xx,&x);
200: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
201: ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
202: VecRestoreArray(bb,&b);
203: VecRestoreArray(xx,&x);
204: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
205: return(0);
206: }
210: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
211: {
212: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
214: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
215: PetscInt bs=A->rmap->bs;
216: MatScalar *aa=a->a;
217: PetscScalar *x,*b;
220: VecGetArray(bb,&b);
221: VecGetArray(xx,&x);
222: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
223: BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
224: VecRestoreArray(bb,&b);
225: VecRestoreArray(xx,&x);
226: PetscLogFlops(2.0*bs2*(a->nz-mbs));
227: return(0);
228: }
232: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
233: {
234: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
235: IS isrow=a->row;
236: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
238: const PetscInt *r;
239: PetscInt nz,*vj,k,idx;
240: MatScalar *aa=a->a,*v,*d;
241: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
244: VecGetArray(bb,&b);
245: VecGetArray(xx,&x);
246: t = a->solve_work;
247: ISGetIndices(isrow,&r);
249: /* solve U^T * D * y = b by forward substitution */
250: tp = t;
251: for (k=0; k<mbs; k++) { /* t <- perm(b) */
252: idx = 7*r[k];
253: tp[0] = b[idx];
254: tp[1] = b[idx+1];
255: tp[2] = b[idx+2];
256: tp[3] = b[idx+3];
257: tp[4] = b[idx+4];
258: tp[5] = b[idx+5];
259: tp[6] = b[idx+6];
260: tp += 7;
261: }
262:
263: for (k=0; k<mbs; k++){
264: v = aa + 49*ai[k];
265: vj = aj + ai[k];
266: tp = t + k*7;
267: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
268: nz = ai[k+1] - ai[k];
269: tp = t + (*vj)*7;
270: while (nz--) {
271: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278: vj++; tp = t + (*vj)*7;
279: v += 49;
280: }
282: /* xk = inv(Dk)*(Dk*xk) */
283: d = aa+k*49; /* ptr to inv(Dk) */
284: tp = t + k*7;
285: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292: }
294: /* solve U*x = y by back substitution */
295: for (k=mbs-1; k>=0; k--){
296: v = aa + 49*ai[k];
297: vj = aj + ai[k];
298: tp = t + k*7;
299: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
300: nz = ai[k+1] - ai[k];
301:
302: tp = t + (*vj)*7;
303: while (nz--) {
304: /* xk += U(k,:)*x(:) */
305: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312: vj++; tp = t + (*vj)*7;
313: v += 49;
314: }
315: tp = t + k*7;
316: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
317: idx = 7*r[k];
318: x[idx] = x0;
319: x[idx+1] = x1;
320: x[idx+2] = x2;
321: x[idx+3] = x3;
322: x[idx+4] = x4;
323: x[idx+5] = x5;
324: x[idx+6] = x6;
325: }
327: ISRestoreIndices(isrow,&r);
328: VecRestoreArray(bb,&b);
329: VecRestoreArray(xx,&x);
330: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
331: return(0);
332: }
336: PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
337: {
338: MatScalar *v,*d;
339: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
340: PetscInt nz,*vj,k;
343: for (k=0; k<mbs; k++){
344: v = aa + 49*ai[k];
345: xp = x + k*7;
346: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
347: nz = ai[k+1] - ai[k];
348: vj = aj + ai[k];
349: xp = x + (*vj)*7;
350: while (nz--) {
351: /* x(:) += U(k,:)^T*(Dk*xk) */
352: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
353: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
354: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
355: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
356: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
357: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
358: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
359: vj++; xp = x + (*vj)*7;
360: v += 49;
361: }
362: /* xk = inv(Dk)*(Dk*xk) */
363: d = aa+k*49; /* ptr to inv(Dk) */
364: xp = x + k*7;
365: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
366: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
367: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
368: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
369: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
370: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
371: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
372: }
373: return(0);
374: }
378: PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
379: {
380: MatScalar *v;
381: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
382: PetscInt nz,*vj,k;
385: for (k=mbs-1; k>=0; k--){
386: v = aa + 49*ai[k];
387: xp = x + k*7;
388: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
389: nz = ai[k+1] - ai[k];
390: vj = aj + ai[k];
391: xp = x + (*vj)*7;
392: while (nz--) {
393: /* xk += U(k,:)*x(:) */
394: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
395: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
396: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
397: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
398: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
399: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
400: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
401: vj++;
402: v += 49; xp = x + (*vj)*7;
403: }
404: xp = x + k*7;
405: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
406: }
407: return(0);
408: }
412: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
413: {
414: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
416: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
417: MatScalar *aa=a->a;
418: PetscScalar *x,*b;
421: VecGetArray(bb,&b);
422: VecGetArray(xx,&x);
423:
424: /* solve U^T * D * y = b by forward substitution */
425: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
426: ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
428: /* solve U*x = y by back substitution */
429: BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
431: VecRestoreArray(bb,&b);
432: VecRestoreArray(xx,&x);
433: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
434: return(0);
435: }
439: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
440: {
441: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
443: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
444: MatScalar *aa=a->a;
445: PetscScalar *x,*b;
448: VecGetArray(bb,&b);
449: VecGetArray(xx,&x);
450: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
451: ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
452: VecRestoreArray(bb,&b);
453: VecRestoreArray(xx,&x);
454: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
455: return(0);
456: }
460: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
461: {
462: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
464: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
465: MatScalar *aa=a->a;
466: PetscScalar *x,*b;
469: VecGetArray(bb,&b);
470: VecGetArray(xx,&x);
471: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
472: BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
473: VecRestoreArray(bb,&b);
474: VecRestoreArray(xx,&x);
475: PetscLogFlops(2.0*bs2*(a->nz-mbs));
476: return(0);
477: }
481: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
482: {
483: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
484: IS isrow=a->row;
485: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
487: const PetscInt *r;
488: PetscInt nz,*vj,k,idx;
489: MatScalar *aa=a->a,*v,*d;
490: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
493: VecGetArray(bb,&b);
494: VecGetArray(xx,&x);
495: t = a->solve_work;
496: ISGetIndices(isrow,&r);
498: /* solve U^T * D * y = b by forward substitution */
499: tp = t;
500: for (k=0; k<mbs; k++) { /* t <- perm(b) */
501: idx = 6*r[k];
502: tp[0] = b[idx];
503: tp[1] = b[idx+1];
504: tp[2] = b[idx+2];
505: tp[3] = b[idx+3];
506: tp[4] = b[idx+4];
507: tp[5] = b[idx+5];
508: tp += 6;
509: }
510:
511: for (k=0; k<mbs; k++){
512: v = aa + 36*ai[k];
513: vj = aj + ai[k];
514: tp = t + k*6;
515: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
516: nz = ai[k+1] - ai[k];
517: tp = t + (*vj)*6;
518: while (nz--) {
519: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
520: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
521: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
522: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
523: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
524: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
525: vj++; tp = t + (*vj)*6;
526: v += 36;
527: }
529: /* xk = inv(Dk)*(Dk*xk) */
530: d = aa+k*36; /* ptr to inv(Dk) */
531: tp = t + k*6;
532: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
533: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
534: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
535: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
536: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
537: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
538: }
540: /* solve U*x = y by back substitution */
541: for (k=mbs-1; k>=0; k--){
542: v = aa + 36*ai[k];
543: vj = aj + ai[k];
544: tp = t + k*6;
545: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
546: nz = ai[k+1] - ai[k];
547:
548: tp = t + (*vj)*6;
549: while (nz--) {
550: /* xk += U(k,:)*x(:) */
551: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
552: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
553: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
554: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
555: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
556: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
557: vj++; tp = t + (*vj)*6;
558: v += 36;
559: }
560: tp = t + k*6;
561: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
562: idx = 6*r[k];
563: x[idx] = x0;
564: x[idx+1] = x1;
565: x[idx+2] = x2;
566: x[idx+3] = x3;
567: x[idx+4] = x4;
568: x[idx+5] = x5;
569: }
571: ISRestoreIndices(isrow,&r);
572: VecRestoreArray(bb,&b);
573: VecRestoreArray(xx,&x);
574: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
575: return(0);
576: }
580: PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
581: {
582: MatScalar *v,*d;
583: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
584: PetscInt nz,*vj,k;
587: for (k=0; k<mbs; k++){
588: v = aa + 36*ai[k];
589: xp = x + k*6;
590: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
591: nz = ai[k+1] - ai[k];
592: vj = aj + ai[k];
593: xp = x + (*vj)*6;
594: while (nz--) {
595: /* x(:) += U(k,:)^T*(Dk*xk) */
596: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
597: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
598: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
599: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
600: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
601: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
602: vj++; xp = x + (*vj)*6;
603: v += 36;
604: }
605: /* xk = inv(Dk)*(Dk*xk) */
606: d = aa+k*36; /* ptr to inv(Dk) */
607: xp = x + k*6;
608: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
609: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
610: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
611: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
612: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
613: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
614: }
615: return(0);
616: }
619: PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
620: {
621: MatScalar *v;
622: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
623: PetscInt nz,*vj,k;
626: for (k=mbs-1; k>=0; k--){
627: v = aa + 36*ai[k];
628: xp = x + k*6;
629: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
630: nz = ai[k+1] - ai[k];
631: vj = aj + ai[k];
632: xp = x + (*vj)*6;
633: while (nz--) {
634: /* xk += U(k,:)*x(:) */
635: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
636: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
637: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
638: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
639: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
640: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
641: vj++;
642: v += 36; xp = x + (*vj)*6;
643: }
644: xp = x + k*6;
645: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
646: }
647: return(0);
648: }
653: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
654: {
655: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
656: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
657: MatScalar *aa=a->a;
658: PetscScalar *x,*b;
662: VecGetArray(bb,&b);
663: VecGetArray(xx,&x);
664:
665: /* solve U^T * D * y = b by forward substitution */
666: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
667: ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
669: /* solve U*x = y by back substitution */
670: BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
672: VecRestoreArray(bb,&b);
673: VecRestoreArray(xx,&x);
674: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
675: return(0);
676: }
680: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
681: {
682: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
683: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
684: MatScalar *aa=a->a;
685: PetscScalar *x,*b;
689: VecGetArray(bb,&b);
690: VecGetArray(xx,&x);
691: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
692: ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
693: VecRestoreArray(bb,&b);
694: VecRestoreArray(xx,&x);
695: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
696: return(0);
697: }
701: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
702: {
703: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
704: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
705: MatScalar *aa=a->a;
706: PetscScalar *x,*b;
710: VecGetArray(bb,&b);
711: VecGetArray(xx,&x);
712: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
713: BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
714: VecRestoreArray(bb,&b);
715: VecRestoreArray(xx,&x);
716: PetscLogFlops(2.0*bs2*(a->nz - mbs));
717: return(0);
718: }
722: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
723: {
724: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
725: IS isrow=a->row;
726: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
728: const PetscInt *r;
729: PetscInt nz,*vj,k,idx;
730: MatScalar *aa=a->a,*v,*diag;
731: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
734: VecGetArray(bb,&b);
735: VecGetArray(xx,&x);
736: t = a->solve_work;
737: ISGetIndices(isrow,&r);
739: /* solve U^T * D * y = b by forward substitution */
740: tp = t;
741: for (k=0; k<mbs; k++) { /* t <- perm(b) */
742: idx = 5*r[k];
743: tp[0] = b[idx];
744: tp[1] = b[idx+1];
745: tp[2] = b[idx+2];
746: tp[3] = b[idx+3];
747: tp[4] = b[idx+4];
748: tp += 5;
749: }
750:
751: for (k=0; k<mbs; k++){
752: v = aa + 25*ai[k];
753: vj = aj + ai[k];
754: tp = t + k*5;
755: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
756: nz = ai[k+1] - ai[k];
758: tp = t + (*vj)*5;
759: while (nz--) {
760: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
761: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
762: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
763: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
764: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
765: vj++; tp = t + (*vj)*5;
766: v += 25;
767: }
769: /* xk = inv(Dk)*(Dk*xk) */
770: diag = aa+k*25; /* ptr to inv(Dk) */
771: tp = t + k*5;
772: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
773: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
774: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
775: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
776: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
777: }
779: /* solve U*x = y by back substitution */
780: for (k=mbs-1; k>=0; k--){
781: v = aa + 25*ai[k];
782: vj = aj + ai[k];
783: tp = t + k*5;
784: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
785: nz = ai[k+1] - ai[k];
786:
787: tp = t + (*vj)*5;
788: while (nz--) {
789: /* xk += U(k,:)*x(:) */
790: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
791: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
792: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
793: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
794: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
795: vj++; tp = t + (*vj)*5;
796: v += 25;
797: }
798: tp = t + k*5;
799: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
800: idx = 5*r[k];
801: x[idx] = x0;
802: x[idx+1] = x1;
803: x[idx+2] = x2;
804: x[idx+3] = x3;
805: x[idx+4] = x4;
806: }
808: ISRestoreIndices(isrow,&r);
809: VecRestoreArray(bb,&b);
810: VecRestoreArray(xx,&x);
811: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
812: return(0);
813: }
817: PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
818: {
819: MatScalar *v,*diag;
820: PetscScalar *xp,x0,x1,x2,x3,x4;
821: PetscInt nz,*vj,k;
824: for (k=0; k<mbs; k++){
825: v = aa + 25*ai[k];
826: xp = x + k*5;
827: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
828: nz = ai[k+1] - ai[k];
829: vj = aj + ai[k];
830: xp = x + (*vj)*5;
831: while (nz--) {
832: /* x(:) += U(k,:)^T*(Dk*xk) */
833: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
834: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
835: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
836: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
837: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
838: vj++; xp = x + (*vj)*5;
839: v += 25;
840: }
841: /* xk = inv(Dk)*(Dk*xk) */
842: diag = aa+k*25; /* ptr to inv(Dk) */
843: xp = x + k*5;
844: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
845: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
846: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
847: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
848: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
849: }
850: return(0);
851: }
855: PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
856: {
857: MatScalar *v;
858: PetscScalar *xp,x0,x1,x2,x3,x4;
859: PetscInt nz,*vj,k;
862: for (k=mbs-1; k>=0; k--){
863: v = aa + 25*ai[k];
864: xp = x + k*5;
865: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
866: nz = ai[k+1] - ai[k];
867: vj = aj + ai[k];
868: xp = x + (*vj)*5;
869: while (nz--) {
870: /* xk += U(k,:)*x(:) */
871: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
872: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
873: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
874: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
875: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
876: vj++;
877: v += 25; xp = x + (*vj)*5;
878: }
879: xp = x + k*5;
880: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
881: }
882: return(0);
883: }
887: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
888: {
889: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
890: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
891: MatScalar *aa=a->a;
892: PetscScalar *x,*b;
896: VecGetArray(bb,&b);
897: VecGetArray(xx,&x);
899: /* solve U^T * D * y = b by forward substitution */
900: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
901: ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
903: /* solve U*x = y by back substitution */
904: BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
906: VecRestoreArray(bb,&b);
907: VecRestoreArray(xx,&x);
908: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
909: return(0);
910: }
914: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
915: {
916: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
917: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
918: MatScalar *aa=a->a;
919: PetscScalar *x,*b;
923: VecGetArray(bb,&b);
924: VecGetArray(xx,&x);
925: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
926: ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
927: VecRestoreArray(bb,&b);
928: VecRestoreArray(xx,&x);
929: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
930: return(0);
931: }
935: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
936: {
937: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
938: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
939: MatScalar *aa=a->a;
940: PetscScalar *x,*b;
944: VecGetArray(bb,&b);
945: VecGetArray(xx,&x);
946: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
947: BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
948: VecRestoreArray(bb,&b);
949: VecRestoreArray(xx,&x);
950: PetscLogFlops(2.0*bs2*(a->nz-mbs));
951: return(0);
952: }
956: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
957: {
958: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
959: IS isrow=a->row;
960: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
962: const PetscInt *r;
963: PetscInt nz,*vj,k,idx;
964: MatScalar *aa=a->a,*v,*diag;
965: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
968: VecGetArray(bb,&b);
969: VecGetArray(xx,&x);
970: t = a->solve_work;
971: ISGetIndices(isrow,&r);
973: /* solve U^T * D * y = b by forward substitution */
974: tp = t;
975: for (k=0; k<mbs; k++) { /* t <- perm(b) */
976: idx = 4*r[k];
977: tp[0] = b[idx];
978: tp[1] = b[idx+1];
979: tp[2] = b[idx+2];
980: tp[3] = b[idx+3];
981: tp += 4;
982: }
983:
984: for (k=0; k<mbs; k++){
985: v = aa + 16*ai[k];
986: vj = aj + ai[k];
987: tp = t + k*4;
988: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
989: nz = ai[k+1] - ai[k];
991: tp = t + (*vj)*4;
992: while (nz--) {
993: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
994: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
995: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
996: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
997: vj++; tp = t + (*vj)*4;
998: v += 16;
999: }
1001: /* xk = inv(Dk)*(Dk*xk) */
1002: diag = aa+k*16; /* ptr to inv(Dk) */
1003: tp = t + k*4;
1004: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1005: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1006: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1007: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1008: }
1010: /* solve U*x = y by back substitution */
1011: for (k=mbs-1; k>=0; k--){
1012: v = aa + 16*ai[k];
1013: vj = aj + ai[k];
1014: tp = t + k*4;
1015: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1016: nz = ai[k+1] - ai[k];
1017:
1018: tp = t + (*vj)*4;
1019: while (nz--) {
1020: /* xk += U(k,:)*x(:) */
1021: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1022: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1023: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1024: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1025: vj++; tp = t + (*vj)*4;
1026: v += 16;
1027: }
1028: tp = t + k*4;
1029: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1030: idx = 4*r[k];
1031: x[idx] = x0;
1032: x[idx+1] = x1;
1033: x[idx+2] = x2;
1034: x[idx+3] = x3;
1035: }
1037: ISRestoreIndices(isrow,&r);
1038: VecRestoreArray(bb,&b);
1039: VecRestoreArray(xx,&x);
1040: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1041: return(0);
1042: }
1046: PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1047: {
1048: MatScalar *v,*diag;
1049: PetscScalar *xp,x0,x1,x2,x3;
1050: PetscInt nz,*vj,k;
1053: for (k=0; k<mbs; k++){
1054: v = aa + 16*ai[k];
1055: xp = x + k*4;
1056: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1057: nz = ai[k+1] - ai[k];
1058: vj = aj + ai[k];
1059: xp = x + (*vj)*4;
1060: while (nz--) {
1061: /* x(:) += U(k,:)^T*(Dk*xk) */
1062: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1063: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1064: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1065: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1066: vj++; xp = x + (*vj)*4;
1067: v += 16;
1068: }
1069: /* xk = inv(Dk)*(Dk*xk) */
1070: diag = aa+k*16; /* ptr to inv(Dk) */
1071: xp = x + k*4;
1072: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1073: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1074: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1075: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1076: }
1077: return(0);
1078: }
1082: PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1083: {
1084: MatScalar *v;
1085: PetscScalar *xp,x0,x1,x2,x3;
1086: PetscInt nz,*vj,k;
1089: for (k=mbs-1; k>=0; k--){
1090: v = aa + 16*ai[k];
1091: xp = x + k*4;
1092: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1093: nz = ai[k+1] - ai[k];
1094: vj = aj + ai[k];
1095: xp = x + (*vj)*4;
1096: while (nz--) {
1097: /* xk += U(k,:)*x(:) */
1098: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1099: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1100: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1101: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1102: vj++;
1103: v += 16; xp = x + (*vj)*4;
1104: }
1105: xp = x + k*4;
1106: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1107: }
1108: return(0);
1109: }
1113: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1114: {
1115: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1116: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1117: MatScalar *aa=a->a;
1118: PetscScalar *x,*b;
1122: VecGetArray(bb,&b);
1123: VecGetArray(xx,&x);
1125: /* solve U^T * D * y = b by forward substitution */
1126: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1127: ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1129: /* solve U*x = y by back substitution */
1130: BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1131: VecRestoreArray(bb,&b);
1132: VecRestoreArray(xx,&x);
1133: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1134: return(0);
1135: }
1139: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1140: {
1141: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1142: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1143: MatScalar *aa=a->a;
1144: PetscScalar *x,*b;
1148: VecGetArray(bb,&b);
1149: VecGetArray(xx,&x);
1150: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1151: ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1152: VecRestoreArray(bb,&b);
1153: VecRestoreArray(xx,&x);
1154: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1155: return(0);
1156: }
1160: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1161: {
1162: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1163: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1164: MatScalar *aa=a->a;
1165: PetscScalar *x,*b;
1169: VecGetArray(bb,&b);
1170: VecGetArray(xx,&x);
1171: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1172: BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1173: VecRestoreArray(bb,&b);
1174: VecRestoreArray(xx,&x);
1175: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1176: return(0);
1177: }
1181: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1182: {
1183: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1184: IS isrow=a->row;
1185: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1187: const PetscInt *r;
1188: PetscInt nz,*vj,k,idx;
1189: MatScalar *aa=a->a,*v,*diag;
1190: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
1193: VecGetArray(bb,&b);
1194: VecGetArray(xx,&x);
1195: t = a->solve_work;
1196: ISGetIndices(isrow,&r);
1198: /* solve U^T * D * y = b by forward substitution */
1199: tp = t;
1200: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1201: idx = 3*r[k];
1202: tp[0] = b[idx];
1203: tp[1] = b[idx+1];
1204: tp[2] = b[idx+2];
1205: tp += 3;
1206: }
1207:
1208: for (k=0; k<mbs; k++){
1209: v = aa + 9*ai[k];
1210: vj = aj + ai[k];
1211: tp = t + k*3;
1212: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1213: nz = ai[k+1] - ai[k];
1215: tp = t + (*vj)*3;
1216: while (nz--) {
1217: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1218: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1219: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1220: vj++; tp = t + (*vj)*3;
1221: v += 9;
1222: }
1224: /* xk = inv(Dk)*(Dk*xk) */
1225: diag = aa+k*9; /* ptr to inv(Dk) */
1226: tp = t + k*3;
1227: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1228: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1229: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1230: }
1232: /* solve U*x = y by back substitution */
1233: for (k=mbs-1; k>=0; k--){
1234: v = aa + 9*ai[k];
1235: vj = aj + ai[k];
1236: tp = t + k*3;
1237: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1238: nz = ai[k+1] - ai[k];
1239:
1240: tp = t + (*vj)*3;
1241: while (nz--) {
1242: /* xk += U(k,:)*x(:) */
1243: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1244: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1245: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1246: vj++; tp = t + (*vj)*3;
1247: v += 9;
1248: }
1249: tp = t + k*3;
1250: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1251: idx = 3*r[k];
1252: x[idx] = x0;
1253: x[idx+1] = x1;
1254: x[idx+2] = x2;
1255: }
1257: ISRestoreIndices(isrow,&r);
1258: VecRestoreArray(bb,&b);
1259: VecRestoreArray(xx,&x);
1260: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1261: return(0);
1262: }
1266: PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1267: {
1268: MatScalar *v,*diag;
1269: PetscScalar *xp,x0,x1,x2;
1270: PetscInt nz,*vj,k;
1273: for (k=0; k<mbs; k++){
1274: v = aa + 9*ai[k];
1275: xp = x + k*3;
1276: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1277: nz = ai[k+1] - ai[k];
1278: vj = aj + ai[k];
1279: xp = x + (*vj)*3;
1280: while (nz--) {
1281: /* x(:) += U(k,:)^T*(Dk*xk) */
1282: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1283: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1284: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1285: vj++; xp = x + (*vj)*3;
1286: v += 9;
1287: }
1288: /* xk = inv(Dk)*(Dk*xk) */
1289: diag = aa+k*9; /* ptr to inv(Dk) */
1290: xp = x + k*3;
1291: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1292: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1293: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1294: }
1295: return(0);
1296: }
1300: PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1301: {
1302: MatScalar *v;
1303: PetscScalar *xp,x0,x1,x2;
1304: PetscInt nz,*vj,k;
1307: for (k=mbs-1; k>=0; k--){
1308: v = aa + 9*ai[k];
1309: xp = x + k*3;
1310: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1311: nz = ai[k+1] - ai[k];
1312: vj = aj + ai[k];
1313: xp = x + (*vj)*3;
1314: while (nz--) {
1315: /* xk += U(k,:)*x(:) */
1316: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1317: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1318: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1319: vj++;
1320: v += 9; xp = x + (*vj)*3;
1321: }
1322: xp = x + k*3;
1323: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1324: }
1325: return(0);
1326: }
1330: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1331: {
1332: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1333: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1334: MatScalar *aa=a->a;
1335: PetscScalar *x,*b;
1337:
1339: VecGetArray(bb,&b);
1340: VecGetArray(xx,&x);
1342: /* solve U^T * D * y = b by forward substitution */
1343: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1344: ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1346: /* solve U*x = y by back substitution */
1347: BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1349: VecRestoreArray(bb,&b);
1350: VecRestoreArray(xx,&x);
1351: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1352: return(0);
1353: }
1357: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1358: {
1359: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1360: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1361: MatScalar *aa=a->a;
1362: PetscScalar *x,*b;
1366: VecGetArray(bb,&b);
1367: VecGetArray(xx,&x);
1368: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1369: ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1370: VecRestoreArray(bb,&b);
1371: VecRestoreArray(xx,&x);
1372: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1373: return(0);
1374: }
1378: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1379: {
1380: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1381: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1382: MatScalar *aa=a->a;
1383: PetscScalar *x,*b;
1387: VecGetArray(bb,&b);
1388: VecGetArray(xx,&x);
1389: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1390: BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1391: VecRestoreArray(bb,&b);
1392: VecRestoreArray(xx,&x);
1393: PetscLogFlops(2.0*bs2*(a->nz-mbs));
1394: return(0);
1395: }
1399: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1400: {
1401: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
1402: IS isrow=a->row;
1403: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1405: const PetscInt *r;
1406: PetscInt nz,*vj,k,k2,idx;
1407: MatScalar *aa=a->a,*v,*diag;
1408: PetscScalar *x,*b,x0,x1,*t;
1411: VecGetArray(bb,&b);
1412: VecGetArray(xx,&x);
1413: t = a->solve_work;
1414: ISGetIndices(isrow,&r);
1416: /* solve U^T * D * y = perm(b) by forward substitution */
1417: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1418: idx = 2*r[k];
1419: t[k*2] = b[idx];
1420: t[k*2+1] = b[idx+1];
1421: }
1422: for (k=0; k<mbs; k++){
1423: v = aa + 4*ai[k];
1424: vj = aj + ai[k];
1425: k2 = k*2;
1426: x0 = t[k2]; x1 = t[k2+1];
1427: nz = ai[k+1] - ai[k];
1428: while (nz--) {
1429: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1430: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1431: vj++; v += 4;
1432: }
1433: diag = aa+k*4; /* ptr to inv(Dk) */
1434: t[k2] = diag[0]*x0 + diag[2]*x1;
1435: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1436: }
1438: /* solve U*x = y by back substitution */
1439: for (k=mbs-1; k>=0; k--){
1440: v = aa + 4*ai[k];
1441: vj = aj + ai[k];
1442: k2 = k*2;
1443: x0 = t[k2]; x1 = t[k2+1];
1444: nz = ai[k+1] - ai[k];
1445: while (nz--) {
1446: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1447: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1448: vj++; v += 4;
1449: }
1450: t[k2] = x0;
1451: t[k2+1] = x1;
1452: idx = 2*r[k];
1453: x[idx] = x0;
1454: x[idx+1] = x1;
1455: }
1457: ISRestoreIndices(isrow,&r);
1458: VecRestoreArray(bb,&b);
1459: VecRestoreArray(xx,&x);
1460: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1461: return(0);
1462: }
1466: PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1467: {
1468: MatScalar *v,*diag;
1469: PetscScalar x0,x1;
1470: PetscInt nz,*vj,k,k2;
1473: for (k=0; k<mbs; k++){
1474: v = aa + 4*ai[k];
1475: vj = aj + ai[k];
1476: k2 = k*2;
1477: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1478: nz = ai[k+1] - ai[k];
1479:
1480: while (nz--) {
1481: /* x(:) += U(k,:)^T*(Dk*xk) */
1482: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1483: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1484: vj++; v += 4;
1485: }
1486: /* xk = inv(Dk)*(Dk*xk) */
1487: diag = aa+k*4; /* ptr to inv(Dk) */
1488: x[k2] = diag[0]*x0 + diag[2]*x1;
1489: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1490: }
1491: return(0);
1492: }
1496: PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1497: {
1498: MatScalar *v;
1499: PetscScalar x0,x1;
1500: PetscInt nz,*vj,k,k2;
1503: for (k=mbs-1; k>=0; k--){
1504: v = aa + 4*ai[k];
1505: vj = aj + ai[k];
1506: k2 = k*2;
1507: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1508: nz = ai[k+1] - ai[k];
1509: while (nz--) {
1510: /* xk += U(k,:)*x(:) */
1511: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1512: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1513: vj++; v += 4;
1514: }
1515: x[k2] = x0;
1516: x[k2+1] = x1;
1517: }
1518: return(0);
1519: }
1523: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1524: {
1525: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1526: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1527: MatScalar *aa=a->a;
1528: PetscScalar *x,*b;
1532: VecGetArray(bb,&b);
1533: VecGetArray(xx,&x);
1535: /* solve U^T * D * y = b by forward substitution */
1536: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1537: ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1539: /* solve U*x = y by back substitution */
1540: BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1542: VecRestoreArray(bb,&b);
1543: VecRestoreArray(xx,&x);
1544: PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1545: return(0);
1546: }
1550: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1551: {
1552: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1553: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1554: MatScalar *aa=a->a;
1555: PetscScalar *x,*b;
1559: VecGetArray(bb,&b);
1560: VecGetArray(xx,&x);
1561: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1562: ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1563: VecRestoreArray(bb,&b);
1564: VecRestoreArray(xx,&x);
1565: PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1566: return(0);
1567: }
1571: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1572: {
1573: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1574: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1575: MatScalar *aa=a->a;
1576: PetscScalar *x,*b;
1580: VecGetArray(bb,&b);
1581: VecGetArray(xx,&x);
1582: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1583: BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1584: VecRestoreArray(bb,&b);
1585: VecRestoreArray(xx,&x);
1586: PetscLogFlops(2.0*bs2*(a->nz - mbs));
1587: return(0);
1588: }
1592: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1593: {
1594: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1595: IS isrow=a->row;
1596: PetscErrorCode ierr;
1597: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1598: const MatScalar *aa=a->a,*v;
1599: const PetscScalar *b;
1600: PetscScalar *x,xk,*t;
1601: PetscInt nz,k,j;
1604: VecGetArray(bb,(PetscScalar**)&b);
1605: VecGetArray(xx,&x);
1606: t = a->solve_work;
1607: ISGetIndices(isrow,&rp);
1608:
1609: /* solve U^T*D*y = perm(b) by forward substitution */
1610: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1611: for (k=0; k<mbs; k++){
1612: v = aa + ai[k];
1613: vj = aj + ai[k];
1614: xk = t[k];
1615: nz = ai[k+1] - ai[k] - 1;
1616: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1617: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1618: }
1620: /* solve U*perm(x) = y by back substitution */
1621: for (k=mbs-1; k>=0; k--){
1622: v = aa + adiag[k] - 1;
1623: vj = aj + adiag[k] - 1;
1624: nz = ai[k+1] - ai[k] - 1;
1625: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1626: x[rp[k]] = t[k];
1627: }
1629: ISRestoreIndices(isrow,&rp);
1630: VecRestoreArray(bb,(PetscScalar**)&b);
1631: VecRestoreArray(xx,&x);
1632: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1633: return(0);
1634: }
1638: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1639: {
1640: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1641: IS isrow=a->row;
1642: PetscErrorCode ierr;
1643: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1644: const MatScalar *aa=a->a,*v;
1645: PetscScalar *x,*b,xk,*t;
1646: PetscInt nz,k;
1649: VecGetArray(bb,&b);
1650: VecGetArray(xx,&x);
1651: t = a->solve_work;
1652: ISGetIndices(isrow,&rp);
1653:
1654: /* solve U^T*D*y = perm(b) by forward substitution */
1655: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1656: for (k=0; k<mbs; k++){
1657: v = aa + ai[k] + 1;
1658: vj = aj + ai[k] + 1;
1659: xk = t[k];
1660: nz = ai[k+1] - ai[k] - 1;
1661: while (nz--) t[*vj++] += (*v++) * xk;
1662: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1663: }
1665: /* solve U*perm(x) = y by back substitution */
1666: for (k=mbs-1; k>=0; k--){
1667: v = aa + ai[k] + 1;
1668: vj = aj + ai[k] + 1;
1669: nz = ai[k+1] - ai[k] - 1;
1670: while (nz--) t[k] += (*v++) * t[*vj++];
1671: x[rp[k]] = t[k];
1672: }
1674: ISRestoreIndices(isrow,&rp);
1675: VecRestoreArray(bb,&b);
1676: VecRestoreArray(xx,&x);
1677: PetscLogFlops(4.0*a->nz - 3*mbs);
1678: return(0);
1679: }
1683: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1684: {
1685: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1686: IS isrow=a->row;
1687: PetscErrorCode ierr;
1688: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1689: const MatScalar *aa=a->a,*v;
1690: PetscReal diagk;
1691: PetscScalar *x,*b,xk;
1692: PetscInt nz,k;
1695: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1696: VecGetArray(bb,&b);
1697: VecGetArray(xx,&x);
1698: ISGetIndices(isrow,&rp);
1699:
1700: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1701: for (k=0; k<mbs; k++){
1702: v = aa + ai[k];
1703: vj = aj + ai[k];
1704: xk = x[k];
1705: nz = ai[k+1] - ai[k] - 1;
1706: while (nz--) x[*vj++] += (*v++) * xk;
1708: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1709: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1710: x[k] = xk*sqrt(diagk);
1711: }
1712: ISRestoreIndices(isrow,&rp);
1713: VecRestoreArray(bb,&b);
1714: VecRestoreArray(xx,&x);
1715: PetscLogFlops(2.0*a->nz - mbs);
1716: return(0);
1717: }
1721: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1722: {
1723: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1724: IS isrow=a->row;
1725: PetscErrorCode ierr;
1726: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1727: const MatScalar *aa=a->a,*v;
1728: PetscReal diagk;
1729: PetscScalar *x,*b,xk;
1730: PetscInt nz,k;
1733: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1734: VecGetArray(bb,&b);
1735: VecGetArray(xx,&x);
1736: ISGetIndices(isrow,&rp);
1737:
1738: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1739: for (k=0; k<mbs; k++){
1740: v = aa + ai[k] + 1;
1741: vj = aj + ai[k] + 1;
1742: xk = x[k];
1743: nz = ai[k+1] - ai[k] - 1;
1744: while (nz--) x[*vj++] += (*v++) * xk;
1746: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1747: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1748: x[k] = xk*sqrt(diagk);
1749: }
1750: ISRestoreIndices(isrow,&rp);
1751: VecRestoreArray(bb,&b);
1752: VecRestoreArray(xx,&x);
1753: PetscLogFlops(2.0*a->nz - mbs);
1754: return(0);
1755: }
1759: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1760: {
1761: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1762: IS isrow=a->row;
1763: PetscErrorCode ierr;
1764: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1765: const MatScalar *aa=a->a,*v;
1766: PetscReal diagk;
1767: PetscScalar *x,*b,*t;
1768: PetscInt nz,k;
1771: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1772: VecGetArray(bb,&b);
1773: VecGetArray(xx,&x);
1774: t = a->solve_work;
1775: ISGetIndices(isrow,&rp);
1777: for (k=mbs-1; k>=0; k--){
1778: v = aa + ai[k];
1779: vj = aj + ai[k];
1780: diagk = PetscRealPart(aa[adiag[k]]);
1781: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1782: t[k] = b[k] * sqrt(diagk);
1783: nz = ai[k+1] - ai[k] - 1;
1784: while (nz--) t[k] += (*v++) * t[*vj++];
1785: x[rp[k]] = t[k];
1786: }
1787: ISRestoreIndices(isrow,&rp);
1788: VecRestoreArray(bb,&b);
1789: VecRestoreArray(xx,&x);
1790: PetscLogFlops(2.0*a->nz - mbs);
1791: return(0);
1792: }
1796: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1797: {
1798: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1799: IS isrow=a->row;
1800: PetscErrorCode ierr;
1801: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1802: const MatScalar *aa=a->a,*v;
1803: PetscReal diagk;
1804: PetscScalar *x,*b,*t;
1805: PetscInt nz,k;
1808: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1809: VecGetArray(bb,&b);
1810: VecGetArray(xx,&x);
1811: t = a->solve_work;
1812: ISGetIndices(isrow,&rp);
1814: for (k=mbs-1; k>=0; k--){
1815: v = aa + ai[k] + 1;
1816: vj = aj + ai[k] + 1;
1817: diagk = PetscRealPart(aa[ai[k]]);
1818: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1819: t[k] = b[k] * sqrt(diagk);
1820: nz = ai[k+1] - ai[k] - 1;
1821: while (nz--) t[k] += (*v++) * t[*vj++];
1822: x[rp[k]] = t[k];
1823: }
1824: ISRestoreIndices(isrow,&rp);
1825: VecRestoreArray(bb,&b);
1826: VecRestoreArray(xx,&x);
1827: PetscLogFlops(2.0*a->nz - mbs);
1828: return(0);
1829: }
1833: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1834: {
1835: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1839: if (A->rmap->bs == 1) {
1840: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1841: } else {
1842: IS isrow=a->row;
1843: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1844: const MatScalar *aa=a->a,*v;
1845: PetscScalar *x,*b,*t;
1846: PetscInt nz,k,n,i,j;
1847: if (bb->n > a->solves_work_n) {
1848: PetscFree(a->solves_work);
1849: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1850: a->solves_work_n = bb->n;
1851: }
1852: n = bb->n;
1853: VecGetArray(bb->v,&b);
1854: VecGetArray(xx->v,&x);
1855: t = a->solves_work;
1857: ISGetIndices(isrow,&rp);
1858:
1859: /* solve U^T*D*y = perm(b) by forward substitution */
1860: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1861: for (k=0; k<mbs; k++){
1862: v = aa + ai[k];
1863: vj = aj + ai[k];
1864: nz = ai[k+1] - ai[k] - 1;
1865: for (j=0; j<nz; j++){
1866: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1867: v++;vj++;
1868: }
1869: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1870: }
1871:
1872: /* solve U*perm(x) = y by back substitution */
1873: for (k=mbs-1; k>=0; k--){
1874: v = aa + ai[k] - 1;
1875: vj = aj + ai[k] - 1;
1876: nz = ai[k+1] - ai[k] - 1;
1877: for (j=0; j<nz; j++){
1878: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1879: v++;vj++;
1880: }
1881: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1882: }
1884: ISRestoreIndices(isrow,&rp);
1885: VecRestoreArray(bb->v,&b);
1886: VecRestoreArray(xx->v,&x);
1887: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1888: }
1889: return(0);
1890: }
1894: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1895: {
1896: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1900: if (A->rmap->bs == 1) {
1901: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1902: } else {
1903: IS isrow=a->row;
1904: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1905: const MatScalar *aa=a->a,*v;
1906: PetscScalar *x,*b,*t;
1907: PetscInt nz,k,n,i;
1908: if (bb->n > a->solves_work_n) {
1909: PetscFree(a->solves_work);
1910: PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1911: a->solves_work_n = bb->n;
1912: }
1913: n = bb->n;
1914: VecGetArray(bb->v,&b);
1915: VecGetArray(xx->v,&x);
1916: t = a->solves_work;
1918: ISGetIndices(isrow,&rp);
1919:
1920: /* solve U^T*D*y = perm(b) by forward substitution */
1921: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1922: for (k=0; k<mbs; k++){
1923: v = aa + ai[k];
1924: vj = aj + ai[k];
1925: nz = ai[k+1] - ai[k];
1926: while (nz--) {
1927: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1928: v++;vj++;
1929: }
1930: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1931: }
1932:
1933: /* solve U*perm(x) = y by back substitution */
1934: for (k=mbs-1; k>=0; k--){
1935: v = aa + ai[k];
1936: vj = aj + ai[k];
1937: nz = ai[k+1] - ai[k];
1938: while (nz--) {
1939: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1940: v++;vj++;
1941: }
1942: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1943: }
1945: ISRestoreIndices(isrow,&rp);
1946: VecRestoreArray(bb->v,&b);
1947: VecRestoreArray(xx->v,&x);
1948: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1949: }
1950: return(0);
1951: }
1955: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1956: {
1957: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1958: PetscErrorCode ierr;
1959: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1960: const MatScalar *aa=a->a,*v;
1961: const PetscScalar *b;
1962: PetscScalar *x,xi;
1963: PetscInt nz,i,j;
1966: VecGetArray(bb,(PetscScalar**)&b);
1967: VecGetArray(xx,&x);
1968:
1969: /* solve U^T*D*y = b by forward substitution */
1970: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1971: for (i=0; i<mbs; i++){
1972: v = aa + ai[i];
1973: vj = aj + ai[i];
1974: xi = x[i];
1975: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1976: for (j=0; j<nz; j++){
1977: x[vj[j]] += v[j]* xi;
1978: }
1979: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1980: }
1982: /* solve U*x = y by backward substitution */
1983: for (i=mbs-2; i>=0; i--){
1984: xi = x[i];
1985: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1986: vj = aj + adiag[i] - 1;
1987: nz = ai[i+1] - ai[i] - 1;
1988: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1989: x[i] = xi;
1990: }
1991:
1992: VecRestoreArray(bb,(PetscScalar**)&b);
1993: VecRestoreArray(xx,&x);
1994: PetscLogFlops(4.0*a->nz - 3*mbs);
1995: return(0);
1996: }
2000: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2001: {
2002: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2004: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2005: MatScalar *aa=a->a,*v;
2006: PetscScalar *x,*b,xk;
2007: PetscInt nz,*vj,k;
2010: VecGetArray(bb,&b);
2011: VecGetArray(xx,&x);
2012:
2013: /* solve U^T*D*y = b by forward substitution */
2014: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2015: for (k=0; k<mbs; k++){
2016: v = aa + ai[k] + 1;
2017: vj = aj + ai[k] + 1;
2018: xk = x[k];
2019: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2020: while (nz--) x[*vj++] += (*v++) * xk;
2021: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2022: }
2024: /* solve U*x = y by back substitution */
2025: for (k=mbs-2; k>=0; k--){
2026: v = aa + ai[k] + 1;
2027: vj = aj + ai[k] + 1;
2028: xk = x[k];
2029: nz = ai[k+1] - ai[k] - 1;
2030: while (nz--) {
2031: xk += (*v++) * x[*vj++];
2032: }
2033: x[k] = xk;
2034: }
2036: VecRestoreArray(bb,&b);
2037: VecRestoreArray(xx,&x);
2038: PetscLogFlops(4.0*a->nz - 3*mbs);
2039: return(0);
2040: }
2044: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2045: {
2046: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2047: PetscErrorCode ierr;
2048: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2049: const MatScalar *aa=a->a,*v;
2050: PetscReal diagk;
2051: PetscScalar *x,*b;
2052: PetscInt nz,*vj,k;
2055: /* solve U^T*D^(1/2)*x = b by forward substitution */
2056: VecGetArray(bb,&b);
2057: VecGetArray(xx,&x);
2058: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2059: for (k=0; k<mbs; k++){
2060: v = aa + ai[k];
2061: vj = aj + ai[k];
2062: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2063: while (nz--) x[*vj++] += (*v++) * x[k];
2064: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2065: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2066: x[k] *= sqrt(diagk);
2067: }
2068: VecRestoreArray(bb,&b);
2069: VecRestoreArray(xx,&x);
2070: PetscLogFlops(2.0*a->nz - mbs);
2071: return(0);
2072: }
2076: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2077: {
2078: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2080: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2081: MatScalar *aa=a->a,*v;
2082: PetscReal diagk;
2083: PetscScalar *x,*b;
2084: PetscInt nz,*vj,k;
2087: /* solve U^T*D^(1/2)*x = b by forward substitution */
2088: VecGetArray(bb,&b);
2089: VecGetArray(xx,&x);
2090: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2091: for (k=0; k<mbs; k++){
2092: v = aa + ai[k] + 1;
2093: vj = aj + ai[k] + 1;
2094: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2095: while (nz--) x[*vj++] += (*v++) * x[k];
2096: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2097: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2098: x[k] *= sqrt(diagk);
2099: }
2100: VecRestoreArray(bb,&b);
2101: VecRestoreArray(xx,&x);
2102: PetscLogFlops(2.0*a->nz - mbs);
2103: return(0);
2104: }
2108: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2109: {
2110: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2112: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2113: MatScalar *aa=a->a,*v;
2114: PetscReal diagk;
2115: PetscScalar *x,*b;
2116: PetscInt nz,*vj,k;
2119: /* solve D^(1/2)*U*x = b by back substitution */
2120: VecGetArray(bb,&b);
2121: VecGetArray(xx,&x);
2123: for (k=mbs-1; k>=0; k--){
2124: v = aa + ai[k];
2125: vj = aj + ai[k];
2126: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2127: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2128: x[k] = sqrt(diagk)*b[k];
2129: nz = ai[k+1] - ai[k] - 1;
2130: while (nz--) x[k] += (*v++) * x[*vj++];
2131: }
2132: VecRestoreArray(bb,&b);
2133: VecRestoreArray(xx,&x);
2134: PetscLogFlops(2.0*a->nz - mbs);
2135: return(0);
2136: }
2140: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2141: {
2142: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2144: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
2145: MatScalar *aa=a->a,*v;
2146: PetscReal diagk;
2147: PetscScalar *x,*b;
2148: PetscInt nz,*vj,k;
2151: /* solve D^(1/2)*U*x = b by back substitution */
2152: VecGetArray(bb,&b);
2153: VecGetArray(xx,&x);
2155: for (k=mbs-1; k>=0; k--){
2156: v = aa + ai[k] + 1;
2157: vj = aj + ai[k] + 1;
2158: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2159: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2160: x[k] = sqrt(diagk)*b[k];
2161: nz = ai[k+1] - ai[k] - 1;
2162: while (nz--) x[k] += (*v++) * x[*vj++];
2163: }
2164: VecRestoreArray(bb,&b);
2165: VecRestoreArray(xx,&x);
2166: PetscLogFlops(2.0*a->nz - mbs);
2167: return(0);
2168: }
2170: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2173: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2174: {
2175: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2177: const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2178: PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2179: PetscInt m,reallocs = 0,*levtmp;
2180: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2181: PetscInt incrlev,*lev,shift,prow,nz;
2182: PetscReal f = info->fill,levels = info->levels;
2183: PetscTruth perm_identity;
2186: /* check whether perm is the identity mapping */
2187: ISIdentity(perm,&perm_identity);
2189: if (perm_identity){
2190: a->permute = PETSC_FALSE;
2191: ai = a->i; aj = a->j;
2192: } else { /* non-trivial permutation */
2193: SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2194: a->permute = PETSC_TRUE;
2195: MatReorderingSeqSBAIJ(A, perm);
2196: ai = a->inew; aj = a->jnew;
2197: }
2198:
2199: /* initialization */
2200: ISGetIndices(perm,&rip);
2201: umax = (PetscInt)(f*ai[mbs] + 1);
2202: PetscMalloc(umax*sizeof(PetscInt),&lev);
2203: umax += mbs + 1;
2204: shift = mbs + 1;
2205: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2206: PetscMalloc(umax*sizeof(PetscInt),&ju);
2207: iu[0] = mbs + 1;
2208: juidx = mbs + 1;
2209: /* prowl: linked list for pivot row */
2210: PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2211: /* q: linked list for col index */
2212:
2213: for (i=0; i<mbs; i++){
2214: prowl[i] = mbs;
2215: q[i] = 0;
2216: }
2218: /* for each row k */
2219: for (k=0; k<mbs; k++){
2220: nzk = 0;
2221: q[k] = mbs;
2222: /* copy current row into linked list */
2223: nz = ai[rip[k]+1] - ai[rip[k]];
2224: j = ai[rip[k]];
2225: while (nz--){
2226: vj = rip[aj[j++]];
2227: if (vj > k){
2228: qm = k;
2229: do {
2230: m = qm; qm = q[m];
2231: } while(qm < vj);
2232: if (qm == vj) {
2233: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
2234: }
2235: nzk++;
2236: q[m] = vj;
2237: q[vj] = qm;
2238: levtmp[vj] = 0; /* initialize lev for nonzero element */
2239: }
2240: }
2242: /* modify nonzero structure of k-th row by computing fill-in
2243: for each row prow to be merged in */
2244: prow = k;
2245: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2246:
2247: while (prow < k){
2248: /* merge row prow into k-th row */
2249: jmin = iu[prow] + 1;
2250: jmax = iu[prow+1];
2251: qm = k;
2252: for (j=jmin; j<jmax; j++){
2253: incrlev = lev[j-shift] + 1;
2254: if (incrlev > levels) continue;
2255: vj = ju[j];
2256: do {
2257: m = qm; qm = q[m];
2258: } while (qm < vj);
2259: if (qm != vj){ /* a fill */
2260: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2261: levtmp[vj] = incrlev;
2262: } else {
2263: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2264: }
2265: }
2266: prow = prowl[prow]; /* next pivot row */
2267: }
2268:
2269: /* add k to row list for first nonzero element in k-th row */
2270: if (nzk > 1){
2271: i = q[k]; /* col value of first nonzero element in k_th row of U */
2272: prowl[k] = prowl[i]; prowl[i] = k;
2273: }
2274: iu[k+1] = iu[k] + nzk;
2276: /* allocate more space to ju and lev if needed */
2277: if (iu[k+1] > umax) {
2278: /* estimate how much additional space we will need */
2279: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2280: /* just double the memory each time */
2281: maxadd = umax;
2282: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2283: umax += maxadd;
2285: /* allocate a longer ju */
2286: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2287: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2288: PetscFree(ju);
2289: ju = jutmp;
2291: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2292: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2293: PetscFree(lev);
2294: lev = jutmp;
2295: reallocs += 2; /* count how many times we realloc */
2296: }
2298: /* save nonzero structure of k-th row in ju */
2299: i=k;
2300: while (nzk --) {
2301: i = q[i];
2302: ju[juidx] = i;
2303: lev[juidx-shift] = levtmp[i];
2304: juidx++;
2305: }
2306: }
2307:
2308: #if defined(PETSC_USE_INFO)
2309: if (ai[mbs] != 0) {
2310: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2311: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2312: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2313: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2314: PetscInfo(A,"for best performance.\n");
2315: } else {
2316: PetscInfo(A,"Empty matrix.\n");
2317: }
2318: #endif
2320: ISRestoreIndices(perm,&rip);
2321: PetscFree3(prowl,q,levtmp);
2322: PetscFree(lev);
2324: /* put together the new matrix */
2325: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);
2327: /* PetscLogObjectParent(B,iperm); */
2328: b = (Mat_SeqSBAIJ*)(B)->data;
2329: PetscFree2(b->imax,b->ilen);
2330: b->singlemalloc = PETSC_FALSE;
2331: b->free_a = PETSC_TRUE;
2332: b->free_ij = PETSC_TRUE;
2333: /* the next line frees the default space generated by the Create() */
2334: PetscFree3(b->a,b->j,b->i);
2335: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2336: b->j = ju;
2337: b->i = iu;
2338: b->diag = 0;
2339: b->ilen = 0;
2340: b->imax = 0;
2341:
2342: if (b->row) {
2343: ISDestroy(b->row);
2344: }
2345: if (b->icol) {
2346: ISDestroy(b->icol);
2347: }
2348: b->row = perm;
2349: b->icol = perm;
2350: PetscObjectReference((PetscObject)perm);
2351: PetscObjectReference((PetscObject)perm);
2352: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2353: /* In b structure: Free imax, ilen, old a, old j.
2354: Allocate idnew, solve_work, new a, new j */
2355: PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2356: b->maxnz = b->nz = iu[mbs];
2357:
2358: (B)->info.factor_mallocs = reallocs;
2359: (B)->info.fill_ratio_given = f;
2360: if (ai[mbs] != 0) {
2361: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2362: } else {
2363: (B)->info.fill_ratio_needed = 0.0;
2364: }
2365: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2366: return(0);
2367: }
2369: /*
2370: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2371: */
2372: #include petscbt.h
2373: #include ../src/mat/utils/freespace.h
2376: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2377: {
2378: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2379: PetscErrorCode ierr;
2380: PetscTruth perm_identity,free_ij = PETSC_TRUE,missing;
2381: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2382: const PetscInt *rip;
2383: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2384: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2385: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2386: PetscReal fill=info->fill,levels=info->levels;
2387: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2388: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2389: PetscBT lnkbt;
2392: if (bs > 1){
2393: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2394: return(0);
2395: }
2396: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2397: MatMissingDiagonal(A,&missing,&d);
2398: if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2400: /* check whether perm is the identity mapping */
2401: ISIdentity(perm,&perm_identity);
2402: if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2403: a->permute = PETSC_FALSE;
2405: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2406: PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2407: ui[0] = 0;
2408:
2409: /* ICC(0) without matrix ordering: simply rearrange column indices */
2410: if (!levels){
2411: /* reuse the column pointers and row offsets for memory savings */
2412: for (i=0; i<am; i++) {
2413: ncols = ai[i+1] - ai[i];
2414: ui[i+1] = ui[i] + ncols;
2415: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2416: }
2417: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2418: cols = uj;
2419: for (i=0; i<am; i++) {
2420: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2421: ncols = ai[i+1] - ai[i] -1;
2422: for (j=0; j<ncols; j++) *cols++ = aj[j];
2423: *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2424: }
2425: } else { /* case: levels>0 */
2426: ISGetIndices(perm,&rip);
2428: /* initialization */
2429: /* jl: linked list for storing indices of the pivot rows
2430: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2431: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2432: for (i=0; i<am; i++){
2433: jl[i] = am; il[i] = 0;
2434: }
2436: /* create and initialize a linked list for storing column indices of the active row k */
2437: nlnk = am + 1;
2438: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2440: /* initial FreeSpace size is fill*(ai[am]+1) */
2441: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2442: current_space = free_space;
2443: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2444: current_space_lvl = free_space_lvl;
2446: for (k=0; k<am; k++){ /* for each active row k */
2447: /* initialize lnk by the column indices of row k */
2448: nzk = 0;
2449: ncols = ai[k+1] - ai[k];
2450: if (!ncols) SETERRQ1(PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2451: cols = aj+ai[k];
2452: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2453: nzk += nlnk;
2455: /* update lnk by computing fill-in for each pivot row to be merged in */
2456: prow = jl[k]; /* 1st pivot row */
2457:
2458: while (prow < k){
2459: nextprow = jl[prow];
2460:
2461: /* merge prow into k-th row */
2462: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2463: jmax = ui[prow+1];
2464: ncols = jmax-jmin;
2465: i = jmin - ui[prow];
2467: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2468: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2469: j = *(uj - 1);
2470: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2471: nzk += nlnk;
2473: /* update il and jl for prow */
2474: if (jmin < jmax){
2475: il[prow] = jmin;
2476: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2477: }
2478: prow = nextprow;
2479: }
2481: /* if free space is not available, make more free space */
2482: if (current_space->local_remaining<nzk) {
2483: i = am - k + 1; /* num of unfactored rows */
2484: i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2485: PetscFreeSpaceGet(i,¤t_space);
2486: PetscFreeSpaceGet(i,¤t_space_lvl);
2487: reallocs++;
2488: }
2490: /* copy data into free_space and free_space_lvl, then initialize lnk */
2491: if (nzk == 0) SETERRQ1(PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2492: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2494: /* add the k-th row into il and jl */
2495: if (nzk > 1){
2496: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2497: jl[k] = jl[i]; jl[i] = k;
2498: il[k] = ui[k] + 1;
2499: }
2500: uj_ptr[k] = current_space->array;
2501: uj_lvl_ptr[k] = current_space_lvl->array;
2503: current_space->array += nzk;
2504: current_space->local_used += nzk;
2505: current_space->local_remaining -= nzk;
2506: current_space_lvl->array += nzk;
2507: current_space_lvl->local_used += nzk;
2508: current_space_lvl->local_remaining -= nzk;
2510: ui[k+1] = ui[k] + nzk;
2511: }
2513: #if defined(PETSC_USE_INFO)
2514: if (ai[am] != 0) {
2515: PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2516: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2517: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2518: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2519: } else {
2520: PetscInfo(A,"Empty matrix.\n");
2521: }
2522: #endif
2524: ISRestoreIndices(perm,&rip);
2525: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2527: /* destroy list of free space and other temporary array(s) */
2528: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2529: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2530: PetscIncompleteLLDestroy(lnk,lnkbt);
2531: PetscFreeSpaceDestroy(free_space_lvl);
2533: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2535: /* put together the new matrix in MATSEQSBAIJ format */
2536: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
2538: b = (Mat_SeqSBAIJ*)(fact)->data;
2539: PetscFree2(b->imax,b->ilen);
2540: b->singlemalloc = PETSC_FALSE;
2541: b->free_a = PETSC_TRUE;
2542: b->free_ij = free_ij;
2543: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2544: b->j = uj;
2545: b->i = ui;
2546: b->diag = udiag;
2547: b->free_diag = PETSC_TRUE;
2548: b->ilen = 0;
2549: b->imax = 0;
2550: b->row = perm;
2551: b->col = perm;
2552: PetscObjectReference((PetscObject)perm);
2553: PetscObjectReference((PetscObject)perm);
2554: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2555: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2556: PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2557: b->maxnz = b->nz = ui[am];
2558:
2559: fact->info.factor_mallocs = reallocs;
2560: fact->info.fill_ratio_given = fill;
2561: if (ai[am] != 0) {
2562: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2563: } else {
2564: fact->info.fill_ratio_needed = 0.0;
2565: }
2566: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2567: return(0);
2568: }
2572: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2573: {
2574: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2575: Mat_SeqSBAIJ *b;
2576: PetscErrorCode ierr;
2577: PetscTruth perm_identity,free_ij = PETSC_TRUE,missing;
2578: PetscInt bs=A->rmap->bs,am=a->mbs,d;
2579: const PetscInt *cols,*rip,*ai,*aj;
2580: PetscInt reallocs=0,i,*ui;
2581: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2582: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2583: PetscReal fill=info->fill,levels=info->levels,ratio_needed;
2584: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2585: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2586: PetscBT lnkbt;
2589: MatMissingDiagonal(A,&missing,&d);
2590: if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2592: /*
2593: This code originally uses Modified Sparse Row (MSR) storage
2594: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2595: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2596: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2597: thus the original code in MSR format is still used for these cases.
2598: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2599: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2600: */
2601: if (bs > 1){
2602: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2603: return(0);
2604: }
2606: /* check whether perm is the identity mapping */
2607: ISIdentity(perm,&perm_identity);
2608: if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format")
2609:
2610: /* special case that simply copies fill pattern */
2611: if (!levels ) {
2612: a->permute = PETSC_FALSE;
2613: /* reuse the column pointers and row offsets for memory savings */
2614: ui = a->i;
2615: uj = a->j;
2616: free_ij = PETSC_FALSE;
2617: ratio_needed = 1.0;
2618: } else { /* case: levels>0 */
2619: if (perm_identity){
2620: a->permute = PETSC_FALSE;
2621: ai = a->i; aj = a->j;
2622: }
2623: ISGetIndices(perm,&rip);
2625: /* initialization */
2626: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2627: ui[0] = 0;
2629: /* jl: linked list for storing indices of the pivot rows
2630: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2631: PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2632: for (i=0; i<am; i++){
2633: jl[i] = am; il[i] = 0;
2634: }
2636: /* create and initialize a linked list for storing column indices of the active row k */
2637: nlnk = am + 1;
2638: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2640: /* initial FreeSpace size is fill*(ai[am]+1) */
2641: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2642: current_space = free_space;
2643: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2644: current_space_lvl = free_space_lvl;
2646: for (k=0; k<am; k++){ /* for each active row k */
2647: /* initialize lnk by the column indices of row rip[k] */
2648: nzk = 0;
2649: ncols = ai[rip[k]+1] - ai[rip[k]];
2650: cols = aj+ai[rip[k]];
2651: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2652: nzk += nlnk;
2654: /* update lnk by computing fill-in for each pivot row to be merged in */
2655: prow = jl[k]; /* 1st pivot row */
2656:
2657: while (prow < k){
2658: nextprow = jl[prow];
2659:
2660: /* merge prow into k-th row */
2661: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2662: jmax = ui[prow+1];
2663: ncols = jmax-jmin;
2664: i = jmin - ui[prow];
2665: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2666: j = *(uj_lvl_ptr[prow] + i - 1);
2667: cols_lvl = uj_lvl_ptr[prow]+i;
2668: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2669: nzk += nlnk;
2671: /* update il and jl for prow */
2672: if (jmin < jmax){
2673: il[prow] = jmin;
2674: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2675: }
2676: prow = nextprow;
2677: }
2679: /* if free space is not available, make more free space */
2680: if (current_space->local_remaining<nzk) {
2681: i = am - k + 1; /* num of unfactored rows */
2682: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2683: PetscFreeSpaceGet(i,¤t_space);
2684: PetscFreeSpaceGet(i,¤t_space_lvl);
2685: reallocs++;
2686: }
2688: /* copy data into free_space and free_space_lvl, then initialize lnk */
2689: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2691: /* add the k-th row into il and jl */
2692: if (nzk-1 > 0){
2693: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2694: jl[k] = jl[i]; jl[i] = k;
2695: il[k] = ui[k] + 1;
2696: }
2697: uj_ptr[k] = current_space->array;
2698: uj_lvl_ptr[k] = current_space_lvl->array;
2700: current_space->array += nzk;
2701: current_space->local_used += nzk;
2702: current_space->local_remaining -= nzk;
2703: current_space_lvl->array += nzk;
2704: current_space_lvl->local_used += nzk;
2705: current_space_lvl->local_remaining -= nzk;
2707: ui[k+1] = ui[k] + nzk;
2708: }
2710: #if defined(PETSC_USE_INFO)
2711: if (ai[am] != 0) {
2712: PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2713: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2714: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2715: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2716: } else {
2717: PetscInfo(A,"Empty matrix.\n");
2718: }
2719: #endif
2721: ISRestoreIndices(perm,&rip);
2722: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2724: /* destroy list of free space and other temporary array(s) */
2725: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2726: PetscFreeSpaceContiguous(&free_space,uj);
2727: PetscIncompleteLLDestroy(lnk,lnkbt);
2728: PetscFreeSpaceDestroy(free_space_lvl);
2729: if (ai[am] != 0) {
2730: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731: } else {
2732: ratio_needed = 0.0;
2733: }
2734: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2736: /* put together the new matrix in MATSEQSBAIJ format */
2737: MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
2739: b = (Mat_SeqSBAIJ*)(fact)->data;
2740: PetscFree2(b->imax,b->ilen);
2741: b->singlemalloc = PETSC_FALSE;
2742: b->free_a = PETSC_TRUE;
2743: b->free_ij = free_ij;
2744: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2745: b->j = uj;
2746: b->i = ui;
2747: b->diag = 0;
2748: b->ilen = 0;
2749: b->imax = 0;
2750: b->row = perm;
2751: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2752: PetscObjectReference((PetscObject)perm);
2753: b->icol = perm;
2754: PetscObjectReference((PetscObject)perm);
2755: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2756: b->maxnz = b->nz = ui[am];
2757:
2758: fact->info.factor_mallocs = reallocs;
2759: fact->info.fill_ratio_given = fill;
2760: fact->info.fill_ratio_needed = ratio_needed;
2761: if (perm_identity){
2762: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2763: } else {
2764: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2765: }
2766: return(0);
2767: }