Actual source code: sbaijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for SBAIJ format. 
  5: */

 7:  #include ../src/mat/impls/sbaij/seq/sbaij.h
 8:  #include ../src/mat/impls/baij/seq/baij.h
 9:  #include ../src/mat/blockinvert.h

 13: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 14: {
 15:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 16:   IS              isrow=a->row;
 17:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
 18:   PetscErrorCode  ierr;
 19:   const PetscInt  *r;
 20:   PetscInt        nz,*vj,k,idx,k1;
 21:   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
 22:   MatScalar       *aa=a->a,*v,*diag;
 23:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

 26:   VecGetArray(bb,&b);
 27:   VecGetArray(xx,&x);
 28:   t  = a->solve_work;
 29:   ISGetIndices(isrow,&r);
 30:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 32:   /* solve U^T * D * y = b by forward substitution */
 33:   xk = t;
 34:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 35:     idx   = bs*r[k];
 36:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 37:   }
 38:   for (k=0; k<mbs; k++){
 39:     v  = aa + bs2*ai[k];
 40:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
 41:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 42:     nz = ai[k+1] - ai[k];
 43:     vj = aj + ai[k];
 44:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
 45:     while (nz--) {
 46:       /* x(:) += U(k,:)^T*(Dk*xk) */
 47:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 48:       vj++; xj = t + (*vj)*bs;
 49:       v += bs2;
 50:     }
 51:     /* xk = inv(Dk)*(Dk*xk) */
 52:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 53:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 54:   }

 56:   /* solve U*x = y by back substitution */
 57:   for (k=mbs-1; k>=0; k--){
 58:     v  = aa + bs2*ai[k];
 59:     xk = t + k*bs;        /* xk */
 60:     nz = ai[k+1] - ai[k];
 61:     vj = aj + ai[k];
 62:     xj = t + (*vj)*bs;
 63:     while (nz--) {
 64:       /* xk += U(k,:)*x(:) */
 65:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 66:       vj++;
 67:       v += bs2; xj = t + (*vj)*bs;
 68:     }
 69:     idx   = bs*r[k];
 70:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 71:   }

 73:   PetscFree(xk_tmp);
 74:   ISRestoreIndices(isrow,&r);
 75:   VecRestoreArray(bb,&b);
 76:   VecRestoreArray(xx,&x);
 77:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 78:   return(0);
 79: }

 83: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 84: {
 86:   SETERRQ(1,"not implemented yet");
 87:   return(0);
 88: }

 92: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 93: {
 95:   SETERRQ(1,"not implemented yet");
 96:   return(0);
 97: }

101: PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102: {
104:   PetscInt       nz,*vj,k;
105:   PetscInt       bs2 = bs*bs;
106:   MatScalar      *v,*diag;
107:   PetscScalar    *xk,*xj,*xk_tmp;
108: 
110:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
111:   for (k=0; k<mbs; k++){
112:     v  = aa + bs2*ai[k];
113:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
114:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
115:     nz = ai[k+1] - ai[k];
116:     vj = aj + ai[k];
117:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
118:     while (nz--) {
119:       /* x(:) += U(k,:)^T*(Dk*xk) */
120:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121:       vj++; xj = x + (*vj)*bs;
122:       v += bs2;
123:     }
124:     /* xk = inv(Dk)*(Dk*xk) */
125:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127:   }
128:   PetscFree(xk_tmp);
129:   return(0);
130: }

134: PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135: {
136:   PetscInt       nz,*vj,k;
137:   PetscInt       bs2 = bs*bs;
138:   MatScalar      *v;
139:   PetscScalar    *xk,*xj;

142:   for (k=mbs-1; k>=0; k--){
143:     v  = aa + bs2*ai[k];
144:     xk = x + k*bs;        /* xk */
145:     nz = ai[k+1] - ai[k];
146:     vj = aj + ai[k];
147:     xj = x + (*vj)*bs;
148:     while (nz--) {
149:       /* xk += U(k,:)*x(:) */
150:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151:       vj++;
152:       v += bs2; xj = x + (*vj)*bs;
153:     }
154:   }
155:   return(0);
156: }

160: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161: {
162:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
164:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165:   PetscInt       bs=A->rmap->bs;
166:   MatScalar      *aa=a->a;
167:   PetscScalar    *x,*b;

170:   VecGetArray(bb,&b);
171:   VecGetArray(xx,&x);

173:   /* solve U^T * D * y = b by forward substitution */
174:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
175:   ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);

177:   /* solve U*x = y by back substitution */
178:   BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);

180:   VecRestoreArray(bb,&b);
181:   VecRestoreArray(xx,&x);
182:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
183:   return(0);
184: }

188: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
189: {
190:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
192:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
193:   PetscInt       bs=A->rmap->bs;
194:   MatScalar      *aa=a->a;
195:   PetscScalar    *x,*b;

198:   VecGetArray(bb,&b);
199:   VecGetArray(xx,&x);
200:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
201:   ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
202:   VecRestoreArray(bb,&b);
203:   VecRestoreArray(xx,&x);
204:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
205:   return(0);
206: }

210: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
211: {
212:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
214:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
215:   PetscInt       bs=A->rmap->bs;
216:   MatScalar      *aa=a->a;
217:   PetscScalar    *x,*b;

220:   VecGetArray(bb,&b);
221:   VecGetArray(xx,&x);
222:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));
223:   BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);
224:   VecRestoreArray(bb,&b);
225:   VecRestoreArray(xx,&x);
226:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
227:   return(0);
228: }

232: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
233: {
234:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
235:   IS             isrow=a->row;
236:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
238:   const PetscInt *r;
239:   PetscInt       nz,*vj,k,idx;
240:   MatScalar      *aa=a->a,*v,*d;
241:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

244:   VecGetArray(bb,&b);
245:   VecGetArray(xx,&x);
246:   t  = a->solve_work;
247:   ISGetIndices(isrow,&r);

249:   /* solve U^T * D * y = b by forward substitution */
250:   tp = t;
251:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
252:     idx   = 7*r[k];
253:     tp[0] = b[idx];
254:     tp[1] = b[idx+1];
255:     tp[2] = b[idx+2];
256:     tp[3] = b[idx+3];
257:     tp[4] = b[idx+4];
258:     tp[5] = b[idx+5];
259:     tp[6] = b[idx+6];
260:     tp += 7;
261:   }
262: 
263:   for (k=0; k<mbs; k++){
264:     v  = aa + 49*ai[k];
265:     vj = aj + ai[k];
266:     tp = t + k*7;
267:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
268:     nz = ai[k+1] - ai[k];
269:     tp = t + (*vj)*7;
270:     while (nz--) {
271:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278:       vj++; tp = t + (*vj)*7;
279:       v += 49;
280:     }

282:     /* xk = inv(Dk)*(Dk*xk) */
283:     d  = aa+k*49;          /* ptr to inv(Dk) */
284:     tp    = t + k*7;
285:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292:   }

294:   /* solve U*x = y by back substitution */
295:   for (k=mbs-1; k>=0; k--){
296:     v  = aa + 49*ai[k];
297:     vj = aj + ai[k];
298:     tp    = t + k*7;
299:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
300:     nz = ai[k+1] - ai[k];
301: 
302:     tp = t + (*vj)*7;
303:     while (nz--) {
304:       /* xk += U(k,:)*x(:) */
305:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312:       vj++; tp = t + (*vj)*7;
313:       v += 49;
314:     }
315:     tp    = t + k*7;
316:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
317:     idx   = 7*r[k];
318:     x[idx]     = x0;
319:     x[idx+1]   = x1;
320:     x[idx+2]   = x2;
321:     x[idx+3]   = x3;
322:     x[idx+4]   = x4;
323:     x[idx+5]   = x5;
324:     x[idx+6]   = x6;
325:   }

327:   ISRestoreIndices(isrow,&r);
328:   VecRestoreArray(bb,&b);
329:   VecRestoreArray(xx,&x);
330:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
331:   return(0);
332: }

336: PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
337: {
338:   MatScalar      *v,*d;
339:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
340:   PetscInt       nz,*vj,k;

343:   for (k=0; k<mbs; k++){
344:     v  = aa + 49*ai[k];
345:     xp = x + k*7;
346:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
347:     nz = ai[k+1] - ai[k];
348:     vj = aj + ai[k];
349:     xp = x + (*vj)*7;
350:     while (nz--) {
351:       /* x(:) += U(k,:)^T*(Dk*xk) */
352:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
353:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
354:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
355:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
356:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
357:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
358:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
359:       vj++; xp = x + (*vj)*7;
360:       v += 49;
361:     }
362:     /* xk = inv(Dk)*(Dk*xk) */
363:     d  = aa+k*49;          /* ptr to inv(Dk) */
364:     xp = x + k*7;
365:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
366:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
367:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
368:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
369:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
370:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
371:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
372:   }
373:   return(0);
374: }

378: PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
379: {
380:   MatScalar      *v;
381:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
382:   PetscInt       nz,*vj,k;

385:   for (k=mbs-1; k>=0; k--){
386:     v  = aa + 49*ai[k];
387:     xp = x + k*7;
388:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
389:     nz = ai[k+1] - ai[k];
390:     vj = aj + ai[k];
391:     xp = x + (*vj)*7;
392:     while (nz--) {
393:       /* xk += U(k,:)*x(:) */
394:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
395:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
396:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
397:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
398:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
399:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
400:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
401:       vj++;
402:       v += 49; xp = x + (*vj)*7;
403:     }
404:     xp = x + k*7;
405:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
406:   }
407:   return(0);
408: }

412: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
413: {
414:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
416:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
417:   MatScalar      *aa=a->a;
418:   PetscScalar    *x,*b;

421:   VecGetArray(bb,&b);
422:   VecGetArray(xx,&x);
423: 
424:   /* solve U^T * D * y = b by forward substitution */
425:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
426:   ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);

428:   /* solve U*x = y by back substitution */
429:   BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);

431:   VecRestoreArray(bb,&b);
432:   VecRestoreArray(xx,&x);
433:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
434:   return(0);
435: }

439: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
440: {
441:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
443:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
444:   MatScalar      *aa=a->a;
445:   PetscScalar    *x,*b;

448:   VecGetArray(bb,&b);
449:   VecGetArray(xx,&x);
450:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
451:   ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
452:   VecRestoreArray(bb,&b);
453:   VecRestoreArray(xx,&x);
454:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
455:   return(0);
456: }

460: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
461: {
462:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
464:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
465:   MatScalar      *aa=a->a;
466:   PetscScalar    *x,*b;

469:   VecGetArray(bb,&b);
470:   VecGetArray(xx,&x);
471:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));
472:   BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);
473:   VecRestoreArray(bb,&b);
474:   VecRestoreArray(xx,&x);
475:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
476:   return(0);
477: }

481: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
482: {
483:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
484:   IS             isrow=a->row;
485:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
487:   const PetscInt *r;
488:   PetscInt       nz,*vj,k,idx;
489:   MatScalar      *aa=a->a,*v,*d;
490:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

493:   VecGetArray(bb,&b);
494:   VecGetArray(xx,&x);
495:   t  = a->solve_work;
496:   ISGetIndices(isrow,&r);

498:   /* solve U^T * D * y = b by forward substitution */
499:   tp = t;
500:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
501:     idx   = 6*r[k];
502:     tp[0] = b[idx];
503:     tp[1] = b[idx+1];
504:     tp[2] = b[idx+2];
505:     tp[3] = b[idx+3];
506:     tp[4] = b[idx+4];
507:     tp[5] = b[idx+5];
508:     tp += 6;
509:   }
510: 
511:   for (k=0; k<mbs; k++){
512:     v  = aa + 36*ai[k];
513:     vj = aj + ai[k];
514:     tp = t + k*6;
515:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
516:     nz = ai[k+1] - ai[k];
517:     tp = t + (*vj)*6;
518:     while (nz--) {
519:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
520:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
521:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
522:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
523:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
524:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
525:       vj++; tp = t + (*vj)*6;
526:       v += 36;
527:     }

529:     /* xk = inv(Dk)*(Dk*xk) */
530:     d  = aa+k*36;          /* ptr to inv(Dk) */
531:     tp    = t + k*6;
532:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
533:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
534:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
535:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
536:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
537:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
538:   }

540:   /* solve U*x = y by back substitution */
541:   for (k=mbs-1; k>=0; k--){
542:     v  = aa + 36*ai[k];
543:     vj = aj + ai[k];
544:     tp    = t + k*6;
545:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
546:     nz = ai[k+1] - ai[k];
547: 
548:     tp = t + (*vj)*6;
549:     while (nz--) {
550:       /* xk += U(k,:)*x(:) */
551:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
552:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
553:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
554:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
555:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
556:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
557:       vj++; tp = t + (*vj)*6;
558:       v += 36;
559:     }
560:     tp    = t + k*6;
561:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
562:     idx   = 6*r[k];
563:     x[idx]     = x0;
564:     x[idx+1]   = x1;
565:     x[idx+2]   = x2;
566:     x[idx+3]   = x3;
567:     x[idx+4]   = x4;
568:     x[idx+5]   = x5;
569:   }

571:   ISRestoreIndices(isrow,&r);
572:   VecRestoreArray(bb,&b);
573:   VecRestoreArray(xx,&x);
574:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
575:   return(0);
576: }

580: PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
581: {
582:   MatScalar      *v,*d;
583:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
584:   PetscInt       nz,*vj,k;

587:   for (k=0; k<mbs; k++){
588:     v  = aa + 36*ai[k];
589:     xp = x + k*6;
590:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
591:     nz = ai[k+1] - ai[k];
592:     vj = aj + ai[k];
593:     xp = x + (*vj)*6;
594:     while (nz--) {
595:       /* x(:) += U(k,:)^T*(Dk*xk) */
596:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
597:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
598:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
599:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
600:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
601:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
602:       vj++; xp = x + (*vj)*6;
603:       v += 36;
604:     }
605:     /* xk = inv(Dk)*(Dk*xk) */
606:     d  = aa+k*36;          /* ptr to inv(Dk) */
607:     xp = x + k*6;
608:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
609:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
610:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
611:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
612:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
613:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
614:   }
615:   return(0);
616: }
619: PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
620: {
621:   MatScalar      *v;
622:   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
623:   PetscInt       nz,*vj,k;

626:   for (k=mbs-1; k>=0; k--){
627:     v  = aa + 36*ai[k];
628:     xp = x + k*6;
629:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
630:     nz = ai[k+1] - ai[k];
631:     vj = aj + ai[k];
632:     xp = x + (*vj)*6;
633:     while (nz--) {
634:       /* xk += U(k,:)*x(:) */
635:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
636:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
637:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
638:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
639:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
640:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
641:       vj++;
642:       v += 36; xp = x + (*vj)*6;
643:     }
644:     xp = x + k*6;
645:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
646:   }
647:   return(0);
648: }


653: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
654: {
655:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
656:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
657:   MatScalar      *aa=a->a;
658:   PetscScalar    *x,*b;

662:   VecGetArray(bb,&b);
663:   VecGetArray(xx,&x);
664: 
665:   /* solve U^T * D * y = b by forward substitution */
666:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
667:   ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);

669:   /* solve U*x = y by back substitution */
670:   BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);

672:   VecRestoreArray(bb,&b);
673:   VecRestoreArray(xx,&x);
674:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
675:   return(0);
676: }

680: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
681: {
682:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
683:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
684:   MatScalar      *aa=a->a;
685:   PetscScalar    *x,*b;

689:   VecGetArray(bb,&b);
690:   VecGetArray(xx,&x);
691:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
692:   ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
693:   VecRestoreArray(bb,&b);
694:   VecRestoreArray(xx,&x);
695:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
696:   return(0);
697: }

701: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
702: {
703:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
704:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
705:   MatScalar      *aa=a->a;
706:   PetscScalar    *x,*b;

710:   VecGetArray(bb,&b);
711:   VecGetArray(xx,&x);
712:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
713:   BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);
714:   VecRestoreArray(bb,&b);
715:   VecRestoreArray(xx,&x);
716:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
717:   return(0);
718: }

722: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
723: {
724:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
725:   IS             isrow=a->row;
726:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
728:   const PetscInt *r;
729:   PetscInt       nz,*vj,k,idx;
730:   MatScalar      *aa=a->a,*v,*diag;
731:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

734:   VecGetArray(bb,&b);
735:   VecGetArray(xx,&x);
736:   t  = a->solve_work;
737:   ISGetIndices(isrow,&r);

739:   /* solve U^T * D * y = b by forward substitution */
740:   tp = t;
741:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
742:     idx   = 5*r[k];
743:     tp[0] = b[idx];
744:     tp[1] = b[idx+1];
745:     tp[2] = b[idx+2];
746:     tp[3] = b[idx+3];
747:     tp[4] = b[idx+4];
748:     tp += 5;
749:   }
750: 
751:   for (k=0; k<mbs; k++){
752:     v  = aa + 25*ai[k];
753:     vj = aj + ai[k];
754:     tp = t + k*5;
755:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
756:     nz = ai[k+1] - ai[k];

758:     tp = t + (*vj)*5;
759:     while (nz--) {
760:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
761:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
762:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
763:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
764:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
765:       vj++; tp = t + (*vj)*5;
766:       v += 25;
767:     }

769:     /* xk = inv(Dk)*(Dk*xk) */
770:     diag  = aa+k*25;          /* ptr to inv(Dk) */
771:     tp    = t + k*5;
772:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
773:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
774:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
775:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
776:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
777:   }

779:   /* solve U*x = y by back substitution */
780:   for (k=mbs-1; k>=0; k--){
781:     v  = aa + 25*ai[k];
782:     vj = aj + ai[k];
783:     tp    = t + k*5;
784:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
785:     nz = ai[k+1] - ai[k];
786: 
787:     tp = t + (*vj)*5;
788:     while (nz--) {
789:       /* xk += U(k,:)*x(:) */
790:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
791:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
792:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
793:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
794:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
795:       vj++; tp = t + (*vj)*5;
796:       v += 25;
797:     }
798:     tp    = t + k*5;
799:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
800:     idx   = 5*r[k];
801:     x[idx]     = x0;
802:     x[idx+1]   = x1;
803:     x[idx+2]   = x2;
804:     x[idx+3]   = x3;
805:     x[idx+4]   = x4;
806:   }

808:   ISRestoreIndices(isrow,&r);
809:   VecRestoreArray(bb,&b);
810:   VecRestoreArray(xx,&x);
811:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
812:   return(0);
813: }

817: PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
818: {
819:   MatScalar      *v,*diag;
820:   PetscScalar    *xp,x0,x1,x2,x3,x4;
821:   PetscInt       nz,*vj,k;

824:   for (k=0; k<mbs; k++){
825:     v  = aa + 25*ai[k];
826:     xp = x + k*5;
827:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
828:     nz = ai[k+1] - ai[k];
829:     vj = aj + ai[k];
830:     xp = x + (*vj)*5;
831:     while (nz--) {
832:       /* x(:) += U(k,:)^T*(Dk*xk) */
833:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
834:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
835:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
836:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
837:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
838:       vj++; xp = x + (*vj)*5;
839:       v += 25;
840:     }
841:     /* xk = inv(Dk)*(Dk*xk) */
842:     diag = aa+k*25;          /* ptr to inv(Dk) */
843:     xp   = x + k*5;
844:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
845:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
846:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
847:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
848:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
849:   }
850:   return(0);
851: }

855: PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
856: {
857:   MatScalar      *v;
858:   PetscScalar    *xp,x0,x1,x2,x3,x4;
859:   PetscInt       nz,*vj,k;

862:   for (k=mbs-1; k>=0; k--){
863:     v  = aa + 25*ai[k];
864:     xp = x + k*5;
865:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
866:     nz = ai[k+1] - ai[k];
867:     vj = aj + ai[k];
868:     xp = x + (*vj)*5;
869:     while (nz--) {
870:       /* xk += U(k,:)*x(:) */
871:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
872:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
873:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
874:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
875:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
876:       vj++;
877:       v += 25; xp = x + (*vj)*5;
878:     }
879:     xp = x + k*5;
880:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
881:   }
882:   return(0);
883: }

887: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
888: {
889:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
890:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
891:   MatScalar      *aa=a->a;
892:   PetscScalar    *x,*b;

896:   VecGetArray(bb,&b);
897:   VecGetArray(xx,&x);

899:   /* solve U^T * D * y = b by forward substitution */
900:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
901:   ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);

903:   /* solve U*x = y by back substitution */
904:   BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);

906:   VecRestoreArray(bb,&b);
907:   VecRestoreArray(xx,&x);
908:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
909:   return(0);
910: }

914: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
915: {
916:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
917:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
918:   MatScalar      *aa=a->a;
919:   PetscScalar    *x,*b;

923:   VecGetArray(bb,&b);
924:   VecGetArray(xx,&x);
925:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
926:   ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
927:   VecRestoreArray(bb,&b);
928:   VecRestoreArray(xx,&x);
929:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
930:   return(0);
931: }

935: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
936: {
937:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
938:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
939:   MatScalar      *aa=a->a;
940:   PetscScalar    *x,*b;

944:   VecGetArray(bb,&b);
945:   VecGetArray(xx,&x);
946:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));
947:   BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);
948:   VecRestoreArray(bb,&b);
949:   VecRestoreArray(xx,&x);
950:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
951:   return(0);
952: }

956: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
957: {
958:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
959:   IS             isrow=a->row;
960:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
962:   const PetscInt *r;
963:   PetscInt       nz,*vj,k,idx;
964:   MatScalar      *aa=a->a,*v,*diag;
965:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

968:   VecGetArray(bb,&b);
969:   VecGetArray(xx,&x);
970:   t  = a->solve_work;
971:   ISGetIndices(isrow,&r);

973:   /* solve U^T * D * y = b by forward substitution */
974:   tp = t;
975:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
976:     idx   = 4*r[k];
977:     tp[0] = b[idx];
978:     tp[1] = b[idx+1];
979:     tp[2] = b[idx+2];
980:     tp[3] = b[idx+3];
981:     tp += 4;
982:   }
983: 
984:   for (k=0; k<mbs; k++){
985:     v  = aa + 16*ai[k];
986:     vj = aj + ai[k];
987:     tp = t + k*4;
988:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
989:     nz = ai[k+1] - ai[k];

991:     tp = t + (*vj)*4;
992:     while (nz--) {
993:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
994:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
995:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
996:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
997:       vj++; tp = t + (*vj)*4;
998:       v += 16;
999:     }

1001:     /* xk = inv(Dk)*(Dk*xk) */
1002:     diag  = aa+k*16;          /* ptr to inv(Dk) */
1003:     tp    = t + k*4;
1004:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1005:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1006:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1007:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1008:   }

1010:   /* solve U*x = y by back substitution */
1011:   for (k=mbs-1; k>=0; k--){
1012:     v  = aa + 16*ai[k];
1013:     vj = aj + ai[k];
1014:     tp    = t + k*4;
1015:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1016:     nz = ai[k+1] - ai[k];
1017: 
1018:     tp = t + (*vj)*4;
1019:     while (nz--) {
1020:       /* xk += U(k,:)*x(:) */
1021:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1022:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1023:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1024:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1025:       vj++; tp = t + (*vj)*4;
1026:       v += 16;
1027:     }
1028:     tp    = t + k*4;
1029:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1030:     idx        = 4*r[k];
1031:     x[idx]     = x0;
1032:     x[idx+1]   = x1;
1033:     x[idx+2]   = x2;
1034:     x[idx+3]   = x3;
1035:   }

1037:   ISRestoreIndices(isrow,&r);
1038:   VecRestoreArray(bb,&b);
1039:   VecRestoreArray(xx,&x);
1040:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1041:    return(0);
1042: }

1046: PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1047: {
1048:   MatScalar      *v,*diag;
1049:   PetscScalar    *xp,x0,x1,x2,x3;
1050:   PetscInt       nz,*vj,k;

1053:   for (k=0; k<mbs; k++){
1054:     v  = aa + 16*ai[k];
1055:     xp = x + k*4;
1056:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1057:     nz = ai[k+1] - ai[k];
1058:     vj = aj + ai[k];
1059:     xp = x + (*vj)*4;
1060:     while (nz--) {
1061:       /* x(:) += U(k,:)^T*(Dk*xk) */
1062:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1063:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1064:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1065:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1066:       vj++; xp = x + (*vj)*4;
1067:       v += 16;
1068:     }
1069:     /* xk = inv(Dk)*(Dk*xk) */
1070:     diag = aa+k*16;          /* ptr to inv(Dk) */
1071:     xp   = x + k*4;
1072:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1073:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1074:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1075:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1076:   }
1077:   return(0);
1078: }

1082: PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1083: {
1084:   MatScalar      *v;
1085:   PetscScalar    *xp,x0,x1,x2,x3;
1086:   PetscInt       nz,*vj,k;

1089:   for (k=mbs-1; k>=0; k--){
1090:     v  = aa + 16*ai[k];
1091:     xp = x + k*4;
1092:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1093:     nz = ai[k+1] - ai[k];
1094:     vj = aj + ai[k];
1095:     xp = x + (*vj)*4;
1096:     while (nz--) {
1097:       /* xk += U(k,:)*x(:) */
1098:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1099:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1100:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1101:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1102:       vj++;
1103:       v += 16; xp = x + (*vj)*4;
1104:     }
1105:     xp = x + k*4;
1106:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1107:   }
1108:   return(0);
1109: }

1113: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1114: {
1115:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1116:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1117:   MatScalar      *aa=a->a;
1118:   PetscScalar    *x,*b;

1122:   VecGetArray(bb,&b);
1123:   VecGetArray(xx,&x);

1125:   /* solve U^T * D * y = b by forward substitution */
1126:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1127:   ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);

1129:   /* solve U*x = y by back substitution */
1130:   BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1131:   VecRestoreArray(bb,&b);
1132:   VecRestoreArray(xx,&x);
1133:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1134:   return(0);
1135: }

1139: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1140: {
1141:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1142:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1143:   MatScalar      *aa=a->a;
1144:   PetscScalar    *x,*b;

1148:   VecGetArray(bb,&b);
1149:   VecGetArray(xx,&x);
1150:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
1151:   ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1152:   VecRestoreArray(bb,&b);
1153:   VecRestoreArray(xx,&x);
1154:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1155:   return(0);
1156: }

1160: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1161: {
1162:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1163:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1164:   MatScalar      *aa=a->a;
1165:   PetscScalar    *x,*b;

1169:   VecGetArray(bb,&b);
1170:   VecGetArray(xx,&x);
1171:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));
1172:   BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);
1173:   VecRestoreArray(bb,&b);
1174:   VecRestoreArray(xx,&x);
1175:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1176:   return(0);
1177: }

1181: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1182: {
1183:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1184:   IS             isrow=a->row;
1185:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1187:   const PetscInt *r;
1188:   PetscInt       nz,*vj,k,idx;
1189:   MatScalar      *aa=a->a,*v,*diag;
1190:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

1193:   VecGetArray(bb,&b);
1194:   VecGetArray(xx,&x);
1195:   t  = a->solve_work;
1196:   ISGetIndices(isrow,&r);

1198:   /* solve U^T * D * y = b by forward substitution */
1199:   tp = t;
1200:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1201:     idx   = 3*r[k];
1202:     tp[0] = b[idx];
1203:     tp[1] = b[idx+1];
1204:     tp[2] = b[idx+2];
1205:     tp += 3;
1206:   }
1207: 
1208:   for (k=0; k<mbs; k++){
1209:     v  = aa + 9*ai[k];
1210:     vj = aj + ai[k];
1211:     tp = t + k*3;
1212:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1213:     nz = ai[k+1] - ai[k];

1215:     tp = t + (*vj)*3;
1216:     while (nz--) {
1217:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1218:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1219:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1220:       vj++; tp = t + (*vj)*3;
1221:       v += 9;
1222:     }

1224:     /* xk = inv(Dk)*(Dk*xk) */
1225:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1226:     tp    = t + k*3;
1227:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1228:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1229:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1230:   }

1232:   /* solve U*x = y by back substitution */
1233:   for (k=mbs-1; k>=0; k--){
1234:     v  = aa + 9*ai[k];
1235:     vj = aj + ai[k];
1236:     tp    = t + k*3;
1237:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1238:     nz = ai[k+1] - ai[k];
1239: 
1240:     tp = t + (*vj)*3;
1241:     while (nz--) {
1242:       /* xk += U(k,:)*x(:) */
1243:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1244:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1245:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1246:       vj++; tp = t + (*vj)*3;
1247:       v += 9;
1248:     }
1249:     tp    = t + k*3;
1250:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1251:     idx      = 3*r[k];
1252:     x[idx]   = x0;
1253:     x[idx+1] = x1;
1254:     x[idx+2] = x2;
1255:   }

1257:   ISRestoreIndices(isrow,&r);
1258:   VecRestoreArray(bb,&b);
1259:   VecRestoreArray(xx,&x);
1260:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1261:   return(0);
1262: }

1266: PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1267: {
1268:   MatScalar      *v,*diag;
1269:   PetscScalar    *xp,x0,x1,x2;
1270:   PetscInt       nz,*vj,k;

1273:   for (k=0; k<mbs; k++){
1274:     v  = aa + 9*ai[k];
1275:     xp = x + k*3;
1276:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1277:     nz = ai[k+1] - ai[k];
1278:     vj = aj + ai[k];
1279:     xp = x + (*vj)*3;
1280:     while (nz--) {
1281:       /* x(:) += U(k,:)^T*(Dk*xk) */
1282:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1283:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1284:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1285:       vj++; xp = x + (*vj)*3;
1286:       v += 9;
1287:     }
1288:     /* xk = inv(Dk)*(Dk*xk) */
1289:     diag = aa+k*9;          /* ptr to inv(Dk) */
1290:     xp   = x + k*3;
1291:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1292:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1293:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1294:   }
1295:   return(0);
1296: }

1300: PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1301: {
1302:   MatScalar      *v;
1303:   PetscScalar    *xp,x0,x1,x2;
1304:   PetscInt       nz,*vj,k;

1307:   for (k=mbs-1; k>=0; k--){
1308:     v  = aa + 9*ai[k];
1309:     xp = x + k*3;
1310:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1311:     nz = ai[k+1] - ai[k];
1312:     vj = aj + ai[k];
1313:     xp = x + (*vj)*3;
1314:     while (nz--) {
1315:       /* xk += U(k,:)*x(:) */
1316:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1317:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1318:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1319:       vj++;
1320:       v += 9; xp = x + (*vj)*3;
1321:     }
1322:     xp = x + k*3;
1323:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1324:   }
1325:   return(0);
1326: }

1330: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1331: {
1332:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1333:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1334:   MatScalar      *aa=a->a;
1335:   PetscScalar    *x,*b;
1337: 
1339:   VecGetArray(bb,&b);
1340:   VecGetArray(xx,&x);

1342:   /* solve U^T * D * y = b by forward substitution */
1343:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1344:   ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);

1346:   /* solve U*x = y by back substitution */
1347:   BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);

1349:   VecRestoreArray(bb,&b);
1350:   VecRestoreArray(xx,&x);
1351:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1352:   return(0);
1353: }

1357: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1358: {
1359:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1360:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1361:   MatScalar      *aa=a->a;
1362:   PetscScalar    *x,*b;

1366:   VecGetArray(bb,&b);
1367:   VecGetArray(xx,&x);
1368:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1369:   ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1370:   VecRestoreArray(bb,&b);
1371:   VecRestoreArray(xx,&x);
1372:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1373:   return(0);
1374: }

1378: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1379: {
1380:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1381:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1382:   MatScalar      *aa=a->a;
1383:   PetscScalar    *x,*b;

1387:   VecGetArray(bb,&b);
1388:   VecGetArray(xx,&x);
1389:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
1390:   BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);
1391:   VecRestoreArray(bb,&b);
1392:   VecRestoreArray(xx,&x);
1393:   PetscLogFlops(2.0*bs2*(a->nz-mbs));
1394:   return(0);
1395: }

1399: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1400: {
1401:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1402:   IS             isrow=a->row;
1403:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1405:   const PetscInt *r;
1406:   PetscInt       nz,*vj,k,k2,idx;
1407:   MatScalar      *aa=a->a,*v,*diag;
1408:   PetscScalar    *x,*b,x0,x1,*t;

1411:   VecGetArray(bb,&b);
1412:   VecGetArray(xx,&x);
1413:   t  = a->solve_work;
1414:   ISGetIndices(isrow,&r);

1416:   /* solve U^T * D * y = perm(b) by forward substitution */
1417:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1418:     idx = 2*r[k];
1419:     t[k*2]   = b[idx];
1420:     t[k*2+1] = b[idx+1];
1421:   }
1422:   for (k=0; k<mbs; k++){
1423:     v  = aa + 4*ai[k];
1424:     vj = aj + ai[k];
1425:     k2 = k*2;
1426:     x0 = t[k2]; x1 = t[k2+1];
1427:     nz = ai[k+1] - ai[k];
1428:     while (nz--) {
1429:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1430:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1431:       vj++; v += 4;
1432:     }
1433:     diag = aa+k*4;  /* ptr to inv(Dk) */
1434:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1435:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1436:   }

1438:   /* solve U*x = y by back substitution */
1439:   for (k=mbs-1; k>=0; k--){
1440:     v  = aa + 4*ai[k];
1441:     vj = aj + ai[k];
1442:     k2 = k*2;
1443:     x0 = t[k2]; x1 = t[k2+1];
1444:     nz = ai[k+1] - ai[k];
1445:     while (nz--) {
1446:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1447:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1448:       vj++; v += 4;
1449:     }
1450:     t[k2]    = x0;
1451:     t[k2+1]  = x1;
1452:     idx      = 2*r[k];
1453:     x[idx]   = x0;
1454:     x[idx+1] = x1;
1455:   }

1457:   ISRestoreIndices(isrow,&r);
1458:   VecRestoreArray(bb,&b);
1459:   VecRestoreArray(xx,&x);
1460:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1461:   return(0);
1462: }

1466: PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1467: {
1468:   MatScalar      *v,*diag;
1469:   PetscScalar    x0,x1;
1470:   PetscInt       nz,*vj,k,k2;

1473:   for (k=0; k<mbs; k++){
1474:     v  = aa + 4*ai[k];
1475:     vj = aj + ai[k];
1476:     k2 = k*2;
1477:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1478:     nz = ai[k+1] - ai[k];
1479: 
1480:     while (nz--) {
1481:       /* x(:) += U(k,:)^T*(Dk*xk) */
1482:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1483:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1484:       vj++; v += 4;
1485:     }
1486:     /* xk = inv(Dk)*(Dk*xk) */
1487:     diag = aa+k*4;          /* ptr to inv(Dk) */
1488:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1489:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1490:   }
1491:   return(0);
1492: }

1496: PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1497: {
1498:   MatScalar      *v;
1499:   PetscScalar    x0,x1;
1500:   PetscInt       nz,*vj,k,k2;

1503:   for (k=mbs-1; k>=0; k--){
1504:     v  = aa + 4*ai[k];
1505:     vj = aj + ai[k];
1506:     k2 = k*2;
1507:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1508:     nz = ai[k+1] - ai[k];
1509:     while (nz--) {
1510:       /* xk += U(k,:)*x(:) */
1511:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1512:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1513:       vj++; v += 4;
1514:     }
1515:     x[k2]     = x0;
1516:     x[k2+1]   = x1;
1517:   }
1518:   return(0);
1519: }

1523: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1524: {
1525:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1526:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1527:   MatScalar      *aa=a->a;
1528:   PetscScalar    *x,*b;

1532:   VecGetArray(bb,&b);
1533:   VecGetArray(xx,&x);

1535:   /* solve U^T * D * y = b by forward substitution */
1536:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1537:   ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);

1539:   /* solve U*x = y by back substitution */
1540:   BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);

1542:   VecRestoreArray(bb,&b);
1543:   VecRestoreArray(xx,&x);
1544:   PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);
1545:   return(0);
1546: }

1550: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1551: {
1552:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1553:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1554:   MatScalar      *aa=a->a;
1555:   PetscScalar    *x,*b;

1559:   VecGetArray(bb,&b);
1560:   VecGetArray(xx,&x);
1561:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1562:   ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1563:   VecRestoreArray(bb,&b);
1564:   VecRestoreArray(xx,&x);
1565:   PetscLogFlops(2.0*bs2*a->nz - bs*mbs);
1566:   return(0);
1567: }

1571: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1572: {
1573:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1574:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1575:   MatScalar      *aa=a->a;
1576:   PetscScalar    *x,*b;

1580:   VecGetArray(bb,&b);
1581:   VecGetArray(xx,&x);
1582:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1583:   BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);
1584:   VecRestoreArray(bb,&b);
1585:   VecRestoreArray(xx,&x);
1586:   PetscLogFlops(2.0*bs2*(a->nz - mbs));
1587:   return(0);
1588: }

1592: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1593: {
1594:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1595:   IS                isrow=a->row;
1596:   PetscErrorCode    ierr;
1597:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1598:   const MatScalar   *aa=a->a,*v;
1599:   const PetscScalar *b;
1600:   PetscScalar       *x,xk,*t;
1601:   PetscInt          nz,k,j;

1604:   VecGetArray(bb,(PetscScalar**)&b);
1605:   VecGetArray(xx,&x);
1606:   t    = a->solve_work;
1607:   ISGetIndices(isrow,&rp);
1608: 
1609:   /* solve U^T*D*y = perm(b) by forward substitution */
1610:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1611:   for (k=0; k<mbs; k++){
1612:     v  = aa + ai[k];
1613:     vj = aj + ai[k];
1614:     xk = t[k];
1615:     nz = ai[k+1] - ai[k] - 1;
1616:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1617:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1618:   }

1620:   /* solve U*perm(x) = y by back substitution */
1621:   for (k=mbs-1; k>=0; k--){
1622:     v  = aa + adiag[k] - 1;
1623:     vj = aj + adiag[k] - 1;
1624:     nz = ai[k+1] - ai[k] - 1;
1625:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1626:     x[rp[k]] = t[k];
1627:   }

1629:   ISRestoreIndices(isrow,&rp);
1630:   VecRestoreArray(bb,(PetscScalar**)&b);
1631:   VecRestoreArray(xx,&x);
1632:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1633:   return(0);
1634: }

1638: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1639: {
1640:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1641:   IS              isrow=a->row;
1642:   PetscErrorCode  ierr;
1643:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1644:   const MatScalar *aa=a->a,*v;
1645:   PetscScalar     *x,*b,xk,*t;
1646:   PetscInt        nz,k;

1649:   VecGetArray(bb,&b);
1650:   VecGetArray(xx,&x);
1651:   t    = a->solve_work;
1652:   ISGetIndices(isrow,&rp);
1653: 
1654:   /* solve U^T*D*y = perm(b) by forward substitution */
1655:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1656:   for (k=0; k<mbs; k++){
1657:     v  = aa + ai[k] + 1;
1658:     vj = aj + ai[k] + 1;
1659:     xk = t[k];
1660:     nz = ai[k+1] - ai[k] - 1;
1661:     while (nz--) t[*vj++] += (*v++) * xk;
1662:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1663:   }

1665:   /* solve U*perm(x) = y by back substitution */
1666:   for (k=mbs-1; k>=0; k--){
1667:     v  = aa + ai[k] + 1;
1668:     vj = aj + ai[k] + 1;
1669:     nz = ai[k+1] - ai[k] - 1;
1670:     while (nz--) t[k] += (*v++) * t[*vj++];
1671:     x[rp[k]] = t[k];
1672:   }

1674:   ISRestoreIndices(isrow,&rp);
1675:   VecRestoreArray(bb,&b);
1676:   VecRestoreArray(xx,&x);
1677:   PetscLogFlops(4.0*a->nz - 3*mbs);
1678:   return(0);
1679: }

1683: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1684: {
1685:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1686:   IS              isrow=a->row;
1687:   PetscErrorCode  ierr;
1688:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1689:   const MatScalar *aa=a->a,*v;
1690:   PetscReal       diagk;
1691:   PetscScalar     *x,*b,xk;
1692:   PetscInt        nz,k;

1695:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1696:   VecGetArray(bb,&b);
1697:   VecGetArray(xx,&x);
1698:   ISGetIndices(isrow,&rp);
1699: 
1700:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1701:   for (k=0; k<mbs; k++){
1702:     v  = aa + ai[k];
1703:     vj = aj + ai[k];
1704:     xk = x[k];
1705:     nz = ai[k+1] - ai[k] - 1;
1706:     while (nz--) x[*vj++] += (*v++) * xk;

1708:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1709:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1710:     x[k] = xk*sqrt(diagk);
1711:   }
1712:   ISRestoreIndices(isrow,&rp);
1713:   VecRestoreArray(bb,&b);
1714:   VecRestoreArray(xx,&x);
1715:   PetscLogFlops(2.0*a->nz - mbs);
1716:   return(0);
1717: }

1721: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1722: {
1723:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1724:   IS              isrow=a->row;
1725:   PetscErrorCode  ierr;
1726:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1727:   const MatScalar *aa=a->a,*v;
1728:   PetscReal       diagk;
1729:   PetscScalar     *x,*b,xk;
1730:   PetscInt        nz,k;

1733:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1734:   VecGetArray(bb,&b);
1735:   VecGetArray(xx,&x);
1736:   ISGetIndices(isrow,&rp);
1737: 
1738:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1739:   for (k=0; k<mbs; k++){
1740:     v  = aa + ai[k] + 1;
1741:     vj = aj + ai[k] + 1;
1742:     xk = x[k];
1743:     nz = ai[k+1] - ai[k] - 1;
1744:     while (nz--) x[*vj++] += (*v++) * xk;

1746:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1747:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1748:     x[k] = xk*sqrt(diagk);
1749:   }
1750:   ISRestoreIndices(isrow,&rp);
1751:   VecRestoreArray(bb,&b);
1752:   VecRestoreArray(xx,&x);
1753:   PetscLogFlops(2.0*a->nz - mbs);
1754:   return(0);
1755: }

1759: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1760: {
1761:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1762:   IS              isrow=a->row;
1763:   PetscErrorCode  ierr;
1764:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1765:   const MatScalar *aa=a->a,*v;
1766:   PetscReal       diagk;
1767:   PetscScalar     *x,*b,*t;
1768:   PetscInt        nz,k;

1771:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1772:   VecGetArray(bb,&b);
1773:   VecGetArray(xx,&x);
1774:   t    = a->solve_work;
1775:   ISGetIndices(isrow,&rp);

1777:   for (k=mbs-1; k>=0; k--){
1778:     v  = aa + ai[k];
1779:     vj = aj + ai[k];
1780:     diagk = PetscRealPart(aa[adiag[k]]);
1781:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1782:     t[k] = b[k] * sqrt(diagk);
1783:     nz = ai[k+1] - ai[k] - 1;
1784:     while (nz--) t[k] += (*v++) * t[*vj++];
1785:     x[rp[k]] = t[k];
1786:   }
1787:   ISRestoreIndices(isrow,&rp);
1788:   VecRestoreArray(bb,&b);
1789:   VecRestoreArray(xx,&x);
1790:   PetscLogFlops(2.0*a->nz - mbs);
1791:   return(0);
1792: }

1796: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1797: {
1798:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1799:   IS              isrow=a->row;
1800:   PetscErrorCode  ierr;
1801:   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1802:   const MatScalar *aa=a->a,*v;
1803:   PetscReal       diagk;
1804:   PetscScalar     *x,*b,*t;
1805:   PetscInt        nz,k;

1808:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1809:   VecGetArray(bb,&b);
1810:   VecGetArray(xx,&x);
1811:   t    = a->solve_work;
1812:   ISGetIndices(isrow,&rp);

1814:   for (k=mbs-1; k>=0; k--){
1815:     v  = aa + ai[k] + 1;
1816:     vj = aj + ai[k] + 1;
1817:     diagk = PetscRealPart(aa[ai[k]]);
1818:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1819:     t[k] = b[k] * sqrt(diagk);
1820:     nz = ai[k+1] - ai[k] - 1;
1821:     while (nz--) t[k] += (*v++) * t[*vj++];
1822:     x[rp[k]] = t[k];
1823:   }
1824:   ISRestoreIndices(isrow,&rp);
1825:   VecRestoreArray(bb,&b);
1826:   VecRestoreArray(xx,&x);
1827:   PetscLogFlops(2.0*a->nz - mbs);
1828:   return(0);
1829: }

1833: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1834: {
1835:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1839:   if (A->rmap->bs == 1) {
1840:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1841:   } else {
1842:     IS              isrow=a->row;
1843:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1844:     const MatScalar *aa=a->a,*v;
1845:     PetscScalar     *x,*b,*t;
1846:     PetscInt        nz,k,n,i,j;
1847:     if (bb->n > a->solves_work_n) {
1848:       PetscFree(a->solves_work);
1849:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1850:       a->solves_work_n = bb->n;
1851:     }
1852:     n    = bb->n;
1853:     VecGetArray(bb->v,&b);
1854:     VecGetArray(xx->v,&x);
1855:     t    = a->solves_work;

1857:     ISGetIndices(isrow,&rp);
1858: 
1859:     /* solve U^T*D*y = perm(b) by forward substitution */
1860:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1861:     for (k=0; k<mbs; k++){
1862:       v  = aa + ai[k];
1863:       vj = aj + ai[k];
1864:       nz = ai[k+1] - ai[k] - 1;
1865:       for (j=0; j<nz; j++){
1866:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1867:         v++;vj++;
1868:       }
1869:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1870:     }
1871: 
1872:     /* solve U*perm(x) = y by back substitution */
1873:     for (k=mbs-1; k>=0; k--){
1874:       v  = aa + ai[k] - 1;
1875:       vj = aj + ai[k] - 1;
1876:       nz = ai[k+1] - ai[k] - 1;
1877:       for (j=0; j<nz; j++){
1878:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1879:         v++;vj++;
1880:       }
1881:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1882:     }

1884:     ISRestoreIndices(isrow,&rp);
1885:     VecRestoreArray(bb->v,&b);
1886:     VecRestoreArray(xx->v,&x);
1887:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1888:   }
1889:   return(0);
1890: }

1894: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1895: {
1896:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1900:   if (A->rmap->bs == 1) {
1901:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1902:   } else {
1903:     IS              isrow=a->row;
1904:     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1905:     const MatScalar *aa=a->a,*v;
1906:     PetscScalar     *x,*b,*t;
1907:     PetscInt        nz,k,n,i;
1908:     if (bb->n > a->solves_work_n) {
1909:       PetscFree(a->solves_work);
1910:       PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);
1911:       a->solves_work_n = bb->n;
1912:     }
1913:     n    = bb->n;
1914:     VecGetArray(bb->v,&b);
1915:     VecGetArray(xx->v,&x);
1916:     t    = a->solves_work;

1918:     ISGetIndices(isrow,&rp);
1919: 
1920:     /* solve U^T*D*y = perm(b) by forward substitution */
1921:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1922:     for (k=0; k<mbs; k++){
1923:       v  = aa + ai[k];
1924:       vj = aj + ai[k];
1925:       nz = ai[k+1] - ai[k];
1926:       while (nz--) {
1927:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1928:         v++;vj++;
1929:       }
1930:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1931:     }
1932: 
1933:     /* solve U*perm(x) = y by back substitution */
1934:     for (k=mbs-1; k>=0; k--){
1935:       v  = aa + ai[k];
1936:       vj = aj + ai[k];
1937:       nz = ai[k+1] - ai[k];
1938:       while (nz--) {
1939:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1940:         v++;vj++;
1941:       }
1942:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1943:     }

1945:     ISRestoreIndices(isrow,&rp);
1946:     VecRestoreArray(bb->v,&b);
1947:     VecRestoreArray(xx->v,&x);
1948:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1949:   }
1950:   return(0);
1951: }

1955: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1956: {
1957:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1958:   PetscErrorCode    ierr;
1959:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1960:   const MatScalar   *aa=a->a,*v;
1961:   const PetscScalar *b;
1962:   PetscScalar       *x,xi;
1963:   PetscInt          nz,i,j;

1966:   VecGetArray(bb,(PetscScalar**)&b);
1967:   VecGetArray(xx,&x);
1968: 
1969:   /* solve U^T*D*y = b by forward substitution */
1970:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1971:   for (i=0; i<mbs; i++){
1972:     v  = aa + ai[i];
1973:     vj = aj + ai[i];
1974:     xi = x[i];
1975:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1976:     for (j=0; j<nz; j++){
1977:       x[vj[j]] += v[j]* xi;
1978:     }
1979:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1980:   }

1982:   /* solve U*x = y by backward substitution */
1983:   for (i=mbs-2; i>=0; i--){
1984:     xi = x[i];
1985:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1986:     vj = aj + adiag[i] - 1;
1987:     nz = ai[i+1] - ai[i] - 1;
1988:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1989:     x[i] = xi;
1990:   }
1991: 
1992:   VecRestoreArray(bb,(PetscScalar**)&b);
1993:   VecRestoreArray(xx,&x);
1994:   PetscLogFlops(4.0*a->nz - 3*mbs);
1995:   return(0);
1996: }

2000: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2001: {
2002:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2004:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2005:   MatScalar      *aa=a->a,*v;
2006:   PetscScalar    *x,*b,xk;
2007:   PetscInt       nz,*vj,k;

2010:   VecGetArray(bb,&b);
2011:   VecGetArray(xx,&x);
2012: 
2013:   /* solve U^T*D*y = b by forward substitution */
2014:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2015:   for (k=0; k<mbs; k++){
2016:     v  = aa + ai[k] + 1;
2017:     vj = aj + ai[k] + 1;
2018:     xk = x[k];
2019:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2020:     while (nz--) x[*vj++] += (*v++) * xk;
2021:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2022:   }

2024:   /* solve U*x = y by back substitution */
2025:   for (k=mbs-2; k>=0; k--){
2026:     v  = aa + ai[k] + 1;
2027:     vj = aj + ai[k] + 1;
2028:     xk = x[k];
2029:     nz = ai[k+1] - ai[k] - 1;
2030:     while (nz--) {
2031:       xk += (*v++) * x[*vj++];
2032:     }
2033:     x[k] = xk;
2034:   }

2036:   VecRestoreArray(bb,&b);
2037:   VecRestoreArray(xx,&x);
2038:   PetscLogFlops(4.0*a->nz - 3*mbs);
2039:   return(0);
2040: }

2044: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2045: {
2046:   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
2047:   PetscErrorCode  ierr;
2048:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2049:   const MatScalar *aa=a->a,*v;
2050:   PetscReal       diagk;
2051:   PetscScalar     *x,*b;
2052:   PetscInt        nz,*vj,k;

2055:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2056:   VecGetArray(bb,&b);
2057:   VecGetArray(xx,&x);
2058:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2059:   for (k=0; k<mbs; k++){
2060:     v  = aa + ai[k];
2061:     vj = aj + ai[k];
2062:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2063:     while (nz--) x[*vj++] += (*v++) * x[k];
2064:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2065:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2066:     x[k] *= sqrt(diagk);
2067:   }
2068:   VecRestoreArray(bb,&b);
2069:   VecRestoreArray(xx,&x);
2070:   PetscLogFlops(2.0*a->nz - mbs);
2071:   return(0);
2072: }

2076: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2077: {
2078:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2080:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2081:   MatScalar      *aa=a->a,*v;
2082:   PetscReal      diagk;
2083:   PetscScalar    *x,*b;
2084:   PetscInt       nz,*vj,k;

2087:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2088:   VecGetArray(bb,&b);
2089:   VecGetArray(xx,&x);
2090:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
2091:   for (k=0; k<mbs; k++){
2092:     v  = aa + ai[k] + 1;
2093:     vj = aj + ai[k] + 1;
2094:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2095:     while (nz--) x[*vj++] += (*v++) * x[k];
2096:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2097:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2098:     x[k] *= sqrt(diagk);
2099:   }
2100:   VecRestoreArray(bb,&b);
2101:   VecRestoreArray(xx,&x);
2102:   PetscLogFlops(2.0*a->nz - mbs);
2103:   return(0);
2104: }

2108: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2109: {
2110:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2112:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2113:   MatScalar      *aa=a->a,*v;
2114:   PetscReal      diagk;
2115:   PetscScalar    *x,*b;
2116:   PetscInt       nz,*vj,k;

2119:   /* solve D^(1/2)*U*x = b by back substitution */
2120:   VecGetArray(bb,&b);
2121:   VecGetArray(xx,&x);

2123:   for (k=mbs-1; k>=0; k--){
2124:     v  = aa + ai[k];
2125:     vj = aj + ai[k];
2126:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2127:     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2128:     x[k] = sqrt(diagk)*b[k];
2129:     nz = ai[k+1] - ai[k] - 1;
2130:     while (nz--) x[k] += (*v++) * x[*vj++];
2131:   }
2132:   VecRestoreArray(bb,&b);
2133:   VecRestoreArray(xx,&x);
2134:   PetscLogFlops(2.0*a->nz - mbs);
2135:   return(0);
2136: }

2140: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2141: {
2142:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2144:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2145:   MatScalar      *aa=a->a,*v;
2146:   PetscReal      diagk;
2147:   PetscScalar    *x,*b;
2148:   PetscInt       nz,*vj,k;

2151:   /* solve D^(1/2)*U*x = b by back substitution */
2152:   VecGetArray(bb,&b);
2153:   VecGetArray(xx,&x);

2155:   for (k=mbs-1; k>=0; k--){
2156:     v  = aa + ai[k] + 1;
2157:     vj = aj + ai[k] + 1;
2158:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2159:     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2160:     x[k] = sqrt(diagk)*b[k];
2161:     nz = ai[k+1] - ai[k] - 1;
2162:     while (nz--) x[k] += (*v++) * x[*vj++];
2163:   }
2164:   VecRestoreArray(bb,&b);
2165:   VecRestoreArray(xx,&x);
2166:   PetscLogFlops(2.0*a->nz - mbs);
2167:   return(0);
2168: }

2170: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2173: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2174: {
2175:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2177:   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2178:   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2179:   PetscInt       m,reallocs = 0,*levtmp;
2180:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2181:   PetscInt       incrlev,*lev,shift,prow,nz;
2182:   PetscReal      f = info->fill,levels = info->levels;
2183:   PetscTruth     perm_identity;

2186:   /* check whether perm is the identity mapping */
2187:   ISIdentity(perm,&perm_identity);

2189:   if (perm_identity){
2190:     a->permute = PETSC_FALSE;
2191:     ai = a->i; aj = a->j;
2192:   } else { /*  non-trivial permutation */
2193:     SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2194:     a->permute = PETSC_TRUE;
2195:     MatReorderingSeqSBAIJ(A, perm);
2196:     ai = a->inew; aj = a->jnew;
2197:   }
2198: 
2199:   /* initialization */
2200:   ISGetIndices(perm,&rip);
2201:   umax  = (PetscInt)(f*ai[mbs] + 1);
2202:   PetscMalloc(umax*sizeof(PetscInt),&lev);
2203:   umax += mbs + 1;
2204:   shift = mbs + 1;
2205:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
2206:   PetscMalloc(umax*sizeof(PetscInt),&ju);
2207:   iu[0] = mbs + 1;
2208:   juidx = mbs + 1;
2209:   /* prowl: linked list for pivot row */
2210:   PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);
2211:   /* q: linked list for col index */
2212: 
2213:   for (i=0; i<mbs; i++){
2214:     prowl[i] = mbs;
2215:     q[i] = 0;
2216:   }

2218:   /* for each row k */
2219:   for (k=0; k<mbs; k++){
2220:     nzk  = 0;
2221:     q[k] = mbs;
2222:     /* copy current row into linked list */
2223:     nz = ai[rip[k]+1] - ai[rip[k]];
2224:     j = ai[rip[k]];
2225:     while (nz--){
2226:       vj = rip[aj[j++]];
2227:       if (vj > k){
2228:         qm = k;
2229:         do {
2230:           m  = qm; qm = q[m];
2231:         } while(qm < vj);
2232:         if (qm == vj) {
2233:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
2234:         }
2235:         nzk++;
2236:         q[m]       = vj;
2237:         q[vj]      = qm;
2238:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2239:       }
2240:     }

2242:     /* modify nonzero structure of k-th row by computing fill-in
2243:        for each row prow to be merged in */
2244:     prow = k;
2245:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2246: 
2247:     while (prow < k){
2248:       /* merge row prow into k-th row */
2249:       jmin = iu[prow] + 1;
2250:       jmax = iu[prow+1];
2251:       qm = k;
2252:       for (j=jmin; j<jmax; j++){
2253:         incrlev = lev[j-shift] + 1;
2254:         if (incrlev > levels) continue;
2255:         vj      = ju[j];
2256:         do {
2257:           m = qm; qm = q[m];
2258:         } while (qm < vj);
2259:         if (qm != vj){      /* a fill */
2260:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2261:           levtmp[vj] = incrlev;
2262:         } else {
2263:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2264:         }
2265:       }
2266:       prow = prowl[prow]; /* next pivot row */
2267:     }
2268: 
2269:     /* add k to row list for first nonzero element in k-th row */
2270:     if (nzk > 1){
2271:       i = q[k]; /* col value of first nonzero element in k_th row of U */
2272:       prowl[k] = prowl[i]; prowl[i] = k;
2273:     }
2274:     iu[k+1] = iu[k] + nzk;

2276:     /* allocate more space to ju and lev if needed */
2277:     if (iu[k+1] > umax) {
2278:       /* estimate how much additional space we will need */
2279:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2280:       /* just double the memory each time */
2281:       maxadd = umax;
2282:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2283:       umax += maxadd;

2285:       /* allocate a longer ju */
2286:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2287:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
2288:       PetscFree(ju);
2289:       ju       = jutmp;

2291:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
2292:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
2293:       PetscFree(lev);
2294:       lev      = jutmp;
2295:       reallocs += 2; /* count how many times we realloc */
2296:     }

2298:     /* save nonzero structure of k-th row in ju */
2299:     i=k;
2300:     while (nzk --) {
2301:       i                = q[i];
2302:       ju[juidx]        = i;
2303:       lev[juidx-shift] = levtmp[i];
2304:       juidx++;
2305:     }
2306:   }
2307: 
2308: #if defined(PETSC_USE_INFO)
2309:   if (ai[mbs] != 0) {
2310:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2311:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
2312:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2313:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
2314:     PetscInfo(A,"for best performance.\n");
2315:   } else {
2316:     PetscInfo(A,"Empty matrix.\n");
2317:   }
2318: #endif

2320:   ISRestoreIndices(perm,&rip);
2321:   PetscFree3(prowl,q,levtmp);
2322:   PetscFree(lev);

2324:   /* put together the new matrix */
2325:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);

2327:   /* PetscLogObjectParent(B,iperm); */
2328:   b    = (Mat_SeqSBAIJ*)(B)->data;
2329:   PetscFree2(b->imax,b->ilen);
2330:   b->singlemalloc = PETSC_FALSE;
2331:   b->free_a       = PETSC_TRUE;
2332:   b->free_ij      = PETSC_TRUE;
2333:   /* the next line frees the default space generated by the Create() */
2334:   PetscFree3(b->a,b->j,b->i);
2335:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
2336:   b->j    = ju;
2337:   b->i    = iu;
2338:   b->diag = 0;
2339:   b->ilen = 0;
2340:   b->imax = 0;
2341: 
2342:   if (b->row) {
2343:     ISDestroy(b->row);
2344:   }
2345:   if (b->icol) {
2346:     ISDestroy(b->icol);
2347:   }
2348:   b->row  = perm;
2349:   b->icol = perm;
2350:   PetscObjectReference((PetscObject)perm);
2351:   PetscObjectReference((PetscObject)perm);
2352:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
2353:   /* In b structure:  Free imax, ilen, old a, old j.  
2354:      Allocate idnew, solve_work, new a, new j */
2355:   PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2356:   b->maxnz = b->nz = iu[mbs];
2357: 
2358:   (B)->info.factor_mallocs    = reallocs;
2359:   (B)->info.fill_ratio_given  = f;
2360:   if (ai[mbs] != 0) {
2361:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2362:   } else {
2363:     (B)->info.fill_ratio_needed = 0.0;
2364:   }
2365:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2366:   return(0);
2367: }

2369: /*
2370:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2371: */
2372:  #include petscbt.h
2373:  #include ../src/mat/utils/freespace.h
2376: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2377: {
2378:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2379:   PetscErrorCode     ierr;
2380:   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2381:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2382:   const PetscInt     *rip;
2383:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2384:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2385:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2386:   PetscReal          fill=info->fill,levels=info->levels;
2387:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2388:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2389:   PetscBT            lnkbt;

2392:   if (bs > 1){
2393:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2394:     return(0);
2395:   }
2396:   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2397:   MatMissingDiagonal(A,&missing,&d);
2398:   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2400:   /* check whether perm is the identity mapping */
2401:   ISIdentity(perm,&perm_identity);
2402:   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2403:   a->permute = PETSC_FALSE;

2405:   PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2406:   PetscMalloc((am+1)*sizeof(PetscInt),&udiag);
2407:   ui[0] = 0;
2408: 
2409:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2410:   if (!levels){
2411:     /* reuse the column pointers and row offsets for memory savings */
2412:     for (i=0; i<am; i++) {
2413:       ncols    = ai[i+1] - ai[i];
2414:       ui[i+1]  = ui[i] + ncols;
2415:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2416:     }
2417:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2418:     cols = uj;
2419:     for (i=0; i<am; i++) {
2420:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2421:       ncols = ai[i+1] - ai[i] -1;
2422:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2423:       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2424:     }
2425:   } else { /* case: levels>0 */
2426:     ISGetIndices(perm,&rip);

2428:     /* initialization */
2429:     /* jl: linked list for storing indices of the pivot rows 
2430:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2431:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2432:     for (i=0; i<am; i++){
2433:       jl[i] = am; il[i] = 0;
2434:     }

2436:     /* create and initialize a linked list for storing column indices of the active row k */
2437:     nlnk = am + 1;
2438:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2440:     /* initial FreeSpace size is fill*(ai[am]+1) */
2441:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2442:     current_space = free_space;
2443:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2444:     current_space_lvl = free_space_lvl;

2446:     for (k=0; k<am; k++){  /* for each active row k */
2447:       /* initialize lnk by the column indices of row k */
2448:       nzk   = 0;
2449:       ncols = ai[k+1] - ai[k];
2450:       if (!ncols) SETERRQ1(PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2451:       cols  = aj+ai[k];
2452:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2453:       nzk += nlnk;

2455:       /* update lnk by computing fill-in for each pivot row to be merged in */
2456:       prow = jl[k]; /* 1st pivot row */
2457: 
2458:       while (prow < k){
2459:         nextprow = jl[prow];
2460: 
2461:         /* merge prow into k-th row */
2462:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2463:         jmax = ui[prow+1];
2464:         ncols = jmax-jmin;
2465:         i     = jmin - ui[prow];

2467:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2468:         uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
2469:         j     = *(uj - 1);
2470:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2471:         nzk += nlnk;

2473:         /* update il and jl for prow */
2474:         if (jmin < jmax){
2475:           il[prow] = jmin;
2476:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2477:         }
2478:         prow = nextprow;
2479:       }

2481:       /* if free space is not available, make more free space */
2482:       if (current_space->local_remaining<nzk) {
2483:         i = am - k + 1; /* num of unfactored rows */
2484:         i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2485:         PetscFreeSpaceGet(i,&current_space);
2486:         PetscFreeSpaceGet(i,&current_space_lvl);
2487:         reallocs++;
2488:       }

2490:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2491:       if (nzk == 0) SETERRQ1(PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2492:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2494:       /* add the k-th row into il and jl */
2495:       if (nzk > 1){
2496:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2497:         jl[k] = jl[i]; jl[i] = k;
2498:         il[k] = ui[k] + 1;
2499:       }
2500:       uj_ptr[k]     = current_space->array;
2501:       uj_lvl_ptr[k] = current_space_lvl->array;

2503:       current_space->array           += nzk;
2504:       current_space->local_used      += nzk;
2505:       current_space->local_remaining -= nzk;
2506:       current_space_lvl->array           += nzk;
2507:       current_space_lvl->local_used      += nzk;
2508:       current_space_lvl->local_remaining -= nzk;

2510:       ui[k+1] = ui[k] + nzk;
2511:     }

2513: #if defined(PETSC_USE_INFO)
2514:     if (ai[am] != 0) {
2515:       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2516:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2517:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2518:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2519:     } else {
2520:       PetscInfo(A,"Empty matrix.\n");
2521:     }
2522: #endif

2524:     ISRestoreIndices(perm,&rip);
2525:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2527:     /* destroy list of free space and other temporary array(s) */
2528:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2529:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2530:     PetscIncompleteLLDestroy(lnk,lnkbt);
2531:     PetscFreeSpaceDestroy(free_space_lvl);

2533:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2535:   /* put together the new matrix in MATSEQSBAIJ format */
2536:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

2538:   b = (Mat_SeqSBAIJ*)(fact)->data;
2539:   PetscFree2(b->imax,b->ilen);
2540:   b->singlemalloc = PETSC_FALSE;
2541:   b->free_a  = PETSC_TRUE;
2542:   b->free_ij = free_ij;
2543:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2544:   b->j     = uj;
2545:   b->i     = ui;
2546:   b->diag  = udiag;
2547:   b->free_diag = PETSC_TRUE;
2548:   b->ilen  = 0;
2549:   b->imax  = 0;
2550:   b->row   = perm;
2551:   b->col   = perm;
2552:   PetscObjectReference((PetscObject)perm);
2553:   PetscObjectReference((PetscObject)perm);
2554:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2555:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2556:   PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2557:   b->maxnz = b->nz = ui[am];
2558: 
2559:   fact->info.factor_mallocs    = reallocs;
2560:   fact->info.fill_ratio_given  = fill;
2561:   if (ai[am] != 0) {
2562:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2563:   } else {
2564:     fact->info.fill_ratio_needed = 0.0;
2565:   }
2566:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2567:   return(0);
2568: }

2572: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2573: {
2574:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2575:   Mat_SeqSBAIJ       *b;
2576:   PetscErrorCode     ierr;
2577:   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2578:   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2579:   const PetscInt     *cols,*rip,*ai,*aj;
2580:   PetscInt           reallocs=0,i,*ui;
2581:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2582:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2583:   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2584:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2585:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2586:   PetscBT            lnkbt;

2589:   MatMissingDiagonal(A,&missing,&d);
2590:   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);

2592:   /*  
2593:    This code originally uses Modified Sparse Row (MSR) storage
2594:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2595:    Then it is rewritten so the factor B takes seqsbaij format. However the associated 
2596:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 
2597:    thus the original code in MSR format is still used for these cases. 
2598:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 
2599:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2600:   */
2601:   if (bs > 1){
2602:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2603:     return(0);
2604:   }

2606:   /* check whether perm is the identity mapping */
2607:   ISIdentity(perm,&perm_identity);
2608:   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format")
2609: 
2610:   /* special case that simply copies fill pattern */
2611:   if (!levels ) {
2612:     a->permute = PETSC_FALSE;
2613:     /* reuse the column pointers and row offsets for memory savings */
2614:     ui           = a->i;
2615:     uj           = a->j;
2616:     free_ij      = PETSC_FALSE;
2617:     ratio_needed = 1.0;
2618:   } else { /* case: levels>0 */
2619:     if (perm_identity){
2620:       a->permute = PETSC_FALSE;
2621:       ai = a->i; aj = a->j;
2622:     }
2623:     ISGetIndices(perm,&rip);

2625:     /* initialization */
2626:     PetscMalloc((am+1)*sizeof(PetscInt),&ui);
2627:     ui[0] = 0;

2629:     /* jl: linked list for storing indices of the pivot rows 
2630:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2631:     PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);
2632:     for (i=0; i<am; i++){
2633:       jl[i] = am; il[i] = 0;
2634:     }

2636:     /* create and initialize a linked list for storing column indices of the active row k */
2637:     nlnk = am + 1;
2638:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2640:     /* initial FreeSpace size is fill*(ai[am]+1) */
2641:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
2642:     current_space = free_space;
2643:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
2644:     current_space_lvl = free_space_lvl;

2646:     for (k=0; k<am; k++){  /* for each active row k */
2647:       /* initialize lnk by the column indices of row rip[k] */
2648:       nzk   = 0;
2649:       ncols = ai[rip[k]+1] - ai[rip[k]];
2650:       cols  = aj+ai[rip[k]];
2651:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2652:       nzk += nlnk;

2654:       /* update lnk by computing fill-in for each pivot row to be merged in */
2655:       prow = jl[k]; /* 1st pivot row */
2656: 
2657:       while (prow < k){
2658:         nextprow = jl[prow];
2659: 
2660:         /* merge prow into k-th row */
2661:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2662:         jmax = ui[prow+1];
2663:         ncols = jmax-jmin;
2664:         i     = jmin - ui[prow];
2665:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2666:         j     = *(uj_lvl_ptr[prow] + i - 1);
2667:         cols_lvl = uj_lvl_ptr[prow]+i;
2668:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2669:         nzk += nlnk;

2671:         /* update il and jl for prow */
2672:         if (jmin < jmax){
2673:           il[prow] = jmin;
2674:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2675:         }
2676:         prow = nextprow;
2677:       }

2679:       /* if free space is not available, make more free space */
2680:       if (current_space->local_remaining<nzk) {
2681:         i = am - k + 1; /* num of unfactored rows */
2682:         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2683:         PetscFreeSpaceGet(i,&current_space);
2684:         PetscFreeSpaceGet(i,&current_space_lvl);
2685:         reallocs++;
2686:       }

2688:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2689:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2691:       /* add the k-th row into il and jl */
2692:       if (nzk-1 > 0){
2693:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2694:         jl[k] = jl[i]; jl[i] = k;
2695:         il[k] = ui[k] + 1;
2696:       }
2697:       uj_ptr[k]     = current_space->array;
2698:       uj_lvl_ptr[k] = current_space_lvl->array;

2700:       current_space->array           += nzk;
2701:       current_space->local_used      += nzk;
2702:       current_space->local_remaining -= nzk;
2703:       current_space_lvl->array           += nzk;
2704:       current_space_lvl->local_used      += nzk;
2705:       current_space_lvl->local_remaining -= nzk;

2707:       ui[k+1] = ui[k] + nzk;
2708:     }

2710: #if defined(PETSC_USE_INFO)
2711:     if (ai[am] != 0) {
2712:       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2713:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
2714:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
2715:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
2716:     } else {
2717:       PetscInfo(A,"Empty matrix.\n");
2718:     }
2719: #endif

2721:     ISRestoreIndices(perm,&rip);
2722:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2724:     /* destroy list of free space and other temporary array(s) */
2725:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
2726:     PetscFreeSpaceContiguous(&free_space,uj);
2727:     PetscIncompleteLLDestroy(lnk,lnkbt);
2728:     PetscFreeSpaceDestroy(free_space_lvl);
2729:     if (ai[am] != 0) {
2730:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731:     } else {
2732:       ratio_needed = 0.0;
2733:     }
2734:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2736:   /* put together the new matrix in MATSEQSBAIJ format */
2737:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

2739:   b = (Mat_SeqSBAIJ*)(fact)->data;
2740:   PetscFree2(b->imax,b->ilen);
2741:   b->singlemalloc = PETSC_FALSE;
2742:   b->free_a  = PETSC_TRUE;
2743:   b->free_ij = free_ij;
2744:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
2745:   b->j    = uj;
2746:   b->i    = ui;
2747:   b->diag    = 0;
2748:   b->ilen    = 0;
2749:   b->imax    = 0;
2750:   b->row     = perm;
2751:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2752:   PetscObjectReference((PetscObject)perm);
2753:   b->icol = perm;
2754:   PetscObjectReference((PetscObject)perm);
2755:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
2756:   b->maxnz = b->nz = ui[am];
2757: 
2758:   fact->info.factor_mallocs    = reallocs;
2759:   fact->info.fill_ratio_given  = fill;
2760:   fact->info.fill_ratio_needed = ratio_needed;
2761:   if (perm_identity){
2762:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2763:   } else {
2764:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2765:   }
2766:   return(0);
2767: }