Actual source code: dgefa4.c

  1: #define PETSCMAT_DLL

  3: /*
  4:        Inverts 4 by 4 matrix using partial pivoting.

  6:        Used by the sparse factorization routines in 
  7:      src/mat/impls/baij/seq

  9:        This is a combination of the Linpack routines
 10:     dgefa() and dgedi() specialized for a size of 4.

 12: */
 13:  #include petscsys.h

 17: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift)
 18: {
 19:     PetscInt   i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
 20:     PetscInt   k4,j3;
 21:     MatScalar  *aa,*ax,*ay,work[16],stmp;
 22:     MatReal    tmp,max;

 24: /*     gaussian elimination with partial pivoting */

 27:     shift = .25*shift*(PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15]));
 28:     /* Parameter adjustments */
 29:     a       -= 5;

 31:     for (k = 1; k <= 3; ++k) {
 32:         kp1 = k + 1;
 33:         k3  = 4*k;
 34:         k4  = k3 + k;
 35: /*        find l = pivot index */

 37:         i__2 = 5 - k;
 38:         aa = &a[k4];
 39:         max = PetscAbsScalar(aa[0]);
 40:         l = 1;
 41:         for (ll=1; ll<i__2; ll++) {
 42:           tmp = PetscAbsScalar(aa[ll]);
 43:           if (tmp > max) { max = tmp; l = ll+1;}
 44:         }
 45:         l       += k - 1;
 46:         ipvt[k-1] = l;

 48:         if (a[l + k3] == 0.0) {
 49:           if (shift == 0.0) {
 50:             SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
 51:             } else {
 52:             /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */
 53:               a[l + k3] = shift;
 54:             }
 55:         }

 57: /*           interchange if necessary */

 59:         if (l != k) {
 60:           stmp      = a[l + k3];
 61:           a[l + k3] = a[k4];
 62:           a[k4]     = stmp;
 63:         }

 65: /*           compute multipliers */

 67:         stmp = -1. / a[k4];
 68:         i__2 = 4 - k;
 69:         aa = &a[1 + k4];
 70:         for (ll=0; ll<i__2; ll++) {
 71:           aa[ll] *= stmp;
 72:         }

 74: /*           row elimination with column indexing */

 76:         ax = &a[k4+1];
 77:         for (j = kp1; j <= 4; ++j) {
 78:             j3   = 4*j;
 79:             stmp = a[l + j3];
 80:             if (l != k) {
 81:               a[l + j3] = a[k + j3];
 82:               a[k + j3] = stmp;
 83:             }

 85:             i__3 = 4 - k;
 86:             ay = &a[1+k+j3];
 87:             for (ll=0; ll<i__3; ll++) {
 88:               ay[ll] += stmp*ax[ll];
 89:             }
 90:         }
 91:     }
 92:     ipvt[3] = 4;
 93:     if (a[20] == 0.0) {
 94:       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
 95:     }

 97:     /*
 98:          Now form the inverse 
 99:     */

101:    /*     compute inverse(u) */

103:     for (k = 1; k <= 4; ++k) {
104:         k3    = 4*k;
105:         k4    = k3 + k;
106:         a[k4] = 1.0 / a[k4];
107:         stmp  = -a[k4];
108:         i__2  = k - 1;
109:         aa    = &a[k3 + 1];
110:         for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
111:         kp1 = k + 1;
112:         if (4 < kp1) continue;
113:         ax = aa;
114:         for (j = kp1; j <= 4; ++j) {
115:             j3        = 4*j;
116:             stmp      = a[k + j3];
117:             a[k + j3] = 0.0;
118:             ay        = &a[j3 + 1];
119:             for (ll=0; ll<k; ll++) {
120:               ay[ll] += stmp*ax[ll];
121:             }
122:         }
123:     }

125:    /*    form inverse(u)*inverse(l) */

127:     for (kb = 1; kb <= 3; ++kb) {
128:         k   = 4 - kb;
129:         k3  = 4*k;
130:         kp1 = k + 1;
131:         aa  = a + k3;
132:         for (i = kp1; i <= 4; ++i) {
133:             work[i-1] = aa[i];
134:             aa[i]   = 0.0;
135:         }
136:         for (j = kp1; j <= 4; ++j) {
137:             stmp  = work[j-1];
138:             ax    = &a[4*j + 1];
139:             ay    = &a[k3 + 1];
140:             ay[0] += stmp*ax[0];
141:             ay[1] += stmp*ax[1];
142:             ay[2] += stmp*ax[2];
143:             ay[3] += stmp*ax[3];
144:         }
145:         l = ipvt[k-1];
146:         if (l != k) {
147:             ax = &a[k3 + 1];
148:             ay = &a[4*l + 1];
149:             stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
150:             stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
151:             stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
152:             stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
153:         }
154:     }
155:     return(0);
156: }

158: #if defined(PETSC_HAVE_SSE)
159: #include PETSC_HAVE_SSE

163: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
164: {
165:   /* 
166:      This routine is converted from Intel's Small Matrix Library.
167:      See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
168:      Order Number: 245043-001
169:      March 1999
170:      http://www.intel.com

172:      Inverse of a 4x4 matrix via Kramer's Rule:
173:      bool Invert4x4(SMLXMatrix &);
174:   */

177:   SSE_SCOPE_BEGIN;
178:     SSE_INLINE_BEGIN_1(a)

180: /* ----------------------------------------------- */

182:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
183:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)

185:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
186:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)

188:       SSE_COPY_PS(XMM3,XMM0)
189:       SSE_SHUFFLE(XMM3,XMM5,0x88)

191:       SSE_SHUFFLE(XMM5,XMM0,0xDD)

193:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
194:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)

196:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
197:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)

199:       SSE_COPY_PS(XMM4,XMM0)
200:       SSE_SHUFFLE(XMM4,XMM6,0x88)

202:       SSE_SHUFFLE(XMM6,XMM0,0xDD)

204: /* ----------------------------------------------- */

206:       SSE_COPY_PS(XMM7,XMM4)
207:       SSE_MULT_PS(XMM7,XMM6)

209:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

211:       SSE_COPY_PS(XMM0,XMM5)
212:       SSE_MULT_PS(XMM0,XMM7)

214:       SSE_COPY_PS(XMM2,XMM3)
215:       SSE_MULT_PS(XMM2,XMM7)

217:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

219:       SSE_COPY_PS(XMM1,XMM5)
220:       SSE_MULT_PS(XMM1,XMM7)
221:       SSE_SUB_PS(XMM1,XMM0)

223:       SSE_MULT_PS(XMM7,XMM3)
224:       SSE_SUB_PS(XMM7,XMM2)

226:       SSE_SHUFFLE(XMM7,XMM7,0x4E)
227:       SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)

229: /* ----------------------------------------------- */

231:       SSE_COPY_PS(XMM0,XMM5)
232:       SSE_MULT_PS(XMM0,XMM4)

234:       SSE_SHUFFLE(XMM0,XMM0,0xB1)

236:       SSE_COPY_PS(XMM2,XMM6)
237:       SSE_MULT_PS(XMM2,XMM0)
238:       SSE_ADD_PS(XMM2,XMM1)
239: 
240:       SSE_COPY_PS(XMM7,XMM3)
241:       SSE_MULT_PS(XMM7,XMM0)

243:       SSE_SHUFFLE(XMM0,XMM0,0x4E)

245:       SSE_COPY_PS(XMM1,XMM6)
246:       SSE_MULT_PS(XMM1,XMM0)
247:       SSE_SUB_PS(XMM2,XMM1)

249:       SSE_MULT_PS(XMM0,XMM3)
250:       SSE_SUB_PS(XMM0,XMM7)

252:       SSE_SHUFFLE(XMM0,XMM0,0x4E)
253:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)

255:       /* ----------------------------------------------- */

257:       SSE_COPY_PS(XMM7,XMM5)
258:       SSE_SHUFFLE(XMM7,XMM5,0x4E)
259:       SSE_MULT_PS(XMM7,XMM6)

261:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

263:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

265:       SSE_COPY_PS(XMM0,XMM4)
266:       SSE_MULT_PS(XMM0,XMM7)
267:       SSE_ADD_PS(XMM0,XMM2)

269:       SSE_COPY_PS(XMM2,XMM3)
270:       SSE_MULT_PS(XMM2,XMM7)

272:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

274:       SSE_COPY_PS(XMM1,XMM4)
275:       SSE_MULT_PS(XMM1,XMM7)
276:       SSE_SUB_PS(XMM0,XMM1)
277:       SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)

279:       SSE_MULT_PS(XMM7,XMM3)
280:       SSE_SUB_PS(XMM7,XMM2)

282:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

284:       /* ----------------------------------------------- */

286:       SSE_COPY_PS(XMM1,XMM3)
287:       SSE_MULT_PS(XMM1,XMM5)

289:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

291:       SSE_COPY_PS(XMM0,XMM6)
292:       SSE_MULT_PS(XMM0,XMM1)
293:       SSE_ADD_PS(XMM0,XMM7)
294: 
295:       SSE_COPY_PS(XMM2,XMM4)
296:       SSE_MULT_PS(XMM2,XMM1)
297:       SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)

299:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

301:       SSE_COPY_PS(XMM7,XMM6)
302:       SSE_MULT_PS(XMM7,XMM1)
303:       SSE_SUB_PS(XMM7,XMM0)

305:       SSE_MULT_PS(XMM1,XMM4)
306:       SSE_SUB_PS(XMM2,XMM1)
307:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)

309:       /* ----------------------------------------------- */

311:       SSE_COPY_PS(XMM1,XMM3)
312:       SSE_MULT_PS(XMM1,XMM6)

314:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

316:       SSE_COPY_PS(XMM2,XMM4)
317:       SSE_MULT_PS(XMM2,XMM1)
318:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
319:       SSE_SUB_PS(XMM0,XMM2)

321:       SSE_COPY_PS(XMM2,XMM5)
322:       SSE_MULT_PS(XMM2,XMM1)
323:       SSE_ADD_PS(XMM2,XMM7)

325:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

327:       SSE_COPY_PS(XMM7,XMM4)
328:       SSE_MULT_PS(XMM7,XMM1)
329:       SSE_ADD_PS(XMM7,XMM0)

331:       SSE_MULT_PS(XMM1,XMM5)
332:       SSE_SUB_PS(XMM2,XMM1)

334:       /* ----------------------------------------------- */

336:       SSE_MULT_PS(XMM4,XMM3)

338:       SSE_SHUFFLE(XMM4,XMM4,0xB1)

340:       SSE_COPY_PS(XMM1,XMM6)
341:       SSE_MULT_PS(XMM1,XMM4)
342:       SSE_ADD_PS(XMM1,XMM7)

344:       SSE_COPY_PS(XMM0,XMM5)
345:       SSE_MULT_PS(XMM0,XMM4)
346:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
347:       SSE_SUB_PS(XMM7,XMM0)

349:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

351:       SSE_MULT_PS(XMM6,XMM4)
352:       SSE_SUB_PS(XMM1,XMM6)

354:       SSE_MULT_PS(XMM5,XMM4)
355:       SSE_ADD_PS(XMM5,XMM7)

357:       /* ----------------------------------------------- */

359:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
360:       SSE_MULT_PS(XMM3,XMM0)

362:       SSE_COPY_PS(XMM4,XMM3)
363:       SSE_SHUFFLE(XMM4,XMM3,0x4E)
364:       SSE_ADD_PS(XMM4,XMM3)

366:       SSE_COPY_PS(XMM6,XMM4)
367:       SSE_SHUFFLE(XMM6,XMM4,0xB1)
368:       SSE_ADD_SS(XMM6,XMM4)

370:       SSE_COPY_PS(XMM3,XMM6)
371:       SSE_RECIP_SS(XMM3,XMM6)
372:       SSE_COPY_SS(XMM4,XMM3)
373:       SSE_ADD_SS(XMM4,XMM3)
374:       SSE_MULT_SS(XMM3,XMM3)
375:       SSE_MULT_SS(XMM6,XMM3)
376:       SSE_SUB_SS(XMM4,XMM6)

378:       SSE_SHUFFLE(XMM4,XMM4,0x00)

380:       SSE_MULT_PS(XMM0,XMM4)
381:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
382:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)

384:       SSE_MULT_PS(XMM1,XMM4)
385:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
386:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)

388:       SSE_MULT_PS(XMM2,XMM4)
389:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
390:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)

392:       SSE_MULT_PS(XMM4,XMM5)
393:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
394:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)

396:       /* ----------------------------------------------- */

398:       SSE_INLINE_END_1;
399:   SSE_SCOPE_END;

401:   return(0);
402: }

404: #endif