Actual source code: baijfact11.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
6: #include ../src/mat/impls/baij/seq/baij.h
7: #include ../src/mat/blockinvert.h
9: /* ------------------------------------------------------------*/
10: /*
11: Version for when blocks are 4 by 4
12: */
15: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
16: {
17: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
18: IS isrow = b->row,isicol = b->icol;
20: const PetscInt *r,*ic;
21: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
22: PetscInt *ajtmpold,*ajtmp,nz,row;
23: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
24: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
25: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
26: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
27: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
28: MatScalar m13,m14,m15,m16;
29: MatScalar *ba = b->a,*aa = a->a;
30: PetscTruth pivotinblocks = b->pivotinblocks;
31: PetscReal shift = info->shiftamount;
34: ISGetIndices(isrow,&r);
35: ISGetIndices(isicol,&ic);
36: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
38: for (i=0; i<n; i++) {
39: nz = bi[i+1] - bi[i];
40: ajtmp = bj + bi[i];
41: for (j=0; j<nz; j++) {
42: x = rtmp+16*ajtmp[j];
43: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
44: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
45: }
46: /* load in initial (unfactored row) */
47: idx = r[i];
48: nz = ai[idx+1] - ai[idx];
49: ajtmpold = aj + ai[idx];
50: v = aa + 16*ai[idx];
51: for (j=0; j<nz; j++) {
52: x = rtmp+16*ic[ajtmpold[j]];
53: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
54: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
55: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
56: x[14] = v[14]; x[15] = v[15];
57: v += 16;
58: }
59: row = *ajtmp++;
60: while (row < i) {
61: pc = rtmp + 16*row;
62: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
63: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
64: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
65: p15 = pc[14]; p16 = pc[15];
66: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
67: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
68: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
69: || p16 != 0.0) {
70: pv = ba + 16*diag_offset[row];
71: pj = bj + diag_offset[row] + 1;
72: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
73: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
74: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
75: x15 = pv[14]; x16 = pv[15];
76: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
77: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
78: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
79: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
81: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
82: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
83: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
84: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
86: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
87: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
88: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
89: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
91: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
92: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
93: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
94: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
96: nz = bi[row+1] - diag_offset[row] - 1;
97: pv += 16;
98: for (j=0; j<nz; j++) {
99: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
100: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
101: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
102: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
103: x = rtmp + 16*pj[j];
104: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
105: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
106: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
107: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
109: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
110: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
111: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
112: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
114: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
115: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
116: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
117: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
119: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
120: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
121: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
122: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
124: pv += 16;
125: }
126: PetscLogFlops(128.0*nz+112.0);
127: }
128: row = *ajtmp++;
129: }
130: /* finished row so stick it into b->a */
131: pv = ba + 16*bi[i];
132: pj = bj + bi[i];
133: nz = bi[i+1] - bi[i];
134: for (j=0; j<nz; j++) {
135: x = rtmp+16*pj[j];
136: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
137: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
138: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
139: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
140: pv += 16;
141: }
142: /* invert diagonal block */
143: w = ba + 16*diag_offset[i];
144: if (pivotinblocks) {
145: Kernel_A_gets_inverse_A_4(w,shift);
146: } else {
147: Kernel_A_gets_inverse_A_4_nopivot(w);
148: }
149: }
151: PetscFree(rtmp);
152: ISRestoreIndices(isicol,&ic);
153: ISRestoreIndices(isrow,&r);
154: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
155: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156: C->assembled = PETSC_TRUE;
157: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
158: return(0);
159: }
161: /* MatLUFactorNumeric_SeqBAIJ_4 -
162: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163: Kernel_A_gets_A_times_B()
164: Kernel_A_gets_A_minus_B_times_C()
165: Kernel_A_gets_inverse_A()
166: */
170: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171: {
172: Mat C=B;
173: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
174: IS isrow = b->row,isicol = b->icol;
176: const PetscInt *r,*ic,*ics;
177: PetscInt i,j,k,nz,nzL,row;
178: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181: PetscInt flg;
182: PetscReal shift = info->shiftamount;
185: ISGetIndices(isrow,&r);
186: ISGetIndices(isicol,&ic);
188: /* generate work space needed by the factorization */
189: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
190: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
191: ics = ic;
193: for (i=0; i<n; i++){
194: /* zero rtmp */
195: /* L part */
196: nz = bi[i+1] - bi[i];
197: bjtmp = bj + bi[i];
198: for (j=0; j<nz; j++){
199: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
200: }
202: /* U part */
203: nz = bdiag[i]-bdiag[i+1];
204: bjtmp = bj + bdiag[i+1]+1;
205: for (j=0; j<nz; j++){
206: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
207: }
208:
209: /* load in initial (unfactored row) */
210: nz = ai[r[i]+1] - ai[r[i]];
211: ajtmp = aj + ai[r[i]];
212: v = aa + bs2*ai[r[i]];
213: for (j=0; j<nz; j++) {
214: PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));
215: }
217: /* elimination */
218: bjtmp = bj + bi[i];
219: nzL = bi[i+1] - bi[i];
220: for(k=0;k < nzL;k++) {
221: row = bjtmp[k];
222: pc = rtmp + bs2*row;
223: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
224: if (flg) {
225: pv = b->a + bs2*bdiag[row];
226: /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
227: Kernel_A_gets_A_times_B_4(pc,pv,mwork);
228:
229: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
230: pv = b->a + bs2*(bdiag[row+1]+1);
231: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
232: for (j=0; j<nz; j++) {
233: /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
234: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
235: v = rtmp + bs2*pj[j];
236: Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
237: pv += bs2;
238: }
239: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
240: }
241: }
243: /* finished row so stick it into b->a */
244: /* L part */
245: pv = b->a + bs2*bi[i] ;
246: pj = b->j + bi[i] ;
247: nz = bi[i+1] - bi[i];
248: for (j=0; j<nz; j++) {
249: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
250: }
252: /* Mark diagonal and invert diagonal for simplier triangular solves */
253: pv = b->a + bs2*bdiag[i];
254: pj = b->j + bdiag[i];
255: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
256: /* Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work); */
257: Kernel_A_gets_inverse_A_4(pv,shift);
258:
259: /* U part */
260: pv = b->a + bs2*(bdiag[i+1]+1);
261: pj = b->j + bdiag[i+1]+1;
262: nz = bdiag[i] - bdiag[i+1] - 1;
263: for (j=0; j<nz; j++){
264: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
265: }
266: }
268: PetscFree2(rtmp,mwork);
269: ISRestoreIndices(isicol,&ic);
270: ISRestoreIndices(isrow,&r);
271: C->ops->solve = MatSolve_SeqBAIJ_4;
272: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
273: C->assembled = PETSC_TRUE;
274: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
275: return(0);
276: }
280: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
281: {
282: /*
283: Default Version for when blocks are 4 by 4 Using natural ordering
284: */
285: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
287: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
288: PetscInt *ajtmpold,*ajtmp,nz,row;
289: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
290: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
291: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
292: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
293: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
294: MatScalar m13,m14,m15,m16;
295: MatScalar *ba = b->a,*aa = a->a;
296: PetscTruth pivotinblocks = b->pivotinblocks;
297: PetscReal shift = info->shiftamount;
300: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
302: for (i=0; i<n; i++) {
303: nz = bi[i+1] - bi[i];
304: ajtmp = bj + bi[i];
305: for (j=0; j<nz; j++) {
306: x = rtmp+16*ajtmp[j];
307: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
308: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
309: }
310: /* load in initial (unfactored row) */
311: nz = ai[i+1] - ai[i];
312: ajtmpold = aj + ai[i];
313: v = aa + 16*ai[i];
314: for (j=0; j<nz; j++) {
315: x = rtmp+16*ajtmpold[j];
316: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
317: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
318: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
319: x[14] = v[14]; x[15] = v[15];
320: v += 16;
321: }
322: row = *ajtmp++;
323: while (row < i) {
324: pc = rtmp + 16*row;
325: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
326: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
327: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
328: p15 = pc[14]; p16 = pc[15];
329: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
330: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
331: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
332: || p16 != 0.0) {
333: pv = ba + 16*diag_offset[row];
334: pj = bj + diag_offset[row] + 1;
335: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
336: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
337: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
338: x15 = pv[14]; x16 = pv[15];
339: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
340: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
341: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
342: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
344: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
345: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
346: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
347: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
349: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
350: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
351: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
352: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
354: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
355: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
356: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
357: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
358: nz = bi[row+1] - diag_offset[row] - 1;
359: pv += 16;
360: for (j=0; j<nz; j++) {
361: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
362: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
363: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
364: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
365: x = rtmp + 16*pj[j];
366: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
367: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
368: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
369: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
371: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
372: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
373: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
374: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
376: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
377: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
378: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
379: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
381: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
382: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
383: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
384: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
386: pv += 16;
387: }
388: PetscLogFlops(128.0*nz+112.0);
389: }
390: row = *ajtmp++;
391: }
392: /* finished row so stick it into b->a */
393: pv = ba + 16*bi[i];
394: pj = bj + bi[i];
395: nz = bi[i+1] - bi[i];
396: for (j=0; j<nz; j++) {
397: x = rtmp+16*pj[j];
398: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
399: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
400: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
401: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
402: pv += 16;
403: }
404: /* invert diagonal block */
405: w = ba + 16*diag_offset[i];
406: if (pivotinblocks) {
407: Kernel_A_gets_inverse_A_4(w,shift);
408: } else {
409: Kernel_A_gets_inverse_A_4_nopivot(w);
410: }
411: }
413: PetscFree(rtmp);
414: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
415: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
416: C->assembled = PETSC_TRUE;
417: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
418: return(0);
419: }
421: /*
422: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
423: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
424: */
427: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
428: {
429: Mat C=B;
430: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
432: PetscInt i,j,k,nz,nzL,row;
433: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
434: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
435: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
436: PetscInt flg;
437: PetscReal shift = info->shiftamount;
440: /* generate work space needed by the factorization */
441: PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);
442: PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));
444: for (i=0; i<n; i++){
445: /* zero rtmp */
446: /* L part */
447: nz = bi[i+1] - bi[i];
448: bjtmp = bj + bi[i];
449: for (j=0; j<nz; j++){
450: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
451: }
453: /* U part */
454: nz = bdiag[i] - bdiag[i+1];
455: bjtmp = bj + bdiag[i+1]+1;
456: for (j=0; j<nz; j++){
457: PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));
458: }
459:
460: /* load in initial (unfactored row) */
461: nz = ai[i+1] - ai[i];
462: ajtmp = aj + ai[i];
463: v = aa + bs2*ai[i];
464: for (j=0; j<nz; j++) {
465: PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));
466: }
468: /* elimination */
469: bjtmp = bj + bi[i];
470: nzL = bi[i+1] - bi[i];
471: for(k=0;k < nzL;k++) {
472: row = bjtmp[k];
473: pc = rtmp + bs2*row;
474: for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
475: if (flg) {
476: pv = b->a + bs2*bdiag[row];
477: /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
478: Kernel_A_gets_A_times_B_4(pc,pv,mwork);
479:
480: pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
481: pv = b->a + bs2*(bdiag[row+1]+1);
482: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
483: for (j=0; j<nz; j++) {
484: /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
485: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
486: v = rtmp + bs2*pj[j];
487: Kernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
488: pv += bs2;
489: }
490: PetscLogFlops(128*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
491: }
492: }
494: /* finished row so stick it into b->a */
495: /* L part */
496: pv = b->a + bs2*bi[i] ;
497: pj = b->j + bi[i] ;
498: nz = bi[i+1] - bi[i];
499: for (j=0; j<nz; j++) {
500: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
501: }
503: /* Mark diagonal and invert diagonal for simplier triangular solves */
504: pv = b->a + bs2*bdiag[i];
505: pj = b->j + bdiag[i];
506: PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));
507: /* Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work); */
508: Kernel_A_gets_inverse_A_4(pv,shift);
509:
510: /* U part */
511: pv = b->a + bs2*(bdiag[i+1]+1);
512: pj = b->j + bdiag[i+1]+1;
513: nz = bdiag[i] - bdiag[i+1] - 1;
514: for (j=0; j<nz; j++){
515: PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));
516: }
517: }
518: PetscFree2(rtmp,mwork);
519: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
520: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
521: C->assembled = PETSC_TRUE;
522: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
523: return(0);
524: }
526: #if defined(PETSC_HAVE_SSE)
528: #include PETSC_HAVE_SSE
530: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
533: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
534: {
535: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
537: int i,j,n = a->mbs;
538: int *bj = b->j,*bjtmp,*pj;
539: int row;
540: int *ajtmpold,nz,*bi=b->i;
541: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
542: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
543: MatScalar *ba = b->a,*aa = a->a;
544: int nonzero=0;
545: /* int nonzero=0,colscale = 16; */
546: PetscTruth pivotinblocks = b->pivotinblocks;
547: PetscReal shift = info->shiftamount;
550: SSE_SCOPE_BEGIN;
552: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
553: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
554: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
555: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
556: /* if ((unsigned long)bj==(unsigned long)aj) { */
557: /* colscale = 4; */
558: /* } */
559: for (i=0; i<n; i++) {
560: nz = bi[i+1] - bi[i];
561: bjtmp = bj + bi[i];
562: /* zero out the 4x4 block accumulators */
563: /* zero out one register */
564: XOR_PS(XMM7,XMM7);
565: for (j=0; j<nz; j++) {
566: x = rtmp+16*bjtmp[j];
567: /* x = rtmp+4*bjtmp[j]; */
568: SSE_INLINE_BEGIN_1(x)
569: /* Copy zero register to memory locations */
570: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
571: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
572: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
573: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
574: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
575: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
576: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
577: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
578: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
579: SSE_INLINE_END_1;
580: }
581: /* load in initial (unfactored row) */
582: nz = ai[i+1] - ai[i];
583: ajtmpold = aj + ai[i];
584: v = aa + 16*ai[i];
585: for (j=0; j<nz; j++) {
586: x = rtmp+16*ajtmpold[j];
587: /* x = rtmp+colscale*ajtmpold[j]; */
588: /* Copy v block into x block */
589: SSE_INLINE_BEGIN_2(v,x)
590: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
591: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
592: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
594: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
595: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
597: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
598: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
600: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
601: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
603: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
604: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
606: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
607: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
609: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
610: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
612: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
613: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
614: SSE_INLINE_END_2;
616: v += 16;
617: }
618: /* row = (*bjtmp++)/4; */
619: row = *bjtmp++;
620: while (row < i) {
621: pc = rtmp + 16*row;
622: SSE_INLINE_BEGIN_1(pc)
623: /* Load block from lower triangle */
624: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
625: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
626: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
628: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
629: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
631: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
632: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
634: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
635: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
637: /* Compare block to zero block */
639: SSE_COPY_PS(XMM4,XMM7)
640: SSE_CMPNEQ_PS(XMM4,XMM0)
642: SSE_COPY_PS(XMM5,XMM7)
643: SSE_CMPNEQ_PS(XMM5,XMM1)
645: SSE_COPY_PS(XMM6,XMM7)
646: SSE_CMPNEQ_PS(XMM6,XMM2)
648: SSE_CMPNEQ_PS(XMM7,XMM3)
650: /* Reduce the comparisons to one SSE register */
651: SSE_OR_PS(XMM6,XMM7)
652: SSE_OR_PS(XMM5,XMM4)
653: SSE_OR_PS(XMM5,XMM6)
654: SSE_INLINE_END_1;
656: /* Reduce the one SSE register to an integer register for branching */
657: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
658: MOVEMASK(nonzero,XMM5);
660: /* If block is nonzero ... */
661: if (nonzero) {
662: pv = ba + 16*diag_offset[row];
663: PREFETCH_L1(&pv[16]);
664: pj = bj + diag_offset[row] + 1;
666: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
667: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
668: /* but the diagonal was inverted already */
669: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
671: SSE_INLINE_BEGIN_2(pv,pc)
672: /* Column 0, product is accumulated in XMM4 */
673: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
674: SSE_SHUFFLE(XMM4,XMM4,0x00)
675: SSE_MULT_PS(XMM4,XMM0)
677: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
678: SSE_SHUFFLE(XMM5,XMM5,0x00)
679: SSE_MULT_PS(XMM5,XMM1)
680: SSE_ADD_PS(XMM4,XMM5)
682: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
683: SSE_SHUFFLE(XMM6,XMM6,0x00)
684: SSE_MULT_PS(XMM6,XMM2)
685: SSE_ADD_PS(XMM4,XMM6)
687: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
688: SSE_SHUFFLE(XMM7,XMM7,0x00)
689: SSE_MULT_PS(XMM7,XMM3)
690: SSE_ADD_PS(XMM4,XMM7)
692: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
693: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
695: /* Column 1, product is accumulated in XMM5 */
696: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
697: SSE_SHUFFLE(XMM5,XMM5,0x00)
698: SSE_MULT_PS(XMM5,XMM0)
700: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
701: SSE_SHUFFLE(XMM6,XMM6,0x00)
702: SSE_MULT_PS(XMM6,XMM1)
703: SSE_ADD_PS(XMM5,XMM6)
705: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
706: SSE_SHUFFLE(XMM7,XMM7,0x00)
707: SSE_MULT_PS(XMM7,XMM2)
708: SSE_ADD_PS(XMM5,XMM7)
710: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
711: SSE_SHUFFLE(XMM6,XMM6,0x00)
712: SSE_MULT_PS(XMM6,XMM3)
713: SSE_ADD_PS(XMM5,XMM6)
715: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
716: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
718: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
720: /* Column 2, product is accumulated in XMM6 */
721: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
722: SSE_SHUFFLE(XMM6,XMM6,0x00)
723: SSE_MULT_PS(XMM6,XMM0)
725: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
726: SSE_SHUFFLE(XMM7,XMM7,0x00)
727: SSE_MULT_PS(XMM7,XMM1)
728: SSE_ADD_PS(XMM6,XMM7)
730: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
731: SSE_SHUFFLE(XMM7,XMM7,0x00)
732: SSE_MULT_PS(XMM7,XMM2)
733: SSE_ADD_PS(XMM6,XMM7)
735: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
736: SSE_SHUFFLE(XMM7,XMM7,0x00)
737: SSE_MULT_PS(XMM7,XMM3)
738: SSE_ADD_PS(XMM6,XMM7)
739:
740: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
741: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
743: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
744: /* Column 3, product is accumulated in XMM0 */
745: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
746: SSE_SHUFFLE(XMM7,XMM7,0x00)
747: SSE_MULT_PS(XMM0,XMM7)
749: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
750: SSE_SHUFFLE(XMM7,XMM7,0x00)
751: SSE_MULT_PS(XMM1,XMM7)
752: SSE_ADD_PS(XMM0,XMM1)
754: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
755: SSE_SHUFFLE(XMM1,XMM1,0x00)
756: SSE_MULT_PS(XMM1,XMM2)
757: SSE_ADD_PS(XMM0,XMM1)
759: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
760: SSE_SHUFFLE(XMM7,XMM7,0x00)
761: SSE_MULT_PS(XMM3,XMM7)
762: SSE_ADD_PS(XMM0,XMM3)
764: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
765: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
767: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
768: /* This is code to be maintained and read by humans afterall. */
769: /* Copy Multiplier Col 3 into XMM3 */
770: SSE_COPY_PS(XMM3,XMM0)
771: /* Copy Multiplier Col 2 into XMM2 */
772: SSE_COPY_PS(XMM2,XMM6)
773: /* Copy Multiplier Col 1 into XMM1 */
774: SSE_COPY_PS(XMM1,XMM5)
775: /* Copy Multiplier Col 0 into XMM0 */
776: SSE_COPY_PS(XMM0,XMM4)
777: SSE_INLINE_END_2;
779: /* Update the row: */
780: nz = bi[row+1] - diag_offset[row] - 1;
781: pv += 16;
782: for (j=0; j<nz; j++) {
783: PREFETCH_L1(&pv[16]);
784: x = rtmp + 16*pj[j];
785: /* x = rtmp + 4*pj[j]; */
787: /* X:=X-M*PV, One column at a time */
788: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
789: SSE_INLINE_BEGIN_2(x,pv)
790: /* Load First Column of X*/
791: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
792: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
794: /* Matrix-Vector Product: */
795: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
796: SSE_SHUFFLE(XMM5,XMM5,0x00)
797: SSE_MULT_PS(XMM5,XMM0)
798: SSE_SUB_PS(XMM4,XMM5)
800: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
801: SSE_SHUFFLE(XMM6,XMM6,0x00)
802: SSE_MULT_PS(XMM6,XMM1)
803: SSE_SUB_PS(XMM4,XMM6)
805: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
806: SSE_SHUFFLE(XMM7,XMM7,0x00)
807: SSE_MULT_PS(XMM7,XMM2)
808: SSE_SUB_PS(XMM4,XMM7)
810: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
811: SSE_SHUFFLE(XMM5,XMM5,0x00)
812: SSE_MULT_PS(XMM5,XMM3)
813: SSE_SUB_PS(XMM4,XMM5)
815: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
816: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
818: /* Second Column */
819: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
820: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
822: /* Matrix-Vector Product: */
823: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
824: SSE_SHUFFLE(XMM6,XMM6,0x00)
825: SSE_MULT_PS(XMM6,XMM0)
826: SSE_SUB_PS(XMM5,XMM6)
828: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
829: SSE_SHUFFLE(XMM7,XMM7,0x00)
830: SSE_MULT_PS(XMM7,XMM1)
831: SSE_SUB_PS(XMM5,XMM7)
833: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
834: SSE_SHUFFLE(XMM4,XMM4,0x00)
835: SSE_MULT_PS(XMM4,XMM2)
836: SSE_SUB_PS(XMM5,XMM4)
838: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
839: SSE_SHUFFLE(XMM6,XMM6,0x00)
840: SSE_MULT_PS(XMM6,XMM3)
841: SSE_SUB_PS(XMM5,XMM6)
842:
843: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
844: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
846: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
848: /* Third Column */
849: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
850: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
852: /* Matrix-Vector Product: */
853: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
854: SSE_SHUFFLE(XMM7,XMM7,0x00)
855: SSE_MULT_PS(XMM7,XMM0)
856: SSE_SUB_PS(XMM6,XMM7)
858: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
859: SSE_SHUFFLE(XMM4,XMM4,0x00)
860: SSE_MULT_PS(XMM4,XMM1)
861: SSE_SUB_PS(XMM6,XMM4)
863: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
864: SSE_SHUFFLE(XMM5,XMM5,0x00)
865: SSE_MULT_PS(XMM5,XMM2)
866: SSE_SUB_PS(XMM6,XMM5)
868: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
869: SSE_SHUFFLE(XMM7,XMM7,0x00)
870: SSE_MULT_PS(XMM7,XMM3)
871: SSE_SUB_PS(XMM6,XMM7)
872:
873: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
874: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
875:
876: /* Fourth Column */
877: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
878: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
880: /* Matrix-Vector Product: */
881: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
882: SSE_SHUFFLE(XMM5,XMM5,0x00)
883: SSE_MULT_PS(XMM5,XMM0)
884: SSE_SUB_PS(XMM4,XMM5)
886: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
887: SSE_SHUFFLE(XMM6,XMM6,0x00)
888: SSE_MULT_PS(XMM6,XMM1)
889: SSE_SUB_PS(XMM4,XMM6)
891: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
892: SSE_SHUFFLE(XMM7,XMM7,0x00)
893: SSE_MULT_PS(XMM7,XMM2)
894: SSE_SUB_PS(XMM4,XMM7)
896: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
897: SSE_SHUFFLE(XMM5,XMM5,0x00)
898: SSE_MULT_PS(XMM5,XMM3)
899: SSE_SUB_PS(XMM4,XMM5)
900:
901: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
902: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
903: SSE_INLINE_END_2;
904: pv += 16;
905: }
906: PetscLogFlops(128.0*nz+112.0);
907: }
908: row = *bjtmp++;
909: /* row = (*bjtmp++)/4; */
910: }
911: /* finished row so stick it into b->a */
912: pv = ba + 16*bi[i];
913: pj = bj + bi[i];
914: nz = bi[i+1] - bi[i];
916: /* Copy x block back into pv block */
917: for (j=0; j<nz; j++) {
918: x = rtmp+16*pj[j];
919: /* x = rtmp+4*pj[j]; */
921: SSE_INLINE_BEGIN_2(x,pv)
922: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
923: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
924: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
926: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
927: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
929: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
930: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
932: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
933: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
935: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
936: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
938: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
939: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
941: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
942: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
944: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
945: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
946: SSE_INLINE_END_2;
947: pv += 16;
948: }
949: /* invert diagonal block */
950: w = ba + 16*diag_offset[i];
951: if (pivotinblocks) {
952: Kernel_A_gets_inverse_A_4(w,shift);
953: } else {
954: Kernel_A_gets_inverse_A_4_nopivot(w);
955: }
956: /* Kernel_A_gets_inverse_A_4_SSE(w); */
957: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
958: }
960: PetscFree(rtmp);
961: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
962: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
963: C->assembled = PETSC_TRUE;
964: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
965: /* Flop Count from inverting diagonal blocks */
966: SSE_SCOPE_END;
967: return(0);
968: }
972: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
973: {
974: Mat A=C;
975: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
977: int i,j,n = a->mbs;
978: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
979: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
980: unsigned int row;
981: int nz,*bi=b->i;
982: int *diag_offset = b->diag,*ai=a->i;
983: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
984: MatScalar *ba = b->a,*aa = a->a;
985: int nonzero=0;
986: /* int nonzero=0,colscale = 16; */
987: PetscTruth pivotinblocks = b->pivotinblocks;
988: PetscReal shift = info->shiftamount;
991: SSE_SCOPE_BEGIN;
993: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
994: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
995: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
996: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
997: /* if ((unsigned long)bj==(unsigned long)aj) { */
998: /* colscale = 4; */
999: /* } */
1000:
1001: for (i=0; i<n; i++) {
1002: nz = bi[i+1] - bi[i];
1003: bjtmp = bj + bi[i];
1004: /* zero out the 4x4 block accumulators */
1005: /* zero out one register */
1006: XOR_PS(XMM7,XMM7);
1007: for (j=0; j<nz; j++) {
1008: x = rtmp+16*((unsigned int)bjtmp[j]);
1009: /* x = rtmp+4*bjtmp[j]; */
1010: SSE_INLINE_BEGIN_1(x)
1011: /* Copy zero register to memory locations */
1012: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1013: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1014: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1015: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1016: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1017: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1018: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1019: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1020: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1021: SSE_INLINE_END_1;
1022: }
1023: /* load in initial (unfactored row) */
1024: nz = ai[i+1] - ai[i];
1025: ajtmp = aj + ai[i];
1026: v = aa + 16*ai[i];
1027: for (j=0; j<nz; j++) {
1028: x = rtmp+16*((unsigned int)ajtmp[j]);
1029: /* x = rtmp+colscale*ajtmp[j]; */
1030: /* Copy v block into x block */
1031: SSE_INLINE_BEGIN_2(v,x)
1032: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1033: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1034: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1036: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1037: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1039: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1040: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1042: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1043: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1045: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1046: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1048: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1049: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1051: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1052: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1054: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1055: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1056: SSE_INLINE_END_2;
1058: v += 16;
1059: }
1060: /* row = (*bjtmp++)/4; */
1061: row = (unsigned int)(*bjtmp++);
1062: while (row < i) {
1063: pc = rtmp + 16*row;
1064: SSE_INLINE_BEGIN_1(pc)
1065: /* Load block from lower triangle */
1066: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1067: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1068: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1070: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1071: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1073: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1074: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1076: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1077: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1079: /* Compare block to zero block */
1081: SSE_COPY_PS(XMM4,XMM7)
1082: SSE_CMPNEQ_PS(XMM4,XMM0)
1084: SSE_COPY_PS(XMM5,XMM7)
1085: SSE_CMPNEQ_PS(XMM5,XMM1)
1087: SSE_COPY_PS(XMM6,XMM7)
1088: SSE_CMPNEQ_PS(XMM6,XMM2)
1090: SSE_CMPNEQ_PS(XMM7,XMM3)
1092: /* Reduce the comparisons to one SSE register */
1093: SSE_OR_PS(XMM6,XMM7)
1094: SSE_OR_PS(XMM5,XMM4)
1095: SSE_OR_PS(XMM5,XMM6)
1096: SSE_INLINE_END_1;
1098: /* Reduce the one SSE register to an integer register for branching */
1099: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1100: MOVEMASK(nonzero,XMM5);
1102: /* If block is nonzero ... */
1103: if (nonzero) {
1104: pv = ba + 16*diag_offset[row];
1105: PREFETCH_L1(&pv[16]);
1106: pj = bj + diag_offset[row] + 1;
1108: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1109: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1110: /* but the diagonal was inverted already */
1111: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1113: SSE_INLINE_BEGIN_2(pv,pc)
1114: /* Column 0, product is accumulated in XMM4 */
1115: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1116: SSE_SHUFFLE(XMM4,XMM4,0x00)
1117: SSE_MULT_PS(XMM4,XMM0)
1119: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1120: SSE_SHUFFLE(XMM5,XMM5,0x00)
1121: SSE_MULT_PS(XMM5,XMM1)
1122: SSE_ADD_PS(XMM4,XMM5)
1124: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1125: SSE_SHUFFLE(XMM6,XMM6,0x00)
1126: SSE_MULT_PS(XMM6,XMM2)
1127: SSE_ADD_PS(XMM4,XMM6)
1129: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1130: SSE_SHUFFLE(XMM7,XMM7,0x00)
1131: SSE_MULT_PS(XMM7,XMM3)
1132: SSE_ADD_PS(XMM4,XMM7)
1134: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1135: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1137: /* Column 1, product is accumulated in XMM5 */
1138: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1139: SSE_SHUFFLE(XMM5,XMM5,0x00)
1140: SSE_MULT_PS(XMM5,XMM0)
1142: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1143: SSE_SHUFFLE(XMM6,XMM6,0x00)
1144: SSE_MULT_PS(XMM6,XMM1)
1145: SSE_ADD_PS(XMM5,XMM6)
1147: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1148: SSE_SHUFFLE(XMM7,XMM7,0x00)
1149: SSE_MULT_PS(XMM7,XMM2)
1150: SSE_ADD_PS(XMM5,XMM7)
1152: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1153: SSE_SHUFFLE(XMM6,XMM6,0x00)
1154: SSE_MULT_PS(XMM6,XMM3)
1155: SSE_ADD_PS(XMM5,XMM6)
1157: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1158: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1160: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1162: /* Column 2, product is accumulated in XMM6 */
1163: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1164: SSE_SHUFFLE(XMM6,XMM6,0x00)
1165: SSE_MULT_PS(XMM6,XMM0)
1167: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1168: SSE_SHUFFLE(XMM7,XMM7,0x00)
1169: SSE_MULT_PS(XMM7,XMM1)
1170: SSE_ADD_PS(XMM6,XMM7)
1172: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1173: SSE_SHUFFLE(XMM7,XMM7,0x00)
1174: SSE_MULT_PS(XMM7,XMM2)
1175: SSE_ADD_PS(XMM6,XMM7)
1177: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1178: SSE_SHUFFLE(XMM7,XMM7,0x00)
1179: SSE_MULT_PS(XMM7,XMM3)
1180: SSE_ADD_PS(XMM6,XMM7)
1181:
1182: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1183: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1185: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1186: /* Column 3, product is accumulated in XMM0 */
1187: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1188: SSE_SHUFFLE(XMM7,XMM7,0x00)
1189: SSE_MULT_PS(XMM0,XMM7)
1191: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1192: SSE_SHUFFLE(XMM7,XMM7,0x00)
1193: SSE_MULT_PS(XMM1,XMM7)
1194: SSE_ADD_PS(XMM0,XMM1)
1196: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1197: SSE_SHUFFLE(XMM1,XMM1,0x00)
1198: SSE_MULT_PS(XMM1,XMM2)
1199: SSE_ADD_PS(XMM0,XMM1)
1201: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1202: SSE_SHUFFLE(XMM7,XMM7,0x00)
1203: SSE_MULT_PS(XMM3,XMM7)
1204: SSE_ADD_PS(XMM0,XMM3)
1206: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1207: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1209: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1210: /* This is code to be maintained and read by humans afterall. */
1211: /* Copy Multiplier Col 3 into XMM3 */
1212: SSE_COPY_PS(XMM3,XMM0)
1213: /* Copy Multiplier Col 2 into XMM2 */
1214: SSE_COPY_PS(XMM2,XMM6)
1215: /* Copy Multiplier Col 1 into XMM1 */
1216: SSE_COPY_PS(XMM1,XMM5)
1217: /* Copy Multiplier Col 0 into XMM0 */
1218: SSE_COPY_PS(XMM0,XMM4)
1219: SSE_INLINE_END_2;
1221: /* Update the row: */
1222: nz = bi[row+1] - diag_offset[row] - 1;
1223: pv += 16;
1224: for (j=0; j<nz; j++) {
1225: PREFETCH_L1(&pv[16]);
1226: x = rtmp + 16*((unsigned int)pj[j]);
1227: /* x = rtmp + 4*pj[j]; */
1229: /* X:=X-M*PV, One column at a time */
1230: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1231: SSE_INLINE_BEGIN_2(x,pv)
1232: /* Load First Column of X*/
1233: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1234: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1236: /* Matrix-Vector Product: */
1237: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1238: SSE_SHUFFLE(XMM5,XMM5,0x00)
1239: SSE_MULT_PS(XMM5,XMM0)
1240: SSE_SUB_PS(XMM4,XMM5)
1242: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1243: SSE_SHUFFLE(XMM6,XMM6,0x00)
1244: SSE_MULT_PS(XMM6,XMM1)
1245: SSE_SUB_PS(XMM4,XMM6)
1247: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1248: SSE_SHUFFLE(XMM7,XMM7,0x00)
1249: SSE_MULT_PS(XMM7,XMM2)
1250: SSE_SUB_PS(XMM4,XMM7)
1252: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1253: SSE_SHUFFLE(XMM5,XMM5,0x00)
1254: SSE_MULT_PS(XMM5,XMM3)
1255: SSE_SUB_PS(XMM4,XMM5)
1257: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1258: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1260: /* Second Column */
1261: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1262: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1264: /* Matrix-Vector Product: */
1265: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1266: SSE_SHUFFLE(XMM6,XMM6,0x00)
1267: SSE_MULT_PS(XMM6,XMM0)
1268: SSE_SUB_PS(XMM5,XMM6)
1270: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1271: SSE_SHUFFLE(XMM7,XMM7,0x00)
1272: SSE_MULT_PS(XMM7,XMM1)
1273: SSE_SUB_PS(XMM5,XMM7)
1275: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1276: SSE_SHUFFLE(XMM4,XMM4,0x00)
1277: SSE_MULT_PS(XMM4,XMM2)
1278: SSE_SUB_PS(XMM5,XMM4)
1280: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1281: SSE_SHUFFLE(XMM6,XMM6,0x00)
1282: SSE_MULT_PS(XMM6,XMM3)
1283: SSE_SUB_PS(XMM5,XMM6)
1284:
1285: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1286: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1288: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1290: /* Third Column */
1291: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1292: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1294: /* Matrix-Vector Product: */
1295: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1296: SSE_SHUFFLE(XMM7,XMM7,0x00)
1297: SSE_MULT_PS(XMM7,XMM0)
1298: SSE_SUB_PS(XMM6,XMM7)
1300: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1301: SSE_SHUFFLE(XMM4,XMM4,0x00)
1302: SSE_MULT_PS(XMM4,XMM1)
1303: SSE_SUB_PS(XMM6,XMM4)
1305: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1306: SSE_SHUFFLE(XMM5,XMM5,0x00)
1307: SSE_MULT_PS(XMM5,XMM2)
1308: SSE_SUB_PS(XMM6,XMM5)
1310: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1311: SSE_SHUFFLE(XMM7,XMM7,0x00)
1312: SSE_MULT_PS(XMM7,XMM3)
1313: SSE_SUB_PS(XMM6,XMM7)
1314:
1315: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1316: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1317:
1318: /* Fourth Column */
1319: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1320: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1322: /* Matrix-Vector Product: */
1323: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1324: SSE_SHUFFLE(XMM5,XMM5,0x00)
1325: SSE_MULT_PS(XMM5,XMM0)
1326: SSE_SUB_PS(XMM4,XMM5)
1328: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1329: SSE_SHUFFLE(XMM6,XMM6,0x00)
1330: SSE_MULT_PS(XMM6,XMM1)
1331: SSE_SUB_PS(XMM4,XMM6)
1333: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1334: SSE_SHUFFLE(XMM7,XMM7,0x00)
1335: SSE_MULT_PS(XMM7,XMM2)
1336: SSE_SUB_PS(XMM4,XMM7)
1338: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1339: SSE_SHUFFLE(XMM5,XMM5,0x00)
1340: SSE_MULT_PS(XMM5,XMM3)
1341: SSE_SUB_PS(XMM4,XMM5)
1342:
1343: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1344: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1345: SSE_INLINE_END_2;
1346: pv += 16;
1347: }
1348: PetscLogFlops(128.0*nz+112.0);
1349: }
1350: row = (unsigned int)(*bjtmp++);
1351: /* row = (*bjtmp++)/4; */
1352: /* bjtmp++; */
1353: }
1354: /* finished row so stick it into b->a */
1355: pv = ba + 16*bi[i];
1356: pj = bj + bi[i];
1357: nz = bi[i+1] - bi[i];
1359: /* Copy x block back into pv block */
1360: for (j=0; j<nz; j++) {
1361: x = rtmp+16*((unsigned int)pj[j]);
1362: /* x = rtmp+4*pj[j]; */
1364: SSE_INLINE_BEGIN_2(x,pv)
1365: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1366: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1367: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1369: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1370: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1372: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1373: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1375: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1376: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1378: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1379: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1381: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1382: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1384: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1385: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1387: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1388: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1389: SSE_INLINE_END_2;
1390: pv += 16;
1391: }
1392: /* invert diagonal block */
1393: w = ba + 16*diag_offset[i];
1394: if (pivotinblocks) {
1395: Kernel_A_gets_inverse_A_4(w,shift);
1396: } else {
1397: Kernel_A_gets_inverse_A_4_nopivot(w);
1398: }
1399: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1400: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1401: }
1403: PetscFree(rtmp);
1404: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1405: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1406: C->assembled = PETSC_TRUE;
1407: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1408: /* Flop Count from inverting diagonal blocks */
1409: SSE_SCOPE_END;
1410: return(0);
1411: }
1415: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1416: {
1417: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1419: int i,j,n = a->mbs;
1420: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1421: unsigned int row;
1422: int *ajtmpold,nz,*bi=b->i;
1423: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1424: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1425: MatScalar *ba = b->a,*aa = a->a;
1426: int nonzero=0;
1427: /* int nonzero=0,colscale = 16; */
1428: PetscTruth pivotinblocks = b->pivotinblocks;
1429: PetscReal shift = info->shiftamount;
1432: SSE_SCOPE_BEGIN;
1434: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1435: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1436: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1437: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1438: /* if ((unsigned long)bj==(unsigned long)aj) { */
1439: /* colscale = 4; */
1440: /* } */
1441: if ((unsigned long)bj==(unsigned long)aj) {
1442: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1443: }
1444:
1445: for (i=0; i<n; i++) {
1446: nz = bi[i+1] - bi[i];
1447: bjtmp = bj + bi[i];
1448: /* zero out the 4x4 block accumulators */
1449: /* zero out one register */
1450: XOR_PS(XMM7,XMM7);
1451: for (j=0; j<nz; j++) {
1452: x = rtmp+16*((unsigned int)bjtmp[j]);
1453: /* x = rtmp+4*bjtmp[j]; */
1454: SSE_INLINE_BEGIN_1(x)
1455: /* Copy zero register to memory locations */
1456: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1457: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1458: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1459: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1460: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1461: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1462: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1463: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1464: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1465: SSE_INLINE_END_1;
1466: }
1467: /* load in initial (unfactored row) */
1468: nz = ai[i+1] - ai[i];
1469: ajtmpold = aj + ai[i];
1470: v = aa + 16*ai[i];
1471: for (j=0; j<nz; j++) {
1472: x = rtmp+16*ajtmpold[j];
1473: /* x = rtmp+colscale*ajtmpold[j]; */
1474: /* Copy v block into x block */
1475: SSE_INLINE_BEGIN_2(v,x)
1476: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1477: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1478: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1480: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1481: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1483: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1484: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1486: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1487: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1489: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1490: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1492: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1493: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1495: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1496: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1498: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1499: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1500: SSE_INLINE_END_2;
1502: v += 16;
1503: }
1504: /* row = (*bjtmp++)/4; */
1505: row = (unsigned int)(*bjtmp++);
1506: while (row < i) {
1507: pc = rtmp + 16*row;
1508: SSE_INLINE_BEGIN_1(pc)
1509: /* Load block from lower triangle */
1510: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1511: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1512: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1514: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1515: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1517: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1518: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1520: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1521: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1523: /* Compare block to zero block */
1525: SSE_COPY_PS(XMM4,XMM7)
1526: SSE_CMPNEQ_PS(XMM4,XMM0)
1528: SSE_COPY_PS(XMM5,XMM7)
1529: SSE_CMPNEQ_PS(XMM5,XMM1)
1531: SSE_COPY_PS(XMM6,XMM7)
1532: SSE_CMPNEQ_PS(XMM6,XMM2)
1534: SSE_CMPNEQ_PS(XMM7,XMM3)
1536: /* Reduce the comparisons to one SSE register */
1537: SSE_OR_PS(XMM6,XMM7)
1538: SSE_OR_PS(XMM5,XMM4)
1539: SSE_OR_PS(XMM5,XMM6)
1540: SSE_INLINE_END_1;
1542: /* Reduce the one SSE register to an integer register for branching */
1543: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1544: MOVEMASK(nonzero,XMM5);
1546: /* If block is nonzero ... */
1547: if (nonzero) {
1548: pv = ba + 16*diag_offset[row];
1549: PREFETCH_L1(&pv[16]);
1550: pj = bj + diag_offset[row] + 1;
1552: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1553: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1554: /* but the diagonal was inverted already */
1555: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1557: SSE_INLINE_BEGIN_2(pv,pc)
1558: /* Column 0, product is accumulated in XMM4 */
1559: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1560: SSE_SHUFFLE(XMM4,XMM4,0x00)
1561: SSE_MULT_PS(XMM4,XMM0)
1563: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1564: SSE_SHUFFLE(XMM5,XMM5,0x00)
1565: SSE_MULT_PS(XMM5,XMM1)
1566: SSE_ADD_PS(XMM4,XMM5)
1568: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1569: SSE_SHUFFLE(XMM6,XMM6,0x00)
1570: SSE_MULT_PS(XMM6,XMM2)
1571: SSE_ADD_PS(XMM4,XMM6)
1573: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1574: SSE_SHUFFLE(XMM7,XMM7,0x00)
1575: SSE_MULT_PS(XMM7,XMM3)
1576: SSE_ADD_PS(XMM4,XMM7)
1578: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1579: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1581: /* Column 1, product is accumulated in XMM5 */
1582: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1583: SSE_SHUFFLE(XMM5,XMM5,0x00)
1584: SSE_MULT_PS(XMM5,XMM0)
1586: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1587: SSE_SHUFFLE(XMM6,XMM6,0x00)
1588: SSE_MULT_PS(XMM6,XMM1)
1589: SSE_ADD_PS(XMM5,XMM6)
1591: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1592: SSE_SHUFFLE(XMM7,XMM7,0x00)
1593: SSE_MULT_PS(XMM7,XMM2)
1594: SSE_ADD_PS(XMM5,XMM7)
1596: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1597: SSE_SHUFFLE(XMM6,XMM6,0x00)
1598: SSE_MULT_PS(XMM6,XMM3)
1599: SSE_ADD_PS(XMM5,XMM6)
1601: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1602: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1604: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1606: /* Column 2, product is accumulated in XMM6 */
1607: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1608: SSE_SHUFFLE(XMM6,XMM6,0x00)
1609: SSE_MULT_PS(XMM6,XMM0)
1611: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1612: SSE_SHUFFLE(XMM7,XMM7,0x00)
1613: SSE_MULT_PS(XMM7,XMM1)
1614: SSE_ADD_PS(XMM6,XMM7)
1616: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1617: SSE_SHUFFLE(XMM7,XMM7,0x00)
1618: SSE_MULT_PS(XMM7,XMM2)
1619: SSE_ADD_PS(XMM6,XMM7)
1621: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1622: SSE_SHUFFLE(XMM7,XMM7,0x00)
1623: SSE_MULT_PS(XMM7,XMM3)
1624: SSE_ADD_PS(XMM6,XMM7)
1625:
1626: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1627: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1629: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1630: /* Column 3, product is accumulated in XMM0 */
1631: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1632: SSE_SHUFFLE(XMM7,XMM7,0x00)
1633: SSE_MULT_PS(XMM0,XMM7)
1635: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1636: SSE_SHUFFLE(XMM7,XMM7,0x00)
1637: SSE_MULT_PS(XMM1,XMM7)
1638: SSE_ADD_PS(XMM0,XMM1)
1640: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1641: SSE_SHUFFLE(XMM1,XMM1,0x00)
1642: SSE_MULT_PS(XMM1,XMM2)
1643: SSE_ADD_PS(XMM0,XMM1)
1645: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1646: SSE_SHUFFLE(XMM7,XMM7,0x00)
1647: SSE_MULT_PS(XMM3,XMM7)
1648: SSE_ADD_PS(XMM0,XMM3)
1650: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1651: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1653: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1654: /* This is code to be maintained and read by humans afterall. */
1655: /* Copy Multiplier Col 3 into XMM3 */
1656: SSE_COPY_PS(XMM3,XMM0)
1657: /* Copy Multiplier Col 2 into XMM2 */
1658: SSE_COPY_PS(XMM2,XMM6)
1659: /* Copy Multiplier Col 1 into XMM1 */
1660: SSE_COPY_PS(XMM1,XMM5)
1661: /* Copy Multiplier Col 0 into XMM0 */
1662: SSE_COPY_PS(XMM0,XMM4)
1663: SSE_INLINE_END_2;
1665: /* Update the row: */
1666: nz = bi[row+1] - diag_offset[row] - 1;
1667: pv += 16;
1668: for (j=0; j<nz; j++) {
1669: PREFETCH_L1(&pv[16]);
1670: x = rtmp + 16*((unsigned int)pj[j]);
1671: /* x = rtmp + 4*pj[j]; */
1673: /* X:=X-M*PV, One column at a time */
1674: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1675: SSE_INLINE_BEGIN_2(x,pv)
1676: /* Load First Column of X*/
1677: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1678: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1680: /* Matrix-Vector Product: */
1681: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1682: SSE_SHUFFLE(XMM5,XMM5,0x00)
1683: SSE_MULT_PS(XMM5,XMM0)
1684: SSE_SUB_PS(XMM4,XMM5)
1686: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1687: SSE_SHUFFLE(XMM6,XMM6,0x00)
1688: SSE_MULT_PS(XMM6,XMM1)
1689: SSE_SUB_PS(XMM4,XMM6)
1691: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1692: SSE_SHUFFLE(XMM7,XMM7,0x00)
1693: SSE_MULT_PS(XMM7,XMM2)
1694: SSE_SUB_PS(XMM4,XMM7)
1696: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1697: SSE_SHUFFLE(XMM5,XMM5,0x00)
1698: SSE_MULT_PS(XMM5,XMM3)
1699: SSE_SUB_PS(XMM4,XMM5)
1701: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1702: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1704: /* Second Column */
1705: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1706: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1708: /* Matrix-Vector Product: */
1709: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1710: SSE_SHUFFLE(XMM6,XMM6,0x00)
1711: SSE_MULT_PS(XMM6,XMM0)
1712: SSE_SUB_PS(XMM5,XMM6)
1714: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1715: SSE_SHUFFLE(XMM7,XMM7,0x00)
1716: SSE_MULT_PS(XMM7,XMM1)
1717: SSE_SUB_PS(XMM5,XMM7)
1719: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1720: SSE_SHUFFLE(XMM4,XMM4,0x00)
1721: SSE_MULT_PS(XMM4,XMM2)
1722: SSE_SUB_PS(XMM5,XMM4)
1724: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1725: SSE_SHUFFLE(XMM6,XMM6,0x00)
1726: SSE_MULT_PS(XMM6,XMM3)
1727: SSE_SUB_PS(XMM5,XMM6)
1728:
1729: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1730: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1732: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1734: /* Third Column */
1735: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1736: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1738: /* Matrix-Vector Product: */
1739: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1740: SSE_SHUFFLE(XMM7,XMM7,0x00)
1741: SSE_MULT_PS(XMM7,XMM0)
1742: SSE_SUB_PS(XMM6,XMM7)
1744: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1745: SSE_SHUFFLE(XMM4,XMM4,0x00)
1746: SSE_MULT_PS(XMM4,XMM1)
1747: SSE_SUB_PS(XMM6,XMM4)
1749: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1750: SSE_SHUFFLE(XMM5,XMM5,0x00)
1751: SSE_MULT_PS(XMM5,XMM2)
1752: SSE_SUB_PS(XMM6,XMM5)
1754: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1755: SSE_SHUFFLE(XMM7,XMM7,0x00)
1756: SSE_MULT_PS(XMM7,XMM3)
1757: SSE_SUB_PS(XMM6,XMM7)
1758:
1759: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1760: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1761:
1762: /* Fourth Column */
1763: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1764: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1766: /* Matrix-Vector Product: */
1767: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1768: SSE_SHUFFLE(XMM5,XMM5,0x00)
1769: SSE_MULT_PS(XMM5,XMM0)
1770: SSE_SUB_PS(XMM4,XMM5)
1772: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1773: SSE_SHUFFLE(XMM6,XMM6,0x00)
1774: SSE_MULT_PS(XMM6,XMM1)
1775: SSE_SUB_PS(XMM4,XMM6)
1777: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1778: SSE_SHUFFLE(XMM7,XMM7,0x00)
1779: SSE_MULT_PS(XMM7,XMM2)
1780: SSE_SUB_PS(XMM4,XMM7)
1782: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1783: SSE_SHUFFLE(XMM5,XMM5,0x00)
1784: SSE_MULT_PS(XMM5,XMM3)
1785: SSE_SUB_PS(XMM4,XMM5)
1786:
1787: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1788: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1789: SSE_INLINE_END_2;
1790: pv += 16;
1791: }
1792: PetscLogFlops(128.0*nz+112.0);
1793: }
1794: row = (unsigned int)(*bjtmp++);
1795: /* row = (*bjtmp++)/4; */
1796: /* bjtmp++; */
1797: }
1798: /* finished row so stick it into b->a */
1799: pv = ba + 16*bi[i];
1800: pj = bj + bi[i];
1801: nz = bi[i+1] - bi[i];
1803: /* Copy x block back into pv block */
1804: for (j=0; j<nz; j++) {
1805: x = rtmp+16*((unsigned int)pj[j]);
1806: /* x = rtmp+4*pj[j]; */
1808: SSE_INLINE_BEGIN_2(x,pv)
1809: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1810: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1811: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1813: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1814: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1816: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1817: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1819: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1820: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1822: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1823: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1825: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1826: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1828: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1829: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1831: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1832: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1833: SSE_INLINE_END_2;
1834: pv += 16;
1835: }
1836: /* invert diagonal block */
1837: w = ba + 16*diag_offset[i];
1838: if (pivotinblocks) {
1839: Kernel_A_gets_inverse_A_4(w,shift);
1840: } else {
1841: Kernel_A_gets_inverse_A_4_nopivot(w);
1842: }
1843: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1844: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1845: }
1847: PetscFree(rtmp);
1848: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1849: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1850: C->assembled = PETSC_TRUE;
1851: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1852: /* Flop Count from inverting diagonal blocks */
1853: SSE_SCOPE_END;
1854: return(0);
1855: }
1857: #endif