Actual source code: baijfact12.c
1: /*$Id: baijfact12.c,v 1.17 2001/08/31 16:22:11 bsmith Exp $*/
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include src/mat/impls/baij/seq/baij.h
6: #include src/inline/ilu.h
10: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,Mat *B)
11: {
12: /*
13: Default Version for when blocks are 4 by 4 Using natural ordering
14: */
15: Mat C = *B;
16: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17: int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
18: int *ajtmpold,*ajtmp,nz,row;
19: int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
20: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
21: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
22: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
23: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
24: MatScalar m13,m14,m15,m16;
25: MatScalar *ba = b->a,*aa = a->a;
26: PetscTruth pivotinblocks = b->pivotinblocks;
29: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
31: for (i=0; i<n; i++) {
32: nz = bi[i+1] - bi[i];
33: ajtmp = bj + bi[i];
34: for (j=0; j<nz; j++) {
35: x = rtmp+16*ajtmp[j];
36: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
37: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
38: }
39: /* load in initial (unfactored row) */
40: nz = ai[i+1] - ai[i];
41: ajtmpold = aj + ai[i];
42: v = aa + 16*ai[i];
43: for (j=0; j<nz; j++) {
44: x = rtmp+16*ajtmpold[j];
45: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
46: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
47: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
48: x[14] = v[14]; x[15] = v[15];
49: v += 16;
50: }
51: row = *ajtmp++;
52: while (row < i) {
53: pc = rtmp + 16*row;
54: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
55: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
56: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
57: p15 = pc[14]; p16 = pc[15];
58: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
59: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
60: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
61: || p16 != 0.0) {
62: pv = ba + 16*diag_offset[row];
63: pj = bj + diag_offset[row] + 1;
64: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
65: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
66: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
67: x15 = pv[14]; x16 = pv[15];
68: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
69: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
70: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
71: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
73: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
74: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
75: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
76: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
78: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
79: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
80: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
81: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
83: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
84: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
85: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
86: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
87: nz = bi[row+1] - diag_offset[row] - 1;
88: pv += 16;
89: for (j=0; j<nz; j++) {
90: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
91: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
92: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
93: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
94: x = rtmp + 16*pj[j];
95: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
96: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
97: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
98: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
100: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
101: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
102: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
103: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
105: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
106: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
107: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
108: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
110: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
111: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
112: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
113: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
115: pv += 16;
116: }
117: PetscLogFlops(128*nz+112);
118: }
119: row = *ajtmp++;
120: }
121: /* finished row so stick it into b->a */
122: pv = ba + 16*bi[i];
123: pj = bj + bi[i];
124: nz = bi[i+1] - bi[i];
125: for (j=0; j<nz; j++) {
126: x = rtmp+16*pj[j];
127: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
128: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
129: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
130: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
131: pv += 16;
132: }
133: /* invert diagonal block */
134: w = ba + 16*diag_offset[i];
135: if (pivotinblocks) {
136: Kernel_A_gets_inverse_A_4(w);
137: } else {
138: Kernel_A_gets_inverse_A_4_nopivot(w);
139: }
140: }
142: PetscFree(rtmp);
143: C->factor = FACTOR_LU;
144: C->assembled = PETSC_TRUE;
145: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
146: return(0);
147: }
150: #if defined(PETSC_HAVE_SSE)
152: #include PETSC_HAVE_SSE
154: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
157: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,Mat *B)
158: {
159: Mat C = *B;
160: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
161: int ierr,i,j,n = a->mbs;
162: int *bj = b->j,*bjtmp,*pj;
163: int row;
164: int *ajtmpold,nz,*bi=b->i;
165: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
166: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
167: MatScalar *ba = b->a,*aa = a->a;
168: int nonzero=0;
169: /* int nonzero=0,colscale = 16; */
170: PetscTruth pivotinblocks = b->pivotinblocks;
173: SSE_SCOPE_BEGIN;
175: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
176: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
177: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
178: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
179: /* if ((unsigned long)bj==(unsigned long)aj) { */
180: /* colscale = 4; */
181: /* } */
182: for (i=0; i<n; i++) {
183: nz = bi[i+1] - bi[i];
184: bjtmp = bj + bi[i];
185: /* zero out the 4x4 block accumulators */
186: /* zero out one register */
187: XOR_PS(XMM7,XMM7);
188: for (j=0; j<nz; j++) {
189: x = rtmp+16*bjtmp[j];
190: /* x = rtmp+4*bjtmp[j]; */
191: SSE_INLINE_BEGIN_1(x)
192: /* Copy zero register to memory locations */
193: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
194: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
195: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
196: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
197: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
198: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
199: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
200: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
201: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
202: SSE_INLINE_END_1;
203: }
204: /* load in initial (unfactored row) */
205: nz = ai[i+1] - ai[i];
206: ajtmpold = aj + ai[i];
207: v = aa + 16*ai[i];
208: for (j=0; j<nz; j++) {
209: x = rtmp+16*ajtmpold[j];
210: /* x = rtmp+colscale*ajtmpold[j]; */
211: /* Copy v block into x block */
212: SSE_INLINE_BEGIN_2(v,x)
213: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
214: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
215: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
217: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
218: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
220: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
221: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
223: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
224: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
226: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
227: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
229: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
230: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
232: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
233: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
235: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
236: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
237: SSE_INLINE_END_2;
239: v += 16;
240: }
241: /* row = (*bjtmp++)/4; */
242: row = *bjtmp++;
243: while (row < i) {
244: pc = rtmp + 16*row;
245: SSE_INLINE_BEGIN_1(pc)
246: /* Load block from lower triangle */
247: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
248: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
249: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
251: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
252: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
254: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
255: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
257: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
258: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
260: /* Compare block to zero block */
262: SSE_COPY_PS(XMM4,XMM7)
263: SSE_CMPNEQ_PS(XMM4,XMM0)
265: SSE_COPY_PS(XMM5,XMM7)
266: SSE_CMPNEQ_PS(XMM5,XMM1)
268: SSE_COPY_PS(XMM6,XMM7)
269: SSE_CMPNEQ_PS(XMM6,XMM2)
271: SSE_CMPNEQ_PS(XMM7,XMM3)
273: /* Reduce the comparisons to one SSE register */
274: SSE_OR_PS(XMM6,XMM7)
275: SSE_OR_PS(XMM5,XMM4)
276: SSE_OR_PS(XMM5,XMM6)
277: SSE_INLINE_END_1;
279: /* Reduce the one SSE register to an integer register for branching */
280: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
281: MOVEMASK(nonzero,XMM5);
283: /* If block is nonzero ... */
284: if (nonzero) {
285: pv = ba + 16*diag_offset[row];
286: PREFETCH_L1(&pv[16]);
287: pj = bj + diag_offset[row] + 1;
289: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
290: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
291: /* but the diagonal was inverted already */
292: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
294: SSE_INLINE_BEGIN_2(pv,pc)
295: /* Column 0, product is accumulated in XMM4 */
296: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
297: SSE_SHUFFLE(XMM4,XMM4,0x00)
298: SSE_MULT_PS(XMM4,XMM0)
300: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
301: SSE_SHUFFLE(XMM5,XMM5,0x00)
302: SSE_MULT_PS(XMM5,XMM1)
303: SSE_ADD_PS(XMM4,XMM5)
305: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
306: SSE_SHUFFLE(XMM6,XMM6,0x00)
307: SSE_MULT_PS(XMM6,XMM2)
308: SSE_ADD_PS(XMM4,XMM6)
310: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
311: SSE_SHUFFLE(XMM7,XMM7,0x00)
312: SSE_MULT_PS(XMM7,XMM3)
313: SSE_ADD_PS(XMM4,XMM7)
315: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
316: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
318: /* Column 1, product is accumulated in XMM5 */
319: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
320: SSE_SHUFFLE(XMM5,XMM5,0x00)
321: SSE_MULT_PS(XMM5,XMM0)
323: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
324: SSE_SHUFFLE(XMM6,XMM6,0x00)
325: SSE_MULT_PS(XMM6,XMM1)
326: SSE_ADD_PS(XMM5,XMM6)
328: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
329: SSE_SHUFFLE(XMM7,XMM7,0x00)
330: SSE_MULT_PS(XMM7,XMM2)
331: SSE_ADD_PS(XMM5,XMM7)
333: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
334: SSE_SHUFFLE(XMM6,XMM6,0x00)
335: SSE_MULT_PS(XMM6,XMM3)
336: SSE_ADD_PS(XMM5,XMM6)
338: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
339: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
341: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
343: /* Column 2, product is accumulated in XMM6 */
344: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
345: SSE_SHUFFLE(XMM6,XMM6,0x00)
346: SSE_MULT_PS(XMM6,XMM0)
348: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
349: SSE_SHUFFLE(XMM7,XMM7,0x00)
350: SSE_MULT_PS(XMM7,XMM1)
351: SSE_ADD_PS(XMM6,XMM7)
353: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
354: SSE_SHUFFLE(XMM7,XMM7,0x00)
355: SSE_MULT_PS(XMM7,XMM2)
356: SSE_ADD_PS(XMM6,XMM7)
358: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
359: SSE_SHUFFLE(XMM7,XMM7,0x00)
360: SSE_MULT_PS(XMM7,XMM3)
361: SSE_ADD_PS(XMM6,XMM7)
362:
363: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
364: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
366: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
367: /* Column 3, product is accumulated in XMM0 */
368: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
369: SSE_SHUFFLE(XMM7,XMM7,0x00)
370: SSE_MULT_PS(XMM0,XMM7)
372: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
373: SSE_SHUFFLE(XMM7,XMM7,0x00)
374: SSE_MULT_PS(XMM1,XMM7)
375: SSE_ADD_PS(XMM0,XMM1)
377: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
378: SSE_SHUFFLE(XMM1,XMM1,0x00)
379: SSE_MULT_PS(XMM1,XMM2)
380: SSE_ADD_PS(XMM0,XMM1)
382: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
383: SSE_SHUFFLE(XMM7,XMM7,0x00)
384: SSE_MULT_PS(XMM3,XMM7)
385: SSE_ADD_PS(XMM0,XMM3)
387: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
388: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
390: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
391: /* This is code to be maintained and read by humans afterall. */
392: /* Copy Multiplier Col 3 into XMM3 */
393: SSE_COPY_PS(XMM3,XMM0)
394: /* Copy Multiplier Col 2 into XMM2 */
395: SSE_COPY_PS(XMM2,XMM6)
396: /* Copy Multiplier Col 1 into XMM1 */
397: SSE_COPY_PS(XMM1,XMM5)
398: /* Copy Multiplier Col 0 into XMM0 */
399: SSE_COPY_PS(XMM0,XMM4)
400: SSE_INLINE_END_2;
402: /* Update the row: */
403: nz = bi[row+1] - diag_offset[row] - 1;
404: pv += 16;
405: for (j=0; j<nz; j++) {
406: PREFETCH_L1(&pv[16]);
407: x = rtmp + 16*pj[j];
408: /* x = rtmp + 4*pj[j]; */
410: /* X:=X-M*PV, One column at a time */
411: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
412: SSE_INLINE_BEGIN_2(x,pv)
413: /* Load First Column of X*/
414: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
415: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
417: /* Matrix-Vector Product: */
418: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
419: SSE_SHUFFLE(XMM5,XMM5,0x00)
420: SSE_MULT_PS(XMM5,XMM0)
421: SSE_SUB_PS(XMM4,XMM5)
423: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
424: SSE_SHUFFLE(XMM6,XMM6,0x00)
425: SSE_MULT_PS(XMM6,XMM1)
426: SSE_SUB_PS(XMM4,XMM6)
428: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
429: SSE_SHUFFLE(XMM7,XMM7,0x00)
430: SSE_MULT_PS(XMM7,XMM2)
431: SSE_SUB_PS(XMM4,XMM7)
433: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
434: SSE_SHUFFLE(XMM5,XMM5,0x00)
435: SSE_MULT_PS(XMM5,XMM3)
436: SSE_SUB_PS(XMM4,XMM5)
438: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
439: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
441: /* Second Column */
442: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
443: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
445: /* Matrix-Vector Product: */
446: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
447: SSE_SHUFFLE(XMM6,XMM6,0x00)
448: SSE_MULT_PS(XMM6,XMM0)
449: SSE_SUB_PS(XMM5,XMM6)
451: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
452: SSE_SHUFFLE(XMM7,XMM7,0x00)
453: SSE_MULT_PS(XMM7,XMM1)
454: SSE_SUB_PS(XMM5,XMM7)
456: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
457: SSE_SHUFFLE(XMM4,XMM4,0x00)
458: SSE_MULT_PS(XMM4,XMM2)
459: SSE_SUB_PS(XMM5,XMM4)
461: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
462: SSE_SHUFFLE(XMM6,XMM6,0x00)
463: SSE_MULT_PS(XMM6,XMM3)
464: SSE_SUB_PS(XMM5,XMM6)
465:
466: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
467: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
469: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
471: /* Third Column */
472: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
473: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
475: /* Matrix-Vector Product: */
476: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
477: SSE_SHUFFLE(XMM7,XMM7,0x00)
478: SSE_MULT_PS(XMM7,XMM0)
479: SSE_SUB_PS(XMM6,XMM7)
481: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
482: SSE_SHUFFLE(XMM4,XMM4,0x00)
483: SSE_MULT_PS(XMM4,XMM1)
484: SSE_SUB_PS(XMM6,XMM4)
486: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
487: SSE_SHUFFLE(XMM5,XMM5,0x00)
488: SSE_MULT_PS(XMM5,XMM2)
489: SSE_SUB_PS(XMM6,XMM5)
491: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
492: SSE_SHUFFLE(XMM7,XMM7,0x00)
493: SSE_MULT_PS(XMM7,XMM3)
494: SSE_SUB_PS(XMM6,XMM7)
495:
496: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
497: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
498:
499: /* Fourth Column */
500: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
501: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
503: /* Matrix-Vector Product: */
504: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
505: SSE_SHUFFLE(XMM5,XMM5,0x00)
506: SSE_MULT_PS(XMM5,XMM0)
507: SSE_SUB_PS(XMM4,XMM5)
509: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
510: SSE_SHUFFLE(XMM6,XMM6,0x00)
511: SSE_MULT_PS(XMM6,XMM1)
512: SSE_SUB_PS(XMM4,XMM6)
514: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
515: SSE_SHUFFLE(XMM7,XMM7,0x00)
516: SSE_MULT_PS(XMM7,XMM2)
517: SSE_SUB_PS(XMM4,XMM7)
519: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
520: SSE_SHUFFLE(XMM5,XMM5,0x00)
521: SSE_MULT_PS(XMM5,XMM3)
522: SSE_SUB_PS(XMM4,XMM5)
523:
524: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
525: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
526: SSE_INLINE_END_2;
527: pv += 16;
528: }
529: PetscLogFlops(128*nz+112);
530: }
531: row = *bjtmp++;
532: /* row = (*bjtmp++)/4; */
533: }
534: /* finished row so stick it into b->a */
535: pv = ba + 16*bi[i];
536: pj = bj + bi[i];
537: nz = bi[i+1] - bi[i];
539: /* Copy x block back into pv block */
540: for (j=0; j<nz; j++) {
541: x = rtmp+16*pj[j];
542: /* x = rtmp+4*pj[j]; */
544: SSE_INLINE_BEGIN_2(x,pv)
545: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
546: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
547: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
549: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
550: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
552: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
553: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
555: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
556: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
558: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
559: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
561: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
562: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
564: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
565: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
567: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
568: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
569: SSE_INLINE_END_2;
570: pv += 16;
571: }
572: /* invert diagonal block */
573: w = ba + 16*diag_offset[i];
574: if (pivotinblocks) {
575: Kernel_A_gets_inverse_A_4(w);
576: } else {
577: Kernel_A_gets_inverse_A_4_nopivot(w);
578: }
579: /* Kernel_A_gets_inverse_A_4_SSE(w); */
580: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
581: }
583: PetscFree(rtmp);
584: C->factor = FACTOR_LU;
585: C->assembled = PETSC_TRUE;
586: PetscLogFlops(1.3333*64*b->mbs);
587: /* Flop Count from inverting diagonal blocks */
588: SSE_SCOPE_END;
589: return(0);
590: }
594: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat *B)
595: {
596: Mat A=*B,C=*B;
597: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
598: int ierr,i,j,n = a->mbs;
599: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
600: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
601: unsigned int row;
602: int nz,*bi=b->i;
603: int *diag_offset = b->diag,*ai=a->i;
604: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
605: MatScalar *ba = b->a,*aa = a->a;
606: int nonzero=0;
607: /* int nonzero=0,colscale = 16; */
608: PetscTruth pivotinblocks = b->pivotinblocks;
611: SSE_SCOPE_BEGIN;
613: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
614: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
615: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
616: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
617: /* if ((unsigned long)bj==(unsigned long)aj) { */
618: /* colscale = 4; */
619: /* } */
620:
621: for (i=0; i<n; i++) {
622: nz = bi[i+1] - bi[i];
623: bjtmp = bj + bi[i];
624: /* zero out the 4x4 block accumulators */
625: /* zero out one register */
626: XOR_PS(XMM7,XMM7);
627: for (j=0; j<nz; j++) {
628: x = rtmp+16*((unsigned int)bjtmp[j]);
629: /* x = rtmp+4*bjtmp[j]; */
630: SSE_INLINE_BEGIN_1(x)
631: /* Copy zero register to memory locations */
632: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
633: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
634: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
635: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
636: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
637: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
638: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
639: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
640: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
641: SSE_INLINE_END_1;
642: }
643: /* load in initial (unfactored row) */
644: nz = ai[i+1] - ai[i];
645: ajtmp = aj + ai[i];
646: v = aa + 16*ai[i];
647: for (j=0; j<nz; j++) {
648: x = rtmp+16*((unsigned int)ajtmp[j]);
649: /* x = rtmp+colscale*ajtmp[j]; */
650: /* Copy v block into x block */
651: SSE_INLINE_BEGIN_2(v,x)
652: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
653: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
654: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
656: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
657: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
659: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
660: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
662: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
663: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
665: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
666: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
668: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
669: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
671: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
672: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
674: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
675: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
676: SSE_INLINE_END_2;
678: v += 16;
679: }
680: /* row = (*bjtmp++)/4; */
681: row = (unsigned int)(*bjtmp++);
682: while (row < i) {
683: pc = rtmp + 16*row;
684: SSE_INLINE_BEGIN_1(pc)
685: /* Load block from lower triangle */
686: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
687: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
688: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
690: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
691: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
693: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
694: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
696: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
697: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
699: /* Compare block to zero block */
701: SSE_COPY_PS(XMM4,XMM7)
702: SSE_CMPNEQ_PS(XMM4,XMM0)
704: SSE_COPY_PS(XMM5,XMM7)
705: SSE_CMPNEQ_PS(XMM5,XMM1)
707: SSE_COPY_PS(XMM6,XMM7)
708: SSE_CMPNEQ_PS(XMM6,XMM2)
710: SSE_CMPNEQ_PS(XMM7,XMM3)
712: /* Reduce the comparisons to one SSE register */
713: SSE_OR_PS(XMM6,XMM7)
714: SSE_OR_PS(XMM5,XMM4)
715: SSE_OR_PS(XMM5,XMM6)
716: SSE_INLINE_END_1;
718: /* Reduce the one SSE register to an integer register for branching */
719: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
720: MOVEMASK(nonzero,XMM5);
722: /* If block is nonzero ... */
723: if (nonzero) {
724: pv = ba + 16*diag_offset[row];
725: PREFETCH_L1(&pv[16]);
726: pj = bj + diag_offset[row] + 1;
728: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
729: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
730: /* but the diagonal was inverted already */
731: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
733: SSE_INLINE_BEGIN_2(pv,pc)
734: /* Column 0, product is accumulated in XMM4 */
735: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
736: SSE_SHUFFLE(XMM4,XMM4,0x00)
737: SSE_MULT_PS(XMM4,XMM0)
739: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
740: SSE_SHUFFLE(XMM5,XMM5,0x00)
741: SSE_MULT_PS(XMM5,XMM1)
742: SSE_ADD_PS(XMM4,XMM5)
744: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
745: SSE_SHUFFLE(XMM6,XMM6,0x00)
746: SSE_MULT_PS(XMM6,XMM2)
747: SSE_ADD_PS(XMM4,XMM6)
749: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
750: SSE_SHUFFLE(XMM7,XMM7,0x00)
751: SSE_MULT_PS(XMM7,XMM3)
752: SSE_ADD_PS(XMM4,XMM7)
754: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
755: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
757: /* Column 1, product is accumulated in XMM5 */
758: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
759: SSE_SHUFFLE(XMM5,XMM5,0x00)
760: SSE_MULT_PS(XMM5,XMM0)
762: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
763: SSE_SHUFFLE(XMM6,XMM6,0x00)
764: SSE_MULT_PS(XMM6,XMM1)
765: SSE_ADD_PS(XMM5,XMM6)
767: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
768: SSE_SHUFFLE(XMM7,XMM7,0x00)
769: SSE_MULT_PS(XMM7,XMM2)
770: SSE_ADD_PS(XMM5,XMM7)
772: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
773: SSE_SHUFFLE(XMM6,XMM6,0x00)
774: SSE_MULT_PS(XMM6,XMM3)
775: SSE_ADD_PS(XMM5,XMM6)
777: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
778: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
780: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
782: /* Column 2, product is accumulated in XMM6 */
783: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
784: SSE_SHUFFLE(XMM6,XMM6,0x00)
785: SSE_MULT_PS(XMM6,XMM0)
787: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
788: SSE_SHUFFLE(XMM7,XMM7,0x00)
789: SSE_MULT_PS(XMM7,XMM1)
790: SSE_ADD_PS(XMM6,XMM7)
792: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
793: SSE_SHUFFLE(XMM7,XMM7,0x00)
794: SSE_MULT_PS(XMM7,XMM2)
795: SSE_ADD_PS(XMM6,XMM7)
797: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
798: SSE_SHUFFLE(XMM7,XMM7,0x00)
799: SSE_MULT_PS(XMM7,XMM3)
800: SSE_ADD_PS(XMM6,XMM7)
801:
802: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
803: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
805: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
806: /* Column 3, product is accumulated in XMM0 */
807: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
808: SSE_SHUFFLE(XMM7,XMM7,0x00)
809: SSE_MULT_PS(XMM0,XMM7)
811: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
812: SSE_SHUFFLE(XMM7,XMM7,0x00)
813: SSE_MULT_PS(XMM1,XMM7)
814: SSE_ADD_PS(XMM0,XMM1)
816: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
817: SSE_SHUFFLE(XMM1,XMM1,0x00)
818: SSE_MULT_PS(XMM1,XMM2)
819: SSE_ADD_PS(XMM0,XMM1)
821: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
822: SSE_SHUFFLE(XMM7,XMM7,0x00)
823: SSE_MULT_PS(XMM3,XMM7)
824: SSE_ADD_PS(XMM0,XMM3)
826: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
827: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
829: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
830: /* This is code to be maintained and read by humans afterall. */
831: /* Copy Multiplier Col 3 into XMM3 */
832: SSE_COPY_PS(XMM3,XMM0)
833: /* Copy Multiplier Col 2 into XMM2 */
834: SSE_COPY_PS(XMM2,XMM6)
835: /* Copy Multiplier Col 1 into XMM1 */
836: SSE_COPY_PS(XMM1,XMM5)
837: /* Copy Multiplier Col 0 into XMM0 */
838: SSE_COPY_PS(XMM0,XMM4)
839: SSE_INLINE_END_2;
841: /* Update the row: */
842: nz = bi[row+1] - diag_offset[row] - 1;
843: pv += 16;
844: for (j=0; j<nz; j++) {
845: PREFETCH_L1(&pv[16]);
846: x = rtmp + 16*((unsigned int)pj[j]);
847: /* x = rtmp + 4*pj[j]; */
849: /* X:=X-M*PV, One column at a time */
850: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
851: SSE_INLINE_BEGIN_2(x,pv)
852: /* Load First Column of X*/
853: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
854: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
856: /* Matrix-Vector Product: */
857: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
858: SSE_SHUFFLE(XMM5,XMM5,0x00)
859: SSE_MULT_PS(XMM5,XMM0)
860: SSE_SUB_PS(XMM4,XMM5)
862: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
863: SSE_SHUFFLE(XMM6,XMM6,0x00)
864: SSE_MULT_PS(XMM6,XMM1)
865: SSE_SUB_PS(XMM4,XMM6)
867: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
868: SSE_SHUFFLE(XMM7,XMM7,0x00)
869: SSE_MULT_PS(XMM7,XMM2)
870: SSE_SUB_PS(XMM4,XMM7)
872: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
873: SSE_SHUFFLE(XMM5,XMM5,0x00)
874: SSE_MULT_PS(XMM5,XMM3)
875: SSE_SUB_PS(XMM4,XMM5)
877: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
878: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
880: /* Second Column */
881: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
882: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
884: /* Matrix-Vector Product: */
885: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
886: SSE_SHUFFLE(XMM6,XMM6,0x00)
887: SSE_MULT_PS(XMM6,XMM0)
888: SSE_SUB_PS(XMM5,XMM6)
890: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
891: SSE_SHUFFLE(XMM7,XMM7,0x00)
892: SSE_MULT_PS(XMM7,XMM1)
893: SSE_SUB_PS(XMM5,XMM7)
895: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
896: SSE_SHUFFLE(XMM4,XMM4,0x00)
897: SSE_MULT_PS(XMM4,XMM2)
898: SSE_SUB_PS(XMM5,XMM4)
900: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
901: SSE_SHUFFLE(XMM6,XMM6,0x00)
902: SSE_MULT_PS(XMM6,XMM3)
903: SSE_SUB_PS(XMM5,XMM6)
904:
905: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
906: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
908: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
910: /* Third Column */
911: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
912: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
914: /* Matrix-Vector Product: */
915: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
916: SSE_SHUFFLE(XMM7,XMM7,0x00)
917: SSE_MULT_PS(XMM7,XMM0)
918: SSE_SUB_PS(XMM6,XMM7)
920: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
921: SSE_SHUFFLE(XMM4,XMM4,0x00)
922: SSE_MULT_PS(XMM4,XMM1)
923: SSE_SUB_PS(XMM6,XMM4)
925: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
926: SSE_SHUFFLE(XMM5,XMM5,0x00)
927: SSE_MULT_PS(XMM5,XMM2)
928: SSE_SUB_PS(XMM6,XMM5)
930: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
931: SSE_SHUFFLE(XMM7,XMM7,0x00)
932: SSE_MULT_PS(XMM7,XMM3)
933: SSE_SUB_PS(XMM6,XMM7)
934:
935: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
936: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
937:
938: /* Fourth Column */
939: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
940: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
942: /* Matrix-Vector Product: */
943: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
944: SSE_SHUFFLE(XMM5,XMM5,0x00)
945: SSE_MULT_PS(XMM5,XMM0)
946: SSE_SUB_PS(XMM4,XMM5)
948: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
949: SSE_SHUFFLE(XMM6,XMM6,0x00)
950: SSE_MULT_PS(XMM6,XMM1)
951: SSE_SUB_PS(XMM4,XMM6)
953: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
954: SSE_SHUFFLE(XMM7,XMM7,0x00)
955: SSE_MULT_PS(XMM7,XMM2)
956: SSE_SUB_PS(XMM4,XMM7)
958: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
959: SSE_SHUFFLE(XMM5,XMM5,0x00)
960: SSE_MULT_PS(XMM5,XMM3)
961: SSE_SUB_PS(XMM4,XMM5)
962:
963: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
964: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
965: SSE_INLINE_END_2;
966: pv += 16;
967: }
968: PetscLogFlops(128*nz+112);
969: }
970: row = (unsigned int)(*bjtmp++);
971: /* row = (*bjtmp++)/4; */
972: /* bjtmp++; */
973: }
974: /* finished row so stick it into b->a */
975: pv = ba + 16*bi[i];
976: pj = bj + bi[i];
977: nz = bi[i+1] - bi[i];
979: /* Copy x block back into pv block */
980: for (j=0; j<nz; j++) {
981: x = rtmp+16*((unsigned int)pj[j]);
982: /* x = rtmp+4*pj[j]; */
984: SSE_INLINE_BEGIN_2(x,pv)
985: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
986: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
987: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
989: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
990: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
992: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
993: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
995: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
996: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
998: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
999: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1001: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1002: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1004: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1005: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1007: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1008: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1009: SSE_INLINE_END_2;
1010: pv += 16;
1011: }
1012: /* invert diagonal block */
1013: w = ba + 16*diag_offset[i];
1014: if (pivotinblocks) {
1015: Kernel_A_gets_inverse_A_4(w);
1016: } else {
1017: Kernel_A_gets_inverse_A_4_nopivot(w);
1018: }
1019: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1020: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1021: }
1023: PetscFree(rtmp);
1024: C->factor = FACTOR_LU;
1025: C->assembled = PETSC_TRUE;
1026: PetscLogFlops(1.3333*64*b->mbs);
1027: /* Flop Count from inverting diagonal blocks */
1028: SSE_SCOPE_END;
1029: return(0);
1030: }
1034: int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A,Mat *B)
1035: {
1036: Mat C = *B;
1037: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1038: int ierr,i,j,n = a->mbs;
1039: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1040: unsigned int row;
1041: int *ajtmpold,nz,*bi=b->i;
1042: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1043: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1044: MatScalar *ba = b->a,*aa = a->a;
1045: int nonzero=0;
1046: /* int nonzero=0,colscale = 16; */
1047: PetscTruth pivotinblocks = b->pivotinblocks;
1050: SSE_SCOPE_BEGIN;
1052: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1053: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1054: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1055: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1056: /* if ((unsigned long)bj==(unsigned long)aj) { */
1057: /* colscale = 4; */
1058: /* } */
1059: if ((unsigned long)bj==(unsigned long)aj) {
1060: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(B));
1061: }
1062:
1063: for (i=0; i<n; i++) {
1064: nz = bi[i+1] - bi[i];
1065: bjtmp = bj + bi[i];
1066: /* zero out the 4x4 block accumulators */
1067: /* zero out one register */
1068: XOR_PS(XMM7,XMM7);
1069: for (j=0; j<nz; j++) {
1070: x = rtmp+16*((unsigned int)bjtmp[j]);
1071: /* x = rtmp+4*bjtmp[j]; */
1072: SSE_INLINE_BEGIN_1(x)
1073: /* Copy zero register to memory locations */
1074: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1075: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1076: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1077: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1078: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1079: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1080: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1081: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1082: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1083: SSE_INLINE_END_1;
1084: }
1085: /* load in initial (unfactored row) */
1086: nz = ai[i+1] - ai[i];
1087: ajtmpold = aj + ai[i];
1088: v = aa + 16*ai[i];
1089: for (j=0; j<nz; j++) {
1090: x = rtmp+16*ajtmpold[j];
1091: /* x = rtmp+colscale*ajtmpold[j]; */
1092: /* Copy v block into x block */
1093: SSE_INLINE_BEGIN_2(v,x)
1094: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1095: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1098: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1099: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1101: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1102: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1104: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1105: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1107: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1108: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1110: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1111: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1113: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1114: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1116: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1117: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1118: SSE_INLINE_END_2;
1120: v += 16;
1121: }
1122: /* row = (*bjtmp++)/4; */
1123: row = (unsigned int)(*bjtmp++);
1124: while (row < i) {
1125: pc = rtmp + 16*row;
1126: SSE_INLINE_BEGIN_1(pc)
1127: /* Load block from lower triangle */
1128: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1129: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1130: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1132: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1133: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1135: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1136: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1138: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1139: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1141: /* Compare block to zero block */
1143: SSE_COPY_PS(XMM4,XMM7)
1144: SSE_CMPNEQ_PS(XMM4,XMM0)
1146: SSE_COPY_PS(XMM5,XMM7)
1147: SSE_CMPNEQ_PS(XMM5,XMM1)
1149: SSE_COPY_PS(XMM6,XMM7)
1150: SSE_CMPNEQ_PS(XMM6,XMM2)
1152: SSE_CMPNEQ_PS(XMM7,XMM3)
1154: /* Reduce the comparisons to one SSE register */
1155: SSE_OR_PS(XMM6,XMM7)
1156: SSE_OR_PS(XMM5,XMM4)
1157: SSE_OR_PS(XMM5,XMM6)
1158: SSE_INLINE_END_1;
1160: /* Reduce the one SSE register to an integer register for branching */
1161: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1162: MOVEMASK(nonzero,XMM5);
1164: /* If block is nonzero ... */
1165: if (nonzero) {
1166: pv = ba + 16*diag_offset[row];
1167: PREFETCH_L1(&pv[16]);
1168: pj = bj + diag_offset[row] + 1;
1170: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1171: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1172: /* but the diagonal was inverted already */
1173: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1175: SSE_INLINE_BEGIN_2(pv,pc)
1176: /* Column 0, product is accumulated in XMM4 */
1177: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1178: SSE_SHUFFLE(XMM4,XMM4,0x00)
1179: SSE_MULT_PS(XMM4,XMM0)
1181: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1182: SSE_SHUFFLE(XMM5,XMM5,0x00)
1183: SSE_MULT_PS(XMM5,XMM1)
1184: SSE_ADD_PS(XMM4,XMM5)
1186: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1187: SSE_SHUFFLE(XMM6,XMM6,0x00)
1188: SSE_MULT_PS(XMM6,XMM2)
1189: SSE_ADD_PS(XMM4,XMM6)
1191: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1192: SSE_SHUFFLE(XMM7,XMM7,0x00)
1193: SSE_MULT_PS(XMM7,XMM3)
1194: SSE_ADD_PS(XMM4,XMM7)
1196: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1197: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1199: /* Column 1, product is accumulated in XMM5 */
1200: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1201: SSE_SHUFFLE(XMM5,XMM5,0x00)
1202: SSE_MULT_PS(XMM5,XMM0)
1204: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1205: SSE_SHUFFLE(XMM6,XMM6,0x00)
1206: SSE_MULT_PS(XMM6,XMM1)
1207: SSE_ADD_PS(XMM5,XMM6)
1209: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1210: SSE_SHUFFLE(XMM7,XMM7,0x00)
1211: SSE_MULT_PS(XMM7,XMM2)
1212: SSE_ADD_PS(XMM5,XMM7)
1214: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1215: SSE_SHUFFLE(XMM6,XMM6,0x00)
1216: SSE_MULT_PS(XMM6,XMM3)
1217: SSE_ADD_PS(XMM5,XMM6)
1219: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1220: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1222: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1224: /* Column 2, product is accumulated in XMM6 */
1225: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1226: SSE_SHUFFLE(XMM6,XMM6,0x00)
1227: SSE_MULT_PS(XMM6,XMM0)
1229: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1230: SSE_SHUFFLE(XMM7,XMM7,0x00)
1231: SSE_MULT_PS(XMM7,XMM1)
1232: SSE_ADD_PS(XMM6,XMM7)
1234: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1235: SSE_SHUFFLE(XMM7,XMM7,0x00)
1236: SSE_MULT_PS(XMM7,XMM2)
1237: SSE_ADD_PS(XMM6,XMM7)
1239: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1240: SSE_SHUFFLE(XMM7,XMM7,0x00)
1241: SSE_MULT_PS(XMM7,XMM3)
1242: SSE_ADD_PS(XMM6,XMM7)
1243:
1244: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1245: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1247: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1248: /* Column 3, product is accumulated in XMM0 */
1249: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1250: SSE_SHUFFLE(XMM7,XMM7,0x00)
1251: SSE_MULT_PS(XMM0,XMM7)
1253: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1254: SSE_SHUFFLE(XMM7,XMM7,0x00)
1255: SSE_MULT_PS(XMM1,XMM7)
1256: SSE_ADD_PS(XMM0,XMM1)
1258: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1259: SSE_SHUFFLE(XMM1,XMM1,0x00)
1260: SSE_MULT_PS(XMM1,XMM2)
1261: SSE_ADD_PS(XMM0,XMM1)
1263: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1264: SSE_SHUFFLE(XMM7,XMM7,0x00)
1265: SSE_MULT_PS(XMM3,XMM7)
1266: SSE_ADD_PS(XMM0,XMM3)
1268: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1269: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1271: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1272: /* This is code to be maintained and read by humans afterall. */
1273: /* Copy Multiplier Col 3 into XMM3 */
1274: SSE_COPY_PS(XMM3,XMM0)
1275: /* Copy Multiplier Col 2 into XMM2 */
1276: SSE_COPY_PS(XMM2,XMM6)
1277: /* Copy Multiplier Col 1 into XMM1 */
1278: SSE_COPY_PS(XMM1,XMM5)
1279: /* Copy Multiplier Col 0 into XMM0 */
1280: SSE_COPY_PS(XMM0,XMM4)
1281: SSE_INLINE_END_2;
1283: /* Update the row: */
1284: nz = bi[row+1] - diag_offset[row] - 1;
1285: pv += 16;
1286: for (j=0; j<nz; j++) {
1287: PREFETCH_L1(&pv[16]);
1288: x = rtmp + 16*((unsigned int)pj[j]);
1289: /* x = rtmp + 4*pj[j]; */
1291: /* X:=X-M*PV, One column at a time */
1292: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1293: SSE_INLINE_BEGIN_2(x,pv)
1294: /* Load First Column of X*/
1295: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1296: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1298: /* Matrix-Vector Product: */
1299: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1300: SSE_SHUFFLE(XMM5,XMM5,0x00)
1301: SSE_MULT_PS(XMM5,XMM0)
1302: SSE_SUB_PS(XMM4,XMM5)
1304: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1305: SSE_SHUFFLE(XMM6,XMM6,0x00)
1306: SSE_MULT_PS(XMM6,XMM1)
1307: SSE_SUB_PS(XMM4,XMM6)
1309: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1310: SSE_SHUFFLE(XMM7,XMM7,0x00)
1311: SSE_MULT_PS(XMM7,XMM2)
1312: SSE_SUB_PS(XMM4,XMM7)
1314: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1315: SSE_SHUFFLE(XMM5,XMM5,0x00)
1316: SSE_MULT_PS(XMM5,XMM3)
1317: SSE_SUB_PS(XMM4,XMM5)
1319: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1320: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1322: /* Second Column */
1323: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1324: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1326: /* Matrix-Vector Product: */
1327: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1328: SSE_SHUFFLE(XMM6,XMM6,0x00)
1329: SSE_MULT_PS(XMM6,XMM0)
1330: SSE_SUB_PS(XMM5,XMM6)
1332: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1333: SSE_SHUFFLE(XMM7,XMM7,0x00)
1334: SSE_MULT_PS(XMM7,XMM1)
1335: SSE_SUB_PS(XMM5,XMM7)
1337: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1338: SSE_SHUFFLE(XMM4,XMM4,0x00)
1339: SSE_MULT_PS(XMM4,XMM2)
1340: SSE_SUB_PS(XMM5,XMM4)
1342: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1343: SSE_SHUFFLE(XMM6,XMM6,0x00)
1344: SSE_MULT_PS(XMM6,XMM3)
1345: SSE_SUB_PS(XMM5,XMM6)
1346:
1347: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1348: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1350: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1352: /* Third Column */
1353: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1354: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1356: /* Matrix-Vector Product: */
1357: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1358: SSE_SHUFFLE(XMM7,XMM7,0x00)
1359: SSE_MULT_PS(XMM7,XMM0)
1360: SSE_SUB_PS(XMM6,XMM7)
1362: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1363: SSE_SHUFFLE(XMM4,XMM4,0x00)
1364: SSE_MULT_PS(XMM4,XMM1)
1365: SSE_SUB_PS(XMM6,XMM4)
1367: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1368: SSE_SHUFFLE(XMM5,XMM5,0x00)
1369: SSE_MULT_PS(XMM5,XMM2)
1370: SSE_SUB_PS(XMM6,XMM5)
1372: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1373: SSE_SHUFFLE(XMM7,XMM7,0x00)
1374: SSE_MULT_PS(XMM7,XMM3)
1375: SSE_SUB_PS(XMM6,XMM7)
1376:
1377: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1378: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1379:
1380: /* Fourth Column */
1381: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1382: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1384: /* Matrix-Vector Product: */
1385: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1386: SSE_SHUFFLE(XMM5,XMM5,0x00)
1387: SSE_MULT_PS(XMM5,XMM0)
1388: SSE_SUB_PS(XMM4,XMM5)
1390: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1391: SSE_SHUFFLE(XMM6,XMM6,0x00)
1392: SSE_MULT_PS(XMM6,XMM1)
1393: SSE_SUB_PS(XMM4,XMM6)
1395: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1396: SSE_SHUFFLE(XMM7,XMM7,0x00)
1397: SSE_MULT_PS(XMM7,XMM2)
1398: SSE_SUB_PS(XMM4,XMM7)
1400: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1401: SSE_SHUFFLE(XMM5,XMM5,0x00)
1402: SSE_MULT_PS(XMM5,XMM3)
1403: SSE_SUB_PS(XMM4,XMM5)
1404:
1405: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1406: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1407: SSE_INLINE_END_2;
1408: pv += 16;
1409: }
1410: PetscLogFlops(128*nz+112);
1411: }
1412: row = (unsigned int)(*bjtmp++);
1413: /* row = (*bjtmp++)/4; */
1414: /* bjtmp++; */
1415: }
1416: /* finished row so stick it into b->a */
1417: pv = ba + 16*bi[i];
1418: pj = bj + bi[i];
1419: nz = bi[i+1] - bi[i];
1421: /* Copy x block back into pv block */
1422: for (j=0; j<nz; j++) {
1423: x = rtmp+16*((unsigned int)pj[j]);
1424: /* x = rtmp+4*pj[j]; */
1426: SSE_INLINE_BEGIN_2(x,pv)
1427: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1428: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1429: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1431: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1432: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1434: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1435: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1437: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1438: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1440: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1441: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1443: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1444: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1446: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1447: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1449: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1450: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1451: SSE_INLINE_END_2;
1452: pv += 16;
1453: }
1454: /* invert diagonal block */
1455: w = ba + 16*diag_offset[i];
1456: if (pivotinblocks) {
1457: Kernel_A_gets_inverse_A_4(w);
1458: } else {
1459: Kernel_A_gets_inverse_A_4_nopivot(w);
1460: }
1461: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1462: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1463: }
1465: PetscFree(rtmp);
1466: C->factor = FACTOR_LU;
1467: C->assembled = PETSC_TRUE;
1468: PetscLogFlops(1.3333*64*b->mbs);
1469: /* Flop Count from inverting diagonal blocks */
1470: SSE_SCOPE_END;
1471: return(0);
1472: }
1474: #endif