28 #ifndef __SIMDHelper_H__ 29 #define __SIMDHelper_H__ 46 #if defined(__INTEL_COMPILER) 49 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) 50 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 52 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64) 54 #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer)) 56 #elif defined(_MSC_VER) 58 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 61 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 71 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 76 #include <xmmintrin.h> 79 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 109 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) 111 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below 122 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ 124 __m128 tmp3, tmp2, tmp1, tmp0; \ 131 tmp0 = _mm_unpacklo_ps(r0, r1); \ 132 tmp2 = _mm_unpackhi_ps(r0, r1); \ 133 tmp1 = _mm_unpacklo_ps(r2, r3); \ 134 tmp3 = _mm_unpackhi_ps(r2, r3); \ 136 r0 = _mm_movelh_ps(tmp0, tmp1); \ 137 r1 = _mm_movehl_ps(tmp1, tmp0); \ 138 r2 = _mm_movelh_ps(tmp2, tmp3); \ 139 r3 = _mm_movehl_ps(tmp3, tmp2); \ 150 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ 152 __m128 tmp0, tmp1, tmp2; \ 158 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); \ 159 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); \ 160 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); \ 162 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); \ 163 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); \ 164 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); \ 174 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ 176 __m128 tmp0, tmp1, tmp2; \ 182 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); \ 183 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); \ 184 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); \ 186 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); \ 187 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); \ 188 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); \ 194 #define __MM_SELECT(v, fp) \ 195 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) 198 #define __MM_ACCUM4_PS(a, b, c, d) \ 199 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) 204 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ 205 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) 210 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ 211 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) 214 #define __MM_ACCUM3_PS(a, b, c) \ 215 _mm_add_ps(_mm_add_ps(a, b), c) 220 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ 221 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) 224 #define __MM_MADD_PS(a, b, c) \ 225 _mm_add_ps(_mm_mul_ps(a, b), c) 228 #define __MM_LERP_PS(t, a, b) \ 229 __MM_MADD_PS(_mm_sub_ps(b, a), t, a) 232 #define __MM_MADD_SS(a, b, c) \ 233 _mm_add_ss(_mm_mul_ss(a, b), c) 236 #define __MM_LERP_SS(t, a, b) \ 237 __MM_MADD_SS(_mm_sub_ss(b, a), t, a) 240 #define __MM_LOAD_PS(p) \ 241 (*(const __m128*)(p)) 244 #define __MM_STORE_PS(p, v) \ 245 (*(__m128*)(p) = (v)) 250 template <
bool aligned = false>
251 struct SSEMemoryAccessor
255 return _mm_loadu_ps(p);
257 static FORCEINLINE void store(
float *p,
const __m128& v)
264 struct SSEMemoryAccessor<true>
266 static FORCEINLINE const __m128& load(
const float *p)
268 return __MM_LOAD_PS(p);
270 static FORCEINLINE void store(
float *p,
const __m128& v)
278 static FORCEINLINE bool _isAlignedForSSE(
const void *p)
280 return (((
size_t)p) & 15) == 0;
286 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(
const __m128& x)
288 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
289 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
290 __m128 t = _mm_rsqrt_ps(x);
291 return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
292 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
297 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ 300 assert(_isAlignedForSSE(&test)); \ 303 #else // !OGRE_DEBUG_MODE 304 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() 306 #endif // OGRE_DEBUG_MODE 309 #endif // __OGRE_HAVE_SSE 315 #endif // __SIMDHelper_H__