diff --git a/src/raymath.h b/src/raymath.h index 67756a6d0..8d5b1b2a9 100644 --- a/src/raymath.h +++ b/src/raymath.h @@ -19,17 +19,22 @@ * * CONFIGURATION: * #define RAYMATH_IMPLEMENTATION -* Generates the implementation of the library into the included file. +* Generates the implementation of the library into the included file * If not defined, the library is in header only mode and can be included in other headers -* or source files without problems. But only ONE file should hold the implementation. +* or source files without problems. But only ONE file should hold the implementation * * #define RAYMATH_STATIC_INLINE -* Define static inline functions code, so #include header suffices for use. -* This may use up lots of memory. +* Define static inline functions code, so #include header suffices for use +* This may use up lots of memory * * #define RAYMATH_DISABLE_CPP_OPERATORS * Disables C++ operator overloads for raymath types. * +* #define RAYMATH_USE_SIMD_INTRINSICS +* Try to enable SIMD intrinsics for MatrixMultiply() +* Note that users enabling it must be aware of the target platform where application will +* run to support the selected SIMD intrinsic, for now, only SSE is supported +* * LICENSE: zlib/libpng * * Copyright (c) 2015-2025 Ramon Santamaria (@raysan5) @@ -79,7 +84,6 @@ #endif #endif - //---------------------------------------------------------------------------------- // Defines and Macros //---------------------------------------------------------------------------------- @@ -170,9 +174,35 @@ typedef struct float16 { #include // Required for: sinf(), cosf(), tan(), atan2f(), sqrtf(), floor(), fminf(), fmaxf(), fabsf() -#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) - #include - #define RAYMATH_SSE_ENABLED +#if defined(RAYMATH_USE_SIMD_INTRINSICS) + // SIMD is used on the most costly raymath function MatrixMultiply() + // NOTE: Only SSE intrinsics support implemented + // TODO: Consider support for other SIMD instrinsics + /* + #if defined(__SSE4_2__) + #define SW_HAS_SSE42 + #include + #elif defined(__SSE4_1__) + #define SW_HAS_SSE41 + #include + #elif defined(__SSSE3__) + #define SW_HAS_SSSE3 + #include + #elif defined(__SSE3__) + #define SW_HAS_SSE3 + #include + #elif defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) // SSE2 x64 + #define SW_HAS_SSE2 + #include + #elif defined(__SSE__) + #define SW_HAS_SSE + #include + #endif + */ + #if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) + #include + #define RAYMATH_SSE_ENABLED + #endif #endif //---------------------------------------------------------------------------------- @@ -1652,18 +1682,20 @@ RMAPI Matrix MatrixSubtract(Matrix left, Matrix right) RMAPI Matrix MatrixMultiply(Matrix left, Matrix right) { Matrix result = { 0 }; -#ifdef RAYMATH_SSE_ENABLED - // Load left side and right side. + +#if defined(RAYMATH_SSE_ENABLED) + // Load left side and right side __m128 c0 = _mm_set_ps(right.m12, right.m8, right.m4, right.m0); __m128 c1 = _mm_set_ps(right.m13, right.m9, right.m5, right.m1); __m128 c2 = _mm_set_ps(right.m14, right.m10, right.m6, right.m2); __m128 c3 = _mm_set_ps(right.m15, right.m11, right.m7, right.m3); - // Transpose so c0..c3 become *rows* of the right matrix in semantic order. + + // Transpose so c0..c3 become *rows* of the right matrix in semantic order _MM_TRANSPOSE4_PS(c0, c1, c2, c3); + float tmp[4] = { 0 }; __m128 row; - float tmp[4]; - + // Row 0 of result: [m0, m1, m2, m3] row = _mm_mul_ps(_mm_set1_ps(left.m0), c0); row = _mm_add_ps(row, _mm_mul_ps(_mm_set1_ps(left.m1), c1)); @@ -1707,7 +1739,6 @@ RMAPI Matrix MatrixMultiply(Matrix left, Matrix right) result.m13 = tmp[1]; result.m14 = tmp[2]; result.m15 = tmp[3]; - #else result.m0 = left.m0*right.m0 + left.m1*right.m4 + left.m2*right.m8 + left.m3*right.m12; result.m1 = left.m0*right.m1 + left.m1*right.m5 + left.m2*right.m9 + left.m3*right.m13; @@ -1726,6 +1757,7 @@ RMAPI Matrix MatrixMultiply(Matrix left, Matrix right) result.m14 = left.m12*right.m2 + left.m13*right.m6 + left.m14*right.m10 + left.m15*right.m14; result.m15 = left.m12*right.m3 + left.m13*right.m7 + left.m14*right.m11 + left.m15*right.m15; #endif + return result; }