Skip to content

Revelator darkmod AVX* changes #631

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5411625
AVX* changes to idlib
revelator Nov 17, 2024
eee2aaf
cmake changes for avx* support
revelator Nov 17, 2024
ca93e90
Changes to use the New AVX* based culling
revelator Nov 17, 2024
29de621
new detection routine for AVX changes.
revelator Nov 17, 2024
0a1f2a3
enums for new cpu types
revelator Nov 18, 2024
f50471b
Hopefully fixed some mistakes
revelator Nov 18, 2024
8dcbdc2
more forgotten changes...
revelator Nov 18, 2024
d0fc68c
trying to fix avx build failure on nix compiler
revelator Nov 18, 2024
59afa20
revert addition of ALIGNTYPE16
revelator Nov 18, 2024
f23f59c
changed ALIGNTYPE16 to ALIGN16 in DAZ detection
revelator Nov 18, 2024
5071ad5
Hybrid GLSL ARB2 renderer
revelator Nov 18, 2024
4d876e0
externs and shader code for GLSL
revelator Nov 18, 2024
a2d68e3
VBO changes
revelator Nov 18, 2024
ff4ab55
Another batch of misses
revelator Nov 18, 2024
bdae6b7
myGlMultiMatrix name change and SSE intrinsics
revelator Nov 18, 2024
453e122
myGlMultiMatrix name change
revelator Nov 18, 2024
92bf94a
missing type for DepthBoundsTest
revelator Nov 18, 2024
246602a
Try to make gcc happy
revelator Nov 18, 2024
ddb3af0
cast from const char to char gcc error
revelator Nov 18, 2024
3f3d2b7
turn of some warnings that make no sense
revelator Nov 18, 2024
50b1ed4
bring draft in line with current
revelator Nov 20, 2024
f7c55ee
further draft changes
revelator Nov 20, 2024
158c17b
and hpåefully the last batch needed for the draft
revelator Nov 20, 2024
6edbc44
mac build error with sse
revelator Nov 20, 2024
2240b7d
minor whoops in disabling SSE intrinsics for ppc
revelator Nov 21, 2024
0edfbdc
DWORD is not recognized
revelator Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
524 changes: 411 additions & 113 deletions neo/idlib/math/Simd.cpp

Large diffs are not rendered by default.

17 changes: 2 additions & 15 deletions neo/idlib/math/Simd_AVX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,6 @@ If you have questions concerning this license or the applicable additional terms
//
//===============================================================

#if defined(__GNUC__) && defined(__SSE3__)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing all this probably won't work either, esp. on non-x86 CPUs


/*
============
idSIMD_SSE3::GetName
============
*/
const char *idSIMD_AVX::GetName( void ) const {
return "MMX & SSE & SSE2 & SSE3 & AVX";
}

#elif defined(_MSC_VER) && defined(_M_IX86)

#include <immintrin.h>

#include "idlib/geometry/DrawVert.h"
Expand Down Expand Up @@ -76,6 +63,7 @@ void VPCALL idSIMD_AVX::CullByFrustum( idDrawVert *verts, const int numVerts, co
const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
const __m256 eps = _mm256_set1_ps( epsilon );
const byte mask6 = ( 1 << 6 ) - 1;

for( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m256 vX = _mm256_set1_ps( vec.x );
Expand Down Expand Up @@ -111,6 +99,7 @@ void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, c
const __m256 eps = _mm256_set1_ps( epsilon );
static const __m256 epsM = _mm256_set1_ps( -epsilon );
const short mask6 = ( 1 << 6 ) - 1;

for( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m256 vX = _mm256_set1_ps( vec.x );
Expand All @@ -133,5 +122,3 @@ void VPCALL idSIMD_AVX::CullByFrustum2( idDrawVert *verts, const int numVerts, c
}
_mm256_zeroupper();
}

#endif
5 changes: 1 addition & 4 deletions neo/idlib/math/Simd_AVX.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,10 @@

class idSIMD_AVX : public idSIMD_SSE3 {
public:
#if defined(__GNUC__) && defined(__AVX__)
virtual const char *VPCALL GetName( void ) const;
#elif defined(_MSC_VER) && defined(_M_IX86)
// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
virtual const char *VPCALL GetName( void ) const;
virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
#endif
};

#endif /* !__MATH_SIMD_AVX_H__ */
20 changes: 3 additions & 17 deletions neo/idlib/math/Simd_AVX2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,10 @@ If you have questions concerning this license or the applicable additional terms

//===============================================================
//
// AVX implementation of idSIMDProcessor
// AVX2 implementation of idSIMDProcessor
//
//===============================================================

#if defined(__GNUC__) && defined(__SSE3__)

/*
============
idSIMD_SSE3::GetName
============
*/
const char *idSIMD_AVX:2:GetName( void ) const {
return "MMX & SSE & SSE2 & SSE3 & AVX & AVX2";
}

#elif defined(_MSC_VER) && defined(_M_IX86)

#include <immintrin.h>

#include "idlib/geometry/DrawVert.h"
Expand Down Expand Up @@ -76,6 +63,7 @@ void VPCALL idSIMD_AVX2::CullByFrustum( idDrawVert *verts, const int numVerts, c
const __m256 fD = _mm256_set_ps( 0, 0, frustum[5][3], frustum[4][3], frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
const __m256 eps = _mm256_set1_ps( epsilon );
const byte mask6 = (1 << 6) - 1;

for ( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m256 vX = _mm256_set1_ps( vec.x );
Expand Down Expand Up @@ -105,6 +93,7 @@ void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts,
const __m256 eps = _mm256_set1_ps( epsilon );
static const __m256 epsM = _mm256_set1_ps( -epsilon );
const short mask6 = (1 << 6) - 1;

for ( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m256 vX = _mm256_set1_ps( vec.x );
Expand All @@ -121,6 +110,3 @@ void VPCALL idSIMD_AVX2::CullByFrustum2( idDrawVert *verts, const int numVerts,
}
_mm256_zeroupper();
}

#endif

5 changes: 1 addition & 4 deletions neo/idlib/math/Simd_AVX2.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,10 @@

class idSIMD_AVX2 : public idSIMD_AVX {
public:
#if defined(__GNUC__) && defined(__AVX__)
virtual const char *VPCALL GetName( void ) const;
#elif defined(_MSC_VER) && defined(_M_IX86)
// Revelator: these work whether gcc clang or msvc in x86 or x64 (no inline assembly used)
virtual const char *VPCALL GetName( void ) const;
virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
#endif
};

#endif /* !__MATH_SIMD_AVX2_H__ */
2 changes: 0 additions & 2 deletions neo/idlib/math/Simd_AltiVec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,6 @@ inline vector float VectorSin16( vector float v ) {
#if 0
// load up half PI and use it to calculate the rest of the values. This is
// sometimes cheaper than loading them from memory

vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
vector float PI = vec_add( halfPI, halfPI );
vector float oneandhalfPI = vec_add( PI, halfPI );
Expand Down Expand Up @@ -603,7 +602,6 @@ inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *
#endif
}


// End Helper Functions

#ifdef ENABLE_SIMPLE_MATH
Expand Down
100 changes: 50 additions & 50 deletions neo/idlib/math/Simd_SSE.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ If you have questions concerning this license or the applicable additional terms
*/

#include "sys/platform.h"
#include "idlib/geometry/DrawVert.h"
#include "idlib/math/Simd_SSE.h"

//===============================================================
Expand All @@ -36,6 +35,16 @@ If you have questions concerning this license or the applicable additional terms
// E
//===============================================================

#include <xmmintrin.h>

#include "idlib/geometry/DrawVert.h"
#include "idlib/geometry/JointTransform.h"
#include "idlib/math/Vector.h"
#include "idlib/math/Matrix.h"
#include "idlib/math/Quat.h"
#include "idlib/math/Plane.h"
#include "renderer/Model.h"

#define DRAWVERT_SIZE 60
#define DRAWVERT_XYZ_OFFSET (0*4)
#define DRAWVERT_ST_OFFSET (3*4)
Expand All @@ -46,8 +55,6 @@ If you have questions concerning this license or the applicable additional terms

#if defined(__GNUC__) && defined(__SSE__)

#include <xmmintrin.h>

#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

Expand Down Expand Up @@ -626,15 +633,6 @@ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *

#elif defined(_MSC_VER) && defined(_M_IX86)

#include <xmmintrin.h>

#include "idlib/geometry/JointTransform.h"
#include "idlib/math/Vector.h"
#include "idlib/math/Matrix.h"
#include "idlib/math/Quat.h"
#include "idlib/math/Plane.h"
#include "renderer/Model.h"

#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

Expand Down Expand Up @@ -18093,6 +18091,8 @@ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuf
#endif
}

#endif /* _MSC_VER */

/*
============
idSIMD_SSE::CullByFrustum
Expand All @@ -18107,31 +18107,32 @@ void VPCALL idSIMD_SSE::CullByFrustum( idDrawVert *verts, const int numVerts, co
__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );

for ( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m128 vX = _mm_set1_ps( vec.x );
__m128 vY = _mm_set1_ps( vec.y );
__m128 vZ = _mm_set1_ps( vec.z );
__m128 d14 = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps( fA14, vX ),
_mm_mul_ps( fB14, vY )
),
_mm_add_ps(
_mm_mul_ps( fC14, vZ ),
fD14
)
);
_mm_add_ps(
_mm_mul_ps( fA14, vX ),
_mm_mul_ps( fB14, vY )
),
_mm_add_ps(
_mm_mul_ps( fC14, vZ ),
fD14
)
);
__m128 d56 = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps( fA56, vX ),
_mm_mul_ps( fB56, vY )
),
_mm_add_ps(
_mm_mul_ps( fC56, vZ ),
fD56
)
);
_mm_add_ps(
_mm_mul_ps( fA56, vX ),
_mm_mul_ps( fB56, vY )
),
_mm_add_ps(
_mm_mul_ps( fC56, vZ ),
fD56
)
);
const short mask6 = ( 1 << 6 ) - 1;
__m128 eps = _mm_set1_ps( epsilon );
int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
Expand All @@ -18155,31 +18156,32 @@ void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, c
__m128 fC56 = _mm_set_ps( 0, 0, frustum[5][2], frustum[4][2] );
__m128 fD14 = _mm_set_ps( frustum[3][3], frustum[2][3], frustum[1][3], frustum[0][3] );
__m128 fD56 = _mm_set_ps( 0, 0, frustum[5][3], frustum[4][3] );

for ( int j = 0; j < numVerts; j++ ) {
idVec3 &vec = verts[j].xyz;
__m128 vX = _mm_set1_ps( vec.x );
__m128 vY = _mm_set1_ps( vec.y );
__m128 vZ = _mm_set1_ps( vec.z );
__m128 d14 = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps( fA14, vX ),
_mm_mul_ps( fB14, vY )
),
_mm_add_ps(
_mm_mul_ps( fC14, vZ ),
fD14
)
);
_mm_add_ps(
_mm_mul_ps( fA14, vX ),
_mm_mul_ps( fB14, vY )
),
_mm_add_ps(
_mm_mul_ps( fC14, vZ ),
fD14
)
);
__m128 d56 = _mm_add_ps(
_mm_add_ps(
_mm_mul_ps( fA56, vX ),
_mm_mul_ps( fB56, vY )
),
_mm_add_ps(
_mm_mul_ps( fC56, vZ ),
fD56
)
);
_mm_add_ps(
_mm_mul_ps( fA56, vX ),
_mm_mul_ps( fB56, vY )
),
_mm_add_ps(
_mm_mul_ps( fC56, vZ ),
fD56
)
);
const short mask6 = ( 1 << 6 ) - 1;
__m128 eps = _mm_set1_ps( epsilon );
int mask_lo14 = _mm_movemask_ps( _mm_cmplt_ps( d14, eps ) );
Expand All @@ -18192,5 +18194,3 @@ void VPCALL idSIMD_SSE::CullByFrustum2( idDrawVert *verts, const int numVerts, c
pointCull[j] = mask_lo & mask6 | ( mask_hi & mask6 ) << 6;
}
}

#endif /* _MSC_VER */
15 changes: 9 additions & 6 deletions neo/idlib/math/Simd_SSE.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class idSIMD_SSE : public idSIMD_MMX {
using idSIMD_MMX::MinMax;

virtual const char *VPCALL GetName( void ) const;
virtual void VPCALL Dot( float *dst, const idPlane &constant,const idDrawVert *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count );
virtual void VPCALL MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count );
virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count );

Expand All @@ -69,9 +69,9 @@ class idSIMD_SSE : public idSIMD_MMX {
virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count );
virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count );
virtual void VPCALL Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant,const idVec3 *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant,const idPlane *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant,const idDrawVert *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant, const idPlane *src, const int count );
virtual void VPCALL Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count );
virtual void VPCALL Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count );
virtual void VPCALL Dot( float &dot, const float *src1, const float *src2, const int count );

Expand Down Expand Up @@ -143,9 +143,12 @@ class idSIMD_SSE : public idSIMD_MMX {
virtual void VPCALL MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] );
virtual void VPCALL MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples );

virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );
#endif

// Revelator: these work whether in gcc clang or msvc x86 or x64 (no inline assembly used)
virtual void VPCALL CullByFrustum( idDrawVert *verts, const int numVerts, const idPlane frustum[6], byte *pointCull, float epsilon );
virtual void VPCALL CullByFrustum2( idDrawVert *verts, const int numVerts, const idPlane frustum[6], unsigned short *pointCull, float epsilon );

};

#endif /* !__MATH_SIMD_SSE_H__ */
Loading