Engine/html/_t_bvh_simd_computing_context_8h_source.html

#pragma once


#include "Math/Algorithm/BVH/TWideBvhNode.h"

#include "Math/Geometry/TAABB3D.h"

#include "Math/TVector3.h"


#include <Common/primitive_type.h>

#include <Common/config.h>

#include <Common/compiler.h>

#include <Common/utility.h>

#include <Common/memory.h>

#include <Common/assertion.h>


#include <cstddef>

#include <type_traits>

#include <array>

#include <limits>

#include <concepts>

#include <utility>


#if PH_USE_SIMD

#if PH_COMPILER_IS_MSVC

#include <immintrin.h>

#else

#include <x86intrin.h>

#endif

#endif


namespace ph::math

{


namespace detail::bvh

{


template<std::size_t N>


struct TFloatN

{

    using Type = void;

};


#if PH_USE_SSE

template<>

struct TFloatN<4>

{

    using Type = __m128;

};

#endif


#if PH_USE_AVX

template<>

struct TFloatN<8>

{

    using Type = __m256;

};

#endif


}// end namespace detail::bvh


template<std::size_t N, typename Index>


class TBvhSimdComputingContext final

{

    // Developer note: Try not to mix SSE and AVX intrinsics, see https://stackoverflow.com/questions/41303780/why-is-this-sse-code-6-times-slower-without-vzeroupper-on-skylake.


    static_assert(N >= 2);

    static_assert(std::is_unsigned_v<Index>);


#if PH_USE_AVX && PH_USE_SSE

    inline static constexpr std::size_t BATCH_SIZE = N <= 4 ? 4 : 8;

#elif PH_USE_SSE

    inline static constexpr std::size_t BATCH_SIZE = 4;

#elif PH_USE_AVX

    inline static constexpr std::size_t BATCH_SIZE = 8;

#else

    inline static constexpr std::size_t BATCH_SIZE = 4;

#endif


    // The batched float type

    using BFloat = detail::bvh::TFloatN<BATCH_SIZE>::Type;


    // Number of batches

    inline static constexpr std::size_t B = N % BATCH_SIZE ? N / BATCH_SIZE + 1 : N / BATCH_SIZE;


    // Ensure proper alignment

#if PH_USE_AVX || PH_USE_SSE

    static_assert(alignof(BFloat) >= sizeof(BFloat));

#endif


public:


    static constexpr bool isSupported()

    {

        constexpr bool hasSseSupport = PH_USE_SSE4_1 && std::is_same_v<real, float32>;

        constexpr bool hasAvxSupport = PH_USE_AVX && std::is_same_v<real, float32>;

        return hasSseSupport || hasAvxSupport;

    }


public:

    [[PH_ALWAYS_INLINE]]


    void setSegment(

        const TVector3<float32>& segmentOrigin,

        const TVector3<float32>& rcpSegmentDir)

    {

        for(std::size_t di = 0; di < 3; ++di)

        {

            if constexpr(BATCH_SIZE == 4)

            {

#if PH_USE_SSE

                m_segmentOrigins[di] = _mm_set1_ps(segmentOrigin[di]);

                m_rcpSegmentDirs[di] = _mm_set1_ps(rcpSegmentDir[di]);

#endif

            }

            else if constexpr(BATCH_SIZE == 8)

            {

#if PH_USE_AVX

                for(std::size_t di = 0; di < 3; ++di)

                {

                    m_segmentOrigins[di] = _mm256_set1_ps(segmentOrigin[di]);

                    m_rcpSegmentDirs[di] = _mm256_set1_ps(rcpSegmentDir[di]);

                }

#endif

            }

        }

    }


    [[PH_ALWAYS_INLINE]]


    void setNode(const TWideBvhNode<N, Index>& node) requires std::is_same_v<real, float32>

    {

        const auto& emptyAABB = AABB3D::makeEmpty();


        for(std::size_t di = 0; di < 3; ++di)

        {

            for(std::size_t ci = 0; ci < N; ci += BATCH_SIZE)

            {

                if constexpr(N % BATCH_SIZE)

                {

                    const auto& aabb0 = node.getAABB(ci);

                    const auto& aabb1 = ci + 1 < N ? node.getAABB(ci + 1) : emptyAABB;

                    const auto& aabb2 = ci + 2 < N ? node.getAABB(ci + 2) : emptyAABB;

                    const auto& aabb3 = ci + 3 < N ? node.getAABB(ci + 3) : emptyAABB;

                    const auto& aabb4 = ci + 4 < N ? node.getAABB(ci + 4) : emptyAABB;

                    const auto& aabb5 = ci + 5 < N ? node.getAABB(ci + 5) : emptyAABB;

                    const auto& aabb6 = ci + 6 < N ? node.getAABB(ci + 6) : emptyAABB;

                    const auto& aabb7 = ci + 7 < N ? node.getAABB(ci + 7) : emptyAABB;


                    if constexpr(BATCH_SIZE == 4)

                    {

#if PH_USE_SSE

                        m_aabbMins[di][ci / 4] = _mm_setr_ps(

                            aabb0.getMinVertex()[di],

                            aabb1.getMinVertex()[di],

                            aabb2.getMinVertex()[di],

                            aabb3.getMinVertex()[di]);

                        m_aabbMaxs[di][ci / 4] = _mm_setr_ps(

                            aabb0.getMaxVertex()[di],

                            aabb1.getMaxVertex()[di],

                            aabb2.getMaxVertex()[di],

                            aabb3.getMaxVertex()[di]);

#endif

                    }

                    else if constexpr(BATCH_SIZE == 8)

                    {

#if PH_USE_AVX

                        m_aabbMins[di][ci / 8] = _mm256_setr_ps(

                            aabb0.getMinVertex()[di],

                            aabb1.getMinVertex()[di],

                            aabb2.getMinVertex()[di],

                            aabb3.getMinVertex()[di],

                            aabb4.getMinVertex()[di],

                            aabb5.getMinVertex()[di],

                            aabb6.getMinVertex()[di],

                            aabb7.getMinVertex()[di]);

                        m_aabbMaxs[di][ci / 8] = _mm256_setr_ps(

                            aabb0.getMaxVertex()[di],

                            aabb1.getMaxVertex()[di],

                            aabb2.getMaxVertex()[di],

                            aabb3.getMaxVertex()[di],

                            aabb4.getMaxVertex()[di],

                            aabb5.getMaxVertex()[di],

                            aabb6.getMaxVertex()[di],

                            aabb7.getMaxVertex()[di]);

#endif

                    }

                }

                else

                {

                    if constexpr(BATCH_SIZE == 4)

                    {

#if PH_USE_SSE

                        PH_ASSERT_GE(node.SOA_VIEW_ALIGNMENT, 16);

                        m_aabbMins[di][ci / 4] = _mm_load_ps(&(node.getMinVerticesOnAxis(di)[ci]));

                        m_aabbMaxs[di][ci / 4] = _mm_load_ps(&(node.getMaxVerticesOnAxis(di)[ci]));

#endif

                    }

                    else if constexpr(BATCH_SIZE == 8)

                    {

#if PH_USE_AVX

                        PH_ASSERT_GE(node.SOA_VIEW_ALIGNMENT, 32);

                        m_aabbMins[di][ci / 8] = _mm256_load_ps(&(node.getMinVerticesOnAxis(di)[ci]));

                        m_aabbMaxs[di][ci / 8] = _mm256_load_ps(&(node.getMaxVerticesOnAxis(di)[ci]));

#endif

                    }

                }

            }

        }

    }


    template<bool IS_ROBUST = true>

    [[PH_ALWAYS_INLINE]]


    void intersectAabbVolumes(const float32 segmentMinT, const float32 segmentMaxT)

    {

        // The implementation is similar to `TAABB3D<T>::intersectVolumeTavian()` and

        // `TAABB3D<T>::intersectVolumeRobust()`


        if constexpr(BATCH_SIZE == 4)

#if PH_USE_SSE

        {

            m_aabbMinTs = make_array<__m128, B>(_mm_set1_ps(segmentMinT));

            m_aabbMaxTs = make_array<__m128, B>(_mm_set1_ps(segmentMaxT));

#endif

        }


        else if(BATCH_SIZE == 8)

        {

#if PH_USE_AVX

            m_aabbMinTs = make_array<__m256, B>(_mm256_set1_ps(segmentMinT));

            m_aabbMaxTs = make_array<__m256, B>(_mm256_set1_ps(segmentMaxT));

#endif

        }


        for(std::size_t di = 0; di < 3; ++di)

        {

            for(std::size_t bi = 0; bi < B; ++bi)

            {

                if constexpr(BATCH_SIZE == 4)

                {

#if PH_USE_SSE

                    const __m128 t1 =

                        _mm_mul_ps(_mm_sub_ps(m_aabbMins[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);

                    const __m128 t2 =

                        _mm_mul_ps(_mm_sub_ps(m_aabbMaxs[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);


                    const __m128 minT = _mm_min_ps(t1, t2);

                    const __m128 maxT = _mm_max_ps(t1, t2);


                    // Safe max: fallback to `segmentMinT` in case of NaN

                    m_aabbMinTs[bi] = _mm_max_ps(minT, m_aabbMinTs[bi]);


                    // Safe min: fallback to `segmentMaxT` in case of NaN

                    m_aabbMaxTs[bi] = _mm_min_ps(maxT, m_aabbMaxTs[bi]);

#endif

                }

                else if constexpr(BATCH_SIZE == 8)

                {

#if PH_USE_AVX

                    const __m256 t1 =

                        _mm256_mul_ps(_mm256_sub_ps(m_aabbMins[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);

                    const __m256 t2 =

                        _mm256_mul_ps(_mm256_sub_ps(m_aabbMaxs[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);


                    const __m256 minT = _mm256_min_ps(t1, t2);

                    const __m256 maxT = _mm256_max_ps(t1, t2);


                    // Safe max: fallback to `segmentMinT` in case of NaN

                    m_aabbMinTs[bi] = _mm256_max_ps(minT, m_aabbMinTs[bi]);


                    // Safe min: fallback to `segmentMaxT` in case of NaN

                    m_aabbMaxTs[bi] = _mm256_min_ps(maxT, m_aabbMaxTs[bi]);

#endif

                }


                // The following links have more information on the behavior of MINPS and MAXPS

                // (they all satisfy the safe requirement)

                // https://www.felixcloutier.com/x86/minps

                // https://tavianator.com/2015/ray_box_nan.html

            }

        }


        if constexpr(IS_ROBUST)

        {

            constexpr auto multiplier = std::numeric_limits<float>::epsilon() * 2 + 1;


            for(std::size_t bi = 0; bi < B; ++bi)

            {

                if constexpr(BATCH_SIZE == 4)

                {

#if PH_USE_SSE

                    m_aabbMaxTs[bi] = _mm_mul_ps(m_aabbMaxTs[bi], _mm_set1_ps(multiplier));

#endif

                }

                else if constexpr(BATCH_SIZE == 8)

                {

#if PH_USE_AVX

                    m_aabbMaxTs[bi] = _mm256_mul_ps(m_aabbMaxTs[bi], _mm256_set1_ps(multiplier));

#endif

                }

            }

        }


    }


    template<std::unsigned_integral MaskType = uint32>

    [[PH_ALWAYS_INLINE]]

    auto getIntersectResultAsMask() const

    -> MaskType

    {

        static_assert(N <= sizeof_in_bits<MaskType>(), "Need more bits for `MaskType`.");


        MaskType hitMask = 0;


        for(std::size_t bi = 0; bi < B; ++bi)

        {

            if constexpr(BATCH_SIZE == 4)

            {

#if PH_USE_SSE

                hitMask <<= 4;

                hitMask |= _mm_movemask_ps(_mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]));

#endif

            }

            else if constexpr(BATCH_SIZE == 8)

            {

#if PH_USE_AVX

                hitMask <<= 8;

                hitMask |= _mm256_movemask_ps(_mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]));

#endif

            }

        }


        return hitMask;

    }


    [[PH_ALWAYS_INLINE]]

    auto getIntersectResultAsMinTsOr(const float32 missValue) const

    -> TAlignedArray<float32, B * BATCH_SIZE, sizeof(float32) * BATCH_SIZE>

    {

        TAlignedArray<float32, B * BATCH_SIZE, sizeof(float32) * BATCH_SIZE> results;


        // Perform `value = aabbMinT <= aabbMaxTs ? aabbMaxTs : missValue`

        for(std::size_t bi = 0; bi < B; ++bi)

        {

            if constexpr(BATCH_SIZE == 4)

            {

#if PH_USE_SSE4_1

                const __m128 cmpleMask = _mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]);

                const __m128 values = _mm_blendv_ps(_mm_set1_ps(missValue), m_aabbMinTs[bi], cmpleMask);


                _mm_store_ps(&(results[bi * 4]), values);

#endif

            }

            else if constexpr(BATCH_SIZE == 8)

            {

#if PH_USE_AVX

                const __m256 cmpleMask = _mm256_cmp_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi], _CMP_LE_OQ);

                const __m256 values = _mm256_blendv_ps(_mm256_set1_ps(missValue), m_aabbMinTs[bi], cmpleMask);


                _mm256_store_ps(&(results[bi * 8]), values);

#endif

            }

        }


#if !PH_USE_SSE4_1 && !PH_USE_AVX

        results.fill(missValue);

#endif


        return results;

    }


#if PH_COMPILER_IS_GNU

#pragma GCC diagnostic push


// g++ 14 will emit "-Wignored-attributes" warnings for `BFloat`, see

// https://stackoverflow.com/questions/41676311/implication-of-gcc-warning-ignoring-attributes-on-template-argument-wignored.

// Ignoring for now as tests are passed.

#if __GNUC__ == 14

#pragma GCC diagnostic ignored "-Wignored-attributes"


#endif


#endif


#if PH_USE_AVX || PH_USE_SSE

private:

    std::array<std::array<BFloat, B>, 3> m_aabbMins;

    std::array<std::array<BFloat, B>, 3> m_aabbMaxs;


    std::array<BFloat, 3> m_segmentOrigins;

    std::array<BFloat, 3> m_rcpSegmentDirs;


    std::array<BFloat, B> m_aabbMinTs;

    std::array<BFloat, B> m_aabbMaxTs;

#endif


#if PH_COMPILER_IS_GNU

#pragma GCC diagnostic pop

#endif

};


}// end namespace ph::math


TAABB3D.h

TVector3.h

TWideBvhNode.h

ph::math::TAABB3D< real >::makeEmpty
static TAABB3D makeEmpty()
Definition TAABB3D.ipp:15

ph::math::TBvhSimdComputingContext
A SIMD computing context for BVH. Use isSupported() to check the availability of the required hardwar...
Definition TBvhSimdComputingContext.h:64

ph::math::TBvhSimdComputingContext::setSegment
void setSegment(const TVector3< float32 > &segmentOrigin, const TVector3< float32 > &rcpSegmentDir)
Definition TBvhSimdComputingContext.h:106

ph::math::TBvhSimdComputingContext::setNode
void setNode(const TWideBvhNode< N, Index > &node)
Definition TBvhSimdComputingContext.h:133

ph::math::TBvhSimdComputingContext::isSupported
static constexpr bool isSupported()
Definition TBvhSimdComputingContext.h:97

ph::math::TBvhSimdComputingContext::intersectAabbVolumes
void intersectAabbVolumes(const float32 segmentMinT, const float32 segmentMaxT)
Definition TBvhSimdComputingContext.h:216

ph::math::TVector3
Represents a 3-D vector.
Definition TVector3.h:17

ph::math::TWideBvhNode
Definition TWideBvhNode.h:26

ph::math
Math functions and utilities.
Definition TransformInfo.h:10

ph::math::MaskType
class ph::math::TBvhSimdComputingContext MaskType

ph::math::BATCH_SIZE
class ph::math::TBvhSimdComputingContext * BATCH_SIZE

ph::math::detail::bvh::TFloatN
Definition TBvhSimdComputingContext.h:37

ph::math::detail::bvh::TFloatN::Type
void Type
Definition TBvhSimdComputingContext.h:38