Photon Engine 2.0.0-beta
A physically based renderer.
Loading...
Searching...
No Matches
TBvhSimdComputingContext.h
Go to the documentation of this file.
1#pragma once
2
5#include "Math/TVector3.h"
6
7#include <Common/primitive_type.h>
8#include <Common/config.h>
9#include <Common/compiler.h>
10#include <Common/utility.h>
11#include <Common/memory.h>
12#include <Common/assertion.h>
13
14#include <cstddef>
15#include <type_traits>
16#include <array>
17#include <limits>
18#include <concepts>
19#include <utility>
20
21#if PH_USE_SIMD
22#if PH_COMPILER_IS_MSVC
23#include <immintrin.h>
24#else
25#include <x86intrin.h>
26#endif
27#endif
28
29namespace ph::math
30{
31
32namespace detail::bvh
33{
34
35template<std::size_t N>
36struct TFloatN
37{
38 using Type = void;
39};
40
41#if PH_USE_SSE
42template<>
43struct TFloatN<4>
44{
45 using Type = __m128;
46};
47#endif
48
49#if PH_USE_AVX
50template<>
51struct TFloatN<8>
52{
53 using Type = __m256;
54};
55#endif
56
57}// end namespace detail::bvh
58
62template<std::size_t N, typename Index>
64{
65 // Developer note: Try not to mix SSE and AVX intrinsics, see https://stackoverflow.com/questions/41303780/why-is-this-sse-code-6-times-slower-without-vzeroupper-on-skylake.
66
67 static_assert(N >= 2);
68 static_assert(std::is_unsigned_v<Index>);
69
70#if PH_USE_AVX && PH_USE_SSE
71 inline static constexpr std::size_t BATCH_SIZE = N <= 4 ? 4 : 8;
72#elif PH_USE_SSE
73 inline static constexpr std::size_t BATCH_SIZE = 4;
74#elif PH_USE_AVX
75 inline static constexpr std::size_t BATCH_SIZE = 8;
76#else
77 inline static constexpr std::size_t BATCH_SIZE = 4;
78#endif
79
80 // The batched float type
82
83 // Number of batches
84 inline static constexpr std::size_t B = N % BATCH_SIZE ? N / BATCH_SIZE + 1 : N / BATCH_SIZE;
85
86 // Ensure proper alignment
87#if PH_USE_AVX || PH_USE_SSE
88 static_assert(alignof(BFloat) >= sizeof(BFloat));
89#endif
90
91public:
97 static constexpr bool isSupported()
98 {
99 constexpr bool hasSseSupport = PH_USE_SSE4_1 && std::is_same_v<real, float32>;
100 constexpr bool hasAvxSupport = PH_USE_AVX && std::is_same_v<real, float32>;
101 return hasSseSupport || hasAvxSupport;
102 }
103
104public:
105 [[PH_ALWAYS_INLINE]]
107 const TVector3<float32>& segmentOrigin,
108 const TVector3<float32>& rcpSegmentDir)
109 {
110 for(std::size_t di = 0; di < 3; ++di)
111 {
112 if constexpr(BATCH_SIZE == 4)
113 {
114#if PH_USE_SSE
115 m_segmentOrigins[di] = _mm_set1_ps(segmentOrigin[di]);
116 m_rcpSegmentDirs[di] = _mm_set1_ps(rcpSegmentDir[di]);
117#endif
118 }
119 else if constexpr(BATCH_SIZE == 8)
120 {
121#if PH_USE_AVX
122 for(std::size_t di = 0; di < 3; ++di)
123 {
124 m_segmentOrigins[di] = _mm256_set1_ps(segmentOrigin[di]);
125 m_rcpSegmentDirs[di] = _mm256_set1_ps(rcpSegmentDir[di]);
126 }
127#endif
128 }
129 }
130 }
131
132 [[PH_ALWAYS_INLINE]]
133 void setNode(const TWideBvhNode<N, Index>& node) requires std::is_same_v<real, float32>
134 {
135 const auto& emptyAABB = AABB3D::makeEmpty();
136
137 for(std::size_t di = 0; di < 3; ++di)
138 {
139 for(std::size_t ci = 0; ci < N; ci += BATCH_SIZE)
140 {
141 if constexpr(N % BATCH_SIZE)
142 {
143 const auto& aabb0 = node.getAABB(ci);
144 const auto& aabb1 = ci + 1 < N ? node.getAABB(ci + 1) : emptyAABB;
145 const auto& aabb2 = ci + 2 < N ? node.getAABB(ci + 2) : emptyAABB;
146 const auto& aabb3 = ci + 3 < N ? node.getAABB(ci + 3) : emptyAABB;
147 const auto& aabb4 = ci + 4 < N ? node.getAABB(ci + 4) : emptyAABB;
148 const auto& aabb5 = ci + 5 < N ? node.getAABB(ci + 5) : emptyAABB;
149 const auto& aabb6 = ci + 6 < N ? node.getAABB(ci + 6) : emptyAABB;
150 const auto& aabb7 = ci + 7 < N ? node.getAABB(ci + 7) : emptyAABB;
151
152 if constexpr(BATCH_SIZE == 4)
153 {
154#if PH_USE_SSE
155 m_aabbMins[di][ci / 4] = _mm_setr_ps(
156 aabb0.getMinVertex()[di],
157 aabb1.getMinVertex()[di],
158 aabb2.getMinVertex()[di],
159 aabb3.getMinVertex()[di]);
160 m_aabbMaxs[di][ci / 4] = _mm_setr_ps(
161 aabb0.getMaxVertex()[di],
162 aabb1.getMaxVertex()[di],
163 aabb2.getMaxVertex()[di],
164 aabb3.getMaxVertex()[di]);
165#endif
166 }
167 else if constexpr(BATCH_SIZE == 8)
168 {
169#if PH_USE_AVX
170 m_aabbMins[di][ci / 8] = _mm256_setr_ps(
171 aabb0.getMinVertex()[di],
172 aabb1.getMinVertex()[di],
173 aabb2.getMinVertex()[di],
174 aabb3.getMinVertex()[di],
175 aabb4.getMinVertex()[di],
176 aabb5.getMinVertex()[di],
177 aabb6.getMinVertex()[di],
178 aabb7.getMinVertex()[di]);
179 m_aabbMaxs[di][ci / 8] = _mm256_setr_ps(
180 aabb0.getMaxVertex()[di],
181 aabb1.getMaxVertex()[di],
182 aabb2.getMaxVertex()[di],
183 aabb3.getMaxVertex()[di],
184 aabb4.getMaxVertex()[di],
185 aabb5.getMaxVertex()[di],
186 aabb6.getMaxVertex()[di],
187 aabb7.getMaxVertex()[di]);
188#endif
189 }
190 }
191 else
192 {
193 if constexpr(BATCH_SIZE == 4)
194 {
195#if PH_USE_SSE
196 PH_ASSERT_GE(node.SOA_VIEW_ALIGNMENT, 16);
197 m_aabbMins[di][ci / 4] = _mm_load_ps(&(node.getMinVerticesOnAxis(di)[ci]));
198 m_aabbMaxs[di][ci / 4] = _mm_load_ps(&(node.getMaxVerticesOnAxis(di)[ci]));
199#endif
200 }
201 else if constexpr(BATCH_SIZE == 8)
202 {
203#if PH_USE_AVX
204 PH_ASSERT_GE(node.SOA_VIEW_ALIGNMENT, 32);
205 m_aabbMins[di][ci / 8] = _mm256_load_ps(&(node.getMinVerticesOnAxis(di)[ci]));
206 m_aabbMaxs[di][ci / 8] = _mm256_load_ps(&(node.getMaxVerticesOnAxis(di)[ci]));
207#endif
208 }
209 }
210 }
211 }
212 }
213
214 template<bool IS_ROBUST = true>
215 [[PH_ALWAYS_INLINE]]
216 void intersectAabbVolumes(const float32 segmentMinT, const float32 segmentMaxT)
217 {
218 // The implementation is similar to `TAABB3D<T>::intersectVolumeTavian()` and
219 // `TAABB3D<T>::intersectVolumeRobust()`
220
221 if constexpr(BATCH_SIZE == 4)
222#if PH_USE_SSE
223 {
224 m_aabbMinTs = make_array<__m128, B>(_mm_set1_ps(segmentMinT));
225 m_aabbMaxTs = make_array<__m128, B>(_mm_set1_ps(segmentMaxT));
226#endif
227 }
228 else if(BATCH_SIZE == 8)
229 {
230#if PH_USE_AVX
231 m_aabbMinTs = make_array<__m256, B>(_mm256_set1_ps(segmentMinT));
232 m_aabbMaxTs = make_array<__m256, B>(_mm256_set1_ps(segmentMaxT));
233#endif
234 }
235
236 for(std::size_t di = 0; di < 3; ++di)
237 {
238 for(std::size_t bi = 0; bi < B; ++bi)
239 {
240 if constexpr(BATCH_SIZE == 4)
241 {
242#if PH_USE_SSE
243 const __m128 t1 =
244 _mm_mul_ps(_mm_sub_ps(m_aabbMins[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);
245 const __m128 t2 =
246 _mm_mul_ps(_mm_sub_ps(m_aabbMaxs[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);
247
248 const __m128 minT = _mm_min_ps(t1, t2);
249 const __m128 maxT = _mm_max_ps(t1, t2);
250
251 // Safe max: fallback to `segmentMinT` in case of NaN
252 m_aabbMinTs[bi] = _mm_max_ps(minT, m_aabbMinTs[bi]);
253
254 // Safe min: fallback to `segmentMaxT` in case of NaN
255 m_aabbMaxTs[bi] = _mm_min_ps(maxT, m_aabbMaxTs[bi]);
256#endif
257 }
258 else if constexpr(BATCH_SIZE == 8)
259 {
260#if PH_USE_AVX
261 const __m256 t1 =
262 _mm256_mul_ps(_mm256_sub_ps(m_aabbMins[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);
263 const __m256 t2 =
264 _mm256_mul_ps(_mm256_sub_ps(m_aabbMaxs[di][bi], m_segmentOrigins[di]), m_rcpSegmentDirs[di]);
265
266 const __m256 minT = _mm256_min_ps(t1, t2);
267 const __m256 maxT = _mm256_max_ps(t1, t2);
268
269 // Safe max: fallback to `segmentMinT` in case of NaN
270 m_aabbMinTs[bi] = _mm256_max_ps(minT, m_aabbMinTs[bi]);
271
272 // Safe min: fallback to `segmentMaxT` in case of NaN
273 m_aabbMaxTs[bi] = _mm256_min_ps(maxT, m_aabbMaxTs[bi]);
274#endif
275 }
276
277 // The following links have more information on the behavior of MINPS and MAXPS
278 // (they all satisfy the safe requirement)
279 // https://www.felixcloutier.com/x86/minps
280 // https://tavianator.com/2015/ray_box_nan.html
281 }
282 }
283
284 if constexpr(IS_ROBUST)
285 {
286 constexpr auto multiplier = std::numeric_limits<float>::epsilon() * 2 + 1;
287
288 for(std::size_t bi = 0; bi < B; ++bi)
289 {
290 if constexpr(BATCH_SIZE == 4)
291 {
292#if PH_USE_SSE
293 m_aabbMaxTs[bi] = _mm_mul_ps(m_aabbMaxTs[bi], _mm_set1_ps(multiplier));
294#endif
295 }
296 else if constexpr(BATCH_SIZE == 8)
297 {
298#if PH_USE_AVX
299 m_aabbMaxTs[bi] = _mm256_mul_ps(m_aabbMaxTs[bi], _mm256_set1_ps(multiplier));
300#endif
301 }
302 }
303 }
304 }
305
309 template<std::unsigned_integral MaskType = uint32>
310 [[PH_ALWAYS_INLINE]]
311 auto getIntersectResultAsMask() const
312 -> MaskType
313 {
314 static_assert(N <= sizeof_in_bits<MaskType>(), "Need more bits for `MaskType`.");
315
316 MaskType hitMask = 0;
317
318 for(std::size_t bi = 0; bi < B; ++bi)
319 {
320 if constexpr(BATCH_SIZE == 4)
321 {
322#if PH_USE_SSE
323 hitMask <<= 4;
324 hitMask |= _mm_movemask_ps(_mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]));
325#endif
326 }
327 else if constexpr(BATCH_SIZE == 8)
328 {
329#if PH_USE_AVX
330 hitMask <<= 8;
331 hitMask |= _mm256_movemask_ps(_mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]));
332#endif
333 }
334 }
335
336 return hitMask;
337 }
338
339 [[PH_ALWAYS_INLINE]]
340 auto getIntersectResultAsMinTsOr(const float32 missValue) const
341 -> TAlignedArray<float32, B * BATCH_SIZE, sizeof(float32) * BATCH_SIZE>
342 {
343 TAlignedArray<float32, B * BATCH_SIZE, sizeof(float32) * BATCH_SIZE> results;
344
345 // Perform `value = aabbMinT <= aabbMaxTs ? aabbMaxTs : missValue`
346 for(std::size_t bi = 0; bi < B; ++bi)
347 {
348 if constexpr(BATCH_SIZE == 4)
349 {
350#if PH_USE_SSE4_1
351 const __m128 cmpleMask = _mm_cmple_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi]);
352 const __m128 values = _mm_blendv_ps(_mm_set1_ps(missValue), m_aabbMinTs[bi], cmpleMask);
353
354 _mm_store_ps(&(results[bi * 4]), values);
355#endif
356 }
357 else if constexpr(BATCH_SIZE == 8)
359#if PH_USE_AVX
360 const __m256 cmpleMask = _mm256_cmp_ps(m_aabbMinTs[bi], m_aabbMaxTs[bi], _CMP_LE_OQ);
361 const __m256 values = _mm256_blendv_ps(_mm256_set1_ps(missValue), m_aabbMinTs[bi], cmpleMask);
362
363 _mm256_store_ps(&(results[bi * 8]), values);
364#endif
365 }
366 }
367
368#if !PH_USE_SSE4_1 && !PH_USE_AVX
369 results.fill(missValue);
370#endif
371
372 return results;
373 }
374
375#if PH_COMPILER_IS_GNU
376#pragma GCC diagnostic push
377
378// g++ 14 will emit "-Wignored-attributes" warnings for `BFloat`, see
379// https://stackoverflow.com/questions/41676311/implication-of-gcc-warning-ignoring-attributes-on-template-argument-wignored.
380// Ignoring for now as tests are passed.
381#if __GNUC__ == 14
382#pragma GCC diagnostic ignored "-Wignored-attributes"
383#endif
384
385#endif
386
387#if PH_USE_AVX || PH_USE_SSE
388private:
389 std::array<std::array<BFloat, B>, 3> m_aabbMins;
390 std::array<std::array<BFloat, B>, 3> m_aabbMaxs;
391
392 std::array<BFloat, 3> m_segmentOrigins;
393 std::array<BFloat, 3> m_rcpSegmentDirs;
394
395 std::array<BFloat, B> m_aabbMinTs;
396 std::array<BFloat, B> m_aabbMaxTs;
397#endif
398
399#if PH_COMPILER_IS_GNU
400#pragma GCC diagnostic pop
401#endif
402};
403
404}// end namespace ph::math
static TAABB3D makeEmpty()
Definition TAABB3D.ipp:15
A SIMD computing context for BVH. Use isSupported() to check the availability of the required hardwar...
Definition TBvhSimdComputingContext.h:64
void setSegment(const TVector3< float32 > &segmentOrigin, const TVector3< float32 > &rcpSegmentDir)
Definition TBvhSimdComputingContext.h:106
void setNode(const TWideBvhNode< N, Index > &node)
Definition TBvhSimdComputingContext.h:133
static constexpr bool isSupported()
Definition TBvhSimdComputingContext.h:97
void intersectAabbVolumes(const float32 segmentMinT, const float32 segmentMaxT)
Definition TBvhSimdComputingContext.h:216
Represents a 3-D vector.
Definition TVector3.h:17
Definition TWideBvhNode.h:26
Math functions and utilities.
Definition TransformInfo.h:10
class ph::math::TBvhSimdComputingContext MaskType
class ph::math::TBvhSimdComputingContext * BATCH_SIZE
Definition TBvhSimdComputingContext.h:37
void Type
Definition TBvhSimdComputingContext.h:38