43#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
46#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
52#if defined(USE_AVX_DYNAMIC) || defined(USE_EMM_DYNAMIC)
53 inline std::atomic<bool>& use_simd()
55 static std::atomic<bool> sUseSimd =
true;
60#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
61 inline double to_scalar(__m256d
const& avxRegister, std::size_t index)
64 return avxRegister.m256d_f64[index];
66 return avxRegister[index];
71#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
72 inline uint32_t to_scalar(__m128i
const& emmRegister, std::size_t index)
75 return emmRegister.m128i_u32[index];
77 return emmRegister[index];
82#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
83 inline double avx_simd_fma_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2)
85 alignas(32) __m256d lhs = _mm256_set_pd(x1, y1, z1, w1);
86 alignas(32) __m256d rhs = _mm256_set_pd(x2, y2, z2, w2);
87 alignas(32) __m256d ans = _mm256_mul_pd(lhs, rhs);
88 return to_scalar(ans, 0) + to_scalar(ans, 1) + to_scalar(ans, 2) + to_scalar(ans, 3);
92 inline double fake_simd_fma_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2)
94 return x1 * x2 + y1 * y2 + z1 * z2 + w1 * w2;
98 #define simd_fma_4d avx_simd_fma_4d
99#elif defined(USE_AVX_DYNAMIC)
100 inline double simd_fma_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2)
103 return avx_simd_fma_4d(x1, x2, y1, y2, z1, z2, w1, w2);
108 #define simd_fma_4d fake_simd_fma_4d
111#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
112 inline void avx_simd_mul_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2,
double& a,
double& b,
double& c,
double& d)
114 alignas(32) __m256d lhs = _mm256_set_pd(x1, y1, z1, w1);
115 alignas(32) __m256d rhs = _mm256_set_pd(x2, y2, z2, w2);
116 alignas(32) __m256d ans = _mm256_mul_pd(lhs, rhs);
117 a = to_scalar(ans, 0);
118 b = to_scalar(ans, 1);
119 c = to_scalar(ans, 2);
120 d = to_scalar(ans, 3);
124 inline void fake_simd_mul_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2,
double& a,
double& b,
double& c,
double& d)
133 #define simd_mul_4d avx_simd_mul_4d
134#elif defined(USE_AVX_DYNAMIC)
135 inline void simd_mul_4d(
double x1,
double x2,
double y1,
double y2,
double z1,
double z2,
double w1,
double w2,
double& a,
double& b,
double& c,
double& d)
138 avx_simd_mul_4d(x1, x2, y1, y2, z1, z2, w1, w2, a, b, c, d);
140 fake_simd_mul_4d(x1, x2, y1, y2, z1, z2, w1, w2, a, b, c, d);
143 #define simd_mul_4d fake_simd_mul_4d
174#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
175 inline __m128i& simd_rand_seed()
177 alignas(16)
thread_local __m128i tSeed;
183#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
184 inline void emm_simd_srand(uint32_t seed)
186 detail::simd_rand_seed() = _mm_set_epi32(seed, seed + 1, seed, seed + 1);
196 #define simd_srand emm_simd_srand
197#elif defined(USE_EMM_DYNAMIC)
201 emm_simd_srand(seed);
206 #define simd_srand fake_simd_srand
211 simd_srand(
static_cast<uint32_t
>(std::hash<std::thread::id>{}(seed)));
214#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
215 inline uint32_t emm_simd_rand()
217 thread_local std::array<uint32_t, 4> result = {};
218 thread_local std::size_t resultCounter = 4;
219 if (resultCounter < 4)
220 return result[resultCounter++];
221 alignas(16) __m128i cur_seed_split;
222 alignas(16) __m128i multiplier;
223 alignas(16) __m128i adder;
224 alignas(16) __m128i mod_mask;
225 alignas(16) __m128i sra_mask;
226 alignas(16) __m128i ans;
227 alignas(16)
static const uint32_t mult[4] =
228 { 214013, 17405, 214013, 69069 };
229 alignas(16)
static const uint32_t gadd[4] =
230 { 2531011, 10395331, 13737667, 1 };
231 alignas(16)
static const uint32_t mask[4] =
232 { 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 };
233 alignas(16)
static const uint32_t masklo[4] =
234 { 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF };
236 adder = _mm_load_si128((__m128i*) gadd);
237 multiplier = _mm_load_si128((__m128i*) mult);
238 mod_mask = _mm_load_si128((__m128i*) mask);
239 sra_mask = _mm_load_si128((__m128i*) masklo);
241 cur_seed_split = _mm_shuffle_epi32(detail::simd_rand_seed(), _MM_SHUFFLE(2, 3, 0, 1));
243 detail::simd_rand_seed() = _mm_mul_epu32(detail::simd_rand_seed(), multiplier);
245 multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));
246 cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);
248 detail::simd_rand_seed() = _mm_and_si128(detail::simd_rand_seed(), mod_mask);
250 cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);
251 cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));
253 detail::simd_rand_seed() = _mm_or_si128(detail::simd_rand_seed(), cur_seed_split);
254 detail::simd_rand_seed() = _mm_add_epi32(detail::simd_rand_seed(), adder);
256 _mm_storeu_si128(&ans, detail::simd_rand_seed());
257 result = { to_scalar(ans, 0), to_scalar(ans, 1), to_scalar(ans, 2), to_scalar(ans, 3) };
259 return result[resultCounter];
265 thread_local std::array<uint32_t, 4> result = {};
266 thread_local std::size_t resultCounter = 4;
267 if (resultCounter < 4)
268 return result[resultCounter++];
269 result = {
static_cast<uint32_t
>(std::rand()),
static_cast<uint32_t
>(std::rand()),
static_cast<uint32_t
>(std::rand()),
static_cast<uint32_t
>(std::rand()) };
271 return result[resultCounter];
275 #define simd_rand emm_simd_rand
276#elif defined(USE_EMM_DYNAMIC)
280 return emm_simd_rand();
285 #define simd_rand fake_simd_srand
288 template <
typename T>
291 return static_cast<T
>(
simd_rand() %
static_cast<uint32_t
>(aUpper));
double fake_simd_fma_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2)
void fake_simd_srand(uint32_t seed)
uint32_t fake_simd_rand()
void fake_simd_mul_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2, double &a, double &b, double &c, double &d)