neoGFX
Cross-platform C++ app/game engine
Loading...
Searching...
No Matches
simd.hpp
Go to the documentation of this file.
1// simd.hpp
2/*
3 * Copyright (c) 2020 Leigh Johnston.
4 *
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * * Neither the name of Leigh Johnston nor the names of any
19 * other contributors to this software may be used to endorse or
20 * promote products derived from this software without specific prior
21 * written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
24 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
25 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
27 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 */
35
36#pragma once
37
38#include <neolib/neolib.hpp>
39#include <atomic>
40#include <array>
41#include <thread>
42#include <cstdlib>
43#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
44#include <immintrin.h>
45#endif
46#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
47#include <emmintrin.h>
48#endif
49
50namespace neolib
51{
52#if defined(USE_AVX_DYNAMIC) || defined(USE_EMM_DYNAMIC)
53 inline std::atomic<bool>& use_simd()
54 {
55 static std::atomic<bool> sUseSimd = true;
56 return sUseSimd;
57 }
58#endif
59
60#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
61 inline double to_scalar(__m256d const& avxRegister, std::size_t index)
62 {
63#ifdef _WIN32
64 return avxRegister.m256d_f64[index];
65#else
66 return avxRegister[index];
67#endif
68 }
69#endif
70
71#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
72 inline uint32_t to_scalar(__m128i const& emmRegister, std::size_t index)
73 {
74#ifdef _WIN32
75 return emmRegister.m128i_u32[index];
76#else
77 return emmRegister[index];
78#endif
79 }
80#endif
81
82#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
83 inline double avx_simd_fma_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2)
84 {
85 alignas(32) __m256d lhs = _mm256_set_pd(x1, y1, z1, w1);
86 alignas(32) __m256d rhs = _mm256_set_pd(x2, y2, z2, w2);
87 alignas(32) __m256d ans = _mm256_mul_pd(lhs, rhs);
88 return to_scalar(ans, 0) + to_scalar(ans, 1) + to_scalar(ans, 2) + to_scalar(ans, 3);
89 }
90#endif
91
92 inline double fake_simd_fma_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2)
93 {
94 return x1 * x2 + y1 * y2 + z1 * z2 + w1 * w2;
95 }
96
97#if defined(USE_AVX)
98 #define simd_fma_4d avx_simd_fma_4d
99#elif defined(USE_AVX_DYNAMIC)
100 inline double simd_fma_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2)
101 {
102 if (use_simd())
103 return avx_simd_fma_4d(x1, x2, y1, y2, z1, z2, w1, w2);
104 else
105 return fake_simd_fma_4d(x1, x2, y1, y2, z1, z2, w1, w2);
106 }
107#else
108 #define simd_fma_4d fake_simd_fma_4d
109#endif
110
111#if defined(USE_AVX) || defined(USE_AVX_DYNAMIC)
112 inline void avx_simd_mul_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2, double& a, double& b, double& c, double& d)
113 {
114 alignas(32) __m256d lhs = _mm256_set_pd(x1, y1, z1, w1);
115 alignas(32) __m256d rhs = _mm256_set_pd(x2, y2, z2, w2);
116 alignas(32) __m256d ans = _mm256_mul_pd(lhs, rhs);
117 a = to_scalar(ans, 0);
118 b = to_scalar(ans, 1);
119 c = to_scalar(ans, 2);
120 d = to_scalar(ans, 3);
121 }
122#endif
123
124 inline void fake_simd_mul_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2, double& a, double& b, double& c, double& d)
125 {
126 a = x1 * x2;
127 b = y1 * y2;
128 c = z1 * z2;
129 d = w1 * w2;
130 }
131
132#if defined(USE_AVX)
133 #define simd_mul_4d avx_simd_mul_4d
134#elif defined(USE_AVX_DYNAMIC)
135 inline void simd_mul_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2, double& a, double& b, double& c, double& d)
136 {
137 if (use_simd())
138 avx_simd_mul_4d(x1, x2, y1, y2, z1, z2, w1, w2, a, b, c, d);
139 else
140 fake_simd_mul_4d(x1, x2, y1, y2, z1, z2, w1, w2, a, b, c, d);
141 }
142#else
143 #define simd_mul_4d fake_simd_mul_4d
144#endif
145
147 // The Software is provided "AS IS" and possibly with faults.
148 // Intel disclaims any and all warranties and guarantees, express, implied or
149 // otherwise, arising, with respect to the software delivered hereunder,
150 // including but not limited to the warranty of merchantability, the warranty
151 // of fitness for a particular purpose, and any warranty of non-infringement
152 // of the intellectual property rights of any third party.
153 // Intel neither assumes nor authorizes any person to assume for it any other
154 // liability. Customer will use the software at its own risk. Intel will not
155 // be liable to customer for any direct or indirect damages incurred in using
156 // the software. In no event will Intel be liable for loss of profits, loss of
157 // use, loss of data, business interruption, nor for punitive, incidental,
158 // consequential, or special damages of any kind, even if advised of
159 // the possibility of such damages.
160 //
161 // Copyright (c) 2003 Intel Corporation
162 //
163 // Third-party brands and names are the property of their respective owners
164 //
166 // Random Number Generation for SSE / SSE2
167 // Source File
168 // Version 0.1
169 // Author Kipp Owens, Rajiv Parikh
171
172 namespace detail
173 {
174#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
175 inline __m128i& simd_rand_seed()
176 {
177 alignas(16) thread_local __m128i tSeed;
178 return tSeed;
179 }
180#endif
181 }
182
183#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
184 inline void emm_simd_srand(uint32_t seed)
185 {
186 detail::simd_rand_seed() = _mm_set_epi32(seed, seed + 1, seed, seed + 1);
187 }
188#endif
189
190 inline void fake_simd_srand(uint32_t seed)
191 {
192 std::srand(seed);
193 }
194
195#if defined(USE_EMM)
196 #define simd_srand emm_simd_srand
197#elif defined(USE_EMM_DYNAMIC)
198 inline void simd_srand(uint32_t seed)
199 {
200 if (use_simd())
201 emm_simd_srand(seed);
202 else
203 fake_simd_srand(seed);
204 }
205#else
206 #define simd_srand fake_simd_srand
207#endif
208
209 inline void simd_srand(std::thread::id seed)
210 {
211 simd_srand(static_cast<uint32_t>(std::hash<std::thread::id>{}(seed)));
212 }
213
214#if defined(USE_EMM) || defined(USE_EMM_DYNAMIC)
215 inline uint32_t emm_simd_rand()
216 {
217 thread_local std::array<uint32_t, 4> result = {};
218 thread_local std::size_t resultCounter = 4;
219 if (resultCounter < 4)
220 return result[resultCounter++];
221 alignas(16) __m128i cur_seed_split;
222 alignas(16) __m128i multiplier;
223 alignas(16) __m128i adder;
224 alignas(16) __m128i mod_mask;
225 alignas(16) __m128i sra_mask;
226 alignas(16) __m128i ans;
227 alignas(16) static const uint32_t mult[4] =
228 { 214013, 17405, 214013, 69069 };
229 alignas(16) static const uint32_t gadd[4] =
230 { 2531011, 10395331, 13737667, 1 };
231 alignas(16) static const uint32_t mask[4] =
232 { 0xFFFFFFFF, 0, 0xFFFFFFFF, 0 };
233 alignas(16) static const uint32_t masklo[4] =
234 { 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF };
235
236 adder = _mm_load_si128((__m128i*) gadd);
237 multiplier = _mm_load_si128((__m128i*) mult);
238 mod_mask = _mm_load_si128((__m128i*) mask);
239 sra_mask = _mm_load_si128((__m128i*) masklo);
240
241 cur_seed_split = _mm_shuffle_epi32(detail::simd_rand_seed(), _MM_SHUFFLE(2, 3, 0, 1));
242
243 detail::simd_rand_seed() = _mm_mul_epu32(detail::simd_rand_seed(), multiplier);
244
245 multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));
246 cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);
247
248 detail::simd_rand_seed() = _mm_and_si128(detail::simd_rand_seed(), mod_mask);
249
250 cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);
251 cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));
252
253 detail::simd_rand_seed() = _mm_or_si128(detail::simd_rand_seed(), cur_seed_split);
254 detail::simd_rand_seed() = _mm_add_epi32(detail::simd_rand_seed(), adder);
255
256 _mm_storeu_si128(&ans, detail::simd_rand_seed());
257 result = { to_scalar(ans, 0), to_scalar(ans, 1), to_scalar(ans, 2), to_scalar(ans, 3) };
258 resultCounter = 0;
259 return result[resultCounter];
260 }
261#endif
262
263 inline uint32_t fake_simd_rand()
264 {
265 thread_local std::array<uint32_t, 4> result = {};
266 thread_local std::size_t resultCounter = 4;
267 if (resultCounter < 4)
268 return result[resultCounter++];
269 result = { static_cast<uint32_t>(std::rand()), static_cast<uint32_t>(std::rand()), static_cast<uint32_t>(std::rand()), static_cast<uint32_t>(std::rand()) };
270 resultCounter = 0;
271 return result[resultCounter];
272 }
273
274#if defined(USE_EMM)
275 #define simd_rand emm_simd_rand
276#elif defined(USE_EMM_DYNAMIC)
277 inline uint32_t simd_rand()
278 {
279 if (use_simd())
280 return emm_simd_rand();
281 else
282 return fake_simd_rand();
283 }
284#else
285 #define simd_rand fake_simd_srand
286#endif
287
288 template <typename T>
289 inline T simd_rand(T aUpper)
290 {
291 return static_cast<T>(simd_rand() % static_cast<uint32_t>(aUpper));
292 }
293}
double fake_simd_fma_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2)
Definition simd.hpp:92
void fake_simd_srand(uint32_t seed)
Definition simd.hpp:190
uint32_t fake_simd_rand()
Definition simd.hpp:263
void fake_simd_mul_4d(double x1, double x2, double y1, double y2, double z1, double z2, double w1, double w2, double &a, double &b, double &c, double &d)
Definition simd.hpp:124
#define simd_fma_4d
Definition simd.hpp:108
#define simd_rand
Definition simd.hpp:285
#define simd_mul_4d
Definition simd.hpp:143
#define simd_srand
Definition simd.hpp:206