$darkmode
Eigen-unsupported  5.0.1-dev
TensorRandom.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13 
14 // IWYU pragma: private
15 #include "./InternalHeaderCheck.h"
16 
17 namespace Eigen {
18 namespace internal {
19 
20 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
21 #if defined(EIGEN_GPU_COMPILE_PHASE)
22  // We don't support 3d kernels since we currently only use 1 and
23  // 2d kernels.
24  gpu_assert(threadIdx.z == 0);
25  return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26 #else
27  // Rely on Eigen's random implementation.
28  return random<uint64_t>();
29 #endif
30 }
31 
32 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33  // TODO: Unify with the implementation in the non blocking thread pool.
34  uint64_t current = *state;
35  // Update the internal state
36  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37  // Generate the random output (using the PCG-XSH-RS scheme)
38  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39 }
40 
41 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
42  seed = seed ? seed : get_random_seed();
43  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44 }
45 
46 template <typename T>
47 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
48  unsigned rnd = PCG_XSH_RS_generator(state, stream);
49  return static_cast<T>(rnd);
50 }
51 
52 template <>
53 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool RandomToTypeUniform<bool>(uint64_t* state, uint64_t stream) {
54  unsigned rnd = PCG_XSH_RS_generator(state, stream);
55  return (rnd & 0x1) != 0;
56 }
57 
58 template <>
59 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
60  // Generate 10 random bits for the mantissa, merge with exponent.
61  unsigned rnd = PCG_XSH_RS_generator(state, stream);
62  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
63  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
64  // Return the final result
65  return result - Eigen::half(1.0f);
66 }
67 
68 template <>
69 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
70  uint64_t stream) {
71  // Generate 7 random bits for the mantissa, merge with exponent.
72  unsigned rnd = PCG_XSH_RS_generator(state, stream);
73  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
74  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
75  // Return the final result
76  return result - Eigen::bfloat16(1.0f);
77 }
78 
79 template <>
80 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
81  typedef union {
82  uint32_t raw;
83  float fp;
84  } internal;
85  internal result;
86  // Generate 23 random bits for the mantissa mantissa
87  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
88  result.raw = rnd & 0x7fffffu;
89  // Set the exponent
90  result.raw |= (static_cast<uint32_t>(127) << 23);
91  // Return the final result
92  return result.fp - 1.0f;
93 }
94 
95 template <>
96 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
97  typedef union {
98  uint64_t raw;
99  double dp;
100  } internal;
101  internal result;
102  result.raw = 0;
103  // Generate 52 random bits for the mantissa
104  // First generate the upper 20 bits
105  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
106  // The generate the lower 32 bits
107  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
108  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
109  // Set the exponent
110  result.raw |= (static_cast<uint64_t>(1023) << 52);
111  // Return the final result
112  return result.dp - 1.0;
113 }
114 
115 template <>
116 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
117  uint64_t stream) {
118  return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
119 }
120 template <>
121 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
122  uint64_t stream) {
123  return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
124 }
125 
126 template <typename T>
127 class UniformRandomGenerator {
128  public:
129  static constexpr bool PacketAccess = true;
130 
131  // Uses the given "seed" if non-zero, otherwise uses a random seed.
132  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
133  m_state = PCG_XSH_RS_state(seed);
134 #ifdef EIGEN_USE_SYCL
135  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
136  // Therefore, we need two steps to initializate the m_state.
137  // IN SYCL, the constructor of the functor is s called on the CPU
138  // and we get the clock seed here from the CPU. However, This seed is
139  // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
140  // and only available on the Operator() function (which is called on the GPU).
141  // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
142  // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
143  // thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
144  // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
145  // However when the operator() is called the thread ID will be available. So inside the operator,
146  // we add the thrreadID, BlockId,... (which is equivalent of i)
147  // to the seed and construct the unique m_state per thead similar to cuda.
148  m_exec_once = false;
149 #endif
150  }
151  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
152  m_state = other.m_state;
153 #ifdef EIGEN_USE_SYCL
154  m_exec_once = other.m_exec_once;
155 #endif
156  }
157 
158  template <typename Index>
159  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
160 #ifdef EIGEN_USE_SYCL
161  if (!m_exec_once) {
162  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
163  // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
164  m_state += (i * 6364136223846793005ULL);
165  m_exec_once = true;
166  }
167 #endif
168  T result = RandomToTypeUniform<T>(&m_state, i);
169  return result;
170  }
171 
172  template <typename Packet, typename Index>
173  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
174  const int packetSize = internal::unpacket_traits<Packet>::size;
175  EIGEN_ALIGN_MAX T values[packetSize];
176 #ifdef EIGEN_USE_SYCL
177  if (!m_exec_once) {
178  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
179  m_state += (i * 6364136223846793005ULL);
180  m_exec_once = true;
181  }
182 #endif
183  EIGEN_UNROLL_LOOP
184  for (int j = 0; j < packetSize; ++j) {
185  values[j] = RandomToTypeUniform<T>(&m_state, i);
186  }
187  return internal::pload<Packet>(values);
188  }
189 
190  private:
191  mutable uint64_t m_state;
192 #ifdef EIGEN_USE_SYCL
193  mutable bool m_exec_once;
194 #endif
195 };
196 
197 template <typename Scalar>
198 struct functor_traits<UniformRandomGenerator<Scalar> > {
199  enum {
200  // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
201  Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
202  PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
203  };
204 };
205 
206 template <typename T>
207 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
208  // Use the ratio of uniform method to generate numbers following a normal
209  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
210  // details.
211  T u, v, q;
212  do {
213  u = RandomToTypeUniform<T>(state, stream);
214  v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
215  const T x = u - T(0.449871);
216  const T y = numext::abs(v) + T(0.386595);
217  q = x * x + y * (T(0.196) * y - T(0.25472) * x);
218  } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));
219 
220  return v / u;
221 }
222 
223 template <>
224 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
225  uint64_t stream) {
226  return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
227 }
228 template <>
229 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
230  uint64_t stream) {
231  return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
232 }
233 
234 template <typename T>
235 class NormalRandomGenerator {
236  public:
237  static constexpr bool PacketAccess = true;
238 
239  // Uses the given "seed" if non-zero, otherwise uses a random seed.
240  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
241  m_state = PCG_XSH_RS_state(seed);
242 #ifdef EIGEN_USE_SYCL
243  // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
244  // Therefore, we need two steps to initializate the m_state.
245  // IN SYCL, the constructor of the functor is s called on the CPU
246  // and we get the clock seed here from the CPU. However, This seed is
247  // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
248  // and only available on the Operator() function (which is called on the GPU).
249  // Therefore, the thread Id injection is not available at this stage. However when the operator()
250  // is called the thread ID will be available. So inside the operator,
251  // we add the thrreadID, BlockId,... (which is equivalent of i)
252  // to the seed and construct the unique m_state per thead similar to cuda.
253  m_exec_once = false;
254 #endif
255  }
256  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
257  m_state = other.m_state;
258 #ifdef EIGEN_USE_SYCL
259  m_exec_once = other.m_exec_once;
260 #endif
261  }
262 
263  template <typename Index>
264  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
265 #ifdef EIGEN_USE_SYCL
266  if (!m_exec_once) {
267  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
268  m_state += (i * 6364136223846793005ULL);
269  m_exec_once = true;
270  }
271 #endif
272  T result = RandomToTypeNormal<T>(&m_state, i);
273  return result;
274  }
275 
276  template <typename Packet, typename Index>
277  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
278  const int packetSize = internal::unpacket_traits<Packet>::size;
279  EIGEN_ALIGN_MAX T values[packetSize];
280 #ifdef EIGEN_USE_SYCL
281  if (!m_exec_once) {
282  // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
283  m_state += (i * 6364136223846793005ULL);
284  m_exec_once = true;
285  }
286 #endif
287  EIGEN_UNROLL_LOOP
288  for (int j = 0; j < packetSize; ++j) {
289  values[j] = RandomToTypeNormal<T>(&m_state, i);
290  }
291  return internal::pload<Packet>(values);
292  }
293 
294  private:
295  mutable uint64_t m_state;
296 #ifdef EIGEN_USE_SYCL
297  mutable bool m_exec_once;
298 #endif
299 };
300 
301 template <typename Scalar>
302 struct functor_traits<NormalRandomGenerator<Scalar> > {
303  enum {
304  // On average, we need to generate about 3 random numbers
305  // 15 mul, 8 add, 1.5 logs
306  Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
307  8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
308  PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
309  };
310 };
311 
312 } // end namespace internal
313 } // end namespace Eigen
314 
315 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index