$darkmode
Eigen-unsupported  5.0.1-dev
TensorDeviceGpu.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 #include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
17 
18 namespace Eigen {
19 
20 static const int kGpuScratchSize = 1024;
21 
22 // This defines an interface that GPUDevice can take to use
23 // HIP / CUDA streams underneath.
24 class StreamInterface {
25  public:
26  virtual ~StreamInterface() {}
27 
28  virtual const gpuStream_t& stream() const = 0;
29  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
30 
31  // Allocate memory on the actual device where the computation will run
32  virtual void* allocate(size_t num_bytes) const = 0;
33  virtual void deallocate(void* buffer) const = 0;
34 
35  // Return a scratchpad buffer of size 1k
36  virtual void* scratchpad() const = 0;
37 
38  // Return a semaphore. The semaphore is initially initialized to 0, and
39  // each kernel using it is responsible for resetting to 0 upon completion
40  // to maintain the invariant that the semaphore is always equal to 0 upon
41  // each kernel start.
42  virtual unsigned int* semaphore() const = 0;
43 };
44 
45 class GpuDeviceProperties {
46  public:
47  GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
48 
49  ~GpuDeviceProperties() {
50  if (device_properties_) {
51  delete[] device_properties_;
52  }
53  }
54 
55  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
56 
57  EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
58 
59  void initialize() {
60  if (!initialized_) {
61  // Attempts to ensure proper behavior in the case of multiple threads
62  // calling this function simultaneously. This would be trivial to
63  // implement if we could use std::mutex, but unfortunately mutex don't
64  // compile with nvcc, so we resort to atomics and thread fences instead.
65  // Note that if the caller uses a compiler that doesn't support c++11 we
66  // can't ensure that the initialization is thread safe.
67  if (first_.exchange(false)) {
68  // We're the first thread to reach this point.
69  int num_devices;
70  gpuError_t status = gpuGetDeviceCount(&num_devices);
71  if (status != gpuSuccess) {
72  std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
73  gpu_assert(status == gpuSuccess);
74  }
75  device_properties_ = new gpuDeviceProp_t[num_devices];
76  for (int i = 0; i < num_devices; ++i) {
77  status = gpuGetDeviceProperties(&device_properties_[i], i);
78  if (status != gpuSuccess) {
79  std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
80  gpu_assert(status == gpuSuccess);
81  }
82  }
83 
84  std::atomic_thread_fence(std::memory_order_release);
85  initialized_ = true;
86  } else {
87  // Wait for the other thread to inititialize the properties.
88  while (!initialized_) {
89  std::atomic_thread_fence(std::memory_order_acquire);
90  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
91  }
92  }
93  }
94  }
95 
96  private:
97  volatile bool initialized_;
98  std::atomic<bool> first_;
99  gpuDeviceProp_t* device_properties_;
100 };
101 
102 EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
103  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
104  if (!deviceProperties->isInitialized()) {
105  deviceProperties->initialize();
106  }
107  return *deviceProperties;
108 }
109 
110 EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
111  return GetGpuDeviceProperties().get(device);
112 }
113 
114 static const gpuStream_t default_stream = gpuStreamDefault;
115 
116 class GpuStreamDevice : public StreamInterface {
117  public:
118  // Use the default stream on the current device
119  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
120  gpuError_t status = gpuGetDevice(&device_);
121  if (status != gpuSuccess) {
122  std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
123  gpu_assert(status == gpuSuccess);
124  }
125  }
126  // Use the default stream on the specified device
127  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
128  // Use the specified stream. Note that it's the
129  // caller responsibility to ensure that the stream can run on
130  // the specified device. If no device is specified the code
131  // assumes that the stream is associated to the current gpu device.
132  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
133  : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
134  if (device < 0) {
135  gpuError_t status = gpuGetDevice(&device_);
136  if (status != gpuSuccess) {
137  std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
138  gpu_assert(status == gpuSuccess);
139  }
140  } else {
141  int num_devices;
142  gpuError_t err = gpuGetDeviceCount(&num_devices);
143  EIGEN_UNUSED_VARIABLE(err)
144  gpu_assert(err == gpuSuccess);
145  gpu_assert(device < num_devices);
146  device_ = device;
147  }
148  }
149 
150  virtual ~GpuStreamDevice() {
151  if (scratch_) {
152  deallocate(scratch_);
153  }
154  }
155 
156  const gpuStream_t& stream() const { return *stream_; }
157  const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); }
158  virtual void* allocate(size_t num_bytes) const {
159  gpuError_t err = gpuSetDevice(device_);
160  EIGEN_UNUSED_VARIABLE(err)
161  gpu_assert(err == gpuSuccess);
162  void* result;
163  err = gpuMalloc(&result, num_bytes);
164  gpu_assert(err == gpuSuccess);
165  gpu_assert(result != NULL);
166  return result;
167  }
168  virtual void deallocate(void* buffer) const {
169  gpuError_t err = gpuSetDevice(device_);
170  EIGEN_UNUSED_VARIABLE(err)
171  gpu_assert(err == gpuSuccess);
172  gpu_assert(buffer != NULL);
173  err = gpuFree(buffer);
174  gpu_assert(err == gpuSuccess);
175  }
176 
177  virtual void* scratchpad() const {
178  if (scratch_ == NULL) {
179  scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
180  }
181  return scratch_;
182  }
183 
184  virtual unsigned int* semaphore() const {
185  if (semaphore_ == NULL) {
186  char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
187  semaphore_ = reinterpret_cast<unsigned int*>(scratch);
188  gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
189  EIGEN_UNUSED_VARIABLE(err)
190  gpu_assert(err == gpuSuccess);
191  }
192  return semaphore_;
193  }
194 
195  private:
196  const gpuStream_t* stream_;
197  int device_;
198  mutable void* scratch_;
199  mutable unsigned int* semaphore_;
200 };
201 
202 struct GpuDevice {
203  // The StreamInterface is not owned: the caller is
204  // responsible for its initialization and eventual destruction.
205  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
206  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
207  eigen_assert(stream);
208  }
209  // TODO(bsteiner): This is an internal API, we should not expose it.
210  EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); }
211 
212  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); }
213 
214  EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); }
215 
216  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); }
217 
218  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); }
219 
220  template <typename Type>
221  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
222  return data;
223  }
224 
225  EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
226 
227  EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); }
228 
229  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
230 #ifndef EIGEN_GPU_COMPILE_PHASE
231  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
232  EIGEN_UNUSED_VARIABLE(err)
233  gpu_assert(err == gpuSuccess);
234 #else
235  EIGEN_UNUSED_VARIABLE(dst);
236  EIGEN_UNUSED_VARIABLE(src);
237  EIGEN_UNUSED_VARIABLE(n);
238  eigen_assert(false && "The default device should be used instead to generate kernel code");
239 #endif
240  }
241 
242  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
243  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
244  EIGEN_UNUSED_VARIABLE(err)
245  gpu_assert(err == gpuSuccess);
246  }
247 
248  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
249  gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
250  EIGEN_UNUSED_VARIABLE(err)
251  gpu_assert(err == gpuSuccess);
252  }
253 
254  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
255 #ifndef EIGEN_GPU_COMPILE_PHASE
256  gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
257  EIGEN_UNUSED_VARIABLE(err)
258  gpu_assert(err == gpuSuccess);
259 #else
260  EIGEN_UNUSED_VARIABLE(buffer)
261  EIGEN_UNUSED_VARIABLE(c)
262  EIGEN_UNUSED_VARIABLE(n)
263  eigen_assert(false && "The default device should be used instead to generate kernel code");
264 #endif
265  }
266 
267  template <typename T>
268  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
269 #ifndef EIGEN_GPU_COMPILE_PHASE
270  const size_t count = end - begin;
271  // Split value into bytes and run memset with stride.
272  const int value_size = sizeof(value);
273  char* buffer = (char*)begin;
274  char* value_bytes = (char*)(&value);
275  gpuError_t err;
276  EIGEN_UNUSED_VARIABLE(err)
277 
278  // If all value bytes are equal, then a single memset can be much faster.
279  bool use_single_memset = true;
280  for (int i = 1; i < value_size; ++i) {
281  if (value_bytes[i] != value_bytes[0]) {
282  use_single_memset = false;
283  }
284  }
285 
286  if (use_single_memset) {
287  err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
288  gpu_assert(err == gpuSuccess);
289  } else {
290  for (int b = 0; b < value_size; ++b) {
291  err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
292  gpu_assert(err == gpuSuccess);
293  }
294  }
295 #else
296  EIGEN_UNUSED_VARIABLE(begin)
297  EIGEN_UNUSED_VARIABLE(end)
298  EIGEN_UNUSED_VARIABLE(value)
299  eigen_assert(false && "The default device should be used instead to generate kernel code");
300 #endif
301  }
302 
303  EIGEN_STRONG_INLINE size_t numThreads() const {
304  // FIXME
305  return 32;
306  }
307 
308  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
309  // FIXME
310  return 48 * 1024;
311  }
312 
313  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
314  // We won't try to take advantage of the l2 cache for the time being, and
315  // there is no l3 cache on hip/cuda devices.
316  return firstLevelCacheSize();
317  }
318 
319  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
320 #ifndef EIGEN_GPU_COMPILE_PHASE
321  gpuError_t err = gpuStreamSynchronize(stream_->stream());
322  if (err != gpuSuccess) {
323  std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
324  gpu_assert(err == gpuSuccess);
325  }
326 #else
327  gpu_assert(false && "The default device should be used instead to generate kernel code");
328 #endif
329  }
330 
331  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; }
332  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; }
333  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
334  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
335  }
336  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
337  return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
338  }
339  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; }
340  EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; }
341 
342  EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; }
343 
344  // This function checks if the GPU runtime recorded an error for the
345  // underlying stream device.
346  inline bool ok() const {
347 #ifdef EIGEN_GPUCC
348  gpuError_t error = gpuStreamQuery(stream_->stream());
349  return (error == gpuSuccess) || (error == gpuErrorNotReady);
350 #else
351  return false;
352 #endif
353  }
354 
355  private:
356  const StreamInterface* stream_;
357  int max_blocks_;
358 };
359 
360 #if defined(EIGEN_HIPCC)
361 
362 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
363  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
364  gpu_assert(hipGetLastError() == hipSuccess);
365 
366 #else
367 
368 #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
369  (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
370  gpu_assert(cudaGetLastError() == cudaSuccess);
371 
372 #endif
373 
374 // FIXME: Should be device and kernel specific.
375 #ifdef EIGEN_GPUCC
376 static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
377 #ifndef EIGEN_GPU_COMPILE_PHASE
378  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
379  EIGEN_UNUSED_VARIABLE(status)
380  gpu_assert(status == gpuSuccess);
381 #else
382  EIGEN_UNUSED_VARIABLE(config)
383 #endif
384 }
385 #endif
386 
387 } // end namespace Eigen
388 
389 // undefine all the gpu* macros we defined at the beginning of the file
390 #include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
391 
392 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
static constexpr lastp1_t end
Namespace containing all symbols from the Eigen library.