$darkmode
Eigen  5.0.1-dev
GeneralMatrixMatrix.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
11 #define EIGEN_GENERAL_MATRIX_MATRIX_H
12 
13 // IWYU pragma: private
14 #include "../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 template <typename LhsScalar_, typename RhsScalar_>
21 class level3_blocking;
22 
23 /* Specialization for a row-major destination matrix => simple transposition of the product */
24 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
25  int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
26 struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
27  ConjugateRhs, RowMajor, ResInnerStride> {
28  typedef gebp_traits<RhsScalar, LhsScalar> Traits;
29 
30  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
31  static EIGEN_STRONG_INLINE void run(Index rows, Index cols, Index depth, const LhsScalar* lhs, Index lhsStride,
32  const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr,
33  Index resStride, ResScalar alpha, level3_blocking<RhsScalar, LhsScalar>& blocking,
34  GemmParallelInfo<Index>* info = 0) {
35  // transpose the product such that the result is column major
36  general_matrix_matrix_product<Index, RhsScalar, RhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateRhs,
37  LhsScalar, LhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor,
38  ResInnerStride>::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr,
39  resStride, alpha, blocking, info);
40  }
41 };
42 
43 /* Specialization for a col-major destination matrix
44  * => Blocking algorithm following Goto's paper */
45 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
46  int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
47 struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
48  ConjugateRhs, ColMajor, ResInnerStride> {
49  typedef gebp_traits<LhsScalar, RhsScalar> Traits;
50 
51  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
52  static void run(Index rows, Index cols, Index depth, const LhsScalar* lhs_, Index lhsStride, const RhsScalar* rhs_,
53  Index rhsStride, ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha,
54  level3_blocking<LhsScalar, RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) {
55  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
56  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
57  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
58  LhsMapper lhs(lhs_, lhsStride);
59  RhsMapper rhs(rhs_, rhsStride);
60  ResMapper res(res_, resStride, resIncr);
61 
62  Index kc = blocking.kc(); // cache block size along the K direction
63  Index mc = (std::min)(rows, blocking.mc()); // cache block size along the M direction
64  Index nc = (std::min)(cols, blocking.nc()); // cache block size along the N direction
65 
66  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
67  LhsStorageOrder>
68  pack_lhs;
69  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
70  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
71 
72 #if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
73  if (info) {
74  // this is the parallel version!
75  int tid = info->logical_thread_id;
76  int threads = info->num_threads;
77 
78  LhsScalar* blockA = blocking.blockA();
79  eigen_internal_assert(blockA != 0);
80 
81  std::size_t sizeB = kc * nc;
82  ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
83 
84  // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
85  for (Index k = 0; k < depth; k += kc) {
86  const Index actual_kc = (std::min)(k + kc, depth) - k; // => rows of B', and cols of the A'
87 
88  // In order to reduce the chance that a thread has to wait for the other,
89  // let's start by packing B'.
90  pack_rhs(blockB, rhs.getSubMapper(k, 0), actual_kc, nc);
91 
92  // Pack A_k to A' in a parallel fashion:
93  // each thread packs the sub block A_k,i to A'_i where i is the thread id.
94 
95  // However, before copying to A'_i, we have to make sure that no other thread is still using it,
96  // i.e., we test that info->task_info[tid].users equals 0.
97  // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to
98  // use it.
99  while (info->task_info[tid].users != 0) {
100  std::this_thread::yield();
101  }
102  info->task_info[tid].users = threads;
103 
104  pack_lhs(blockA + info->task_info[tid].lhs_start * actual_kc,
105  lhs.getSubMapper(info->task_info[tid].lhs_start, k), actual_kc, info->task_info[tid].lhs_length);
106 
107  // Notify the other threads that the part A'_i is ready to go.
108  info->task_info[tid].sync = k;
109 
110  // Computes C_i += A' * B' per A'_i
111  for (int shift = 0; shift < threads; ++shift) {
112  int i = (tid + shift) % threads;
113 
114  // At this point we have to make sure that A'_i has been updated by the thread i,
115  // we use testAndSetOrdered to mimic a volatile access.
116  // However, no need to wait for the B' part which has been updated by the current thread!
117  if (shift > 0) {
118  while (info->task_info[i].sync != k) {
119  std::this_thread::yield();
120  }
121  }
122 
123  gebp(res.getSubMapper(info->task_info[i].lhs_start, 0), blockA + info->task_info[i].lhs_start * actual_kc,
124  blockB, info->task_info[i].lhs_length, actual_kc, nc, alpha);
125  }
126 
127  // Then keep going as usual with the remaining B'
128  for (Index j = nc; j < cols; j += nc) {
129  const Index actual_nc = (std::min)(j + nc, cols) - j;
130 
131  // pack B_k,j to B'
132  pack_rhs(blockB, rhs.getSubMapper(k, j), actual_kc, actual_nc);
133 
134  // C_j += A' * B'
135  gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
136  }
137 
138  // Release all the sub blocks A'_i of A' for the current thread,
139  // i.e., we simply decrement the number of users by 1
140  for (Index i = 0; i < threads; ++i) info->task_info[i].users -= 1;
141  }
142  } else
143 #endif // defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
144  {
145  EIGEN_UNUSED_VARIABLE(info);
146 
147  // this is the sequential version!
148  std::size_t sizeA = kc * mc;
149  std::size_t sizeB = kc * nc;
150 
151  ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
152  ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
153 
154  const bool pack_rhs_once = mc != rows && kc == depth && nc == cols;
155 
156  // For each horizontal panel of the rhs, and corresponding panel of the lhs...
157  for (Index i2 = 0; i2 < rows; i2 += mc) {
158  const Index actual_mc = (std::min)(i2 + mc, rows) - i2;
159 
160  for (Index k2 = 0; k2 < depth; k2 += kc) {
161  const Index actual_kc = (std::min)(k2 + kc, depth) - k2;
162 
163  // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
164  // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
165  // Note that this panel will be read as many times as the number of blocks in the rhs's
166  // horizontal panel which is, in practice, a very low number.
167  pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
168 
169  // For each kc x nc block of the rhs's horizontal panel...
170  for (Index j2 = 0; j2 < cols; j2 += nc) {
171  const Index actual_nc = (std::min)(j2 + nc, cols) - j2;
172 
173  // We pack the rhs's block into a sequential chunk of memory (L2 caching)
174  // Note that this block will be read a very high number of times, which is equal to the number of
175  // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
176  if ((!pack_rhs_once) || i2 == 0) pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc);
177 
178  // Everything is packed, we can now call the panel * block kernel:
179  gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
180  }
181  }
182  }
183  }
184  }
185 };
186 
187 /*********************************************************************************
188  * Specialization of generic_product_impl for "large" GEMM, i.e.,
189  * implementation of the high level wrapper to general_matrix_matrix_product
190  **********************************************************************************/
191 
192 template <typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest,
193  typename BlockingType>
194 struct gemm_functor {
195  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
196  : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {}
197 
198  void initParallelSession(Index num_threads) const {
199  m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
200  m_blocking.allocateA();
201  }
202 
203  void operator()(Index row, Index rows, Index col = 0, Index cols = -1, GemmParallelInfo<Index>* info = 0) const {
204  if (cols == -1) cols = m_rhs.cols();
205 
206  Gemm::run(rows, cols, m_lhs.cols(), &m_lhs.coeffRef(row, 0), m_lhs.outerStride(), &m_rhs.coeffRef(0, col),
207  m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row, col)), m_dest.innerStride(), m_dest.outerStride(),
208  m_actualAlpha, m_blocking, info);
209  }
210 
211  typedef typename Gemm::Traits Traits;
212 
213  protected:
214  const Lhs& m_lhs;
215  const Rhs& m_rhs;
216  Dest& m_dest;
217  Scalar m_actualAlpha;
218  BlockingType& m_blocking;
219 };
220 
221 template <int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth,
222  int KcFactor = 1, bool FiniteAtCompileTime = MaxRows != Dynamic && MaxCols != Dynamic && MaxDepth != Dynamic>
223 class gemm_blocking_space;
224 
225 template <typename LhsScalar_, typename RhsScalar_>
226 class level3_blocking {
227  typedef LhsScalar_ LhsScalar;
228  typedef RhsScalar_ RhsScalar;
229 
230  protected:
231  LhsScalar* m_blockA;
232  RhsScalar* m_blockB;
233 
234  Index m_mc;
235  Index m_nc;
236  Index m_kc;
237 
238  public:
239  level3_blocking() : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {}
240 
241  inline Index mc() const { return m_mc; }
242  inline Index nc() const { return m_nc; }
243  inline Index kc() const { return m_kc; }
244 
245  inline LhsScalar* blockA() { return m_blockA; }
246  inline RhsScalar* blockB() { return m_blockB; }
247 };
248 
249 template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
250  int KcFactor>
251 class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor,
252  true /* == FiniteAtCompileTime */>
253  : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
254  std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
255  enum {
256  Transpose = StorageOrder == RowMajor,
257  ActualRows = Transpose ? MaxCols : MaxRows,
258  ActualCols = Transpose ? MaxRows : MaxCols
259  };
260  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
261  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
262  enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth };
263 
264 #if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
265  EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
266  EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
267 #else
268  EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
269  EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
270 #endif
271 
272  public:
273  gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/,
274  bool /*full_rows = false*/) {
275  this->m_mc = ActualRows;
276  this->m_nc = ActualCols;
277  this->m_kc = MaxDepth;
278 #if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
279  this->m_blockA = m_staticA;
280  this->m_blockB = m_staticB;
281 #else
282  this->m_blockA = reinterpret_cast<LhsScalar*>((std::uintptr_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
283  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
284  this->m_blockB = reinterpret_cast<RhsScalar*>((std::uintptr_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
285  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
286 #endif
287  }
288 
289  void initParallel(Index, Index, Index, Index) {}
290 
291  inline void allocateA() {}
292  inline void allocateB() {}
293  inline void allocateAll() {}
294 };
295 
296 template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
297  int KcFactor>
298 class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor, false>
299  : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
300  std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
301  enum { Transpose = StorageOrder == RowMajor };
302  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
303  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
304 
305  Index m_sizeA;
306  Index m_sizeB;
307 
308  public:
309  gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) {
310  this->m_mc = Transpose ? cols : rows;
311  this->m_nc = Transpose ? rows : cols;
312  this->m_kc = depth;
313 
314  if (l3_blocking) {
315  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
316  } else // no l3 blocking
317  {
318  Index n = this->m_nc;
319  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, n, num_threads);
320  }
321 
322  m_sizeA = this->m_mc * this->m_kc;
323  m_sizeB = this->m_kc * this->m_nc;
324  }
325 
326  void initParallel(Index rows, Index cols, Index depth, Index num_threads) {
327  this->m_mc = Transpose ? cols : rows;
328  this->m_nc = Transpose ? rows : cols;
329  this->m_kc = depth;
330 
331  eigen_internal_assert(this->m_blockA == 0 && this->m_blockB == 0);
332  Index m = this->m_mc;
333  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, m, this->m_nc, num_threads);
334  m_sizeA = this->m_mc * this->m_kc;
335  m_sizeB = this->m_kc * this->m_nc;
336  }
337 
338  void allocateA() {
339  if (this->m_blockA == 0) this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
340  }
341 
342  void allocateB() {
343  if (this->m_blockB == 0) this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
344  }
345 
346  void allocateAll() {
347  allocateA();
348  allocateB();
349  }
350 
351  ~gemm_blocking_space() {
352  aligned_delete(this->m_blockA, m_sizeA);
353  aligned_delete(this->m_blockB, m_sizeB);
354  }
355 };
356 
357 } // end namespace internal
358 
359 namespace internal {
360 
361 template <typename Lhs, typename Rhs>
362 struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>
363  : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>> {
364  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
365  typedef typename Lhs::Scalar LhsScalar;
366  typedef typename Rhs::Scalar RhsScalar;
367 
368  typedef internal::blas_traits<Lhs> LhsBlasTraits;
369  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
370  typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
371 
372  typedef internal::blas_traits<Rhs> RhsBlasTraits;
373  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
374  typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
375 
376  enum { MaxDepthAtCompileTime = min_size_prefer_fixed(Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) };
377 
378  typedef generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> lazyproduct;
379 
380  template <typename Dst>
381  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
382  // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
383  // to determine the following heuristic.
384  // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
385  // unless it has been specialized by the user or for a given architecture.
386  // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
387  // I'm not sure it is still required.
388  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
389  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>());
390  else {
391  dst.setZero();
392  scaleAndAddTo(dst, lhs, rhs, Scalar(1));
393  }
394  }
395 
396  template <typename Dst>
397  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
398  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
399  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>());
400  else
401  scaleAndAddTo(dst, lhs, rhs, Scalar(1));
402  }
403 
404  template <typename Dst>
405  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
406  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
407  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>());
408  else
409  scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
410  }
411 
412  template <typename Dest>
413  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
414  eigen_assert(dst.rows() == a_lhs.rows() && dst.cols() == a_rhs.cols());
415  if (a_lhs.cols() == 0 || a_lhs.rows() == 0 || a_rhs.cols() == 0) return;
416 
417  if (dst.cols() == 1) {
418  // Fallback to GEMV if either the lhs or rhs is a runtime vector
419  typename Dest::ColXpr dst_vec(dst.col(0));
420  return internal::generic_product_impl<Lhs, typename Rhs::ConstColXpr, DenseShape, DenseShape,
421  GemvProduct>::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
422  } else if (dst.rows() == 1) {
423  // Fallback to GEMV if either the lhs or rhs is a runtime vector
424  typename Dest::RowXpr dst_vec(dst.row(0));
425  return internal::generic_product_impl<typename Lhs::ConstRowXpr, Rhs, DenseShape, DenseShape,
426  GemvProduct>::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
427  }
428 
429  add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
430  add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
431 
432  Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
433 
434  typedef internal::gemm_blocking_space<(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar,
435  Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, MaxDepthAtCompileTime>
436  BlockingType;
437 
438  typedef internal::gemm_functor<
439  Scalar, Index,
440  internal::general_matrix_matrix_product<
441  Index, LhsScalar, (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor,
442  bool(LhsBlasTraits::NeedToConjugate), RhsScalar,
443  (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
444  (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>,
445  ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType>
446  GemmFunctor;
447 
448  BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
449  internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || Dest::MaxRowsAtCompileTime == Dynamic)>(
450  GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(),
451  Dest::Flags & RowMajorBit);
452  }
453 };
454 
455 } // end namespace internal
456 
457 } // end namespace Eigen
458 
459 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
Definition: Constants.h:318
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1
Definition: BFloat16.h:231
const unsigned int RowMajorBit
Definition: Constants.h:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:82
Definition: Constants.h:320
const int Dynamic
Definition: Constants.h:25