$darkmode
#include <Eigen/src/Core/arch/AVX512/TrsmKernel.h>
Unrolls for triSolveKernel
Idea: 1) Load a block of right-hand sides to registers in RHSInPacket (using loadRHS). 2) Do triangular solve with RHSInPacket and a small block of A (triangular matrix) stored in AInPacket (using triSolveMicroKernel). 3) Store final results (in avx registers) back into memory (using storeRHS).
RHSInPacket uses at most EIGEN_AVX_MAX_NUM_ACC avx registers and AInPacket uses at most EIGEN_AVX_MAX_NUM_ROW registers.
Static Public Member Functions | |
| template<int64_t currM, int64_t endK, int64_t counter> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 &&currM >=0)> | aux_divRHSByDiag (PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
| template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_loadRHS (Scalar *B_arr, int64_t LDB, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, int64_t rem=0) |
| template<bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_storeRHS (Scalar *B_arr, int64_t LDB, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, int64_t rem=0) |
| template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t counter, int64_t numK> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_triSolveMicroKernel (Scalar *A_arr, int64_t LDA, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
| template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK, int64_t counter, int64_t currentM> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_updateRHS (Scalar *A_arr, int64_t LDA, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
| template<int64_t currM, int64_t endK> | |
| static EIGEN_ALWAYS_INLINE void | divRHSByDiag (PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
| template<bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false> | |
| static EIGEN_ALWAYS_INLINE void | loadRHS (Scalar *B_arr, int64_t LDB, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, int64_t rem=0) |
| template<bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false> | |
| static EIGEN_ALWAYS_INLINE void | storeRHS (Scalar *B_arr, int64_t LDB, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, int64_t rem=0) |
| template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t numK> | |
| static EIGEN_ALWAYS_INLINE void | triSolveMicroKernel (Scalar *A_arr, int64_t LDA, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
| template<bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t startM, int64_t endM, int64_t endK, int64_t currentM> | |
| static EIGEN_ALWAYS_INLINE void | updateRHS (Scalar *A_arr, int64_t LDA, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ACC > &RHSInPacket, PacketBlock< vec, EIGEN_AVX_MAX_NUM_ROW > &AInPacket) |
|
inlinestatic |
aux_divRHSByDiag
currM may be -1, (currM >=0) in enable_if checks for this
1-D unroll for(startK = 0; startK < endK; startK++)
|
inlinestatic |
aux_loadRHS
2-D unroll for(startM = 0; startM < endM; startM++) for(startK = 0; startK < endK; startK++)
|
inlinestatic |
aux_storeRHS
2-D unroll for(startM = 0; startM < endM; startM++) for(startK = 0; startK < endK; startK++)
|
inlinestatic |
aux_triSolverMicroKernel
1-D unroll for(startM = 0; startM < endM; startM++)
|
inlinestatic |
aux_updateRHS
2-D unroll for(startM = initM; startM < endM; startM++) for(startK = 0; startK < endK; startK++)
|
inlinestatic |
Only used if Triangular matrix has non-unit diagonal values
|
inlinestatic |
Load endMxendK block of B to RHSInPacket Masked loads are used for cases where endK is not a multiple of PacketSize
|
inlinestatic |
Load endMxendK block of B to RHSInPacket Masked loads are used for cases where endK is not a multiple of PacketSize
|
inlinestatic |
endM: dimension of A. 1 <= endM <= EIGEN_AVX_MAX_NUM_ROW numK: number of avx registers to use for each row of B (ex fp32: 48 rhs => 3 avx reg used). 1 <= endK <= 3. isFWDSolve: true => forward substitution, false => backwards substitution isUnitDiag: true => triangular matrix has unit diagonal.
|
inlinestatic |
Update right-hand sides (stored in avx registers) Traversing along the column A_{i,currentM}, where currentM <= i <= endM, and broadcasting each value to AInPacket.