$darkmode
Eigen  5.0.1-dev
MathFunctions.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2007 Julien Pommier
5 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 /* The sin and cos and functions of this file come from
12  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
13  */
14 
15 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
16 #define EIGEN_MATH_FUNCTIONS_SSE_H
17 
18 // IWYU pragma: private
19 #include "../../InternalHeaderCheck.h"
20 
21 namespace Eigen {
22 
23 namespace internal {
24 
25 EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4f)
26 EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2d)
27 
28 // Notice that for newer processors, it is counterproductive to use Newton
29 // iteration for square root. In particular, Skylake and Zen2 processors
30 // have approximately doubled throughput of the _mm_sqrt_ps instruction
31 // compared to their predecessors.
32 template <>
33 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
34  return _mm_sqrt_ps(x);
35 }
36 template <>
37 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
38  return _mm_sqrt_pd(x);
39 }
40 template <>
41 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt<Packet16b>(const Packet16b& x) {
42  return x;
43 }
44 
45 #if EIGEN_FAST_MATH
46 // Even on Skylake, using Newton iteration is a win for reciprocal square root.
47 template <>
48 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt<Packet4f>(const Packet4f& x) {
49  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
50 }
51 
52 #ifdef EIGEN_VECTORIZE_FMA
53 // Trying to speed up reciprocal using Newton-Raphson is counterproductive
54 // unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
55 // 30% faster.
56 template <>
57 EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
58  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
59 }
60 #endif
61 
62 #endif
63 
64 } // end namespace internal
65 
66 namespace numext {
67 
68 template <>
69 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float& x) {
70  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
71 }
72 
73 template <>
74 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double& x) {
75 #if EIGEN_COMP_GNUC_STRICT
76  // This works around a GCC bug generating poor code for _mm_sqrt_pd
77  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
78  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
79 #else
80  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
81 #endif
82 }
83 
84 } // namespace numext
85 
86 } // end namespace Eigen
87 
88 #endif // EIGEN_MATH_FUNCTIONS_SSE_H
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sqrt_op< typename Derived::Scalar >, const Derived > sqrt(const Eigen::ArrayBase< Derived > &x)
Namespace containing all symbols from the Eigen library.
Definition: B01_Experimental.dox:1