// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #ifndef EIGEN_COMPLEX_SSE_H #define EIGEN_COMPLEX_SSE_H namespace Eigen { namespace internal { //---------- float ---------- struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} __m128 v; }; template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, HasAdd = 1, HasSub = 1, HasMul = 1, HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, HasMax = 0, HasSetLinear = 0 }; }; template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); return Packet2cf(_mm_xor_ps(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_xor_ps(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2)))); // return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), // _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), // vec4f_swizzle1(b.v, 1, 0, 3, 2)))); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); #endif } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; #if EIGEN_GNUC_AT_MOST(4,2) // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast(&from)); #elif EIGEN_GNUC_AT_LEAST(4,6) // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" res.v = _mm_loadl_pi(res.v, (const __m64*)&from); #pragma GCC diagnostic pop #else res.v = _mm_loadl_pi(res.v, (const __m64*)&from); #endif return Packet2cf(_mm_movelh_ps(res.v,res.v)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { #if EIGEN_GNUC_AT_MOST(4,3) // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares... // This workaround also fix invalid code generation with gcc 4.3 EIGEN_ALIGN16 std::complex res[2]; _mm_store_ps((float*)res, a.v); return res[0]; #else std::complex res; _mm_storel_pi((__m64*)&res, a.v); return res; #endif } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } template struct palign_impl { static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) { if (Offset==1) { first.v = _mm_movehl_ps(first.v, first.v); first.v = _mm_movelh_ps(first.v, second.v); } } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return internal::pmul(a, pconj(b)); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return internal::pmul(pconj(a), b); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return pconj(internal::pmul(a, b)); #else const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const { return Packet2cf(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const { return Packet2cf(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 Packet2cf res = conj_helper().pmul(a,b); __m128 s = _mm_mul_ps(b.v,b.v); return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); } //---------- double ---------- struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} __m128d v; }; template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, HasAdd = 1, HasSub = 1, HasMul = 1, HasDiv = 1, HasNegate = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, HasMax = 0, HasSetLinear = 0 }; }; template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_xor_pd(a.v,mask)); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)))); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask))); #endif } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } // FIXME force unaligned store, this is a temporary fix template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { EIGEN_ALIGN16 double res[2]; _mm_store_pd(res, a.v); return std::complex(res[0],res[1]); } template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } template struct palign_impl { static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) { // FIXME is it sure we never have to align a Packet1cd? // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return internal::pmul(a, pconj(b)); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return internal::pmul(pconj(a), b); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { #ifdef EIGEN_VECTORIZE_SSE3 return pconj(internal::pmul(a, b)); #else const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)))); #endif } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const { return Packet1cd(Eigen::internal::pmul(x, y.v)); } }; template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const { return Packet1cd(Eigen::internal::pmul(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 Packet1cd res = conj_helper().pmul(a,b); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { return Packet1cd(preverse(x.v)); } } // end namespace internal } // end namespace Eigen #endif // EIGEN_COMPLEX_SSE_H