.. _program_listing_file_include_vec.h: Program Listing for File vec.h ============================== |exhale_lsh| :ref:`Return to documentation for file ` (``include/vec.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #ifndef vec_h #define vec_h #include #if __MIC__ | __AVX512F__ const int SIMD_BYTES = 64; #elif __AVX__ const int SIMD_BYTES = 32; #elif __SSE__ const int SIMD_BYTES = 16; #else #error no SIMD #endif #ifndef EXAFMM_RSQRT_APPROX #define EXAFMM_RSQRT_APPROX 1 #endif namespace exafmm_t { #ifndef __CUDACC__ template class vec { private: T data[N]; public: vec(){} vec(const T &v) { for (int i=0; i=(const T v) { for (int i=0; i= v; return *this; } const vec &operator<=(const T v) { for (int i=0; i=(const vec & v) { for (int i=0; i= v[i]; return *this; } const vec &operator<=(const vec & v) { for (int i=0; i(const T v) const { return vec(*this) >= v; } vec operator<(const T v) const { return vec(*this) <= v; } vec operator&(const T v) const { return vec(*this) &= v; } vec operator|(const T v) const { return vec(*this) |= v; } vec operator+(const vec & v) const { return vec(*this) += v; } vec operator-(const vec & v) const { return vec(*this) -= v; } vec operator*(const vec & v) const { return vec(*this) *= v; } vec operator/(const vec & v) const { return vec(*this) /= v; } vec operator>(const vec & v) const { return vec(*this) >= v; } vec operator<(const vec & v) const { return vec(*this) <= v; } vec operator&(const vec & v) const { return vec(*this) &= v; } vec operator|(const vec & v) const { return vec(*this) |= v; } vec operator-() const { vec temp; for (int i=0; i w[i] ? v[i] : w[i]; return temp; } friend T min(const vec & v) { T temp = v[0]; for (int i=1; i v[i] ? temp : v[i]; return temp; } friend vec sin(const vec & v) { vec temp; for (int i=0; i w[i] / 2) { v[i] -= w[i]; iw |= 1 << i; } } return iw; } friend void unwrap(vec & v, const vec & w, const int & iw) { for (int i=0; i> i) & 1) v[i] += (v[i] > 0 ? -w[i] : w[i]); } } }; #else #if EXAFMM_VEC_VERBOSE #pragma message("Overloading vector operators for CUDA") #endif #include "unroll.h" template class vec { private: T data[N]; public: __host__ __device__ __forceinline__ vec(){} __host__ __device__ __forceinline__ vec(const T &v) { Unroll,T,N>::loop(data,v); } __host__ __device__ __forceinline__ vec(const vec &v) { Unroll,T,N>::loop(data,v); } __host__ __device__ __forceinline__ vec(const float4 &v) { data[0] = v.x; data[1] = v.y; data[2] = v.z; data[3] = v.w; } __host__ __device__ __forceinline__ vec(const float x, const float y, const float z, const float w) { data[0] = x; data[1] = y; data[2] = z; data[3] = w; } __host__ __device__ __forceinline__ vec(const float x, const float y, const float z) { data[0] = x; data[1] = y; data[2] = z; } __host__ __device__ __forceinline__ ~vec(){} __host__ __device__ __forceinline__ const vec &operator=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator+=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator-=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator*=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator/=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator>=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator<=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator&=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator|=(const T v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator+=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator-=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator*=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator/=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator>=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator<=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator&=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ const vec &operator|=(const vec & v) { Unroll,T,N>::loop(data,v); return *this; } __host__ __device__ __forceinline__ vec operator+(const T v) const { return vec(*this) += v; } __host__ __device__ __forceinline__ vec operator-(const T v) const { return vec(*this) -= v; } __host__ __device__ __forceinline__ vec operator*(const T v) const { return vec(*this) *= v; } __host__ __device__ __forceinline__ vec operator/(const T v) const { return vec(*this) /= v; } __host__ __device__ __forceinline__ vec operator>(const T v) const { return vec(*this) >= v; } __host__ __device__ __forceinline__ vec operator<(const T v) const { return vec(*this) <= v; } __host__ __device__ __forceinline__ vec operator&(const T v) const { return vec(*this) &= v; } __host__ __device__ __forceinline__ vec operator|(const T v) const { return vec(*this) |= v; } __host__ __device__ __forceinline__ vec operator+(const vec & v) const { return vec(*this) += v; } __host__ __device__ __forceinline__ vec operator-(const vec & v) const { return vec(*this) -= v; } __host__ __device__ __forceinline__ vec operator*(const vec & v) const { return vec(*this) *= v; } __host__ __device__ __forceinline__ vec operator/(const vec & v) const { return vec(*this) /= v; } __host__ __device__ __forceinline__ vec operator>(const vec & v) const { return vec(*this) >= v; } __host__ __device__ __forceinline__ vec operator<(const vec & v) const { return vec(*this) <= v; } __host__ __device__ __forceinline__ vec operator&(const vec & v) const { return vec(*this) &= v; } __host__ __device__ __forceinline__ vec operator|(const vec & v) const { return vec(*this) |= v; } __host__ __device__ __forceinline__ vec operator-() const { vec temp; Unroll,T,N>::loop(temp,data); return temp; } __host__ __device__ __forceinline__ T &operator[](int i) { return data[i]; } __host__ __device__ __forceinline__ const T &operator[](int i) const { return data[i]; } __host__ __device__ __forceinline__ operator T* () {return data;} __host__ __device__ __forceinline__ friend T min(const vec & v) { T temp; for (int i=0; i v[i] ? temp : v[i]; return temp; } __host__ __device__ __forceinline__ operator const T* () const {return data;} __host__ __device__ __forceinline__ friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i,T,N>::reduce(v); } __host__ __device__ __forceinline__ friend T norm(const vec & v) { return sum(v * v); } __host__ __device__ __forceinline__ friend vec min(const vec & v, const vec & w) { vec temp; for (int i=0; i w[i] ? v[i] : w[i]; return temp; } __host__ __device__ __forceinline__ friend T min(const vec & v) { T temp = v[0]; for (int i=1; i v[i] ? temp : v[i]; return temp; } __device__ __forceinline__ friend vec abs(const vec & v) { vec temp; Unroll,T,N>::loop(temp,v); return temp; } __device__ __forceinline__ friend vec rsqrt(const vec & v) { vec temp; Unroll,T,N>::loop(temp,v); return temp; } __host__ __device__ __forceinline__ friend vec sin(const vec & v) { vec temp; Unroll,T,N>::loop(temp,v); return temp; } __host__ __device__ __forceinline__ friend vec cos(const vec & v) { vec temp; Unroll,T,N>::loop(temp,v); return temp; } __host__ __device__ __forceinline__ friend void sincos(vec & s, vec & c, const vec & v) { Unroll,T,N>::loop(s,c,v); } __host__ __device__ __forceinline__ friend vec exp(const vec & v) { vec temp; Unroll,T,N>::loop(temp,v); return temp; } __host__ __device__ __forceinline__ friend int wrap(vec & v, const T & w) { int iw = 0; for (int i=0; i w / 2) { v[i] -= w; iw |= 1 << i; } } return iw; } __host__ __device__ __forceinline__ friend void unwrap(vec & v, const T & w, const int & iw) { for (int i=0; i> i) & 1) v[i] += (v[i] > 0 ? -w : w); } } }; #endif #if defined __MIC__ || defined __AVX512F__ #if EXAFMM_VEC_VERBOSE #pragma message("Overloading vector operators for AVX512/MIC") #endif #include template<> class vec<16,float> { private: union { __m512 data; float array[16]; }; public: vec(){} vec(const float v) { data = _mm512_set1_ps(v); } vec(const __m512 v) { data = v; } vec(const vec & v) { data = v.data; } vec(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h, const float i, const float j, const float k, const float l, const float m, const float n, const float o, const float p) { data = _mm512_setr_ps(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p); } vec(const float* a, const int size) { int offset = size / (int)sizeof(float); data = _mm512_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset), *(a + 8 * offset), *(a + 9 * offset), *(a + 10 * offset), *(a + 11 * offset), *(a + 12 * offset), *(a + 13 * offset), *(a + 14 * offset), *(a + 15 * offset)); } ~vec(){} const vec &operator=(const float v) { data = _mm512_set1_ps(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm512_add_ps(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm512_sub_ps(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm512_mul_ps(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm512_div_ps(data,v.data); return *this; } const vec &operator&=(const __mmask16 & v) { data = _mm512_mask_mov_ps(_mm512_setzero_ps(),v,data); return *this; } vec operator+(const vec & v) const { return vec(_mm512_add_ps(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm512_sub_ps(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm512_mul_ps(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm512_div_ps(data,v.data)); } __mmask16 operator>(const vec & v) const { return _mm512_cmp_ps_mask(data,v.data,_MM_CMPINT_GT); } __mmask16 operator<(const vec & v) const { return _mm512_cmp_ps_mask(data,v.data,_MM_CMPINT_LT); } vec operator-() const { return vec(_mm512_sub_ps(_mm512_setzero_ps(),data)); } float &operator[](int i) { return array[i]; } const float &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<16; i++) s << v[i] << ' '; return s; } friend vec min(const vec & v, const vec & w) { return vec(_mm512_min_ps(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm512_max_ps(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX vec three = 3.0; vec twelve = 12.0; #ifdef __MIC__ vec temp = vec(_mm512_rsqrt23_ps(v.data)); #else vec temp = vec(_mm512_rsqrt14_ps(v.data)); #endif temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm512_div_ps(one.data,_mm512_sqrt_ps(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec & v) { return vec(_mm512_sin_ps(v.data)); } friend vec cos(const vec & v) { return vec(_mm512_cos_ps(v.data)); } friend vec exp(const vec & v) { return vec(_mm512_exp_ps(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm512_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]), std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7]), std::sin(v[8]), std::sin(v[9]), std::sin(v[10]), std::sin(v[11]), std::sin(v[12]), std::sin(v[13]), std::sin(v[14]), std::sin(v[15])); return temp; } friend vec cos(const vec & v) { vec temp = _mm512_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]), std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7]), std::cos(v[8]), std::cos(v[9]), std::cos(v[10]), std::cos(v[11]), std::cos(v[12]), std::cos(v[13]), std::cos(v[14]), std::cos(v[15])); return temp; } friend vec exp(const vec & v) { vec temp = _mm512_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]), std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7]), std::exp(v[8]), std::exp(v[9]), std::exp(v[10]), std::exp(v[11]), std::exp(v[12]), std::exp(v[13]), std::exp(v[14]), std::exp(v[15])); return temp; } #endif }; template<> class vec<8,double> { private: union { __m512d data; double array[8]; }; public: vec(){} vec(const double v) { data = _mm512_set1_pd(v); } vec(const __m512d v) { data = v; } vec(const vec & v) { data = v.data; } vec(const double a, const double b, const double c, const double d, const double e, const double f, const double g, const double h) { data = _mm512_setr_pd(a,b,c,d,e,f,g,h); } vec(const double* a) { data = _mm512_setr_pd(*a, *(a + 1), *(a + 2), *(a + 3), *(a + 4), *(a + 5), *(a + 6), *(a + 7)); } vec(const double* a, const int size) { int offset = size / (int)sizeof(double); data = _mm512_setr_pd(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset)); } ~vec(){} const vec &operator=(const double v) { data = _mm512_set1_pd(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm512_add_pd(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm512_sub_pd(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm512_mul_pd(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm512_div_pd(data,v.data); return *this; } const vec &operator&=(const __mmask8 & v) { data = _mm512_mask_mov_pd(_mm512_setzero_pd(),v,data); return *this; } vec operator+(const vec & v) const { return vec(_mm512_add_pd(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm512_sub_pd(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm512_mul_pd(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm512_div_pd(data,v.data)); } __mmask8 operator>(const vec & v) const { return _mm512_cmp_pd_mask(data,v.data,_MM_CMPINT_GT); } __mmask8 operator<(const vec & v) const { return _mm512_cmp_pd_mask(data,v.data,_MM_CMPINT_LT); } vec operator-() const { return vec(_mm512_sub_pd(_mm512_setzero_pd(),data)); } double &operator[](int i) { return array[i]; } const double &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<8; i++) s << v[i] << ' '; return s; } friend vec min(const vec & v, const vec & w) { return vec(_mm512_min_pd(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm512_max_pd(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX #ifdef __MIC__ vec temp = vec(_mm512_cvtps_pd(_mm256_rsqrt_ps(_mm512_cvtpd_ps(v.data)))); #else vec temp = vec(_mm512_cvtps_pd(_mm256_rsqrt_ps(_mm512_cvtpd_ps(v.data)))); #endif vec three = 3.0; vec twelve = 12.0; temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm512_div_pd(one.data,_mm512_sqrt_pd(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec & v) { return vec(_mm512_sin_pd(v.data)); } friend vec cos(const vec & v) { return vec(_mm512_cos_pd(v.data)); } friend vec exp(const vec & v) { return vec(_mm512_exp_pd(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm512_setr_pd(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]), std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7])); return temp; } friend vec cos(const vec & v) { vec temp = _mm512_setr_pd(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]), std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7])); return temp; } friend vec exp(const vec & v) { vec temp = _mm512_setr_pd(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]), std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7])); return temp; } #endif }; #endif #ifdef __AVX__ #if EXAFMM_VEC_VERBOSE #pragma message("Overloading vector operators for AVX") #endif #include template<> class vec<8,float> { private: union { __m256 data; float array[8]; }; public: vec(){} vec(const float v) { data = _mm256_set1_ps(v); } vec(const __m256 v) { data = v; } vec(const vec & v) { data = v.data; } vec(const float a, const float b, const float c, const float d, const float e, const float f, const float g, const float h) { data = _mm256_setr_ps(a,b,c,d,e,f,g,h); } vec(const float* a, const int size) { int offset = size / (int)sizeof(float); data = _mm256_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset)); } ~vec(){} const vec &operator=(const float v) { data = _mm256_set1_ps(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm256_add_ps(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm256_sub_ps(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm256_mul_ps(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm256_div_ps(data,v.data); return *this; } const vec &operator&=(const vec & v) { data = _mm256_and_ps(data,v.data); return *this; } vec operator+(const vec & v) const { return vec(_mm256_add_ps(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm256_sub_ps(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm256_mul_ps(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm256_div_ps(data,v.data)); } vec operator>(const vec & v) const { return vec(_mm256_cmp_ps(data,v.data,_CMP_GT_OQ)); } vec operator<(const vec & v) const { return vec(_mm256_cmp_ps(data,v.data,_CMP_LT_OQ)); } vec operator-() const { return vec(_mm256_sub_ps(_mm256_setzero_ps(),data)); } float &operator[](int i) { return array[i]; } const float &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<8; i++) s << v[i] << ' '; return s; } friend float sum(const vec & v) { union { __m256 temp; float out[8]; }; temp = _mm256_permute2f128_ps(v.data,v.data,1); temp = _mm256_add_ps(temp,v.data); temp = _mm256_hadd_ps(temp,temp); temp = _mm256_hadd_ps(temp,temp); return out[0]; } friend float norm(const vec & v) { union { __m256 temp; float out[8]; }; temp = _mm256_mul_ps(v.data,v.data); __m256 perm = _mm256_permute2f128_ps(temp,temp,1); temp = _mm256_add_ps(temp,perm); temp = _mm256_hadd_ps(temp,temp); temp = _mm256_hadd_ps(temp,temp); return out[0]; } friend vec min(const vec & v, const vec & w) { return vec(_mm256_min_ps(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm256_max_ps(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX vec three = 3.0f; vec twelve = 12.0f; vec temp = vec(_mm256_rsqrt_ps(v.data)); temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm256_div_ps(one.data,_mm256_sqrt_ps(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec & v) { return vec(_mm256_sin_ps(v.data)); } friend vec cos(const vec & v) { return vec(_mm256_cos_ps(v.data)); } friend vec exp(const vec & v) { return vec(_mm256_exp_ps(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm256_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]), std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7])); return temp; } friend vec cos(const vec & v) { vec temp = _mm256_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]), std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7])); return temp; } friend vec exp(const vec & v) { vec temp = _mm256_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]), std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7])); return temp; } #endif }; template<> class vec<4,double> { private: union { __m256d data; double array[4]; }; public: vec(){} vec(const double v) { data = _mm256_set1_pd(v); } vec(const __m256d v) { data = v; } vec(const vec & v) { data = v.data; } vec(const double a, const double b, const double c, const double d) { data = _mm256_setr_pd(a,b,c,d); } vec(const double* a) { data = _mm256_setr_pd(*a, *(a + 1), *(a + 2), *(a + 3)); } vec(const double* a, const int size) { int offset = size / (int)sizeof(double); data = _mm256_setr_pd(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset)); } ~vec(){} const vec &operator=(const double v) { data = _mm256_set1_pd(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm256_add_pd(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm256_sub_pd(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm256_mul_pd(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm256_div_pd(data,v.data); return *this; } const vec &operator&=(const vec & v) { data = _mm256_and_pd(data,v.data); return *this; } vec operator+(const vec & v) const { return vec(_mm256_add_pd(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm256_sub_pd(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm256_mul_pd(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm256_div_pd(data,v.data)); } vec operator>(const vec & v) const { return vec(_mm256_cmp_pd(data,v.data,_CMP_GT_OQ)); } vec operator<(const vec & v) const { return vec(_mm256_cmp_pd(data,v.data,_CMP_LT_OQ)); } vec operator-() const { return vec(_mm256_sub_pd(_mm256_setzero_pd(),data)); } double &operator[](int i) { return array[i]; } const double &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<4; i++) s << v[i] << ' '; return s; } friend double sum(const vec & v) { union { __m256d temp; double out[4]; }; temp = _mm256_permute2f128_pd(v.data,v.data,1); temp = _mm256_add_pd(temp,v.data); temp = _mm256_hadd_pd(temp,temp); return out[0]; } friend double norm(const vec & v) { union { __m256d temp; double out[4]; }; temp = _mm256_mul_pd(v.data,v.data); __m256d perm = _mm256_permute2f128_pd(temp,temp,1); temp = _mm256_add_pd(temp,perm); temp = _mm256_hadd_pd(temp,temp); return out[0]; } friend vec min(const vec & v, const vec & w) { return vec(_mm256_min_pd(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm256_max_pd(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX vec temp = vec(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(v.data)))); vec three = 3.0; vec twelve = 12.0; temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm256_div_pd(one.data,_mm256_sqrt_pd(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec & v) { return vec(_mm256_sin_pd(v.data)); } friend vec cos(const vec & v) { return vec(_mm256_cos_pd(v.data)); } friend vec exp(const vec & v) { return vec(_mm256_exp_pd(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm256_setr_pd(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3])); return temp; } friend vec cos(const vec & v) { vec temp = _mm256_setr_pd(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3])); return temp; } friend vec exp(const vec & v) { vec temp = _mm256_setr_pd(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3])); return temp; } #endif }; #endif #ifdef __SSE__ #if EXAFMM_VEC_VERBOSE #pragma message("Overloading vector operators for SSE") #endif #include template<> class vec<4,float> { private: union { __m128 data; float array[4]; }; public: vec(){} vec(const float v) { data = _mm_set1_ps(v); } vec(const __m128 v) { data = v; } vec(const vec & v) { data = v.data; } vec(const float a, const float b, const float c, const float d) { data = _mm_setr_ps(a,b,c,d); } vec(const float* a, const int size) { int offset = size / (int)sizeof(float); data = _mm_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset)); } ~vec(){} const vec &operator=(const float v) { data = _mm_set1_ps(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm_add_ps(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm_sub_ps(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm_mul_ps(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm_div_ps(data,v.data); return *this; } const vec &operator&=(const vec & v) { data = _mm_and_ps(data,v.data); return *this; } vec operator+(const vec & v) const { return vec(_mm_add_ps(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm_sub_ps(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm_mul_ps(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm_div_ps(data,v.data)); } vec operator>(const vec & v) const { return vec(_mm_cmpgt_ps(data,v.data)); } vec operator<(const vec & v) const { return vec(_mm_cmplt_ps(data,v.data)); } vec operator-() const { return vec(_mm_sub_ps(_mm_setzero_ps(),data)); } float &operator[](int i) { return array[i]; } const float &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<4; i++) s << v[i] << ' '; return s; } friend float sum(const vec & v) { union { __m128 temp; float out[4]; }; temp = _mm_hadd_ps(v.data,v.data); temp = _mm_hadd_ps(temp,temp); return out[0]; } friend float norm(const vec & v) { union { __m128 temp; float out[4]; }; temp = _mm_mul_ps(v.data,v.data); temp = _mm_hadd_ps(temp,temp); temp = _mm_hadd_ps(temp,temp); return out[0]; } friend vec min(const vec & v, const vec & w) { return vec(_mm_min_ps(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm_max_ps(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX vec temp = vec(_mm_rsqrt_ps(v.data)); vec three = 3.0f; vec twelve = 12.0f; temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm_div_ps(one.data,_mm_sqrt_ps(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec &v) { return vec(_mm_sin_ps(v.data)); } friend vec cos(const vec &v) { return vec(_mm_cos_ps(v.data)); } friend vec exp(const vec &v) { return vec(_mm_exp_ps(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3])); return temp; } friend vec cos(const vec & v) { vec temp = _mm_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3])); return temp; } friend vec exp(const vec & v) { vec temp = _mm_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3])); return temp; } #endif }; template<> class vec<2,double> { private: union { __m128d data; double array[2]; }; public: vec(){} vec(const double v) { data = _mm_set1_pd(v); } vec(const __m128d v) { data = v; } vec(const vec & v) { data = v.data; } vec(const double a, const double b) { data = _mm_setr_pd(a,b); } vec(const double* a, const int size) { int offset = size / (int)sizeof(double); data = _mm_setr_pd(*a, *(a + 1 * offset)); } ~vec(){} const vec &operator=(const double v) { data = _mm_set1_pd(v); return *this; } const vec &operator=(const vec & v) { data = v.data; return *this; } const vec &operator+=(const vec & v) { data = _mm_add_pd(data,v.data); return *this; } const vec &operator-=(const vec & v) { data = _mm_sub_pd(data,v.data); return *this; } const vec &operator*=(const vec & v) { data = _mm_mul_pd(data,v.data); return *this; } const vec &operator/=(const vec & v) { data = _mm_div_pd(data,v.data); return *this; } const vec &operator&=(const vec & v) { data = _mm_and_pd(data,v.data); return *this; } vec operator+(const vec & v) const { return vec(_mm_add_pd(data,v.data)); } vec operator-(const vec & v) const { return vec(_mm_sub_pd(data,v.data)); } vec operator*(const vec & v) const { return vec(_mm_mul_pd(data,v.data)); } vec operator/(const vec & v) const { return vec(_mm_div_pd(data,v.data)); } vec operator>(const vec & v) const { return vec(_mm_cmpgt_pd(data,v.data)); } vec operator<(const vec & v) const { return vec(_mm_cmplt_pd(data,v.data)); } vec operator-() const { return vec(_mm_sub_pd(_mm_setzero_pd(),data)); } double &operator[](int i) { return array[i]; } const double &operator[](int i) const { return array[i]; } friend std::ostream &operator<<(std::ostream & s, const vec & v) { for (int i=0; i<2; i++) s << v[i] << ' '; return s; } friend double sum(const vec & v) { union { __m128d temp; double out[2]; }; temp = _mm_hadd_pd(v.data,v.data); return out[0]; } friend double norm(const vec & v) { union { __m128d temp; double out[2]; }; temp = _mm_mul_pd(v.data,v.data); temp = _mm_hadd_pd(temp,temp); return out[0]; } friend vec min(const vec & v, const vec & w) { return vec(_mm_min_pd(v.data,w.data)); } friend vec max(const vec & v, const vec & w) { return vec(_mm_max_pd(v.data,w.data)); } friend vec rsqrt(const vec & v) { #if EXAFMM_RSQRT_APPROX vec temp = vec(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(v.data)))); vec three = 3.0; vec twelve = 12.0; temp *= (three - temp * temp * v); temp *= (twelve - temp * temp * v); return temp; #else vec one = 1; return vec(_mm_div_pd(one.data,_mm_sqrt_pd(v.data))); #endif } #ifdef __INTEL_COMPILER friend vec sin(const vec &v) { return vec(_mm_sin_pd(v.data)); } friend vec cos(const vec &v) { return vec(_mm_cos_pd(v.data)); } friend vec exp(const vec &v) { return vec(_mm_exp_pd(v.data)); } #else friend vec sin(const vec & v) { vec temp = _mm_setr_pd(std::sin(v[0]), std::sin(v[1])); return temp; } friend vec cos(const vec & v) { vec temp = _mm_setr_pd(std::cos(v[0]), std::cos(v[1])); return temp; } friend vec exp(const vec & v) { vec temp = _mm_setr_pd(std::exp(v[0]), std::exp(v[1])); return temp; } #endif }; #endif } #endif