Program Listing for File vec.h¶
↰ Return to documentation for file (include/vec.h
)
#ifndef vec_h
#define vec_h
#include <ostream>
#if __MIC__ | __AVX512F__
const int SIMD_BYTES = 64;
#elif __AVX__
const int SIMD_BYTES = 32;
#elif __SSE__
const int SIMD_BYTES = 16;
#else
#error no SIMD
#endif
#ifndef EXAFMM_RSQRT_APPROX
#define EXAFMM_RSQRT_APPROX 1
#endif
namespace exafmm_t {
#ifndef __CUDACC__
template<int N, typename T>
class vec {
private:
T data[N];
public:
vec(){}
vec(const T &v) {
for (int i=0; i<N; i++) data[i] = v;
}
vec(const vec &v) {
for (int i=0; i<N; i++) data[i] = v[i];
}
~vec(){}
const vec &operator=(const T v) {
for (int i=0; i<N; i++) data[i] = v;
return *this;
}
const vec &operator+=(const T v) {
for (int i=0; i<N; i++) data[i] += v;
return *this;
}
const vec &operator-=(const T v) {
for (int i=0; i<N; i++) data[i] -= v;
return *this;
}
const vec &operator*=(const T v) {
for (int i=0; i<N; i++) data[i] *= v;
return *this;
}
const vec &operator/=(const T v) {
for (int i=0; i<N; i++) data[i] /= v;
return *this;
}
const vec &operator>=(const T v) {
for (int i=0; i<N; i++) data[i] >= v;
return *this;
}
const vec &operator<=(const T v) {
for (int i=0; i<N; i++) data[i] <= v;
return *this;
}
const vec &operator&=(const T v) {
for (int i=0; i<N; i++) data[i] &= v;
return *this;
}
const vec &operator|=(const T v) {
for (int i=0; i<N; i++) data[i] |= v;
return *this;
}
const vec &operator=(const vec & v) {
for (int i=0; i<N; i++) data[i] = v[i];
return *this;
}
const vec &operator+=(const vec & v) {
for (int i=0; i<N; i++) data[i] += v[i];
return *this;
}
const vec &operator-=(const vec & v) {
for (int i=0; i<N; i++) data[i] -= v[i];
return *this;
}
const vec &operator*=(const vec & v) {
for (int i=0; i<N; i++) data[i] *= v[i];
return *this;
}
const vec &operator/=(const vec & v) {
for (int i=0; i<N; i++) data[i] /= v[i];
return *this;
}
const vec &operator>=(const vec & v) {
for (int i=0; i<N; i++) data[i] >= v[i];
return *this;
}
const vec &operator<=(const vec & v) {
for (int i=0; i<N; i++) data[i] <= v[i];
return *this;
}
const vec &operator&=(const vec & v) {
for (int i=0; i<N; i++) data[i] &= v[i];
return *this;
}
const vec &operator|=(const vec & v) {
for (int i=0; i<N; i++) data[i] |= v[i];
return *this;
}
vec operator+(const T v) const {
return vec(*this) += v;
}
vec operator-(const T v) const {
return vec(*this) -= v;
}
vec operator*(const T v) const {
return vec(*this) *= v;
}
vec operator/(const T v) const {
return vec(*this) /= v;
}
vec operator>(const T v) const {
return vec(*this) >= v;
}
vec operator<(const T v) const {
return vec(*this) <= v;
}
vec operator&(const T v) const {
return vec(*this) &= v;
}
vec operator|(const T v) const {
return vec(*this) |= v;
}
vec operator+(const vec & v) const {
return vec(*this) += v;
}
vec operator-(const vec & v) const {
return vec(*this) -= v;
}
vec operator*(const vec & v) const {
return vec(*this) *= v;
}
vec operator/(const vec & v) const {
return vec(*this) /= v;
}
vec operator>(const vec & v) const {
return vec(*this) >= v;
}
vec operator<(const vec & v) const {
return vec(*this) <= v;
}
vec operator&(const vec & v) const {
return vec(*this) &= v;
}
vec operator|(const vec & v) const {
return vec(*this) |= v;
}
vec operator-() const {
vec temp;
for (int i=0; i<N; i++) temp[i] = -data[i];
return temp;
}
T &operator[](int i) {
return data[i];
}
const T &operator[](int i) const {
return data[i];
}
operator T* () {return data;}
operator const T* () const {return data;}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<N; i++) s << v[i] << ' ';
return s;
}
friend T sum(const vec & v) {
T temp = 0;
for (int i=0; i<N; i++) temp += v[i];
return temp;
}
friend T norm(const vec & v) {
T temp = 0;
for (int i=0; i<N; i++) temp += v[i] * v[i];
return temp;
}
friend vec min(const vec & v, const vec & w) {
vec temp;
for (int i=0; i<N; i++) temp[i] = v[i] < w[i] ? v[i] : w[i];
return temp;
}
friend vec max(const vec & v, const vec & w) {
vec temp;
for (int i=0; i<N; i++) temp[i] = v[i] > w[i] ? v[i] : w[i];
return temp;
}
friend T min(const vec & v) {
T temp = v[0];
for (int i=1; i<N; i++) temp = temp < v[i] ? temp : v[i];
return temp;
}
friend T max(const vec & v) {
T temp = v[0];
for (int i=1; i<N; i++) temp = temp > v[i] ? temp : v[i];
return temp;
}
friend vec sin(const vec & v) {
vec temp;
for (int i=0; i<N; i++) temp[i] = sin(v[i]);
return temp;
}
friend vec cos(const vec & v) {
vec temp;
for (int i=0; i<N; i++) temp[i] = cos(v[i]);
return temp;
}
friend vec exp(const vec & v) {
vec temp;
for (int i=0; i<N; i++) temp[i] = exp(v[i]);
return temp;
}
friend int wrap(vec & v, const vec & w) {
int iw = 0;
for (int i=0; i<N; i++) {
if(v[i] < -w[i] / 2) {
v[i] += w[i];
iw |= 1 << i;
}
if(v[i] > w[i] / 2) {
v[i] -= w[i];
iw |= 1 << i;
}
}
return iw;
}
friend void unwrap(vec & v, const vec & w, const int & iw) {
for (int i=0; i<N; i++) {
if((iw >> i) & 1) v[i] += (v[i] > 0 ? -w[i] : w[i]);
}
}
};
#else
#if EXAFMM_VEC_VERBOSE
#pragma message("Overloading vector operators for CUDA")
#endif
#include "unroll.h"
template<int N, typename T>
class vec {
private:
T data[N];
public:
__host__ __device__ __forceinline__
vec(){}
__host__ __device__ __forceinline__
vec(const T &v) {
Unroll<Ops::Assign<T>,T,N>::loop(data,v);
}
__host__ __device__ __forceinline__
vec(const vec &v) {
Unroll<Ops::Assign<T>,T,N>::loop(data,v);
}
__host__ __device__ __forceinline__
vec(const float4 &v) {
data[0] = v.x;
data[1] = v.y;
data[2] = v.z;
data[3] = v.w;
}
__host__ __device__ __forceinline__
vec(const float x, const float y, const float z, const float w) {
data[0] = x;
data[1] = y;
data[2] = z;
data[3] = w;
}
__host__ __device__ __forceinline__
vec(const float x, const float y, const float z) {
data[0] = x;
data[1] = y;
data[2] = z;
}
__host__ __device__ __forceinline__
~vec(){}
__host__ __device__ __forceinline__
const vec &operator=(const T v) {
Unroll<Ops::Assign<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator+=(const T v) {
Unroll<Ops::Add<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator-=(const T v) {
Unroll<Ops::Sub<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator*=(const T v) {
Unroll<Ops::Mul<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator/=(const T v) {
Unroll<Ops::Div<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator>=(const T v) {
Unroll<Ops::Gt<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator<=(const T v) {
Unroll<Ops::Lt<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator&=(const T v) {
Unroll<Ops::And<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator|=(const T v) {
Unroll<Ops::Or<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator=(const vec & v) {
Unroll<Ops::Assign<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator+=(const vec & v) {
Unroll<Ops::Add<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator-=(const vec & v) {
Unroll<Ops::Sub<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator*=(const vec & v) {
Unroll<Ops::Mul<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator/=(const vec & v) {
Unroll<Ops::Div<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator>=(const vec & v) {
Unroll<Ops::Gt<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator<=(const vec & v) {
Unroll<Ops::Lt<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator&=(const vec & v) {
Unroll<Ops::And<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
const vec &operator|=(const vec & v) {
Unroll<Ops::Or<T>,T,N>::loop(data,v);
return *this;
}
__host__ __device__ __forceinline__
vec operator+(const T v) const {
return vec(*this) += v;
}
__host__ __device__ __forceinline__
vec operator-(const T v) const {
return vec(*this) -= v;
}
__host__ __device__ __forceinline__
vec operator*(const T v) const {
return vec(*this) *= v;
}
__host__ __device__ __forceinline__
vec operator/(const T v) const {
return vec(*this) /= v;
}
__host__ __device__ __forceinline__
vec operator>(const T v) const {
return vec(*this) >= v;
}
__host__ __device__ __forceinline__
vec operator<(const T v) const {
return vec(*this) <= v;
}
__host__ __device__ __forceinline__
vec operator&(const T v) const {
return vec(*this) &= v;
}
__host__ __device__ __forceinline__
vec operator|(const T v) const {
return vec(*this) |= v;
}
__host__ __device__ __forceinline__
vec operator+(const vec & v) const {
return vec(*this) += v;
}
__host__ __device__ __forceinline__
vec operator-(const vec & v) const {
return vec(*this) -= v;
}
__host__ __device__ __forceinline__
vec operator*(const vec & v) const {
return vec(*this) *= v;
}
__host__ __device__ __forceinline__
vec operator/(const vec & v) const {
return vec(*this) /= v;
}
__host__ __device__ __forceinline__
vec operator>(const vec & v) const {
return vec(*this) >= v;
}
__host__ __device__ __forceinline__
vec operator<(const vec & v) const {
return vec(*this) <= v;
}
__host__ __device__ __forceinline__
vec operator&(const vec & v) const {
return vec(*this) &= v;
}
__host__ __device__ __forceinline__
vec operator|(const vec & v) const {
return vec(*this) |= v;
}
__host__ __device__ __forceinline__
vec operator-() const {
vec temp;
Unroll<Ops::Negate<T>,T,N>::loop(temp,data);
return temp;
}
__host__ __device__ __forceinline__
T &operator[](int i) {
return data[i];
}
__host__ __device__ __forceinline__
const T &operator[](int i) const {
return data[i];
}
__host__ __device__ __forceinline__
operator T* () {return data;}
__host__ __device__ __forceinline__
friend T min(const vec & v) {
T temp;
for (int i=0; i<N; i++) temp = temp < v[i] ? temp : v[i];
return temp;
}
__host__ __device__ __forceinline__
friend T max(const vec & v) {
T temp;
for (int i=0; i<N; i++) temp = temp > v[i] ? temp : v[i];
return temp;
} __host__ __device__ __forceinline__
operator const T* () const {return data;}
__host__ __device__ __forceinline__
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<N; i++) s << v[i] << ' ';
return s;
}
__host__ __device__ __forceinline__
friend T sum(const vec & v) {
return Unroll<Ops::Add<T>,T,N>::reduce(v);
}
__host__ __device__ __forceinline__
friend T norm(const vec & v) {
return sum(v * v);
}
__host__ __device__ __forceinline__
friend vec min(const vec & v, const vec & w) {
vec temp;
for (int i=0; i<N; i++) temp[i] = v[i] < w[i] ? v[i] : w[i];
return temp;
}
__host__ __device__ __forceinline__
friend vec max(const vec & v, const vec & w) {
vec temp;
for (int i=0; i<N; i++) temp[i] = v[i] > w[i] ? v[i] : w[i];
return temp;
}
__host__ __device__ __forceinline__
friend T min(const vec & v) {
T temp = v[0];
for (int i=1; i<N; i++) temp = temp < v[i] ? temp : v[i];
return temp;
}
__host__ __device__ __forceinline__
friend T max(const vec & v) {
T temp = v[0];
for (int i=1; i<N; i++) temp = temp > v[i] ? temp : v[i];
return temp;
}
__device__ __forceinline__
friend vec abs(const vec & v) {
vec temp;
Unroll<Ops::Abs<T>,T,N>::loop(temp,v);
return temp;
}
__device__ __forceinline__
friend vec rsqrt(const vec & v) {
vec temp;
Unroll<Ops::Rsqrt<T>,T,N>::loop(temp,v);
return temp;
}
__host__ __device__ __forceinline__
friend vec sin(const vec & v) {
vec temp;
Unroll<Ops::Sin<T>,T,N>::loop(temp,v);
return temp;
}
__host__ __device__ __forceinline__
friend vec cos(const vec & v) {
vec temp;
Unroll<Ops::Cos<T>,T,N>::loop(temp,v);
return temp;
}
__host__ __device__ __forceinline__
friend void sincos(vec & s, vec & c, const vec & v) {
Unroll<Ops::SinCos<T>,T,N>::loop(s,c,v);
}
__host__ __device__ __forceinline__
friend vec exp(const vec & v) {
vec temp;
Unroll<Ops::Exp<T>,T,N>::loop(temp,v);
return temp;
}
__host__ __device__ __forceinline__
friend int wrap(vec & v, const T & w) {
int iw = 0;
for (int i=0; i<N; i++) {
if(v[i] < -w / 2) {
v[i] += w;
iw |= 1 << i;
}
if(v[i] > w / 2) {
v[i] -= w;
iw |= 1 << i;
}
}
return iw;
}
__host__ __device__ __forceinline__
friend void unwrap(vec & v, const T & w, const int & iw) {
for (int i=0; i<N; i++) {
if((iw >> i) & 1) v[i] += (v[i] > 0 ? -w : w);
}
}
};
#endif
#if defined __MIC__ || defined __AVX512F__
#if EXAFMM_VEC_VERBOSE
#pragma message("Overloading vector operators for AVX512/MIC")
#endif
#include <immintrin.h>
template<>
class vec<16,float> {
private:
union {
__m512 data;
float array[16];
};
public:
vec(){}
vec(const float v) {
data = _mm512_set1_ps(v);
}
vec(const __m512 v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const float a, const float b, const float c, const float d,
const float e, const float f, const float g, const float h,
const float i, const float j, const float k, const float l,
const float m, const float n, const float o, const float p) {
data = _mm512_setr_ps(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p);
}
vec(const float* a, const int size) {
int offset = size / (int)sizeof(float);
data = _mm512_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset), *(a + 8 * offset), *(a + 9 * offset), *(a + 10 * offset), *(a + 11 * offset), *(a + 12 * offset), *(a + 13 * offset), *(a + 14 * offset), *(a + 15 * offset));
}
~vec(){}
const vec &operator=(const float v) {
data = _mm512_set1_ps(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm512_add_ps(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm512_sub_ps(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm512_mul_ps(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm512_div_ps(data,v.data);
return *this;
}
const vec &operator&=(const __mmask16 & v) {
data = _mm512_mask_mov_ps(_mm512_setzero_ps(),v,data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm512_add_ps(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm512_sub_ps(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm512_mul_ps(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm512_div_ps(data,v.data));
}
__mmask16 operator>(const vec & v) const {
return _mm512_cmp_ps_mask(data,v.data,_MM_CMPINT_GT);
}
__mmask16 operator<(const vec & v) const {
return _mm512_cmp_ps_mask(data,v.data,_MM_CMPINT_LT);
}
vec operator-() const {
return vec(_mm512_sub_ps(_mm512_setzero_ps(),data));
}
float &operator[](int i) {
return array[i];
}
const float &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<16; i++) s << v[i] << ' ';
return s;
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm512_min_ps(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm512_max_ps(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
vec three = 3.0;
vec twelve = 12.0;
#ifdef __MIC__
vec temp = vec(_mm512_rsqrt23_ps(v.data));
#else
vec temp = vec(_mm512_rsqrt14_ps(v.data));
#endif
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm512_div_ps(one.data,_mm512_sqrt_ps(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec & v) {
return vec(_mm512_sin_ps(v.data));
}
friend vec cos(const vec & v) {
return vec(_mm512_cos_ps(v.data));
}
friend vec exp(const vec & v) {
return vec(_mm512_exp_ps(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm512_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]),
std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7]),
std::sin(v[8]), std::sin(v[9]), std::sin(v[10]), std::sin(v[11]),
std::sin(v[12]), std::sin(v[13]), std::sin(v[14]), std::sin(v[15]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm512_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]),
std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7]),
std::cos(v[8]), std::cos(v[9]), std::cos(v[10]), std::cos(v[11]),
std::cos(v[12]), std::cos(v[13]), std::cos(v[14]), std::cos(v[15]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm512_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]),
std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7]),
std::exp(v[8]), std::exp(v[9]), std::exp(v[10]), std::exp(v[11]),
std::exp(v[12]), std::exp(v[13]), std::exp(v[14]), std::exp(v[15]));
return temp;
}
#endif
};
template<>
class vec<8,double> {
private:
union {
__m512d data;
double array[8];
};
public:
vec(){}
vec(const double v) {
data = _mm512_set1_pd(v);
}
vec(const __m512d v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const double a, const double b, const double c, const double d,
const double e, const double f, const double g, const double h) {
data = _mm512_setr_pd(a,b,c,d,e,f,g,h);
}
vec(const double* a) {
data = _mm512_setr_pd(*a, *(a + 1), *(a + 2), *(a + 3), *(a + 4), *(a + 5), *(a + 6), *(a + 7));
}
vec(const double* a, const int size) {
int offset = size / (int)sizeof(double);
data = _mm512_setr_pd(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset));
}
~vec(){}
const vec &operator=(const double v) {
data = _mm512_set1_pd(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm512_add_pd(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm512_sub_pd(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm512_mul_pd(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm512_div_pd(data,v.data);
return *this;
}
const vec &operator&=(const __mmask8 & v) {
data = _mm512_mask_mov_pd(_mm512_setzero_pd(),v,data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm512_add_pd(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm512_sub_pd(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm512_mul_pd(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm512_div_pd(data,v.data));
}
__mmask8 operator>(const vec & v) const {
return _mm512_cmp_pd_mask(data,v.data,_MM_CMPINT_GT);
}
__mmask8 operator<(const vec & v) const {
return _mm512_cmp_pd_mask(data,v.data,_MM_CMPINT_LT);
}
vec operator-() const {
return vec(_mm512_sub_pd(_mm512_setzero_pd(),data));
}
double &operator[](int i) {
return array[i];
}
const double &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<8; i++) s << v[i] << ' ';
return s;
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm512_min_pd(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm512_max_pd(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
#ifdef __MIC__
vec temp = vec(_mm512_cvtps_pd(_mm256_rsqrt_ps(_mm512_cvtpd_ps(v.data))));
#else
vec temp = vec(_mm512_cvtps_pd(_mm256_rsqrt_ps(_mm512_cvtpd_ps(v.data))));
#endif
vec three = 3.0;
vec twelve = 12.0;
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm512_div_pd(one.data,_mm512_sqrt_pd(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec & v) {
return vec(_mm512_sin_pd(v.data));
}
friend vec cos(const vec & v) {
return vec(_mm512_cos_pd(v.data));
}
friend vec exp(const vec & v) {
return vec(_mm512_exp_pd(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm512_setr_pd(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]),
std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm512_setr_pd(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]),
std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm512_setr_pd(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]),
std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7]));
return temp;
}
#endif
};
#endif
#ifdef __AVX__
#if EXAFMM_VEC_VERBOSE
#pragma message("Overloading vector operators for AVX")
#endif
#include <immintrin.h>
template<>
class vec<8,float> {
private:
union {
__m256 data;
float array[8];
};
public:
vec(){}
vec(const float v) {
data = _mm256_set1_ps(v);
}
vec(const __m256 v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const float a, const float b, const float c, const float d,
const float e, const float f, const float g, const float h) {
data = _mm256_setr_ps(a,b,c,d,e,f,g,h);
}
vec(const float* a, const int size) {
int offset = size / (int)sizeof(float);
data = _mm256_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset), *(a + 4 * offset), *(a + 5 * offset), *(a + 6 * offset), *(a + 7 * offset));
}
~vec(){}
const vec &operator=(const float v) {
data = _mm256_set1_ps(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm256_add_ps(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm256_sub_ps(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm256_mul_ps(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm256_div_ps(data,v.data);
return *this;
}
const vec &operator&=(const vec & v) {
data = _mm256_and_ps(data,v.data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm256_add_ps(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm256_sub_ps(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm256_mul_ps(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm256_div_ps(data,v.data));
}
vec operator>(const vec & v) const {
return vec(_mm256_cmp_ps(data,v.data,_CMP_GT_OQ));
}
vec operator<(const vec & v) const {
return vec(_mm256_cmp_ps(data,v.data,_CMP_LT_OQ));
}
vec operator-() const {
return vec(_mm256_sub_ps(_mm256_setzero_ps(),data));
}
float &operator[](int i) {
return array[i];
}
const float &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<8; i++) s << v[i] << ' ';
return s;
}
friend float sum(const vec & v) {
union {
__m256 temp;
float out[8];
};
temp = _mm256_permute2f128_ps(v.data,v.data,1);
temp = _mm256_add_ps(temp,v.data);
temp = _mm256_hadd_ps(temp,temp);
temp = _mm256_hadd_ps(temp,temp);
return out[0];
}
friend float norm(const vec & v) {
union {
__m256 temp;
float out[8];
};
temp = _mm256_mul_ps(v.data,v.data);
__m256 perm = _mm256_permute2f128_ps(temp,temp,1);
temp = _mm256_add_ps(temp,perm);
temp = _mm256_hadd_ps(temp,temp);
temp = _mm256_hadd_ps(temp,temp);
return out[0];
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm256_min_ps(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm256_max_ps(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
vec three = 3.0f;
vec twelve = 12.0f;
vec temp = vec(_mm256_rsqrt_ps(v.data));
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm256_div_ps(one.data,_mm256_sqrt_ps(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec & v) {
return vec(_mm256_sin_ps(v.data));
}
friend vec cos(const vec & v) {
return vec(_mm256_cos_ps(v.data));
}
friend vec exp(const vec & v) {
return vec(_mm256_exp_ps(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm256_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]),
std::sin(v[4]), std::sin(v[5]), std::sin(v[6]), std::sin(v[7]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm256_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]),
std::cos(v[4]), std::cos(v[5]), std::cos(v[6]), std::cos(v[7]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm256_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]),
std::exp(v[4]), std::exp(v[5]), std::exp(v[6]), std::exp(v[7]));
return temp;
}
#endif
};
template<>
class vec<4,double> {
private:
union {
__m256d data;
double array[4];
};
public:
vec(){}
vec(const double v) {
data = _mm256_set1_pd(v);
}
vec(const __m256d v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const double a, const double b, const double c, const double d) {
data = _mm256_setr_pd(a,b,c,d);
}
vec(const double* a) {
data = _mm256_setr_pd(*a, *(a + 1), *(a + 2), *(a + 3));
}
vec(const double* a, const int size) {
int offset = size / (int)sizeof(double);
data = _mm256_setr_pd(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset));
}
~vec(){}
const vec &operator=(const double v) {
data = _mm256_set1_pd(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm256_add_pd(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm256_sub_pd(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm256_mul_pd(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm256_div_pd(data,v.data);
return *this;
}
const vec &operator&=(const vec & v) {
data = _mm256_and_pd(data,v.data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm256_add_pd(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm256_sub_pd(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm256_mul_pd(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm256_div_pd(data,v.data));
}
vec operator>(const vec & v) const {
return vec(_mm256_cmp_pd(data,v.data,_CMP_GT_OQ));
}
vec operator<(const vec & v) const {
return vec(_mm256_cmp_pd(data,v.data,_CMP_LT_OQ));
}
vec operator-() const {
return vec(_mm256_sub_pd(_mm256_setzero_pd(),data));
}
double &operator[](int i) {
return array[i];
}
const double &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<4; i++) s << v[i] << ' ';
return s;
}
friend double sum(const vec & v) {
union {
__m256d temp;
double out[4];
};
temp = _mm256_permute2f128_pd(v.data,v.data,1);
temp = _mm256_add_pd(temp,v.data);
temp = _mm256_hadd_pd(temp,temp);
return out[0];
}
friend double norm(const vec & v) {
union {
__m256d temp;
double out[4];
};
temp = _mm256_mul_pd(v.data,v.data);
__m256d perm = _mm256_permute2f128_pd(temp,temp,1);
temp = _mm256_add_pd(temp,perm);
temp = _mm256_hadd_pd(temp,temp);
return out[0];
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm256_min_pd(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm256_max_pd(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
vec temp = vec(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(v.data))));
vec three = 3.0;
vec twelve = 12.0;
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm256_div_pd(one.data,_mm256_sqrt_pd(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec & v) {
return vec(_mm256_sin_pd(v.data));
}
friend vec cos(const vec & v) {
return vec(_mm256_cos_pd(v.data));
}
friend vec exp(const vec & v) {
return vec(_mm256_exp_pd(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm256_setr_pd(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm256_setr_pd(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm256_setr_pd(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]));
return temp;
}
#endif
};
#endif
#ifdef __SSE__
#if EXAFMM_VEC_VERBOSE
#pragma message("Overloading vector operators for SSE")
#endif
#include <pmmintrin.h>
template<>
class vec<4,float> {
private:
union {
__m128 data;
float array[4];
};
public:
vec(){}
vec(const float v) {
data = _mm_set1_ps(v);
}
vec(const __m128 v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const float a, const float b, const float c, const float d) {
data = _mm_setr_ps(a,b,c,d);
}
vec(const float* a, const int size) {
int offset = size / (int)sizeof(float);
data = _mm_setr_ps(*a, *(a + 1 * offset), *(a + 2 * offset), *(a + 3 * offset));
}
~vec(){}
const vec &operator=(const float v) {
data = _mm_set1_ps(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm_add_ps(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm_sub_ps(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm_mul_ps(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm_div_ps(data,v.data);
return *this;
}
const vec &operator&=(const vec & v) {
data = _mm_and_ps(data,v.data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm_add_ps(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm_sub_ps(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm_mul_ps(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm_div_ps(data,v.data));
}
vec operator>(const vec & v) const {
return vec(_mm_cmpgt_ps(data,v.data));
}
vec operator<(const vec & v) const {
return vec(_mm_cmplt_ps(data,v.data));
}
vec operator-() const {
return vec(_mm_sub_ps(_mm_setzero_ps(),data));
}
float &operator[](int i) {
return array[i];
}
const float &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<4; i++) s << v[i] << ' ';
return s;
}
friend float sum(const vec & v) {
union {
__m128 temp;
float out[4];
};
temp = _mm_hadd_ps(v.data,v.data);
temp = _mm_hadd_ps(temp,temp);
return out[0];
}
friend float norm(const vec & v) {
union {
__m128 temp;
float out[4];
};
temp = _mm_mul_ps(v.data,v.data);
temp = _mm_hadd_ps(temp,temp);
temp = _mm_hadd_ps(temp,temp);
return out[0];
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm_min_ps(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm_max_ps(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
vec temp = vec(_mm_rsqrt_ps(v.data));
vec three = 3.0f;
vec twelve = 12.0f;
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm_div_ps(one.data,_mm_sqrt_ps(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec &v) {
return vec(_mm_sin_ps(v.data));
}
friend vec cos(const vec &v) {
return vec(_mm_cos_ps(v.data));
}
friend vec exp(const vec &v) {
return vec(_mm_exp_ps(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm_setr_ps(std::sin(v[0]), std::sin(v[1]), std::sin(v[2]), std::sin(v[3]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm_setr_ps(std::cos(v[0]), std::cos(v[1]), std::cos(v[2]), std::cos(v[3]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm_setr_ps(std::exp(v[0]), std::exp(v[1]), std::exp(v[2]), std::exp(v[3]));
return temp;
}
#endif
};
template<>
class vec<2,double> {
private:
union {
__m128d data;
double array[2];
};
public:
vec(){}
vec(const double v) {
data = _mm_set1_pd(v);
}
vec(const __m128d v) {
data = v;
}
vec(const vec & v) {
data = v.data;
}
vec(const double a, const double b) {
data = _mm_setr_pd(a,b);
}
vec(const double* a, const int size) {
int offset = size / (int)sizeof(double);
data = _mm_setr_pd(*a, *(a + 1 * offset));
}
~vec(){}
const vec &operator=(const double v) {
data = _mm_set1_pd(v);
return *this;
}
const vec &operator=(const vec & v) {
data = v.data;
return *this;
}
const vec &operator+=(const vec & v) {
data = _mm_add_pd(data,v.data);
return *this;
}
const vec &operator-=(const vec & v) {
data = _mm_sub_pd(data,v.data);
return *this;
}
const vec &operator*=(const vec & v) {
data = _mm_mul_pd(data,v.data);
return *this;
}
const vec &operator/=(const vec & v) {
data = _mm_div_pd(data,v.data);
return *this;
}
const vec &operator&=(const vec & v) {
data = _mm_and_pd(data,v.data);
return *this;
}
vec operator+(const vec & v) const {
return vec(_mm_add_pd(data,v.data));
}
vec operator-(const vec & v) const {
return vec(_mm_sub_pd(data,v.data));
}
vec operator*(const vec & v) const {
return vec(_mm_mul_pd(data,v.data));
}
vec operator/(const vec & v) const {
return vec(_mm_div_pd(data,v.data));
}
vec operator>(const vec & v) const {
return vec(_mm_cmpgt_pd(data,v.data));
}
vec operator<(const vec & v) const {
return vec(_mm_cmplt_pd(data,v.data));
}
vec operator-() const {
return vec(_mm_sub_pd(_mm_setzero_pd(),data));
}
double &operator[](int i) {
return array[i];
}
const double &operator[](int i) const {
return array[i];
}
friend std::ostream &operator<<(std::ostream & s, const vec & v) {
for (int i=0; i<2; i++) s << v[i] << ' ';
return s;
}
friend double sum(const vec & v) {
union {
__m128d temp;
double out[2];
};
temp = _mm_hadd_pd(v.data,v.data);
return out[0];
}
friend double norm(const vec & v) {
union {
__m128d temp;
double out[2];
};
temp = _mm_mul_pd(v.data,v.data);
temp = _mm_hadd_pd(temp,temp);
return out[0];
}
friend vec min(const vec & v, const vec & w) {
return vec(_mm_min_pd(v.data,w.data));
}
friend vec max(const vec & v, const vec & w) {
return vec(_mm_max_pd(v.data,w.data));
}
friend vec rsqrt(const vec & v) {
#if EXAFMM_RSQRT_APPROX
vec temp = vec(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(v.data))));
vec three = 3.0;
vec twelve = 12.0;
temp *= (three - temp * temp * v);
temp *= (twelve - temp * temp * v);
return temp;
#else
vec one = 1;
return vec(_mm_div_pd(one.data,_mm_sqrt_pd(v.data)));
#endif
}
#ifdef __INTEL_COMPILER
friend vec sin(const vec &v) {
return vec(_mm_sin_pd(v.data));
}
friend vec cos(const vec &v) {
return vec(_mm_cos_pd(v.data));
}
friend vec exp(const vec &v) {
return vec(_mm_exp_pd(v.data));
}
#else
friend vec sin(const vec & v) {
vec temp = _mm_setr_pd(std::sin(v[0]), std::sin(v[1]));
return temp;
}
friend vec cos(const vec & v) {
vec temp = _mm_setr_pd(std::cos(v[0]), std::cos(v[1]));
return temp;
}
friend vec exp(const vec & v) {
vec temp = _mm_setr_pd(std::exp(v[0]), std::exp(v[1]));
return temp;
}
#endif
};
#endif
}
#endif