Program Listing for File helmholtz.h

Return to documentation for file (include/helmholtz.h)

#ifndef helmholtz_h
#define helmholtz_h
#include "exafmm_t.h"
#include "fmm.h"
#include "geometry.h"
#include "intrinsics.h"
#include "timer.h"

namespace exafmm_t {
  class HelmholtzFmm : public Fmm<complex_t> {
  public:
    complex_t wavek;

    HelmholtzFmm() {}
    HelmholtzFmm(int p_, int ncrit_, complex_t wavek_, std::string filename_=std::string()) :
      Fmm<complex_t>(p_, ncrit_, filename_)
    {
      wavek = wavek_;
      if (this->filename.empty()) {
        this->filename = std::string("helmholtz_") + (std::is_same<real_t, float>::value ? "f" : "d")
                       + std::string("_p") + std::to_string(p) + std::string(".dat");
      }
    }

    void potential_P2P(RealVec& src_coord, ComplexVec& src_value, RealVec& trg_coord, ComplexVec& trg_value) {
      simdvec zero((real_t)0);
      real_t newton_coef = 16;
      simdvec coef(real_t(1.0/(4*PI*newton_coef)));
      simdvec k_real(wavek.real()/newton_coef);
      simdvec k_imag(wavek.imag()/newton_coef);
      int nsrcs = src_coord.size() / 3;
      int ntrgs = trg_coord.size() / 3;
      int t;
      const complex_t I(0, 1);
      for (t=0; t+NSIMD<=ntrgs; t+=NSIMD) {
        simdvec tx(&trg_coord[3*t+0], 3*(int)sizeof(real_t));
        simdvec ty(&trg_coord[3*t+1], 3*(int)sizeof(real_t));
        simdvec tz(&trg_coord[3*t+2], 3*(int)sizeof(real_t));
        simdvec tv_real(zero);
        simdvec tv_imag(zero);
        for (int s=0; s<nsrcs; s++) {
          simdvec sx(src_coord[3*s+0]);
          sx -= tx;
          simdvec sy(src_coord[3*s+1]);
          sy -= ty;
          simdvec sz(src_coord[3*s+2]);
          sz -= tz;
          simdvec sv_real(src_value[s].real());
          simdvec sv_imag(src_value[s].imag());
          simdvec r2(zero);
          r2 += sx * sx;
          r2 += sy * sy;
          r2 += sz * sz;
          simdvec invr = rsqrt(r2);   // invr = newton_coef * 1/r
          invr &= r2 > zero;

          simdvec r = r2 * invr;
          simdvec kr_real = k_real * r;  // k_real * r
          simdvec kr_imag = k_imag * r;  // k_imag * r
          simdvec e_ikr = exp(-kr_imag); // exp(-k_imag*r)
          // simdvec kr = k * r2 * invr;   // newton_coefs in k & invr cancel out
          simdvec G_real = e_ikr * cos(kr_real) * invr;  // G = e^(ikr) / r
          simdvec G_imag = e_ikr * sin(kr_real) * invr;  // invr carries newton_coef
          tv_real += sv_real*G_real - sv_imag*G_imag;  // p += G * q
          tv_imag += sv_real*G_imag + sv_imag*G_real;
        }
        tv_real *= coef;  // coef carries 1/(4*PI) and offsets newton_coef in invr
        tv_imag *= coef;
        for (int m=0; m<NSIMD && (t+m)<ntrgs; m++) {
          trg_value[t+m] += complex_t(tv_real[m], tv_imag[m]);
        }
      }
      for (; t<ntrgs; t++) {
        complex_t potential(0, 0);
        for (int s=0; s<nsrcs; s++) {
          vec3 dx;
          for (int d=0; d<3; d++)
            dx[d] = trg_coord[3*t+d] - src_coord[3*s+d];
          real_t r2 = norm(dx);
          if (r2 != 0) {
            real_t r = std::sqrt(r2);
            potential += std::exp(I * r * wavek) * src_value[s] / r;
          }
        }
        trg_value[t] += potential / (4*PI);
      }
    }

    void gradient_P2P(RealVec& src_coord, ComplexVec& src_value, RealVec& trg_coord, ComplexVec& trg_value) {
      simdvec zero((real_t)0);
      simdvec one((real_t)1);
      real_t newton_coef = 16;   // comes from Newton's method in simd rsqrt function
      simdvec coef(real_t(1.0/(4*PI*newton_coef)));
      simdvec k_real(wavek.real()/newton_coef);
      simdvec k_imag(wavek.imag()/newton_coef);
      simdvec newton_offset(real_t(1.0/(newton_coef*newton_coef)));   // offset invr2 term
      int nsrcs = src_coord.size() / 3;
      int ntrgs = trg_coord.size() / 3;
      int t;
      const complex_t I(0, 1);
      for (t=0; t+NSIMD<=ntrgs; t+=NSIMD) {
        simdvec tx(&trg_coord[3*t+0], 3*(int)sizeof(real_t));
        simdvec ty(&trg_coord[3*t+1], 3*(int)sizeof(real_t));
        simdvec tz(&trg_coord[3*t+2], 3*(int)sizeof(real_t));
        simdvec tv0_real(zero);
        simdvec tv0_imag(zero);
        simdvec tv1_real(zero);
        simdvec tv1_imag(zero);
        simdvec tv2_real(zero);
        simdvec tv2_imag(zero);
        simdvec tv3_real(zero);
        simdvec tv3_imag(zero);
        for (int s=0; s<nsrcs; s++) {
          simdvec sx(src_coord[3*s+0]);
          sx -= tx;    // sx is negative dx
          simdvec sy(src_coord[3*s+1]);
          sy -= ty;
          simdvec sz(src_coord[3*s+2]);
          sz -= tz;
          simdvec sv_real(src_value[s].real());
          simdvec sv_imag(src_value[s].imag());
          simdvec r2(zero);
          r2 += sx * sx;
          r2 += sy * sy;
          r2 += sz * sz;
          simdvec invr = rsqrt(r2);
          invr &= r2 > zero;

          simdvec r = r2 * invr;
          simdvec kr_real = k_real * r;  // k_real * r
          simdvec kr_imag = k_imag * r;  // k_imag * r
          simdvec e_ikr = exp(-kr_imag); // exp(-k_imag*r)
          // simdvec kr = k * r2 * invr;   // newton_coefs in k & invr cancel out
          simdvec G_real = e_ikr * cos(kr_real) * invr;  // G = e^(ikr) / r
          simdvec G_imag = e_ikr * sin(kr_real) * invr;  // invr carries newton_coef
          simdvec potential_real = sv_real*G_real - sv_imag*G_imag;    // p = G * q
          simdvec potential_imag = sv_real*G_imag + sv_imag*G_real;
          tv0_real += potential_real;
          tv0_imag += potential_imag;

          // coefg = (1+k_imag*r)/r2 - k_real/r*I (considering the negative sign in sx)
          simdvec coefg_real = (one+kr_imag) * invr * invr * newton_offset;
          simdvec coefg_imag = - k_real * invr;
          // gradient = coefg * potential * dx
          simdvec gradient_real = coefg_real*potential_real - coefg_imag*potential_imag;
          simdvec gradient_imag = coefg_real*potential_imag + coefg_imag*potential_real;
          tv1_real += sx * gradient_real;
          tv1_imag += sx * gradient_imag;
          tv2_real += sy * gradient_real;
          tv2_imag += sy * gradient_imag;
          tv3_real += sz * gradient_real;
          tv3_imag += sz * gradient_imag;
        }
        tv0_real *= coef;
        tv0_imag *= coef;
        tv1_real *= coef;
        tv1_imag *= coef;
        tv2_real *= coef;
        tv2_imag *= coef;
        tv3_real *= coef;
        tv3_imag *= coef;
        for (int m=0; m<NSIMD && (t+m)<ntrgs; m++) {
          trg_value[4*(t+m)+0] += complex_t(tv0_real[m], tv0_imag[m]);
          trg_value[4*(t+m)+1] += complex_t(tv1_real[m], tv1_imag[m]);
          trg_value[4*(t+m)+2] += complex_t(tv2_real[m], tv2_imag[m]);
          trg_value[4*(t+m)+3] += complex_t(tv3_real[m], tv3_imag[m]);
        }
      }
      for (; t<ntrgs; t++) {
        complex_t potential(0, 0);
        cvec3 gradient = complex_t(0., 0.);
        for (int s=0; s<nsrcs; s++) {
          vec3 dx;
          for (int d=0; d<3; d++)
            dx[d] = trg_coord[3*t+d] - src_coord[3*s+d];
          real_t r2 = norm(dx);
          if (r2 != 0) {
            real_t r = std::sqrt(r2);
            complex_t potential_ij = std::exp(I * r * wavek) * src_value[s] / r;
            complex_t coefg = (I*wavek/r - 1/r2) * potential_ij;
            potential += potential_ij;
            for (int d=0; d<3; d++)
              gradient[d] += coefg * dx[d];
          }
        }
        trg_value[4*t+0] += potential / (4*PI);
        trg_value[4*t+1] += gradient[0] / (4*PI);
        trg_value[4*t+2] += gradient[1] / (4*PI);
        trg_value[4*t+3] += gradient[2] / (4*PI);
      }
    }
  };
}  // end namespace exafmm_t
#endif