Program Listing for File fmm.h

Return to documentation for file (include/fmm.h)

#ifndef fmm_h
#define fmm_h
#include <cstring>      // std::memset
#include <fstream>      // std::ofstream
#include <type_traits>  // std::is_same
#include "fmm_base.h"
#include "intrinsics.h"
#include "math_wrapper.h"

namespace exafmm_t {
  template <typename T>
  class Fmm : public FmmBase<T> {
  public:
    /* precomputation matrices */
    std::vector<std::vector<T>> matrix_UC2E_U;
    std::vector<std::vector<T>> matrix_UC2E_V;
    std::vector<std::vector<T>> matrix_DC2E_U;
    std::vector<std::vector<T>> matrix_DC2E_V;
    std::vector<std::vector<std::vector<T>>> matrix_M2M;
    std::vector<std::vector<std::vector<T>>> matrix_L2L;

    std::vector<M2LData> m2ldata;

    /* constructors */
    Fmm() {}

    Fmm(int p_, int ncrit_, std::string filename_=std::string()) : FmmBase<T>(p_, ncrit_, filename_) {}

    /* precomputation */
    void initialize_matrix() {
      int& nsurf_ = this->nsurf;
      int& depth_ = this->depth;
      matrix_UC2E_V.resize(depth_+1, std::vector<T>(nsurf_*nsurf_));
      matrix_UC2E_U.resize(depth_+1, std::vector<T>(nsurf_*nsurf_));
      matrix_DC2E_V.resize(depth_+1, std::vector<T>(nsurf_*nsurf_));
      matrix_DC2E_U.resize(depth_+1, std::vector<T>(nsurf_*nsurf_));
      matrix_M2M.resize(depth_+1);
      matrix_L2L.resize(depth_+1);
      for (int level=0; level<=depth_; ++level) {
        matrix_M2M[level].resize(REL_COORD[M2M_Type].size(), std::vector<T>(nsurf_*nsurf_));
        matrix_L2L[level].resize(REL_COORD[L2L_Type].size(), std::vector<T>(nsurf_*nsurf_));
      }
    }

    void precompute_M2M() {
      int& nsurf_ = this->nsurf;
      real_t parent_coord[3] = {0, 0, 0};
      for (int level=0; level<=this->depth; level++) {
        RealVec parent_up_check_surf = surface(this->p, this->r0, level, parent_coord, 2.95);
        real_t s = this->r0 * powf(0.5, level+1);
        int npos = REL_COORD[M2M_Type].size();  // number of relative positions
#pragma omp parallel for
        for(int i=0; i<npos; i++) {
          // compute kernel matrix
          ivec3& coord = REL_COORD[M2M_Type][i];
          real_t child_coord[3] = {parent_coord[0] + coord[0]*s,
                                   parent_coord[1] + coord[1]*s,
                                   parent_coord[2] + coord[2]*s};
          RealVec child_up_equiv_surf = surface(this->p, this->r0, level+1, child_coord, 1.05);
          std::vector<T> matrix_pc2ce(nsurf_*nsurf_);
          this->kernel_matrix(parent_up_check_surf, child_up_equiv_surf, matrix_pc2ce);
          // M2M
          std::vector<T> buffer(nsurf_*nsurf_);
          gemm(nsurf_, nsurf_, nsurf_, &(matrix_UC2E_U[level][0]), &matrix_pc2ce[0], &buffer[0]);
          gemm(nsurf_, nsurf_, nsurf_, &(matrix_UC2E_V[level][0]), &buffer[0], &(matrix_M2M[level][i][0]));
          // L2L
          matrix_pc2ce = transpose(matrix_pc2ce, nsurf_, nsurf_);
          gemm(nsurf_, nsurf_, nsurf_, &matrix_pc2ce[0], &(matrix_DC2E_V[level][0]), &buffer[0]);
          gemm(nsurf_, nsurf_, nsurf_, &buffer[0], &(matrix_DC2E_U[level][0]), &(matrix_L2L[level][i][0]));
        }
      }
    }

    void precompute_check2equiv() {}

    void precompute_M2L(std::ofstream& file) {}

    void save_matrix(std::ofstream& file) {
      file.write(reinterpret_cast<char*>(&this->r0), sizeof(real_t));  // r0
      size_t size = this->nsurf * this->nsurf;
      for(int l=0; l<=this->depth; l++) {
        // UC2E, DC2E
        file.write(reinterpret_cast<char*>(&matrix_UC2E_U[l][0]), size*sizeof(T));
        file.write(reinterpret_cast<char*>(&matrix_UC2E_V[l][0]), size*sizeof(T));
        file.write(reinterpret_cast<char*>(&matrix_DC2E_U[l][0]), size*sizeof(T));
        file.write(reinterpret_cast<char*>(&matrix_DC2E_V[l][0]), size*sizeof(T));
        // M2M, L2L
        for (auto & vec : matrix_M2M[l]) {
          file.write(reinterpret_cast<char*>(&vec[0]), size*sizeof(T));
        }
        for (auto & vec : matrix_L2L[l]) {
          file.write(reinterpret_cast<char*>(&vec[0]), size*sizeof(T));
        }
      }
    }

    void load_matrix() {
      int& nsurf_ = this->nsurf;
      int& depth_ = this->depth;
      size_t size_M2L = this->nfreq * 2 * NCHILD * NCHILD;
      size_t file_size = (2*REL_COORD[M2M_Type].size()+4) * nsurf_ * nsurf_ * (depth_+1) * sizeof(T)
                       + REL_COORD[M2L_Type].size() * size_M2L * depth_ * sizeof(real_t)
                       + 1 * sizeof(real_t);   // +1 denotes r0
      std::ifstream file(this->filename, std::ifstream::binary);
      if (file.good()) {
        file.seekg(0, file.end);
        if (size_t(file.tellg()) == file_size) {   // if file size is correct
          file.seekg(0, file.beg);  // move the position back to the beginning
          real_t r0_;
          file.read(reinterpret_cast<char*>(&r0_), sizeof(real_t));
          if (this->r0 == r0_) {    // if radius match
            size_t size = nsurf_ * nsurf_;
            for (int l=0; l<=depth_; l++) {
              // UC2E, DC2E
              file.read(reinterpret_cast<char*>(&matrix_UC2E_U[l][0]), size*sizeof(T));
              file.read(reinterpret_cast<char*>(&matrix_UC2E_V[l][0]), size*sizeof(T));
              file.read(reinterpret_cast<char*>(&matrix_DC2E_U[l][0]), size*sizeof(T));
              file.read(reinterpret_cast<char*>(&matrix_DC2E_V[l][0]), size*sizeof(T));
              // M2M, L2L
              for (auto& vec : matrix_M2M[l]) {
                file.read(reinterpret_cast<char*>(&vec[0]), size*sizeof(T));
              }
              for (auto& vec : matrix_L2L[l]) {
                file.read(reinterpret_cast<char*>(&vec[0]), size*sizeof(T));
              }
            }
            this->is_precomputed = true;
          }
        }
      }
      file.close();
    }

    void precompute() {
      initialize_matrix();
      load_matrix();
      if (!this->is_precomputed) {
        precompute_check2equiv();
        precompute_M2M();
        std::remove(this->filename.c_str());
        std::ofstream file(this->filename, std::ofstream::binary);
        save_matrix(file);
        precompute_M2L(file);
        file.close();
      }
    }

    void P2M(NodePtrs<T>& leafs) {
      int& nsurf_ = this->nsurf;
      real_t c[3] = {0,0,0};
      std::vector<RealVec> up_check_surf;
      up_check_surf.resize(this->depth+1);
      for (int level=0; level<=this->depth; level++) {
        up_check_surf[level].resize(nsurf_*3);
        up_check_surf[level] = surface(this->p, this->r0, level, c, 2.95);
      }
#pragma omp parallel for
      for (size_t i=0; i<leafs.size(); i++) {
        Node<T>* leaf = leafs[i];
        int level = leaf->level;
        // calculate upward check potential induced by sources' charges
        RealVec check_coord(nsurf_*3);
        for (int k=0; k<nsurf_; k++) {
          check_coord[3*k+0] = up_check_surf[level][3*k+0] + leaf->x[0];
          check_coord[3*k+1] = up_check_surf[level][3*k+1] + leaf->x[1];
          check_coord[3*k+2] = up_check_surf[level][3*k+2] + leaf->x[2];
        }
        this->potential_P2P(leaf->src_coord, leaf->src_value,
                            check_coord, leaf->up_equiv);
        std::vector<T> buffer(nsurf_);
        std::vector<T> equiv(nsurf_);
        gemv(nsurf_, nsurf_, &(matrix_UC2E_U[level][0]), &(leaf->up_equiv[0]), &buffer[0]);
        gemv(nsurf_, nsurf_, &(matrix_UC2E_V[level][0]), &buffer[0], &equiv[0]);
        for (int k=0; k<nsurf_; k++)
          leaf->up_equiv[k] = equiv[k];
      }
    }

    void L2P(NodePtrs<T>& leafs) {
      int& nsurf_ = this->nsurf;
      real_t c[3] = {0,0,0};
      std::vector<RealVec> dn_equiv_surf;
      dn_equiv_surf.resize(this->depth+1);
      for (int level=0; level<=this->depth; level++) {
        dn_equiv_surf[level].resize(nsurf_*3);
        dn_equiv_surf[level] = surface(this->p, this->r0, level, c, 2.95);
      }
#pragma omp parallel for
      for (size_t i=0; i<leafs.size(); i++) {
        Node<T>* leaf = leafs[i];
        int level = leaf->level;
        // down check surface potential -> equivalent surface charge
        std::vector<T> buffer(nsurf_);
        std::vector<T> equiv(nsurf_);
        gemv(nsurf_, nsurf_, &(matrix_DC2E_U[level][0]), &(leaf->dn_equiv[0]), &buffer[0]);
        gemv(nsurf_, nsurf_, &(matrix_DC2E_V[level][0]), &buffer[0], &equiv[0]);
        for (int k=0; k<nsurf_; k++)
          leaf->dn_equiv[k] = equiv[k];
        // equivalent surface charge -> target potential
        RealVec equiv_coord(nsurf_*3);
        for (int k=0; k<nsurf_; k++) {
          equiv_coord[3*k+0] = dn_equiv_surf[level][3*k+0] + leaf->x[0];
          equiv_coord[3*k+1] = dn_equiv_surf[level][3*k+1] + leaf->x[1];
          equiv_coord[3*k+2] = dn_equiv_surf[level][3*k+2] + leaf->x[2];
        }
        this->gradient_P2P(equiv_coord, leaf->dn_equiv,
                           leaf->trg_coord, leaf->trg_value);
      }
    }

    void M2M(Node<T>* node) {
      int& nsurf_ = this->nsurf;
      if (node->is_leaf) return;
      for (int octant=0; octant<8; octant++) {
        if (node->children[octant])
#pragma omp task untied
          M2M(node->children[octant]);
      }
#pragma omp taskwait
      for (int octant=0; octant<8; octant++) {
        if (node->children[octant]) {
          Node<T>* child = node->children[octant];
          std::vector<T> buffer(nsurf_);
          int level = node->level;
          gemv(nsurf_, nsurf_, &(matrix_M2M[level][octant][0]), &child->up_equiv[0], &buffer[0]);
          for (int k=0; k<nsurf_; k++) {
            node->up_equiv[k] += buffer[k];
          }
        }
      }
    }

    void L2L(Node<T>* node) {
      int& nsurf_ = this->nsurf;
      if (node->is_leaf) return;
      for (int octant=0; octant<8; octant++) {
        if (node->children[octant]) {
          Node<T>* child = node->children[octant];
          std::vector<T> buffer(nsurf_);
          int level = node->level;
          gemv(nsurf_, nsurf_, &(matrix_L2L[level][octant][0]), &node->dn_equiv[0], &buffer[0]);
          for (int k=0; k<nsurf_; k++)
            child->dn_equiv[k] += buffer[k];
        }
      }
      for (int octant=0; octant<8; octant++) {
        if (node->children[octant])
#pragma omp task untied
          L2L(node->children[octant]);
      }
#pragma omp taskwait
    }

    void M2L_setup(NodePtrs<T>& nonleafs) {
      int& nsurf_ = this->nsurf;
      int& depth_ = this->depth;
      int npos = REL_COORD[M2L_Type].size();  // number of M2L relative positions
      m2ldata.resize(depth_);                  // initialize m2ldata

      // construct lists of target nodes for M2L operator at each level
      std::vector<NodePtrs<T>> trg_nodes(depth_);
      for (size_t i=0; i<nonleafs.size(); i++) {
        trg_nodes[nonleafs[i]->level].push_back(nonleafs[i]);
      }

      // prepare for m2ldata for each level
      for (int l=0; l<depth_; l++) {
        // construct M2L source nodes for current level
        std::set<Node<T>*> src_nodes_;
        for (size_t i=0; i<trg_nodes[l].size(); i++) {
          NodePtrs<T>& M2L_list = trg_nodes[l][i]->M2L_list;
          for (int k=0; k<npos; k++) {
            if (M2L_list[k])
              src_nodes_.insert(M2L_list[k]);
          }
        }
        NodePtrs<T> src_nodes;
        auto it = src_nodes_.begin();
        for (; it!=src_nodes_.end(); it++) {
          src_nodes.push_back(*it);
        }
        // prepare the indices of src_nodes & trg_nodes in all_up_equiv & all_dn_equiv
        std::vector<size_t> fft_offset(src_nodes.size());       // displacement in all_up_equiv
        std::vector<size_t> ifft_offset(trg_nodes[l].size());  // displacement in all_dn_equiv
        for (size_t i=0; i<src_nodes.size(); i++) {
          fft_offset[i] = src_nodes[i]->children[0]->idx * nsurf_;
        }
        for (size_t i=0; i<trg_nodes[l].size(); i++) {
          ifft_offset[i] = trg_nodes[l][i]->children[0]->idx * nsurf_;
        }

        // calculate interaction_offset_f & interaction_count_offset
        std::vector<size_t> interaction_offset_f;
        std::vector<size_t> interaction_count_offset;
        for (size_t i=0; i<src_nodes.size(); i++) {
          src_nodes[i]->idx_M2L = i;  // node_id: node's index in src_nodes list
        }
        size_t nblk_trg = trg_nodes[l].size() * sizeof(real_t) / CACHE_SIZE;
        if (nblk_trg==0) nblk_trg = 1;
        size_t interaction_count_offset_ = 0;
        size_t fft_size = 2 * NCHILD * this->nfreq;
        for (size_t iblk_trg=0; iblk_trg<nblk_trg; iblk_trg++) {
          size_t blk_start = (trg_nodes[l].size()* iblk_trg   ) / nblk_trg;
          size_t blk_end   = (trg_nodes[l].size()*(iblk_trg+1)) / nblk_trg;
          for (int k=0; k<npos; k++) {
            for (size_t i=blk_start; i<blk_end; i++) {
              NodePtrs<T>& M2L_list = trg_nodes[l][i]->M2L_list;
              if (M2L_list[k]) {
                interaction_offset_f.push_back(M2L_list[k]->idx_M2L * fft_size);   // src_node's displacement in fft_in
                interaction_offset_f.push_back(        i           * fft_size);   // trg_node's displacement in fft_out
                interaction_count_offset_++;
              }
            }
            interaction_count_offset.push_back(interaction_count_offset_);
          }
        }
        m2ldata[l].fft_offset = fft_offset;
        m2ldata[l].ifft_offset = ifft_offset;
        m2ldata[l].interaction_offset_f = interaction_offset_f;
        m2ldata[l].interaction_count_offset = interaction_count_offset;
      }
    }

  void hadamard_product(std::vector<size_t>& interaction_count_offset, std::vector<size_t>& interaction_offset_f,
                        AlignedVec& fft_in, AlignedVec& fft_out, std::vector<AlignedVec>& matrix_M2L) {
      size_t fft_size = 2 * NCHILD * this->nfreq;
      AlignedVec zero_vec0(fft_size, 0.);
      AlignedVec zero_vec1(fft_size, 0.);

      size_t npos = matrix_M2L.size();
      size_t nblk_inter = interaction_count_offset.size();   // num of blocks of interactions
      size_t nblk_trg = nblk_inter / npos;                   // num of blocks based on trg_nodes
      int BLOCK_SIZE = CACHE_SIZE * 2 / sizeof(real_t);
      std::vector<real_t*> IN_(BLOCK_SIZE*nblk_inter);
      std::vector<real_t*> OUT_(BLOCK_SIZE*nblk_inter);

      // initialize fft_out with zero
#pragma omp parallel for
      for (size_t i=0; i<fft_out.capacity()/fft_size; ++i) {
        std::memset(fft_out.data()+i*fft_size, 0, fft_size*sizeof(real_t));
      }

#pragma omp parallel for
      for (size_t iblk_inter=0; iblk_inter<nblk_inter; iblk_inter++) {
        size_t interaction_count_offset0 = (iblk_inter==0 ? 0 : interaction_count_offset[iblk_inter-1]);
        size_t interaction_count_offset1 = interaction_count_offset[iblk_inter];
        size_t interaction_count = interaction_count_offset1 - interaction_count_offset0;
        for (size_t j=0; j<interaction_count; j++) {
          IN_ [BLOCK_SIZE*iblk_inter+j] = &fft_in[interaction_offset_f[(interaction_count_offset0+j)*2+0]];
          OUT_[BLOCK_SIZE*iblk_inter+j] = &fft_out[interaction_offset_f[(interaction_count_offset0+j)*2+1]];
        }
        IN_ [BLOCK_SIZE*iblk_inter+interaction_count] = &zero_vec0[0];
        OUT_[BLOCK_SIZE*iblk_inter+interaction_count] = &zero_vec1[0];
      }

      for (size_t iblk_trg=0; iblk_trg<nblk_trg; iblk_trg++) {
#pragma omp parallel for
        for (int k=0; k<this->nfreq; k++) {
          for (size_t ipos=0; ipos<npos; ipos++) {
            size_t iblk_inter = iblk_trg*npos + ipos;
            size_t interaction_count_offset0 = (iblk_inter==0 ? 0 : interaction_count_offset[iblk_inter-1]);
            size_t interaction_count_offset1 = interaction_count_offset[iblk_inter];
            size_t interaction_count = interaction_count_offset1 - interaction_count_offset0;
            real_t** IN = &IN_[BLOCK_SIZE*iblk_inter];
            real_t** OUT= &OUT_[BLOCK_SIZE*iblk_inter];
            real_t* M = &matrix_M2L[ipos][k*2*NCHILD*NCHILD]; // k-th freq's (row) offset in matrix_M2L
            for (size_t j=0; j<interaction_count; j+=2) {
              real_t* M_   = M;
              real_t* IN0  = IN [j+0] + k*NCHILD*2;   // go to k-th freq chunk
              real_t* IN1  = IN [j+1] + k*NCHILD*2;
              real_t* OUT0 = OUT[j+0] + k*NCHILD*2;
              real_t* OUT1 = OUT[j+1] + k*NCHILD*2;
              matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
            }
          }
        }
      }
    }

    void fft_up_equiv(std::vector<size_t>& fft_offset, std::vector<T>& all_up_equiv, AlignedVec& fft_in) {}

    void ifft_dn_check(std::vector<size_t>& ifft_offset, AlignedVec& fft_out, std::vector<T>& all_dn_equiv) {}

    void M2L(Nodes<T>& nodes) {
      int& nsurf_ = this->nsurf;
      int& nfreq_ = this->nfreq;
      int fft_size = 2 * NCHILD * nfreq_;
      int nnodes = nodes.size();
      int npos = REL_COORD[M2L_Type].size();   // number of relative positions

      // allocate memory
      std::vector<T> all_up_equiv, all_dn_equiv;
      all_up_equiv.reserve(nnodes*nsurf_);
      all_dn_equiv.reserve(nnodes*nsurf_);
      std::vector<AlignedVec> matrix_M2L(npos, AlignedVec(fft_size*NCHILD, 0));

      // setup ifstream of M2L precomputation matrix
      std::ifstream ifile(this->filename, std::ifstream::binary);
      ifile.seekg(0, ifile.end);
      size_t fsize = ifile.tellg();   // file size in bytes
      size_t msize = NCHILD * NCHILD * nfreq_ * 2 * sizeof(real_t);   // size in bytes for each M2L matrix
      ifile.seekg(fsize - this->depth*npos*msize, ifile.beg);   // go to the start of M2L section

      // collect all upward equivalent charges
#pragma omp parallel for collapse(2)
      for (int i=0; i<nnodes; ++i) {
        for (int j=0; j<nsurf_; ++j) {
          all_up_equiv[i*nsurf_+j] = nodes[i].up_equiv[j];
          all_dn_equiv[i*nsurf_+j] = nodes[i].dn_equiv[j];
        }
      }
      // FFT-accelerate M2L
      for (int l=0; l<this->depth; ++l) {
        // load M2L matrix for current level
        for (int i=0; i<npos; ++i) {
          ifile.read(reinterpret_cast<char*>(matrix_M2L[i].data()), msize);
        }
        AlignedVec fft_in, fft_out;
        fft_in.reserve(m2ldata[l].fft_offset.size()*fft_size);
        fft_out.reserve(m2ldata[l].ifft_offset.size()*fft_size);
        fft_up_equiv(m2ldata[l].fft_offset, all_up_equiv, fft_in);
        hadamard_product(m2ldata[l].interaction_count_offset,
                         m2ldata[l].interaction_offset_f,
                         fft_in, fft_out, matrix_M2L);
        ifft_dn_check(m2ldata[l].ifft_offset, fft_out, all_dn_equiv);
      }
      // update all downward check potentials
#pragma omp parallel for collapse(2)
      for (int i=0; i<nnodes; ++i) {
        for (int j=0; j<nsurf_; ++j) {
          nodes[i].dn_equiv[j] = all_dn_equiv[i*nsurf_+j];
        }
      }
      ifile.close();   // close ifstream
    }
  };

  template <>
  void Fmm<real_t>::precompute_check2equiv() {
    real_t c[3] = {0, 0, 0};
    int& nsurf_ = this->nsurf;
#pragma omp parallel for
    for (int level=0; level<=this->depth; ++level) {
      // compute kernel matrix
      RealVec up_check_surf = surface(this->p, this->r0, level, c, 2.95);
      RealVec up_equiv_surf = surface(this->p, this->r0, level, c, 1.05);
      RealVec matrix_c2e(nsurf_*nsurf_);  // UC2UE
      this->kernel_matrix(up_check_surf, up_equiv_surf, matrix_c2e);

      // svd
      RealVec S(nsurf_*nsurf_);  // singular values
      RealVec U(nsurf_*nsurf_), VT(nsurf_*nsurf_);
      svd(nsurf_, nsurf_, &matrix_c2e[0], &S[0], &U[0], &VT[0]);

      // pseudo-inverse
      real_t max_S = 0;
      for (int i=0; i<nsurf_; i++) {
        max_S = fabs(S[i*nsurf_+i])>max_S ? fabs(S[i*nsurf_+i]) : max_S;
      }
      for (int i=0; i<nsurf_; i++) {
        S[i*nsurf_+i] = S[i*nsurf_+i]>EPS*max_S*4 ? 1.0/S[i*nsurf_+i] : 0.0;
      }
      RealVec V = transpose(VT, nsurf_, nsurf_);
      matrix_UC2E_U[level] = transpose(U, nsurf_, nsurf_);
      gemm(nsurf_, nsurf_, nsurf_, &V[0], &S[0], &(matrix_UC2E_V[level][0]));
      matrix_DC2E_U[level] = VT;
      gemm(nsurf_, nsurf_, nsurf_, &U[0], &S[0], &(matrix_DC2E_V[level][0]));
    }
  }

  template <>
  void Fmm<complex_t>::precompute_check2equiv() {
    real_t c[3] = {0, 0, 0};
    int& nsurf_ = this->nsurf;
#pragma omp parallel for
    for (int level=0; level<=this->depth; ++level) {
      // compute kernel matrix
      RealVec up_check_surf = surface(this->p, this->r0, level, c, 2.95);
      RealVec up_equiv_surf = surface(this->p, this->r0, level, c, 1.05);
      ComplexVec matrix_c2e(nsurf_*nsurf_);  // UC2UE
      this->kernel_matrix(up_check_surf, up_equiv_surf, matrix_c2e);

      // svd
      RealVec S(nsurf_*nsurf_);  // singular values
      ComplexVec U(nsurf_*nsurf_), VH(nsurf_*nsurf_);
      svd(nsurf_, nsurf_, &matrix_c2e[0], &S[0], &U[0], &VH[0]);

      // pseudo-inverse
      real_t max_S = 0;
      for (int i=0; i<nsurf_; i++) {
        max_S = fabs(S[i*nsurf_+i])>max_S ? fabs(S[i*nsurf_+i]) : max_S;
      }
      for (int i=0; i<nsurf_; i++) {
        S[i*nsurf_+i] = S[i*nsurf_+i]>EPS*max_S*4 ? 1.0/S[i*nsurf_+i] : 0.0;
      }
      ComplexVec S_(nsurf_*nsurf_);
      for (size_t i=0; i<S_.size(); i++) {   // convert S to complex type
        S_[i] = S[i];
      }
      ComplexVec V = conjugate_transpose(VH, nsurf_, nsurf_);
      ComplexVec UH = conjugate_transpose(U, nsurf_, nsurf_);
      matrix_UC2E_U[level] = UH;
      gemm(nsurf_, nsurf_, nsurf_, &V[0], &S_[0], &(matrix_UC2E_V[level][0]));
      matrix_DC2E_U[level] = transpose(V, nsurf_, nsurf_);
      ComplexVec UHT = transpose(UH, nsurf_, nsurf_);
      gemm(nsurf_, nsurf_, nsurf_, &UHT[0], &S_[0], &(matrix_DC2E_V[level][0]));
    }
  }

  template <>
  void Fmm<real_t>::precompute_M2L(std::ofstream& file) {
    int n1 = this->p * 2;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int fft_size = 2 * nfreq_ * NCHILD * NCHILD;
    std::vector<RealVec> matrix_M2L_Helper(REL_COORD[M2L_Helper_Type].size(),
                                           RealVec(2*nfreq_));
    std::vector<AlignedVec> matrix_M2L(REL_COORD[M2L_Type].size(), AlignedVec(fft_size));
    // create fft plan
    RealVec fftw_in(nconv_);
    RealVec fftw_out(2*nfreq_);
    int dim[3] = {n1, n1, n1};
    fft_plan plan = fft_plan_dft_r2c(3, dim, fftw_in.data(), reinterpret_cast<fft_complex*>(fftw_out.data()), FFTW_ESTIMATE);
    RealVec trg_coord(3,0);
    for (int l=1; l<this->depth+1; ++l) {
      // compute M2L kernel matrix, perform DFT
#pragma omp parallel for
      for (size_t i=0; i<REL_COORD[M2L_Helper_Type].size(); ++i) {
        real_t coord[3];
        for (int d=0; d<3; d++) {
          coord[d] = REL_COORD[M2L_Helper_Type][i][d] * this->r0 * powf(0.5, l-1);  // relative coords
        }
        RealVec conv_coord = convolution_grid(this->p, this->r0, l, coord);   // convolution grid
        RealVec conv_value(nconv_);   // potentials on convolution grid
        this->kernel_matrix(conv_coord, trg_coord, conv_value);
        fft_execute_dft_r2c(plan, conv_value.data(), reinterpret_cast<fft_complex*>(matrix_M2L_Helper[i].data()));
      }
      // convert M2L_Helper to M2L and reorder data layout to improve locality
#pragma omp parallel for
      for (size_t i=0; i<REL_COORD[M2L_Type].size(); ++i) {
        for (int j=0; j<NCHILD*NCHILD; j++) {   // loop over child's relative positions
          int child_rel_idx = M2L_INDEX_MAP[i][j];
          if (child_rel_idx != -1) {
            for (int k=0; k<nfreq_; k++) {   // loop over frequencies
              int new_idx = k*(2*NCHILD*NCHILD) + 2*j;
              matrix_M2L[i][new_idx+0] = matrix_M2L_Helper[child_rel_idx][k*2+0] / nconv_;   // real
              matrix_M2L[i][new_idx+1] = matrix_M2L_Helper[child_rel_idx][k*2+1] / nconv_;   // imag
            }
          }
        }
      }
      // write to file
      for(auto& vec : matrix_M2L) {
        file.write(reinterpret_cast<char*>(vec.data()), fft_size*sizeof(real_t));
      }
    }
    // destroy fftw plan
    fft_destroy_plan(plan);
  }

  template <>
  void Fmm<complex_t>::precompute_M2L(std::ofstream& file) {
    int n1 = this->p * 2;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int fft_size = 2 * nfreq_ * NCHILD * NCHILD;
    std::vector<RealVec> matrix_M2L_Helper(REL_COORD[M2L_Helper_Type].size(),
                                           RealVec(2*nfreq_));
    std::vector<AlignedVec> matrix_M2L(REL_COORD[M2L_Type].size(), AlignedVec(fft_size));
    // create fft plan
    RealVec fftw_in(nconv_);
    RealVec fftw_out(2*nfreq_);
    int dim[3] = {n1, n1, n1};
    fft_plan plan = fft_plan_dft(3, dim,
                                 reinterpret_cast<fft_complex*>(fftw_in.data()),
                                 reinterpret_cast<fft_complex*>(fftw_out.data()),
                                 FFTW_FORWARD, FFTW_ESTIMATE);
    RealVec trg_coord(3,0);
    for (int l=1; l<this->depth+1; ++l) {
      // compute M2L kernel matrix, perform DFT
#pragma omp parallel for
      for (size_t i=0; i<REL_COORD[M2L_Helper_Type].size(); ++i) {
        real_t coord[3];
        for (int d=0; d<3; d++) {
          coord[d] = REL_COORD[M2L_Helper_Type][i][d] * this->r0 * powf(0.5, l-1);  // relative coords
        }
        RealVec conv_coord = convolution_grid(this->p, this->r0, l, coord);   // convolution grid
        ComplexVec conv_value(nconv_);   // potentials on convolution grid
        this->kernel_matrix(conv_coord, trg_coord, conv_value);
        fft_execute_dft(plan, reinterpret_cast<fft_complex*>(conv_value.data()),
                              reinterpret_cast<fft_complex*>(matrix_M2L_Helper[i].data()));
      }
      // convert M2L_Helper to M2L and reorder data layout to improve locality
#pragma omp parallel for
      for (size_t i=0; i<REL_COORD[M2L_Type].size(); ++i) {
        for (int j=0; j<NCHILD*NCHILD; j++) {   // loop over child's relative positions
          int child_rel_idx = M2L_INDEX_MAP[i][j];
          if (child_rel_idx != -1) {
            for (int k=0; k<nfreq_; k++) {   // loop over frequencies
              int new_idx = k*(2*NCHILD*NCHILD) + 2*j;
              matrix_M2L[i][new_idx+0] = matrix_M2L_Helper[child_rel_idx][k*2+0] / nconv_;   // real
              matrix_M2L[i][new_idx+1] = matrix_M2L_Helper[child_rel_idx][k*2+1] / nconv_;   // imag
            }
          }
        }
      }
      // write to file
      for(auto& vec : matrix_M2L) {
        file.write(reinterpret_cast<char*>(vec.data()), fft_size*sizeof(real_t));
      }
    }
    // destroy fftw plan
    fft_destroy_plan(plan);
  }

  template <>
  void Fmm<real_t>::fft_up_equiv(std::vector<size_t>& fft_offset, RealVec& all_up_equiv, AlignedVec& fft_in) {
    int& nsurf_ = this->nsurf;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int n1 = this->p * 2;
    auto map = generate_surf2conv_up(p);

    size_t fft_size = 2 * NCHILD * nfreq_;
    AlignedVec fftw_in(nconv_ * NCHILD);
    AlignedVec fftw_out(fft_size);
    int dim[3] = {n1, n1, n1};
    fft_plan plan = fft_plan_many_dft_r2c(3, dim, NCHILD,
                                          (real_t*)&fftw_in[0], nullptr, 1, nconv_,
                                          (fft_complex*)(&fftw_out[0]), nullptr, 1, nfreq_,
                                          FFTW_ESTIMATE);

#pragma omp parallel for
    for (size_t node_idx=0; node_idx<fft_offset.size(); node_idx++) {
      RealVec buffer(fft_size, 0);
      RealVec equiv_t(NCHILD*nconv_, 0.);

      real_t* up_equiv = &all_up_equiv[fft_offset[node_idx]];  // offset ptr of node's 8 child's up_equiv in all_up_equiv, size=8*nsurf_
      real_t* up_equiv_f = &fft_in[fft_size*node_idx];   // offset ptr of node_idx in fft_in vector, size=fftsize

      for (int k=0; k<nsurf_; k++) {
        size_t idx = map[k];
        for (int j=0; j<NCHILD; j++)
          equiv_t[idx+j*nconv_] = up_equiv[j*nsurf_+k];
      }
      fft_execute_dft_r2c(plan, &equiv_t[0], (fft_complex*)&buffer[0]);
      for (int k=0; k<nfreq_; k++) {
        for (int j=0; j<NCHILD; j++) {
          up_equiv_f[2*(NCHILD*k+j)+0] = buffer[2*(nfreq_*j+k)+0];
          up_equiv_f[2*(NCHILD*k+j)+1] = buffer[2*(nfreq_*j+k)+1];
        }
      }
    }
    fft_destroy_plan(plan);
  }

  template <>
  void Fmm<complex_t>::fft_up_equiv(std::vector<size_t>& fft_offset, ComplexVec& all_up_equiv, AlignedVec& fft_in) {
    int& nsurf_ = this->nsurf;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int n1 = this->p * 2;
    auto map = generate_surf2conv_up(p);

    size_t fft_size = 2 * NCHILD * nfreq_;
    ComplexVec fftw_in(nconv_ * NCHILD);
    AlignedVec fftw_out(fft_size);
    int dim[3] = {n1, n1, n1};

    fft_plan plan = fft_plan_many_dft(3, dim, NCHILD, reinterpret_cast<fft_complex*>(&fftw_in[0]),
                                      nullptr, 1, nconv_, (fft_complex*)(&fftw_out[0]), nullptr, 1, nfreq_,
                                      FFTW_FORWARD, FFTW_ESTIMATE);

#pragma omp parallel for
    for (size_t node_idx=0; node_idx<fft_offset.size(); node_idx++) {
      RealVec buffer(fft_size, 0);
      ComplexVec equiv_t(NCHILD*nconv_, complex_t(0.,0.));

      complex_t* up_equiv = &all_up_equiv[fft_offset[node_idx]];  // offset ptr of node's 8 child's up_equiv in all_up_equiv, size=8*nsurf_
      real_t* up_equiv_f = &fft_in[fft_size*node_idx];   // offset ptr of node_idx in fft_in vector, size=fftsize

      for (int k=0; k<nsurf_; k++) {
        size_t idx = map[k];
        for (int j=0; j<NCHILD; j++)
          equiv_t[idx+j*nconv_] = up_equiv[j*nsurf_+k];
      }
      fft_execute_dft(plan, reinterpret_cast<fft_complex*>(&equiv_t[0]), (fft_complex*)&buffer[0]);
      for (int k=0; k<nfreq_; k++) {
        for (int j=0; j<NCHILD; j++) {
          up_equiv_f[2*(NCHILD*k+j)+0] = buffer[2*(nfreq_*j+k)+0];
          up_equiv_f[2*(NCHILD*k+j)+1] = buffer[2*(nfreq_*j+k)+1];
        }
      }
    }
    fft_destroy_plan(plan);
  }

  template <>
  void Fmm<real_t>::ifft_dn_check(std::vector<size_t>& ifft_offset, AlignedVec& fft_out, RealVec& all_dn_equiv) {
    int& nsurf_ = this->nsurf;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int n1 = this->p * 2;
    auto map = generate_surf2conv_dn(p);

    size_t fft_size = 2 * NCHILD * nfreq_;
    AlignedVec fftw_in(fft_size);
    AlignedVec fftw_out(nconv_ * NCHILD);
    int dim[3] = {n1, n1, n1};

    fft_plan plan = fft_plan_many_dft_c2r(3, dim, NCHILD,
                    (fft_complex*)(&fftw_in[0]), nullptr, 1, nfreq_,
                    (real_t*)(&fftw_out[0]), nullptr, 1, nconv_,
                    FFTW_ESTIMATE);

#pragma omp parallel for
    for (size_t node_idx=0; node_idx<ifft_offset.size(); node_idx++) {
      RealVec buffer0(fft_size, 0);
      RealVec buffer1(fft_size, 0);
      real_t* dn_check_f = &fft_out[fft_size*node_idx];  // offset ptr for node_idx in fft_out vector, size=fftsize
      real_t* dn_equiv = &all_dn_equiv[ifft_offset[node_idx]];  // offset ptr for node_idx's child's dn_equiv in all_dn_equiv, size=numChilds * nsurf_
      for (int k=0; k<nfreq_; k++)
        for (int j=0; j<NCHILD; j++) {
          buffer0[2*(nfreq_*j+k)+0] = dn_check_f[2*(NCHILD*k+j)+0];
          buffer0[2*(nfreq_*j+k)+1] = dn_check_f[2*(NCHILD*k+j)+1];
        }
      fft_execute_dft_c2r(plan, (fft_complex*)&buffer0[0], (real_t*)(&buffer1[0]));
      for (int k=0; k<nsurf_; k++) {
        size_t idx = map[k];
        for (int j=0; j<NCHILD; j++)
          dn_equiv[nsurf_*j+k] += buffer1[idx+j*nconv_];
      }
    }
    fft_destroy_plan(plan);
  }

  template <>
  void Fmm<complex_t>::ifft_dn_check(std::vector<size_t>& ifft_offset, AlignedVec& fft_out, ComplexVec& all_dn_equiv) {
    int& nsurf_ = this->nsurf;
    int& nconv_ = this->nconv;
    int& nfreq_ = this->nfreq;
    int n1 = this->p * 2;
    auto map = generate_surf2conv_dn(p);

    size_t fft_size = 2 * NCHILD * nfreq_;
    AlignedVec fftw_in(fft_size);
    ComplexVec fftw_out(nconv_*NCHILD);
    int dim[3] = {n1, n1, n1};

    fft_plan plan = fft_plan_many_dft(3, dim, NCHILD, (fft_complex*)(&fftw_in[0]), nullptr, 1, nfreq_,
                                      reinterpret_cast<fft_complex*>(&fftw_out[0]), nullptr, 1, nconv_,
                                      FFTW_BACKWARD, FFTW_ESTIMATE);

#pragma omp parallel for
    for (size_t node_idx=0; node_idx<ifft_offset.size(); node_idx++) {
      RealVec buffer0(fft_size, 0);
      ComplexVec buffer1(NCHILD*nconv_, 0);
      real_t* dn_check_f = &fft_out[fft_size*node_idx];
      complex_t* dn_equiv = &all_dn_equiv[ifft_offset[node_idx]];
      for (int k=0; k<nfreq_; k++)
        for (int j=0; j<NCHILD; j++) {
          buffer0[2*(nfreq_*j+k)+0] = dn_check_f[2*(NCHILD*k+j)+0];
          buffer0[2*(nfreq_*j+k)+1] = dn_check_f[2*(NCHILD*k+j)+1];
        }
      fft_execute_dft(plan, (fft_complex*)&buffer0[0], reinterpret_cast<fft_complex*>(&buffer1[0]));
      for (int k=0; k<nsurf_; k++) {
        size_t idx = map[k];
        for (int j=0; j<NCHILD; j++)
          dn_equiv[nsurf_*j+k]+=buffer1[idx+j*nconv_];
      }
    }
    fft_destroy_plan(plan);
  }
}  // end namespace
#endif