.. _program_listing_file_include_fmm_scale_invariant.h: Program Listing for File fmm_scale_invariant.h ============================================== |exhale_lsh| :ref:`Return to documentation for file ` (``include/fmm_scale_invariant.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #ifndef fmm_scale_invariant_h #define fmm_scale_invariant_h #include // std::memset #include // std::ofstream #include // std::is_same #include "fmm_base.h" #include "intrinsics.h" #include "math_wrapper.h" namespace exafmm_t { template class FmmScaleInvariant : public FmmBase { public: /* precomputation matrices */ std::vector matrix_UC2E_U; std::vector matrix_UC2E_V; std::vector matrix_DC2E_U; std::vector matrix_DC2E_V; std::vector> matrix_M2M; std::vector> matrix_L2L; std::vector matrix_M2L; M2LData m2ldata; /* constructors */ FmmScaleInvariant() {} FmmScaleInvariant(int p_, int ncrit_, std::string filename_=std::string()) : FmmBase(p_, ncrit_, filename_) {} /* precomputation */ void initialize_matrix() { size_t size = this->nfreq * 2 * NCHILD * NCHILD; // size of each M2L precomputation matrix int& nsurf_ = this->nsurf; matrix_UC2E_U.resize(nsurf_*nsurf_); matrix_UC2E_V.resize(nsurf_*nsurf_); matrix_DC2E_U.resize(nsurf_*nsurf_); matrix_DC2E_V.resize(nsurf_*nsurf_); matrix_M2M.resize(REL_COORD[M2M_Type].size(), std::vector(nsurf_*nsurf_)); matrix_L2L.resize(REL_COORD[L2L_Type].size(), std::vector(nsurf_*nsurf_)); matrix_M2L.resize(REL_COORD[M2L_Type].size(), AlignedVec(size)); } void precompute_M2M() { int& nsurf_ = this->nsurf; int npos = REL_COORD[M2M_Type].size(); // number of relative positions int level = 0; real_t parent_coord[3] = {0, 0, 0}; RealVec parent_up_check_surf = surface(this->p, this->r0, level, parent_coord, 2.95); real_t s = this->r0 * powf(0.5, level+1); #pragma omp parallel for for (int i=0; ip, this->r0, level+1, child_coord, 1.05); std::vector matrix_pc2ce(nsurf_*nsurf_); this->kernel_matrix(parent_up_check_surf, child_up_equiv_surf, matrix_pc2ce); // M2M std::vector buffer(nsurf_*nsurf_); gemm(nsurf_, nsurf_, nsurf_, &matrix_UC2E_U[0], &matrix_pc2ce[0], &buffer[0]); gemm(nsurf_, nsurf_, nsurf_, &matrix_UC2E_V[0], &buffer[0], &(matrix_M2M[i][0])); // L2L matrix_pc2ce = transpose(matrix_pc2ce, nsurf_, nsurf_); gemm(nsurf_, nsurf_, nsurf_, &matrix_pc2ce[0], &matrix_DC2E_V[0], &buffer[0]); gemm(nsurf_, nsurf_, nsurf_, &buffer[0], &matrix_DC2E_U[0], &(matrix_L2L[i][0])); } } void precompute_check2equiv() {} void precompute_M2L() {} void save_matrix() { std::remove(this->filename.c_str()); std::ofstream file(this->filename, std::ofstream::binary); // r0 file.write(reinterpret_cast(&this->r0), sizeof(real_t)); size_t size = this->nsurf * this->nsurf; // UC2E, DC2E file.write(reinterpret_cast(&matrix_UC2E_U[0]), size*sizeof(T)); file.write(reinterpret_cast(&matrix_UC2E_V[0]), size*sizeof(T)); file.write(reinterpret_cast(&matrix_DC2E_U[0]), size*sizeof(T)); file.write(reinterpret_cast(&matrix_DC2E_V[0]), size*sizeof(T)); // M2M, L2L for (auto & vec : matrix_M2M) { file.write(reinterpret_cast(&vec[0]), size*sizeof(T)); } for (auto & vec : matrix_L2L) { file.write(reinterpret_cast(&vec[0]), size*sizeof(T)); } // M2L size = this->nfreq * 2 * NCHILD * NCHILD; for (auto & vec : matrix_M2L) { file.write(reinterpret_cast(&vec[0]), size*sizeof(real_t)); } file.close(); } void load_matrix() { size_t size_M2L = this->nfreq * 2 * NCHILD * NCHILD; size_t file_size = (2*REL_COORD[M2M_Type].size()+4) * this->nsurf * this->nsurf * sizeof(T) + REL_COORD[M2L_Type].size() * size_M2L * sizeof(real_t) + 1 * sizeof(real_t); // +1 denotes r0 std::ifstream file(this->filename, std::ifstream::binary); if (file.good()) { file.seekg(0, file.end); if (size_t(file.tellg()) == file_size) { // if file size is correct file.seekg(0, file.beg); // move the position back to the beginning real_t r0_; file.read(reinterpret_cast(&r0_), sizeof(real_t)); if (this->r0 == r0_) { // if radius match size_t size = this->nsurf * this->nsurf; // UC2E, DC2E file.read(reinterpret_cast(&matrix_UC2E_U[0]), size*sizeof(T)); file.read(reinterpret_cast(&matrix_UC2E_V[0]), size*sizeof(T)); file.read(reinterpret_cast(&matrix_DC2E_U[0]), size*sizeof(T)); file.read(reinterpret_cast(&matrix_DC2E_V[0]), size*sizeof(T)); // M2M, L2L for (auto & vec : matrix_M2M) { file.read(reinterpret_cast(&vec[0]), size*sizeof(T)); } for (auto & vec : matrix_L2L) { file.read(reinterpret_cast(&vec[0]), size*sizeof(T)); } // M2L for (auto & vec : matrix_M2L) { file.read(reinterpret_cast(&vec[0]), size_M2L*sizeof(real_t)); } this->is_precomputed = true; } } } file.close(); } void precompute() { initialize_matrix(); load_matrix(); if (!this->is_precomputed) { precompute_check2equiv(); precompute_M2M(); precompute_M2L(); save_matrix(); } } void P2M(NodePtrs& leafs) { int& nsurf_ = this->nsurf; real_t c[3] = {0,0,0}; std::vector up_check_surf; up_check_surf.resize(this->depth+1); for (int level=0; level<=this->depth; level++) { up_check_surf[level].resize(nsurf_*3); up_check_surf[level] = surface(this->p, this->r0, level, c, 2.95); } #pragma omp parallel for for (size_t i=0; i* leaf = leafs[i]; int level = leaf->level; real_t scale = pow(0.5, level); // scaling factor of UC2UE precomputation matrix // calculate upward check potential induced by sources' charges RealVec check_coord(nsurf_*3); for (int k=0; kx[0]; check_coord[3*k+1] = up_check_surf[level][3*k+1] + leaf->x[1]; check_coord[3*k+2] = up_check_surf[level][3*k+2] + leaf->x[2]; } this->potential_P2P(leaf->src_coord, leaf->src_value, check_coord, leaf->up_equiv); // convert upward check potential to upward equivalent charge std::vector buffer(nsurf_); std::vector equiv(nsurf_); gemv(nsurf_, nsurf_, &matrix_UC2E_U[0], &(leaf->up_equiv[0]), &buffer[0]); gemv(nsurf_, nsurf_, &matrix_UC2E_V[0], &buffer[0], &equiv[0]); // scale the check-to-equivalent conversion (precomputation) for (int k=0; kup_equiv[k] = scale * equiv[k]; } } void L2P(NodePtrs& leafs) { int& nsurf_ = this->nsurf; real_t c[3] = {0.0}; std::vector dn_equiv_surf; dn_equiv_surf.resize(this->depth+1); for (int level=0; level<=this->depth; level++) { dn_equiv_surf[level].resize(nsurf_*3); dn_equiv_surf[level] = surface(this->p, this->r0, level, c, 2.95); } #pragma omp parallel for for (size_t i=0; i* leaf = leafs[i]; int level = leaf->level; real_t scale = pow(0.5, level); // convert downward check potential to downward equivalent charge std::vector buffer(nsurf_); std::vector equiv(nsurf_); gemv(nsurf_, nsurf_, &matrix_DC2E_U[0], &(leaf->dn_equiv[0]), &buffer[0]); gemv(nsurf_, nsurf_, &matrix_DC2E_V[0], &buffer[0], &equiv[0]); // scale the check-to-equivalent conversion (precomputation) for (int k=0; kdn_equiv[k] = scale * equiv[k]; // calculate targets' potential & gradient induced by downward equivalent charge RealVec equiv_coord(nsurf_*3); for (int k=0; kx[0]; equiv_coord[3*k+1] = dn_equiv_surf[level][3*k+1] + leaf->x[1]; equiv_coord[3*k+2] = dn_equiv_surf[level][3*k+2] + leaf->x[2]; } this->gradient_P2P(equiv_coord, leaf->dn_equiv, leaf->trg_coord, leaf->trg_value); } } void M2M(Node* node) { int& nsurf_ = this->nsurf; if (node->is_leaf) return; for (int octant=0; octant<8; octant++) { if (node->children[octant]) #pragma omp task untied M2M(node->children[octant]); } #pragma omp taskwait // evaluate parent's upward equivalent charge from child's upward equivalent charge for (int octant=0; octant<8; octant++) { if (node->children[octant]) { Node* child = node->children[octant]; std::vector buffer(nsurf_); gemv(nsurf_, nsurf_, &(matrix_M2M[octant][0]), &child->up_equiv[0], &buffer[0]); for (int k=0; kup_equiv[k] += buffer[k]; } } } } void L2L(Node* node) { int& nsurf_ = this->nsurf; if (node->is_leaf) return; // evaluate child's downward check potential from parent's downward check potential for (int octant=0; octant<8; octant++) { if (node->children[octant]) { Node* child = node->children[octant]; std::vector buffer(nsurf_); gemv(nsurf_, nsurf_, &(matrix_L2L[octant][0]), &node->dn_equiv[0], &buffer[0]); for (int k=0; kdn_equiv[k] += buffer[k]; } } for (int octant=0; octant<8; octant++) { if (node->children[octant]) #pragma omp task untied L2L(node->children[octant]); } #pragma omp taskwait } void M2L_setup(NodePtrs nonleafs) { int& nsurf_ = this->nsurf; int npos = REL_COORD[M2L_Type].size(); // number of M2L relative positions // construct lists of source nodes and target nodes for M2L operator NodePtrs& trg_nodes = nonleafs; std::set*> src_nodes_; for (size_t i=0; i& M2L_list = trg_nodes[i]->M2L_list; for (int k=0; k src_nodes; auto it = src_nodes_.begin(); for (; it!=src_nodes_.end(); it++) { src_nodes.push_back(*it); } // prepare the indices of src_nodes & trg_nodes in all_up_equiv & all_dn_equiv std::vector fft_offset(src_nodes.size()); std::vector ifft_offset(trg_nodes.size()); RealVec ifft_scale(trg_nodes.size()); for (size_t i=0; ichildren[0]->idx * nsurf_; } for (size_t i=0; ilevel+1; ifft_offset[i] = trg_nodes[i]->children[0]->idx * nsurf_; ifft_scale[i] = powf(2.0, level); } // calculate interaction_offset_f & interaction_count_offset std::vector interaction_offset_f; std::vector interaction_count_offset; for (size_t i=0; iidx_M2L = i; } size_t nblk_trg = trg_nodes.size() * sizeof(real_t) / CACHE_SIZE; if (nblk_trg==0) nblk_trg = 1; size_t interaction_count_offset_ = 0; size_t fft_size = 2 * NCHILD * this->nfreq; for (size_t iblk_trg=0; iblk_trg& M2L_list = trg_nodes[i]->M2L_list; if (M2L_list[k]) { interaction_offset_f.push_back(M2L_list[k]->idx_M2L * fft_size); interaction_offset_f.push_back( i * fft_size); interaction_count_offset_++; } } interaction_count_offset.push_back(interaction_count_offset_); } } m2ldata.fft_offset = fft_offset; m2ldata.ifft_offset = ifft_offset; m2ldata.ifft_scale = ifft_scale; m2ldata.interaction_offset_f = interaction_offset_f; m2ldata.interaction_count_offset = interaction_count_offset; } void hadamard_product(std::vector& interaction_count_offset, std::vector& interaction_offset_f, AlignedVec& fft_in, AlignedVec& fft_out) { size_t fft_size = 2 * NCHILD * this->nfreq; AlignedVec zero_vec0(fft_size, 0.); AlignedVec zero_vec1(fft_size, 0.); size_t npos = matrix_M2L.size(); size_t nblk_inter = interaction_count_offset.size(); // num of blocks of interactions size_t nblk_trg = nblk_inter / npos; // num of blocks based on trg_nodes int BLOCK_SIZE = CACHE_SIZE * 2 / sizeof(real_t); std::vector IN_(BLOCK_SIZE*nblk_inter); std::vector OUT_(BLOCK_SIZE*nblk_inter); // initialize fft_out with zero #pragma omp parallel for for (size_t i=0; infreq; k++) { for (size_t ipos=0; ipos< npos; ipos++) { size_t iblk_inter = iblk_trg*npos+ipos; size_t interaction_count_offset0 = (iblk_inter==0 ? 0 : interaction_count_offset[iblk_inter-1]); size_t interaction_count_offset1 = interaction_count_offset[iblk_inter] ; size_t interaction_count = interaction_count_offset1 - interaction_count_offset0; real_t** IN = &IN_[BLOCK_SIZE*iblk_inter]; real_t** OUT= &OUT_[BLOCK_SIZE*iblk_inter]; real_t* M = &matrix_M2L[ipos][k*2*NCHILD*NCHILD]; // k-th freq's (row) offset in matrix_M2L[ipos] for (size_t j=0; jnfreq); } void fft_up_equiv(std::vector& fft_offset, RealVec& all_up_equiv, AlignedVec& fft_in) {} void ifft_dn_check(std::vector& ifft_offset, RealVec& ifft_scal, AlignedVec& fft_out, RealVec& all_dn_equiv) {} void M2L(Nodes& nodes) { int& nsurf_ = this->nsurf; size_t fft_size = 2 * NCHILD * this->nfreq; int nnodes = nodes.size(); // allocate memory std::vector all_up_equiv, all_dn_equiv; all_up_equiv.reserve(nnodes*nsurf_); // use reserve() to avoid the overhead of calling constructor all_dn_equiv.reserve(nnodes*nsurf_); // use pointer instead of iterator to access elements AlignedVec fft_in, fft_out; fft_in.reserve(m2ldata.fft_offset.size()*fft_size); fft_out.reserve(m2ldata.ifft_offset.size()*fft_size); // gather all upward equivalent charges #pragma omp parallel for collapse(2) for (int i=0; i void FmmScaleInvariant::precompute_check2equiv() { int level = 0; real_t c[3] = {0, 0, 0}; int& nsurf_ = this->nsurf; // compute kernel matrix RealVec up_check_surf = surface(this->p, this->r0, level, c, 2.95); RealVec up_equiv_surf = surface(this->p, this->r0, level, c, 1.05); RealVec matrix_c2e(nsurf_*nsurf_); // UC2UE this->kernel_matrix(up_check_surf, up_equiv_surf, matrix_c2e); // svd RealVec S(nsurf_*nsurf_); // singular values RealVec U(nsurf_*nsurf_), VH(nsurf_*nsurf_); svd(nsurf_, nsurf_, &matrix_c2e[0], &S[0], &U[0], &VH[0]); // pseudo-inverse real_t max_S = 0; for (int i=0; imax_S ? fabs(S[i*nsurf_+i]) : max_S; } for (int i=0; iEPS*max_S*4 ? 1.0/S[i*nsurf_+i] : 0.0; } RealVec V = transpose(VH, nsurf_, nsurf_); matrix_UC2E_U = transpose(U, nsurf_, nsurf_); gemm(nsurf_, nsurf_, nsurf_, &V[0], &S[0], &matrix_UC2E_V[0]); matrix_DC2E_U = VH; gemm(nsurf_, nsurf_, nsurf_, &U[0], &S[0], &matrix_DC2E_V[0]); } template <> void FmmScaleInvariant::precompute_check2equiv() { int level = 0; real_t c[3] = {0, 0, 0}; int& nsurf_ = this->nsurf; // compute kernel matrix RealVec up_check_surf = surface(this->p, this->r0, level, c, 2.95); RealVec up_equiv_surf = surface(this->p, this->r0, level, c, 1.05); ComplexVec matrix_c2e(nsurf_*nsurf_); // UC2UE this->kernel_matrix(up_check_surf, up_equiv_surf, matrix_c2e); // svd RealVec S(nsurf_*nsurf_); // singular values ComplexVec U(nsurf_*nsurf_), VH(nsurf_*nsurf_); svd(nsurf_, nsurf_, &matrix_c2e[0], &S[0], &U[0], &VH[0]); // pseudo-inverse real_t max_S = 0; for (int i=0; imax_S ? fabs(S[i*nsurf_+i]) : max_S; } for (int i=0; iEPS*max_S*4 ? 1.0/S[i*nsurf_+i] : 0.0; } ComplexVec S_(nsurf_*nsurf_); for (size_t i=0; i void FmmScaleInvariant::precompute_M2L() { int n1 = this->p * 2; int& nconv_ = this->nconv; int& nfreq_ = this->nfreq; std::vector matrix_M2L_Helper(REL_COORD[M2L_Helper_Type].size(), RealVec(2*nfreq_)); // create fft plan RealVec fftw_in(nconv_); RealVec fftw_out(2*nfreq_); int dim[3] = {n1, n1, n1}; fft_plan plan = fft_plan_dft_r2c(3, dim, fftw_in.data(), reinterpret_cast(fftw_out.data()), FFTW_ESTIMATE); // compute M2L kernel matrix, perform DFT RealVec trg_coord(3,0); #pragma omp parallel for for (size_t i=0; ir0 / 0.5; // relative coords } RealVec conv_coord = convolution_grid(this->p, this->r0, 0, coord); // convolution grid RealVec conv_value(nconv_); // potentials on convolution grid this->kernel_matrix(conv_coord, trg_coord, conv_value); fft_execute_dft_r2c(plan, conv_value.data(), reinterpret_cast(matrix_M2L_Helper[i].data())); } // convert M2L_Helper to M2L and reorder data layout to improve locality #pragma omp parallel for for (size_t i=0; i void FmmScaleInvariant::fft_up_equiv(std::vector& fft_offset, RealVec& all_up_equiv, AlignedVec& fft_in) { int& nsurf_ = this->nsurf; int& nconv_ = this->nconv; int& nfreq_ = this->nfreq; int n1 = 2 * this->p; auto map = generate_surf2conv_up(this->p); size_t fft_size = 2 * NCHILD * nfreq_; AlignedVec fftw_in(nconv_ * NCHILD); AlignedVec fftw_out(fft_size); int dim[3] = {n1, n1, n1}; fft_plan plan = fft_plan_many_dft_r2c(3, dim, NCHILD, (real_t*)&fftw_in[0], nullptr, 1, nconv_, (fft_complex*)(&fftw_out[0]), nullptr, 1, nfreq_, FFTW_ESTIMATE); #pragma omp parallel for for (size_t node_idx=0; node_idx void FmmScaleInvariant::ifft_dn_check(std::vector& ifft_offset, RealVec& ifft_scal, AlignedVec& fft_out, RealVec& all_dn_equiv) { int& nsurf_ = this->nsurf; int& nconv_ = this->nconv; int& nfreq_ = this->nfreq; int n1 = 2 * this->p; auto map = generate_surf2conv_dn(this->p); size_t fft_size = 2 * NCHILD * nfreq_; AlignedVec fftw_in(fft_size); AlignedVec fftw_out(nconv_ * NCHILD); int dim[3] = {n1, n1, n1}; fft_plan plan = fft_plan_many_dft_c2r(3, dim, NCHILD, (fft_complex*)&fftw_in[0], nullptr, 1, nfreq_, (real_t*)(&fftw_out[0]), nullptr, 1, nconv_, FFTW_ESTIMATE); #pragma omp parallel for for (size_t node_idx=0; node_idx