28 #ifndef _MATRIX_SCALAPACK_IMPL_HPP_ 
   29 #define _MATRIX_SCALAPACK_IMPL_HPP_ 
   42 #include "../complex.hpp" 
   43 #include "../mpi_wrapper.hpp" 
   48 int numroc_(
int* M, 
int* MB, 
int* prow, 
int* irsrc, 
int* nprow);
 
   49 void descinit_(
int desca[], 
int* M, 
int* N, 
int* MB, 
int* NB, 
int* irsrc,
 
   50                int* icsrc, 
int* ictxt, 
int* lld, 
int* info);
 
   65 const size_t Matrix<C>::BLOCK_SIZE = 16;
 
   86     : grid(comm), desc(9) {
 
  102 template <
typename C>
 
  107 template <
typename C>
 
  112 template <
typename C>
 
  120 template <
typename C>
 
  125 template <
typename C>
 
  130 template <
typename C>
 
  135 template <
typename C>
 
  143 template <
typename C>
 
  148 template <
typename C>
 
  159 template <
typename C>
 
  164 template <
typename C>
 
  166                                     size_t& g_col)
 const {
 
  167   const int lld = get_lld();
 
  168   const size_t l_row = i % lld;
 
  169   const size_t l_col = i / lld;
 
  170   prep_local_to_global();
 
  171   g_row = global_row[l_row];
 
  172   g_col = global_col[l_col];
 
  175 template <
typename C>
 
  178   const int lld = get_lld();
 
  179   const int nprow = grid.nprow;
 
  180   const int npcol = grid.npcol;
 
  181   const int myprow = grid.myprow;
 
  182   const int mypcol = grid.mypcol;
 
  184   size_t b_row = g_row / BLOCK_SIZE;
 
  185   size_t p_row = b_row % nprow;
 
  186   size_t b_col = g_col / BLOCK_SIZE;
 
  187   size_t p_col = b_col % npcol;
 
  188   if (p_row != 
size_t(myprow) || p_col != 
size_t(mypcol)) 
return false;
 
  190   size_t l_row = (b_row / nprow) * BLOCK_SIZE + g_row % BLOCK_SIZE;
 
  191   size_t l_col = (b_col / npcol) * BLOCK_SIZE + g_col % BLOCK_SIZE;
 
  192   i = l_row + l_col * lld;  
 
  196 template <
typename C>
 
  198                                       int& comm_rank, 
size_t& lindex)
 const {
 
  199   prep_global_to_local();
 
  200   const int myprow = proc_row[g_row];
 
  201   const int mypcol = proc_col[g_col];
 
  202   comm_rank = grid.mpirank(myprow, mypcol);
 
  203   lindex = local_row[g_row] + local_col[g_col] * lld_list[myprow];
 
  206 template <
typename C>
 
  208   return local_row_size_;
 
  211 template <
typename C>
 
  213   return local_col_size_;
 
  216 template <
typename C>
 
  218   return lindex % (
static_cast<size_t>(get_lld()));
 
  221 template <
typename C>
 
  223   return lindex / (
static_cast<size_t>(get_lld()));
 
  226 template <
typename C>
 
  228   return BLOCK_SIZE * ((l_row / BLOCK_SIZE) * grid.nprow + grid.myprow) +
 
  229          (l_row % BLOCK_SIZE);
 
  232 template <
typename C>
 
  234   return BLOCK_SIZE * ((l_col / BLOCK_SIZE) * grid.npcol + grid.mypcol) +
 
  235          (l_col % BLOCK_SIZE);
 
  239 template <
typename C>
 
  241   if (has_local_to_global) 
return;
 
  243   const int nprow = grid.nprow;
 
  244   const int npcol = grid.npcol;
 
  245   const int myprow = grid.myprow;
 
  246   const int mypcol = grid.mypcol;
 
  247   const size_t size_row = get_lld();
 
  248   const size_t size_col = local_size() / size_row;
 
  249   global_row.resize(size_row);
 
  250   global_col.resize(size_col);
 
  252 #pragma omp parallel default(shared) 
  255     for (
size_t l_row = 0; l_row < size_row; ++l_row) {
 
  256       global_row[l_row] = BLOCK_SIZE * ((l_row / BLOCK_SIZE) * nprow + myprow) +
 
  257                           (l_row % BLOCK_SIZE);
 
  260     for (
size_t l_col = 0; l_col < size_col; ++l_col) {
 
  261       global_col[l_col] = BLOCK_SIZE * ((l_col / BLOCK_SIZE) * npcol + mypcol) +
 
  262                           (l_col % BLOCK_SIZE);
 
  266   has_local_to_global = 
true;
 
  270 template <
typename C>
 
  272   if (has_global_to_local) 
return;
 
  274   const int nprow = grid.nprow;
 
  275   const int npcol = grid.npcol;
 
  276   const size_t size_row = n_row();
 
  277   const size_t size_col = n_col();
 
  278   const size_t nblocks = size_row / BLOCK_SIZE;
 
  279   const size_t extra_blocks = nblocks % nprow;
 
  280   const size_t locr_offset = (nblocks / nprow) * BLOCK_SIZE;
 
  282   local_row.resize(size_row);
 
  283   local_col.resize(size_col);
 
  284   proc_row.resize(size_row);
 
  285   proc_col.resize(size_col);
 
  286   lld_list.resize(nprow);
 
  288 #pragma omp parallel default(shared) 
  291     for (
size_t g_row = 0; g_row < size_row; ++g_row) {
 
  292       size_t b_row = g_row / BLOCK_SIZE;
 
  293       proc_row[g_row] = b_row % nprow;
 
  294       local_row[g_row] = (b_row / nprow) * BLOCK_SIZE + g_row % BLOCK_SIZE;
 
  298     for (
size_t g_col = 0; g_col < size_col; ++g_col) {
 
  299       size_t b_col = g_col / BLOCK_SIZE;
 
  300       proc_col[g_col] = b_col % npcol;
 
  301       local_col[g_col] = (b_col / npcol) * BLOCK_SIZE + g_col % BLOCK_SIZE;
 
  305     for (
size_t myprow = 0; myprow < size_t(nprow); ++myprow) {
 
  307       size_t locr = locr_offset;
 
  308       if (myprow < extra_blocks)
 
  310       else if (myprow == extra_blocks)
 
  311         locr += size_row % BLOCK_SIZE;
 
  313       lld_list[myprow] = (locr < 1) ? 1 : locr;
 
  317   has_global_to_local = 
true;
 
  320 template <
typename C>
 
  324   int xMB = BLOCK_SIZE;
 
  325   int xNB = BLOCK_SIZE;
 
  326   int irsrc = 0, icsrc = 0;
 
  327   int locr = 
numroc_(&xM, &xMB, &(grid.myprow), &irsrc, &(grid.nprow));
 
  328   int locc = 
numroc_(&xN, &xNB, &(grid.mypcol), &icsrc, &(grid.npcol));
 
  329   int lld = (locr < 1) ? 1 : locr;
 
  331   descinit_(&(desc[0]), &xM, &xN, &xMB, &xNB, &irsrc, &icsrc, &(grid.ictxt),
 
  333   V.resize(locr * locc);
 
  334   local_row_size_ = 
static_cast<size_t>(locr);
 
  335   local_col_size_ = 
static_cast<size_t>(locc);
 
  336   has_local_to_global = 
false;
 
  337   has_global_to_local = 
false;
 
  350 template <
typename C>
 
  352   out << 
"Matrix: prow= " << grid.myprow << 
" pcol= " << grid.mypcol
 
  353       << 
" local_size= " << local_size() << 
" lld(locr)= " << get_lld()
 
  354       << 
" locc= " << local_size() / get_lld() << 
"\n";
 
  357 template <
typename C>
 
  360   for (
size_t i = 0; i < V.size(); ++i) V[i] += rhs[i];
 
  364 template <
typename C>
 
  367   for (
size_t i = 0; i < V.size(); ++i) V[i] -= rhs[i];
 
  371 template <
typename C>
 
  373   for (
size_t i = 0; i < V.size(); ++i) V[i] *= rhs;
 
  377 template <
typename C>
 
  379   for (
size_t i = 0; i < V.size(); ++i) V[i] /= rhs;
 
  383 template <
typename C>
 
  384 template <
typename UnaryOperation>
 
  386   std::transform(V.begin(), V.end(), V.begin(), op);
 
  390 template <
typename C>
 
  393   Matrix<C> M_new(get_comm(), n_col(), n_row());
 
  396   std::vector<int> dest_rank(local_size());
 
  397   std::vector<unsigned long int> local_position(local_size());
 
  399   for (
size_t i = 0; i < local_size(); i++) {
 
  401     size_t pos, g_col, g_row;
 
  403     global_index(i, g_row, g_col);
 
  406     local_position[i] = pos;
 
  416 template <
typename C>
 
  418   const size_t n = n_row() * n_col();
 
  419   const size_t nr = n_row();
 
  420   std::vector<C> vec(n);
 
  422   for (
size_t i = 0; i < local_size(); ++i) {
 
  423     global_index(i, g_row, g_col);
 
  424     vec[g_row + g_col * nr] = V[i];
 
  429 template <
typename C>
 
  431   MPI_Barrier(get_comm());
 
  434 template <
typename C>
 
  439 template <
typename C>
 
  440 template <
typename D>
 
  445 template <
typename C>
 
  447   const size_t size_row = get_lld();
 
  448   const size_t size_col = local_size() / size_row;
 
  449   prep_local_to_global();
 
  451   fout << 
"local_size= " << local_size() << 
"\n";
 
  452   fout << 
"local_n_row= " << size_row << 
"\n";
 
  453   fout << 
"local_n_col= " << size_col << 
"\n";
 
  454   fout << 
"[global_row]\n";
 
  455   for (
size_t i = 0; i < size_row; ++i) {
 
  456     fout << 
" " << global_row[i];
 
  457     if (i % BLOCK_SIZE == BLOCK_SIZE - 1) fout << 
"\n";
 
  460   fout << 
"[global_col]\n";
 
  461   for (
size_t i = 0; i < size_col; ++i) {
 
  462     fout << 
" " << global_col[i];
 
  463     if (i % BLOCK_SIZE == BLOCK_SIZE - 1) fout << 
"\n";
 
  471 template <
typename C>
 
  473                          const std::vector<size_t>& local_position,
 
  480   const C* mat = M.
head();
 
  481   C* mat_new = M_new.
head();
 
  484   assert(dest_rank.size() == local_size);
 
  485   assert(local_position.size() == local_size);
 
  487   const int proc_size = mpisize + 1;
 
  488   int* send_counts = 
new int[proc_size];
 
  489   int* send_displs = 
new int[proc_size];
 
  490   int* recv_counts = 
new int[proc_size];
 
  491   int* recv_displs = 
new int[proc_size];
 
  492   for (
int rank = 0; rank < proc_size; ++rank) {
 
  493     send_counts[rank] = 0;
 
  494     send_displs[rank] = 0;
 
  495     recv_counts[rank] = 0;
 
  496     recv_displs[rank] = 0;
 
  499   for (
size_t i = 0; i < local_size; ++i) {
 
  500     send_counts[dest_rank[i]] += 1;
 
  503   MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm);
 
  505   for (
int rank = 0; rank < mpisize; ++rank) {
 
  506     send_displs[rank + 1] = send_counts[rank] + send_displs[rank];
 
  507     recv_displs[rank + 1] = recv_counts[rank] + recv_displs[rank];
 
  510   const int send_size = local_size;  
 
  511   const int recv_size = recv_displs[mpisize];
 
  512   unsigned long int* send_pos = 
new unsigned long int[send_size];
 
  513   unsigned long int* recv_pos = 
new unsigned long int[recv_size];
 
  514   C* send_value = 
new C[send_size];
 
  515   C* recv_value = 
new C[recv_size];
 
  519   int* pack_idx = 
new int[proc_size];
 
  520   for (
int rank = 0; rank < proc_size; ++rank) {
 
  521     pack_idx[rank] = send_displs[rank];
 
  523   for (
size_t i = 0; i < local_size; ++i) {
 
  524     const int rank = dest_rank[i];
 
  525     const int idx = pack_idx[rank];
 
  526     send_value[idx] = mat[i];
 
  527     send_pos[idx] = local_position[i];
 
  534                          recv_counts, recv_displs, comm);
 
  536                          recv_counts, recv_displs, comm);
 
  539   for (
int i = 0; i < recv_size; ++i) {
 
  540     mat_new[recv_pos[i]] = recv_value[i];
 
  548   delete[] send_counts;
 
  549   delete[] send_displs;
 
  550   delete[] recv_counts;
 
  551   delete[] recv_displs;
 
  554 template <
typename C>
 
  556                          const std::vector<size_t>& local_position,
 
  558   const MPI_Comm comm = M_new.
get_comm();
 
  561   const size_t local_size = V.size();
 
  563   const C* mat = &(V[0]);
 
  564   C* mat_new = M_new.
head();
 
  567   assert(dest_rank.size() == local_size);
 
  568   assert(local_position.size() == local_size);
 
  570   const int proc_size = mpisize + 1;
 
  571   int* send_counts = 
new int[proc_size];
 
  572   int* send_displs = 
new int[proc_size];
 
  573   int* recv_counts = 
new int[proc_size];
 
  574   int* recv_displs = 
new int[proc_size];
 
  575   for (
int rank = 0; rank < proc_size; ++rank) {
 
  576     send_counts[rank] = 0;
 
  577     send_displs[rank] = 0;
 
  578     recv_counts[rank] = 0;
 
  579     recv_displs[rank] = 0;
 
  582   for (
size_t i = 0; i < local_size; ++i) {
 
  583     send_counts[dest_rank[i]] += 1;
 
  586   MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm);
 
  588   for (
int rank = 0; rank < mpisize; ++rank) {
 
  589     send_displs[rank + 1] = send_counts[rank] + send_displs[rank];
 
  590     recv_displs[rank + 1] = recv_counts[rank] + recv_displs[rank];
 
  593   const int send_size = local_size;  
 
  594   const int recv_size = recv_displs[mpisize];
 
  595   unsigned long int* send_pos = 
new unsigned long int[send_size];
 
  596   unsigned long int* recv_pos = 
new unsigned long int[recv_size];
 
  597   C* send_value = 
new C[send_size];
 
  598   C* recv_value = 
new C[recv_size];
 
  602   int* pack_idx = 
new int[proc_size];
 
  603   for (
int rank = 0; rank < proc_size; ++rank) {
 
  604     pack_idx[rank] = send_displs[rank];
 
  606   for (
size_t i = 0; i < local_size; ++i) {
 
  607     const int rank = dest_rank[i];
 
  608     const int idx = pack_idx[rank];
 
  609     send_value[idx] = mat[i];
 
  610     send_pos[idx] = local_position[i];
 
  617                          recv_counts, recv_displs, comm);
 
  619                          recv_counts, recv_displs, comm);
 
  622   for (
int i = 0; i < recv_size; ++i) {
 
  623     mat_new[recv_pos[i]] = recv_value[i];
 
  631   delete[] send_counts;
 
  632   delete[] send_displs;
 
  633   delete[] recv_counts;
 
  634   delete[] recv_displs;
 
  637 template <
typename C>
 
  639                      const std::vector<size_t>& local_position,
 
  647   assert(dest_rank.size() == local_size);
 
  648   assert(local_position.size() == local_size);
 
  650   std::vector<int> send_size_list(mpisize, 0);
 
  651   for (
size_t i = 0; i < local_size; ++i) {
 
  652     const int rank = dest_rank[i];
 
  653     if (rank >= 0 && rank < mpisize) send_size_list[dest_rank[i]] += 1;
 
  656   const C* mat = M.
head();
 
  657   C* mat_new = M_new.
head();
 
  659   for (
int step = 0; step < mpisize; ++step) {
 
  660     int dest = (mpirank + step) % mpisize;
 
  661     int source = (mpirank + mpisize - step) % mpisize;
 
  662     if (dest == mpirank) {
 
  663       for (
size_t i = 0; i < local_size; i++) {
 
  664         if (dest_rank[i] == mpirank) {
 
  665           mat_new[local_position[i]] += mat[i];
 
  670       int send_size = send_size_list[dest];
 
  674       MPI_Sendrecv(&send_size, 1, MPI_INT, dest, tag, &recv_size, 1, MPI_INT,
 
  675                    source, tag, comm, &status);
 
  678       std::vector<unsigned long int> send_pos(send_size);
 
  679       std::vector<C> send_value(send_size);
 
  681       for (
size_t i = 0; i < local_size; i++) {
 
  682         if (dest_rank[i] == dest) {
 
  683           send_pos[k] = local_position[i];
 
  684           send_value[k] = mat[i];
 
  690       std::vector<unsigned long int> recv_pos(recv_size);
 
  691       std::vector<C> recv_value(recv_size);
 
  692       MPI_Sendrecv(&(send_pos[0]), send_size, MPI_UNSIGNED_LONG, dest, tag,
 
  693                    &(recv_pos[0]), recv_size, MPI_UNSIGNED_LONG, source, tag,
 
  699       for (
int i = 0; i < recv_size; ++i) {
 
  700         mat_new[recv_pos[i]] += recv_value[i];
 
  706 template <
typename C>
 
  711   for (
size_t i = 0; i < n; ++i) {
 
  712     send = 
std::max(send, std::abs(a[i]));
 
  714   MPI_Allreduce(&send, &recv, 1, MPI_DOUBLE, MPI_MAX, a.
get_comm());
 
  718 template <
typename C>
 
  721   double send = DBL_MAX;
 
  723   for (
size_t i = 0; i < n; ++i) {
 
  724     send = 
std::min(send, std::abs(a[i]));
 
  726   MPI_Allreduce(&send, &recv, 1, MPI_DOUBLE, MPI_MIN, a.
get_comm());
 
Non-distributed matrix using LAPACK.
Definition: matrix_lapack.hpp:44
 
Distributed matrix using ScaLAPACK.
Definition: matrix_scalapack.hpp:47
 
int n_row() const
Definition: matrix_scalapack_impl.hpp:144
 
Matrix & operator*=(C rhs)
Definition: matrix_scalapack_impl.hpp:372
 
const int * descriptor() const
Definition: matrix_scalapack_impl.hpp:136
 
void bcast(D *buffer, int count, int root) const
Definition: matrix_scalapack_impl.hpp:441
 
void print_info(std::ostream &) const
Definition: matrix_scalapack_impl.hpp:351
 
void prep_local_to_global() const
Preprocess for fast conversion from local index to global one.
Definition: matrix_scalapack_impl.hpp:240
 
size_t local_row_index(size_t lindex) const
Definition: matrix_scalapack_impl.hpp:217
 
bool local_index(size_t g_row, size_t g_col, size_t &i) const
Definition: matrix_scalapack_impl.hpp:176
 
void global_index(size_t i, size_t &g_row, size_t &g_col) const
Definition: matrix_scalapack_impl.hpp:165
 
int n_col() const
Definition: matrix_scalapack_impl.hpp:149
 
size_t local_row_size() const
Definition: matrix_scalapack_impl.hpp:207
 
Matrix & operator-=(const Matrix &rhs)
Definition: matrix_scalapack_impl.hpp:365
 
const C * head() const
Definition: matrix_scalapack_impl.hpp:103
 
Matrix()
Definition: matrix_scalapack_impl.hpp:70
 
Matrix & map(UnaryOperation op)
 
void init(size_t n_row, size_t n_col)
Definition: matrix_scalapack_impl.hpp:321
 
C allreduce_sum(C value) const
Definition: matrix_scalapack_impl.hpp:435
 
std::vector< C > flatten()
Definition: matrix_scalapack_impl.hpp:417
 
const MPI_Comm & get_comm() const
Definition: matrix_scalapack_impl.hpp:121
 
Matrix & operator/=(C rhs)
Definition: matrix_scalapack_impl.hpp:378
 
void barrier() const
Definition: matrix_scalapack_impl.hpp:430
 
void local_position(size_t g_row, size_t g_col, int &comm_rank, size_t &lindex) const
Definition: matrix_scalapack_impl.hpp:197
 
const Matrix transpose()
Definition: matrix_scalapack_impl.hpp:391
 
void save_index(const std::string &filename) const
Definition: matrix_scalapack_impl.hpp:446
 
size_t local_col_index(size_t lindex) const
Definition: matrix_scalapack_impl.hpp:222
 
size_t local_size() const
Definition: matrix_scalapack_impl.hpp:113
 
size_t global_row_index(size_t lindex_row) const
Definition: matrix_scalapack_impl.hpp:227
 
int get_comm_size() const
Definition: matrix_scalapack_impl.hpp:126
 
int get_comm_rank() const
Definition: matrix_scalapack_impl.hpp:131
 
void prep_global_to_local() const
Preprocess for fast conversion from global index to local one.
Definition: matrix_scalapack_impl.hpp:271
 
Matrix & operator+=(const Matrix &rhs)
Definition: matrix_scalapack_impl.hpp:358
 
const C & operator[](size_t i) const
Definition: matrix_scalapack_impl.hpp:93
 
size_t local_col_size() const
Definition: matrix_scalapack_impl.hpp:212
 
size_t global_col_index(size_t lindex_col) const
Definition: matrix_scalapack_impl.hpp:233
 
std::string filename(const std::string &prefix, int proc_size)
Definition: common.hpp:32
 
void descinit_(int desca[], int *M, int *N, int *MB, int *NB, int *irsrc, int *icsrc, int *ictxt, int *lld, int *info)
 
int numroc_(int *M, int *MB, int *prow, int *irsrc, int *nprow)
 
double min(const Matrix< C > &a)
 
double max(const Matrix< C > &a)
 
std::vector< C > allreduce_vec(const std::vector< C > &vec, const MPI_Comm &comm)
Calculate a summation of each element of vector over MPI communicator.
 
void bcast(C *buffer, int count, int root, const MPI_Comm &comm)
Wrapper of MPI_Bcast.
 
void alltoallv(const C *sendbuf, const int *sendcounts, const int *sdispls, C *recvbuf, const int *recvcounts, const int *rdispls, const MPI_Comm &comm)
Wrapper of MPI_Alltoallv.
 
void send_recv_vector(const std::vector< C > &send_vec, int dest, int sendtag, std::vector< C > &recv_vec, int source, int recvtag, const MPI_Comm &comm, MPI_Status &status)
Wrapper of MPI_Sendrecv.
 
C allreduce_sum(C val, const MPI_Comm &comm)
Calculate a summation over MPI communicator.
 
double min_abs(const Matrix< C > &a)
Definition: matrix_scalapack_impl.hpp:719
 
void replace_matrix_data(const Matrix< C > &M, const std::vector< int > &dest_rank, const std::vector< size_t > &local_position, Matrix< C > &M_new)
Definition: matrix_scalapack_impl.hpp:472
 
void sum_matrix_data(const Matrix< C > &M, const std::vector< int > &dest_rank, const std::vector< size_t > &local_position, Matrix< C > &M_new)
Definition: matrix_scalapack_impl.hpp:638
 
double max_abs(const Matrix< C > &a)
Definition: matrix_scalapack_impl.hpp:707
 
Definition: complex.hpp:34