28 #ifndef _MATRIX_SCALAPACK_IMPL_HPP_
29 #define _MATRIX_SCALAPACK_IMPL_HPP_
42 #include "../complex.hpp"
43 #include "../mpi_wrapper.hpp"
48 int numroc_(
int* M,
int* MB,
int* prow,
int* irsrc,
int* nprow);
49 void descinit_(
int desca[],
int* M,
int* N,
int* MB,
int* NB,
int* irsrc,
50 int* icsrc,
int* ictxt,
int* lld,
int* info);
65 const size_t Matrix<C>::BLOCK_SIZE = 16;
86 : grid(comm), desc(9) {
102 template <
typename C>
107 template <
typename C>
112 template <
typename C>
120 template <
typename C>
125 template <
typename C>
130 template <
typename C>
135 template <
typename C>
143 template <
typename C>
148 template <
typename C>
159 template <
typename C>
164 template <
typename C>
166 size_t& g_col)
const {
167 const int lld = get_lld();
168 const size_t l_row = i % lld;
169 const size_t l_col = i / lld;
170 prep_local_to_global();
171 g_row = global_row[l_row];
172 g_col = global_col[l_col];
175 template <
typename C>
178 const int lld = get_lld();
179 const int nprow = grid.nprow;
180 const int npcol = grid.npcol;
181 const int myprow = grid.myprow;
182 const int mypcol = grid.mypcol;
184 size_t b_row = g_row / BLOCK_SIZE;
185 size_t p_row = b_row % nprow;
186 size_t b_col = g_col / BLOCK_SIZE;
187 size_t p_col = b_col % npcol;
188 if (p_row !=
size_t(myprow) || p_col !=
size_t(mypcol))
return false;
190 size_t l_row = (b_row / nprow) * BLOCK_SIZE + g_row % BLOCK_SIZE;
191 size_t l_col = (b_col / npcol) * BLOCK_SIZE + g_col % BLOCK_SIZE;
192 i = l_row + l_col * lld;
196 template <
typename C>
198 int& comm_rank,
size_t& lindex)
const {
199 prep_global_to_local();
200 const int myprow = proc_row[g_row];
201 const int mypcol = proc_col[g_col];
202 comm_rank = grid.mpirank(myprow, mypcol);
203 lindex = local_row[g_row] + local_col[g_col] * lld_list[myprow];
206 template <
typename C>
208 return local_row_size_;
211 template <
typename C>
213 return local_col_size_;
216 template <
typename C>
218 return lindex % (
static_cast<size_t>(get_lld()));
221 template <
typename C>
223 return lindex / (
static_cast<size_t>(get_lld()));
226 template <
typename C>
228 return BLOCK_SIZE * ((l_row / BLOCK_SIZE) * grid.nprow + grid.myprow) +
229 (l_row % BLOCK_SIZE);
232 template <
typename C>
234 return BLOCK_SIZE * ((l_col / BLOCK_SIZE) * grid.npcol + grid.mypcol) +
235 (l_col % BLOCK_SIZE);
239 template <
typename C>
241 if (has_local_to_global)
return;
243 const int nprow = grid.nprow;
244 const int npcol = grid.npcol;
245 const int myprow = grid.myprow;
246 const int mypcol = grid.mypcol;
247 const size_t size_row = get_lld();
248 const size_t size_col = local_size() / size_row;
249 global_row.resize(size_row);
250 global_col.resize(size_col);
252 #pragma omp parallel default(shared)
255 for (
size_t l_row = 0; l_row < size_row; ++l_row) {
256 global_row[l_row] = BLOCK_SIZE * ((l_row / BLOCK_SIZE) * nprow + myprow) +
257 (l_row % BLOCK_SIZE);
260 for (
size_t l_col = 0; l_col < size_col; ++l_col) {
261 global_col[l_col] = BLOCK_SIZE * ((l_col / BLOCK_SIZE) * npcol + mypcol) +
262 (l_col % BLOCK_SIZE);
266 has_local_to_global =
true;
270 template <
typename C>
272 if (has_global_to_local)
return;
274 const int nprow = grid.nprow;
275 const int npcol = grid.npcol;
276 const size_t size_row = n_row();
277 const size_t size_col = n_col();
278 const size_t nblocks = size_row / BLOCK_SIZE;
279 const size_t extra_blocks = nblocks % nprow;
280 const size_t locr_offset = (nblocks / nprow) * BLOCK_SIZE;
282 local_row.resize(size_row);
283 local_col.resize(size_col);
284 proc_row.resize(size_row);
285 proc_col.resize(size_col);
286 lld_list.resize(nprow);
288 #pragma omp parallel default(shared)
291 for (
size_t g_row = 0; g_row < size_row; ++g_row) {
292 size_t b_row = g_row / BLOCK_SIZE;
293 proc_row[g_row] = b_row % nprow;
294 local_row[g_row] = (b_row / nprow) * BLOCK_SIZE + g_row % BLOCK_SIZE;
298 for (
size_t g_col = 0; g_col < size_col; ++g_col) {
299 size_t b_col = g_col / BLOCK_SIZE;
300 proc_col[g_col] = b_col % npcol;
301 local_col[g_col] = (b_col / npcol) * BLOCK_SIZE + g_col % BLOCK_SIZE;
305 for (
size_t myprow = 0; myprow < size_t(nprow); ++myprow) {
307 size_t locr = locr_offset;
308 if (myprow < extra_blocks)
310 else if (myprow == extra_blocks)
311 locr += size_row % BLOCK_SIZE;
313 lld_list[myprow] = (locr < 1) ? 1 : locr;
317 has_global_to_local =
true;
320 template <
typename C>
324 int xMB = BLOCK_SIZE;
325 int xNB = BLOCK_SIZE;
326 int irsrc = 0, icsrc = 0;
327 int locr =
numroc_(&xM, &xMB, &(grid.myprow), &irsrc, &(grid.nprow));
328 int locc =
numroc_(&xN, &xNB, &(grid.mypcol), &icsrc, &(grid.npcol));
329 int lld = (locr < 1) ? 1 : locr;
331 descinit_(&(desc[0]), &xM, &xN, &xMB, &xNB, &irsrc, &icsrc, &(grid.ictxt),
333 V.resize(locr * locc);
334 local_row_size_ =
static_cast<size_t>(locr);
335 local_col_size_ =
static_cast<size_t>(locc);
336 has_local_to_global =
false;
337 has_global_to_local =
false;
350 template <
typename C>
352 out <<
"Matrix: prow= " << grid.myprow <<
" pcol= " << grid.mypcol
353 <<
" local_size= " << local_size() <<
" lld(locr)= " << get_lld()
354 <<
" locc= " << local_size() / get_lld() <<
"\n";
357 template <
typename C>
360 for (
size_t i = 0; i < V.size(); ++i) V[i] += rhs[i];
364 template <
typename C>
367 for (
size_t i = 0; i < V.size(); ++i) V[i] -= rhs[i];
371 template <
typename C>
373 for (
size_t i = 0; i < V.size(); ++i) V[i] *= rhs;
377 template <
typename C>
379 for (
size_t i = 0; i < V.size(); ++i) V[i] /= rhs;
383 template <
typename C>
384 template <
typename UnaryOperation>
386 std::transform(V.begin(), V.end(), V.begin(), op);
390 template <
typename C>
393 Matrix<C> M_new(get_comm(), n_col(), n_row());
396 std::vector<int> dest_rank(local_size());
397 std::vector<unsigned long int> local_position(local_size());
399 for (
size_t i = 0; i < local_size(); i++) {
401 size_t pos, g_col, g_row;
403 global_index(i, g_row, g_col);
406 local_position[i] = pos;
416 template <
typename C>
418 const size_t n = n_row() * n_col();
419 const size_t nr = n_row();
420 std::vector<C> vec(n);
422 for (
size_t i = 0; i < local_size(); ++i) {
423 global_index(i, g_row, g_col);
424 vec[g_row + g_col * nr] = V[i];
429 template <
typename C>
431 MPI_Barrier(get_comm());
434 template <
typename C>
439 template <
typename C>
440 template <
typename D>
445 template <
typename C>
447 const size_t size_row = get_lld();
448 const size_t size_col = local_size() / size_row;
449 prep_local_to_global();
451 fout <<
"local_size= " << local_size() <<
"\n";
452 fout <<
"local_n_row= " << size_row <<
"\n";
453 fout <<
"local_n_col= " << size_col <<
"\n";
454 fout <<
"[global_row]\n";
455 for (
size_t i = 0; i < size_row; ++i) {
456 fout <<
" " << global_row[i];
457 if (i % BLOCK_SIZE == BLOCK_SIZE - 1) fout <<
"\n";
460 fout <<
"[global_col]\n";
461 for (
size_t i = 0; i < size_col; ++i) {
462 fout <<
" " << global_col[i];
463 if (i % BLOCK_SIZE == BLOCK_SIZE - 1) fout <<
"\n";
471 template <
typename C>
473 const std::vector<size_t>& local_position,
480 const C* mat = M.
head();
481 C* mat_new = M_new.
head();
484 assert(dest_rank.size() == local_size);
485 assert(local_position.size() == local_size);
487 const int proc_size = mpisize + 1;
488 int* send_counts =
new int[proc_size];
489 int* send_displs =
new int[proc_size];
490 int* recv_counts =
new int[proc_size];
491 int* recv_displs =
new int[proc_size];
492 for (
int rank = 0; rank < proc_size; ++rank) {
493 send_counts[rank] = 0;
494 send_displs[rank] = 0;
495 recv_counts[rank] = 0;
496 recv_displs[rank] = 0;
499 for (
size_t i = 0; i < local_size; ++i) {
500 send_counts[dest_rank[i]] += 1;
503 MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm);
505 for (
int rank = 0; rank < mpisize; ++rank) {
506 send_displs[rank + 1] = send_counts[rank] + send_displs[rank];
507 recv_displs[rank + 1] = recv_counts[rank] + recv_displs[rank];
510 const int send_size = local_size;
511 const int recv_size = recv_displs[mpisize];
512 unsigned long int* send_pos =
new unsigned long int[send_size];
513 unsigned long int* recv_pos =
new unsigned long int[recv_size];
514 C* send_value =
new C[send_size];
515 C* recv_value =
new C[recv_size];
519 int* pack_idx =
new int[proc_size];
520 for (
int rank = 0; rank < proc_size; ++rank) {
521 pack_idx[rank] = send_displs[rank];
523 for (
size_t i = 0; i < local_size; ++i) {
524 const int rank = dest_rank[i];
525 const int idx = pack_idx[rank];
526 send_value[idx] = mat[i];
527 send_pos[idx] = local_position[i];
534 recv_counts, recv_displs, comm);
536 recv_counts, recv_displs, comm);
539 for (
int i = 0; i < recv_size; ++i) {
540 mat_new[recv_pos[i]] = recv_value[i];
548 delete[] send_counts;
549 delete[] send_displs;
550 delete[] recv_counts;
551 delete[] recv_displs;
554 template <
typename C>
556 const std::vector<size_t>& local_position,
558 const MPI_Comm comm = M_new.
get_comm();
561 const size_t local_size = V.size();
563 const C* mat = &(V[0]);
564 C* mat_new = M_new.
head();
567 assert(dest_rank.size() == local_size);
568 assert(local_position.size() == local_size);
570 const int proc_size = mpisize + 1;
571 int* send_counts =
new int[proc_size];
572 int* send_displs =
new int[proc_size];
573 int* recv_counts =
new int[proc_size];
574 int* recv_displs =
new int[proc_size];
575 for (
int rank = 0; rank < proc_size; ++rank) {
576 send_counts[rank] = 0;
577 send_displs[rank] = 0;
578 recv_counts[rank] = 0;
579 recv_displs[rank] = 0;
582 for (
size_t i = 0; i < local_size; ++i) {
583 send_counts[dest_rank[i]] += 1;
586 MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, comm);
588 for (
int rank = 0; rank < mpisize; ++rank) {
589 send_displs[rank + 1] = send_counts[rank] + send_displs[rank];
590 recv_displs[rank + 1] = recv_counts[rank] + recv_displs[rank];
593 const int send_size = local_size;
594 const int recv_size = recv_displs[mpisize];
595 unsigned long int* send_pos =
new unsigned long int[send_size];
596 unsigned long int* recv_pos =
new unsigned long int[recv_size];
597 C* send_value =
new C[send_size];
598 C* recv_value =
new C[recv_size];
602 int* pack_idx =
new int[proc_size];
603 for (
int rank = 0; rank < proc_size; ++rank) {
604 pack_idx[rank] = send_displs[rank];
606 for (
size_t i = 0; i < local_size; ++i) {
607 const int rank = dest_rank[i];
608 const int idx = pack_idx[rank];
609 send_value[idx] = mat[i];
610 send_pos[idx] = local_position[i];
617 recv_counts, recv_displs, comm);
619 recv_counts, recv_displs, comm);
622 for (
int i = 0; i < recv_size; ++i) {
623 mat_new[recv_pos[i]] = recv_value[i];
631 delete[] send_counts;
632 delete[] send_displs;
633 delete[] recv_counts;
634 delete[] recv_displs;
637 template <
typename C>
639 const std::vector<size_t>& local_position,
647 assert(dest_rank.size() == local_size);
648 assert(local_position.size() == local_size);
650 std::vector<int> send_size_list(mpisize, 0);
651 for (
size_t i = 0; i < local_size; ++i) {
652 const int rank = dest_rank[i];
653 if (rank >= 0 && rank < mpisize) send_size_list[dest_rank[i]] += 1;
656 const C* mat = M.
head();
657 C* mat_new = M_new.
head();
659 for (
int step = 0; step < mpisize; ++step) {
660 int dest = (mpirank + step) % mpisize;
661 int source = (mpirank + mpisize - step) % mpisize;
662 if (dest == mpirank) {
663 for (
size_t i = 0; i < local_size; i++) {
664 if (dest_rank[i] == mpirank) {
665 mat_new[local_position[i]] += mat[i];
670 int send_size = send_size_list[dest];
674 MPI_Sendrecv(&send_size, 1, MPI_INT, dest, tag, &recv_size, 1, MPI_INT,
675 source, tag, comm, &status);
678 std::vector<unsigned long int> send_pos(send_size);
679 std::vector<C> send_value(send_size);
681 for (
size_t i = 0; i < local_size; i++) {
682 if (dest_rank[i] == dest) {
683 send_pos[k] = local_position[i];
684 send_value[k] = mat[i];
690 std::vector<unsigned long int> recv_pos(recv_size);
691 std::vector<C> recv_value(recv_size);
692 MPI_Sendrecv(&(send_pos[0]), send_size, MPI_UNSIGNED_LONG, dest, tag,
693 &(recv_pos[0]), recv_size, MPI_UNSIGNED_LONG, source, tag,
699 for (
int i = 0; i < recv_size; ++i) {
700 mat_new[recv_pos[i]] += recv_value[i];
706 template <
typename C>
711 for (
size_t i = 0; i < n; ++i) {
712 send =
std::max(send, std::abs(a[i]));
714 MPI_Allreduce(&send, &recv, 1, MPI_DOUBLE, MPI_MAX, a.
get_comm());
718 template <
typename C>
721 double send = DBL_MAX;
723 for (
size_t i = 0; i < n; ++i) {
724 send =
std::min(send, std::abs(a[i]));
726 MPI_Allreduce(&send, &recv, 1, MPI_DOUBLE, MPI_MIN, a.
get_comm());
Non-distributed matrix using LAPACK.
Definition: matrix_lapack.hpp:44
Distributed matrix using ScaLAPACK.
Definition: matrix_scalapack.hpp:47
int n_row() const
Definition: matrix_scalapack_impl.hpp:144
Matrix & operator*=(C rhs)
Definition: matrix_scalapack_impl.hpp:372
const int * descriptor() const
Definition: matrix_scalapack_impl.hpp:136
void bcast(D *buffer, int count, int root) const
Definition: matrix_scalapack_impl.hpp:441
void print_info(std::ostream &) const
Definition: matrix_scalapack_impl.hpp:351
void prep_local_to_global() const
Preprocess for fast conversion from local index to global one.
Definition: matrix_scalapack_impl.hpp:240
size_t local_row_index(size_t lindex) const
Definition: matrix_scalapack_impl.hpp:217
bool local_index(size_t g_row, size_t g_col, size_t &i) const
Definition: matrix_scalapack_impl.hpp:176
void global_index(size_t i, size_t &g_row, size_t &g_col) const
Definition: matrix_scalapack_impl.hpp:165
int n_col() const
Definition: matrix_scalapack_impl.hpp:149
size_t local_row_size() const
Definition: matrix_scalapack_impl.hpp:207
Matrix & operator-=(const Matrix &rhs)
Definition: matrix_scalapack_impl.hpp:365
const C * head() const
Definition: matrix_scalapack_impl.hpp:103
Matrix()
Definition: matrix_scalapack_impl.hpp:70
Matrix & map(UnaryOperation op)
void init(size_t n_row, size_t n_col)
Definition: matrix_scalapack_impl.hpp:321
C allreduce_sum(C value) const
Definition: matrix_scalapack_impl.hpp:435
std::vector< C > flatten()
Definition: matrix_scalapack_impl.hpp:417
const MPI_Comm & get_comm() const
Definition: matrix_scalapack_impl.hpp:121
Matrix & operator/=(C rhs)
Definition: matrix_scalapack_impl.hpp:378
void barrier() const
Definition: matrix_scalapack_impl.hpp:430
void local_position(size_t g_row, size_t g_col, int &comm_rank, size_t &lindex) const
Definition: matrix_scalapack_impl.hpp:197
const Matrix transpose()
Definition: matrix_scalapack_impl.hpp:391
void save_index(const std::string &filename) const
Definition: matrix_scalapack_impl.hpp:446
size_t local_col_index(size_t lindex) const
Definition: matrix_scalapack_impl.hpp:222
size_t local_size() const
Definition: matrix_scalapack_impl.hpp:113
size_t global_row_index(size_t lindex_row) const
Definition: matrix_scalapack_impl.hpp:227
int get_comm_size() const
Definition: matrix_scalapack_impl.hpp:126
int get_comm_rank() const
Definition: matrix_scalapack_impl.hpp:131
void prep_global_to_local() const
Preprocess for fast conversion from global index to local one.
Definition: matrix_scalapack_impl.hpp:271
Matrix & operator+=(const Matrix &rhs)
Definition: matrix_scalapack_impl.hpp:358
const C & operator[](size_t i) const
Definition: matrix_scalapack_impl.hpp:93
size_t local_col_size() const
Definition: matrix_scalapack_impl.hpp:212
size_t global_col_index(size_t lindex_col) const
Definition: matrix_scalapack_impl.hpp:233
std::string filename(const std::string &prefix, int proc_size)
Definition: common.hpp:32
void descinit_(int desca[], int *M, int *N, int *MB, int *NB, int *irsrc, int *icsrc, int *ictxt, int *lld, int *info)
int numroc_(int *M, int *MB, int *prow, int *irsrc, int *nprow)
double min(const Matrix< C > &a)
double max(const Matrix< C > &a)
std::vector< C > allreduce_vec(const std::vector< C > &vec, const MPI_Comm &comm)
Calculate a summation of each element of vector over MPI communicator.
void bcast(C *buffer, int count, int root, const MPI_Comm &comm)
Wrapper of MPI_Bcast.
void alltoallv(const C *sendbuf, const int *sendcounts, const int *sdispls, C *recvbuf, const int *recvcounts, const int *rdispls, const MPI_Comm &comm)
Wrapper of MPI_Alltoallv.
void send_recv_vector(const std::vector< C > &send_vec, int dest, int sendtag, std::vector< C > &recv_vec, int source, int recvtag, const MPI_Comm &comm, MPI_Status &status)
Wrapper of MPI_Sendrecv.
C allreduce_sum(C val, const MPI_Comm &comm)
Calculate a summation over MPI communicator.
double min_abs(const Matrix< C > &a)
Definition: matrix_scalapack_impl.hpp:719
void replace_matrix_data(const Matrix< C > &M, const std::vector< int > &dest_rank, const std::vector< size_t > &local_position, Matrix< C > &M_new)
Definition: matrix_scalapack_impl.hpp:472
void sum_matrix_data(const Matrix< C > &M, const std::vector< int > &dest_rank, const std::vector< size_t > &local_position, Matrix< C > &M_new)
Definition: matrix_scalapack_impl.hpp:638
double max_abs(const Matrix< C > &a)
Definition: matrix_scalapack_impl.hpp:707
Definition: complex.hpp:34