MPI-SCATCI  2.0
An MPI version of SCATCI
DistributedMatrix_module.f90
Go to the documentation of this file.
1 ! Copyright 2019
2 !
3 ! For a comprehensive list of the developers that contributed to these codes
4 ! see the UK-AMOR website.
5 !
6 ! This file is part of UKRmol-in (UKRmol+ suite).
7 !
8 ! UKRmol-in is free software: you can redistribute it and/or modify
9 ! it under the terms of the GNU General Public License as published by
10 ! the Free Software Foundation, either version 3 of the License, or
11 ! (at your option) any later version.
12 !
13 ! UKRmol-in is distributed in the hope that it will be useful,
14 ! but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ! GNU General Public License for more details.
17 !
18 ! You should have received a copy of the GNU General Public License
19 ! along with UKRmol-in (in source/COPYING). Alternatively, you can also visit
20 ! <https://www.gnu.org/licenses/>.
21 
33 
34  use const_gbl, only: stdout
35  use consts_mpi_ci, only: mat_dense
36  use precisn, only: longint, wp
37  use mpi_gbl, only: nprocs, mpi_xermsg, mpi_reduceall_min, mpi_reduceall_max, &
38  mpi_mod_rotate_arrays_around_ring, mpi_reduceall_inplace_sum_cfp, mpi_reduceall_sum_cfp
39  use basematrix_module, only: basematrix
42  use parallelization_module, only: grid => process_grid
43  use timing_module, only: master_timer
44 
45  implicit none
46 
47  public distributedmatrix
48 
49  private
50 
57  type, abstract, extends(basematrix) :: distributedmatrix
59  type(matrixcache) :: temp_cache
60 
61  real(wp) :: memory_scale = 0.75_wp
62 
63  integer :: continuum_counter
64  integer :: l2_counter
65  integer :: start_continuum_update
66  integer :: start_l2_update
67 
68  contains
69 
70  procedure, public :: print => print_distributed
71  procedure, public :: update_continuum => update_continuum_distributed
72  procedure, public :: update_pure_l2 => update_l2_distributed
73 
74  procedure, public :: initialize_struct_self => initialize_struct_distributed
75 
76  procedure, public :: setup_diag_matrix
77 
78  procedure, public :: construct_self => construct_mat_distributed
79  procedure, public :: insert_matelem_self => insert_matelem_distributed
80  procedure, public :: get_matelem_self => get_matelem_distributed
81  procedure, public :: clear_self => clear_distributed
82  procedure, public :: destroy_self => destroy_distributed
83  procedure, public :: finalize_matrix => finalize_distributed
84  procedure, public :: finalize_matrix_self
85  procedure, public :: destroy_matrix
86  procedure, public :: clear_matrix
87 
88  procedure, public :: insert_into_diag_matrix
89  procedure, private :: convert_temp_cache_to_array
90  procedure, private :: update_counter
91  end type
92 
93 contains
94 
95  subroutine construct_mat_distributed (this)
96  class(distributedmatrix) :: this
97 
98  write (stdout, "('Constructing Distributed matrix')")
99 
100  !Initialize the temporary cache
101  call this % temp_cache % construct
102 
103  !Clear them both
104  call this % temp_cache % clear
105 
106  end subroutine construct_mat_distributed
107 
108 
109  subroutine initialize_struct_distributed (this, matrix_size, matrix_type, block_size)
110  class(distributedmatrix) :: this
111  integer, intent(in) :: matrix_size, matrix_type, block_size
112  integer :: ifail
113 
114  call this % temp_cache % clear
115 
116  this % memory_scale = 0.75_wp
117  call this % setup_diag_matrix(matrix_size, matrix_type, block_size)
118 
119  this % continuum_counter = 0
120  this % L2_counter = 0
121 
122  !this is an estimate based on the number of L2*contiuum functions
123  !We will use a a rough estimate which wuill be the dimension of the matrix *100
124  !Lets figure out when we need to perform a continuum update
125 
126  call this % update_counter
127 
128  end subroutine initialize_struct_distributed
129 
130 
131  subroutine insert_matelem_distributed (this, i, j, coefficient, class, thresh)
132  class(distributedmatrix) :: this
133  integer, intent(in) :: i, j, class
134  real(wp), intent(in) :: coefficient, thresh
135  logical :: dummy
136  integer :: row, column
137 
138  if (nprocs <= 1) then
139  dummy = this % insert_into_diag_matrix(i, j, coefficient)
140  return
141  end if
142 
143  if (class /= 8 .and. class /= 2 .and. this % matrix_type /= mat_dense) then
144  call this % temp_cache % insert_into_cache(i, j, coefficient)
145  else
146  !Is it within the threshold
147  if (abs(coefficient) < thresh) return
148 
149  !Does it belong to me?
150  if (this % insert_into_diag_matrix(i, j, coefficient)) return
151 
152  !Otherwise we insert it into the temporary cache
153  call this % temp_cache % insert_into_cache(i, j, coefficient)
154  end if
155 
156  end subroutine insert_matelem_distributed
157 
158 
159  subroutine get_matelem_distributed (this, idx, i, j, coeff)
160  class(distributedmatrix) :: this
161  integer, intent(in) :: idx
162  integer, intent(out) :: i, j
163  real(wp), intent(out) :: coeff
164 
165  i = -1
166  j = -1
167  coeff = 0
168 
169  end subroutine get_matelem_distributed
170 
171 
172  subroutine setup_diag_matrix (this, matrix_size, matrix_type, block_size)
173  class(distributedmatrix) :: this
174  integer, intent(in) :: matrix_size, matrix_type, block_size
175  end subroutine setup_diag_matrix
176 
177 
180  logical function insert_into_diag_matrix (this, row, column, coefficient)
181  class(distributedmatrix) :: this
182  integer, intent(in) :: row, column
183  real(wp), intent(in) :: coefficient
184 
185  insert_into_diag_matrix = .false.
186  call mpi_xermsg('DistributedMatrix_module', 'insert_into_diag_matrix', 'Not implemented', 1, 1)
187 
188  end function insert_into_diag_matrix
189 
190 
191  subroutine update_continuum_distributed (this, force_update)
192  class(distributedmatrix) :: this
193  logical, intent(in) :: force_update
194 
195  real(wp), allocatable :: matrix_coeffs(:)
196  real(wp) :: coeff
197 
198  integer :: number_of_chunks, num_elements, nelms_chunk, num_elems, i, j, ido, jdo, ierr, error, length, temp
199  logical :: dummy
200 
201  if (this % temp_cache % is_empty()) return
202 
203  if (nprocs <= 1) return
204 
205  this % continuum_counter = this % continuum_counter + 1
206 
207  if (this % continuum_counter < this % start_continuum_update .and. .not. force_update) return
208 
209  this % continuum_counter = 0
210 
211  call master_timer % start_timer("Update Continuum")
212 
213  !Otherwise we will need to to an MPI reduce on all elements
214  !Here we make the assumption that all the processes have the same elements (which they should in this case)
215 
216  number_of_chunks = this % temp_cache % num_matrix_chunks
217 
218  write (stdout, "('Number of elements to reduce = ',i8)") this % temp_cache % get_size()
219 
220  !Loop through each chunk and reduce
221 
222  do ido = 1, number_of_chunks
223  !Get the number f elements in the matrix chunk
224  nelms_chunk = this % temp_cache % matrix_arrays(ido) % num_elems
225  allocate(matrix_coeffs(nelms_chunk))
226 
227  !Reduce them all
228  !call mpi_reduce_inplace_sum_cfp(matrix_coeffs,nelms_chunk)
229  !For some strange reason MPI_IN_PLACE does not work here -_-
230 
231  call mpi_reduceall_sum_cfp(this % temp_cache % matrix_arrays(ido) % coefficient(1:nelms_chunk), &
232  matrix_coeffs, nelms_chunk, grid % gcomm)
233 
234  this % temp_cache % matrix_arrays(ido) % coefficient(1:nelms_chunk) = matrix_coeffs(1:nelms_chunk)
235 
236  deallocate(matrix_coeffs)
237  end do
238 
239  num_elems = this % temp_cache % get_size()
240 
241  !Insert what is in the temp cache
242  do ido = 1, num_elems
243  call this % temp_cache % get_from_cache(ido, i, j, coeff)
244  dummy = this % insert_into_diag_matrix(i, j, coeff)
245  end do
246 
247  !Clear it
248  call this % temp_cache % clear_and_shrink
249  call master_timer % stop_timer("Update Continuum")
250  call this % update_counter
251 
252  end subroutine update_continuum_distributed
253 
254 
255  subroutine convert_temp_cache_to_array (this, matrix_ij, matrix_coeffs)
256 
257  class(distributedmatrix) :: this
258  integer(longint), intent(inout) :: matrix_ij(:,:)
259  real(wp), intent(inout) :: matrix_coeffs(:)
260 
261  integer :: ido, i, j
262 
263  write (stdout, "('Converting cache to array')")
264  do ido = 1, this % temp_cache % n
265  call this % temp_cache % get_from_cache(ido, i, j, matrix_coeffs(ido))
266  matrix_ij(ido, 1) = i
267  matrix_ij(ido, 2) = j
268  end do
269  write (stdout, "('done')")
270  call this % temp_cache % clear_and_shrink
271 
272  end subroutine convert_temp_cache_to_array
273 
274 
275  subroutine update_l2_distributed (this, force_update, count_)
276  class(distributedmatrix) :: this
277  logical, intent(in) :: force_update
278  real(wp), allocatable :: matrix_coeffs(:)
279  integer, optional :: count_
280 
281  integer(longint), allocatable, target :: matrix_ij(:,:)
282  integer(longint), pointer :: mat_ptr(:)
283  integer(longint) :: my_num_of_elements, procs_num_of_elements, largest_num_of_elems
284 
285  integer :: count_amount, ido, proc_id, i, j, ierr
286  logical :: dummy
287  real(wp) :: coeff
288 
289  count_amount = 1
290 
291  if (present(count_)) count_amount = count_
292 
293  this % L2_counter = this % L2_counter + count_amount
294 
295  if (this % L2_counter < this % start_L2_update .and. .not. force_update) return
296 
297  my_num_of_elements = this % temp_cache % get_size()
298 
299  if (nprocs <= 1) then
300  do ido = 1, my_num_of_elements
301  call this % temp_cache % get_from_cache(ido, i, j, coeff)
302  dummy = this % insert_into_diag_matrix(i, j, coeff)
303  end do
304 
305  call this % temp_cache % clear
306  return
307  end if
308 
309  call mpi_reduceall_max(my_num_of_elements, largest_num_of_elems, grid % gcomm)
310  !No point if we aint full
311  if (largest_num_of_elems < this % start_L2_update .and. .not. force_update) then
312  this % L2_counter = largest_num_of_elems
313  return
314  end if
315 
316  this % L2_counter = 0
317 
318  call this % temp_cache % shrink_capacity
319 
320  !Lets start off with a much simpler method and move on to something more complex
321  !First off we find the largest_number of elements between procs
322 
323  write (stdout, "('The largest number of elements is ',i10,' mine is ',i10)") largest_num_of_elems, my_num_of_elements
324  flush(stdout)
325  !Now we allocate the needed space
326 
327  if (largest_num_of_elems == 0) return
328  write (stdout, "('Starting L2 update')")
329  flush(stdout)
330 
331  call master_timer % start_timer("Update L2")
332  allocate(matrix_ij(largest_num_of_elems, 2), matrix_coeffs(largest_num_of_elems), stat = ierr)
333 
334  call master_memory % track_memory(kind(matrix_ij), size(matrix_ij), ierr, "DIST::L2UPDATE::MAT_IJ")
335  call master_memory % track_memory(kind(matrix_coeffs), size(matrix_coeffs), ierr, "DIST::L2UPDATE::MAT_COEFFS")
336 
337  if (ierr /=0 ) then
338  write (stdout, "('Memory allocation error during update!')")
339  stop "Memory allocation error"
340  end if
341 
342  matrix_ij = 0
343  matrix_coeffs = 0.0
344 
345  mat_ptr(1:largest_num_of_elems*2) => matrix_ij(:,:)
346 
347  call this % convert_temp_cache_to_array(matrix_ij(:,:), matrix_coeffs(:))
348  call this % temp_cache%clear_and_shrink
349 
350  procs_num_of_elements = my_num_of_elements
351 
352  !whats nice is that we can guarentee that the processes caches will not contain coefficients that belong to itself
353  do proc_id = 1, grid % gprows * grid % gpcols - 1
354  call mpi_mod_rotate_arrays_around_ring(procs_num_of_elements, mat_ptr, &
355  matrix_coeffs, largest_num_of_elems, grid % gcomm)
356  do ido = 1, procs_num_of_elements
357  dummy = this % insert_into_diag_matrix(int(matrix_ij(ido,1)), int(matrix_ij(ido,2)), matrix_coeffs(ido))
358  end do
359  end do
360 
361  call master_memory % free_memory(kind(matrix_ij), size(matrix_ij))
362  call master_memory % free_memory(kind(matrix_coeffs), size(matrix_coeffs))
363 
364  !Free the space
365  deallocate(matrix_ij,matrix_coeffs)
366 
367  !Reduce the temporary cache to free more space
368  call this % temp_cache % clear_and_shrink
369  call master_timer % stop_timer("Update L2")
370 
371  write (stdout, "('Finished L2 update')")
372  flush(stdout)
373 
374  call this % update_counter
375 
376  end subroutine update_l2_distributed
377 
378 
379  subroutine clear_matrix (this)
380  class(distributedmatrix) :: this
381  end subroutine clear_matrix
382 
383 
384  subroutine finalize_matrix_self (this)
385  class(distributedmatrix) :: this
386  end subroutine finalize_matrix_self
387 
388 
389  subroutine finalize_distributed (this)
390  class(distributedmatrix) :: this
391 
392  call mpi_reduceall_inplace_sum_cfp(this % diagonal, this % matrix_dimension, grid % gcomm)
393  call this % finalize_matrix_self
394 
395  end subroutine finalize_distributed
396 
397 
398  subroutine destroy_matrix (this)
399  class(distributedmatrix) :: this
400  end subroutine destroy_matrix
401 
402 
403  subroutine print_distributed (this)
404  class(distributedmatrix) :: this
405 
406  write (stdout, "('-------TEMP CACHE---------')")
407 
408  call this % temp_cache % print
409 
410  end subroutine print_distributed
411 
412 
413  subroutine clear_distributed (this)
414  class(distributedmatrix) :: this
415 
416  call this % temp_cache % clear_and_shrink
417  call this % clear_matrix
418 
419  end subroutine clear_distributed
420 
421 
422  subroutine destroy_distributed (this)
423  class(distributedmatrix) :: this
424 
425  call this % clear
426  call this % temp_cache % destroy
427  call this % destroy_matrix
428 
429  end subroutine destroy_distributed
430 
431 
432  subroutine update_counter (this)
433  class(distributedmatrix) :: this
434  integer :: ifail, per_elm, dummy_int, c_update, l_update
435  real(wp) :: dummy_real
436 
437  this % continuum_counter = 0
438  this % L2_counter = 0
439 
440  !this is an estimate based on the number of L2*contiuum functions
441  !We will use a a rough estimate which wuill be the dimension of the matrix *100
442  !Lets figure out when we need to perform a continuum update
443 
444  per_elm = kind(dummy_int) * 2 + kind(dummy_real) + 4
445 
446  c_update = master_memory % get_scaled_available_memory(this % memory_scale) / (per_elm * 2)
447  l_update = master_memory % get_scaled_available_memory(this % memory_scale) / (per_elm * 2)
448 
449  call mpi_reduceall_min(c_update, this % start_continuum_update, grid % gcomm)
450  call mpi_reduceall_min(l_update, this % start_L2_update, grid % gcomm)
451 
452  call master_memory % print_memory_report
453 
454  write (stdout, "(2i16,' updates will occur at continuum = ',i12,' and L2 = ',i12)") &
455  c_update, l_update, this % start_continuum_update, this % start_L2_update
456 
457  end subroutine update_counter
458 
459 end module distributedmatrix_module
memorymanager_module
Memory manager module.
Definition: MemoryManager_module.f90:31
distributedmatrix_module::clear_matrix
subroutine clear_matrix(this)
Definition: DistributedMatrix_module.f90:380
timing_module
Timing module.
Definition: Timing_Module.f90:28
distributedmatrix_module::destroy_distributed
subroutine destroy_distributed(this)
Definition: DistributedMatrix_module.f90:423
distributedmatrix_module::update_counter
subroutine update_counter(this)
Definition: DistributedMatrix_module.f90:433
distributedmatrix_module::construct_mat_distributed
subroutine construct_mat_distributed(this)
Definition: DistributedMatrix_module.f90:96
distributedmatrix_module::finalize_matrix_self
subroutine finalize_matrix_self(this)
Definition: DistributedMatrix_module.f90:385
distributedmatrix_module::finalize_distributed
subroutine finalize_distributed(this)
Definition: DistributedMatrix_module.f90:390
distributedmatrix_module::distributedmatrix
Distributed matrix type.
Definition: DistributedMatrix_module.f90:57
distributedmatrix_module::update_l2_distributed
subroutine update_l2_distributed(this, force_update, count_)
Definition: DistributedMatrix_module.f90:276
parallelization_module::process_grid
type(processgrid) process_grid
Definition: Parallelization_module.F90:80
basematrix_module
Base matrix module.
Definition: BaseMatrix_module.f90:28
distributedmatrix_module
Distributed matrix module.
Definition: DistributedMatrix_module.f90:32
consts_mpi_ci
MPI SCATCI Constants module.
Definition: consts_mpi_ci.f90:31
distributedmatrix_module::insert_matelem_distributed
subroutine insert_matelem_distributed(this, i, j, coefficient, class, thresh)
Definition: DistributedMatrix_module.f90:132
distributedmatrix_module::convert_temp_cache_to_array
subroutine convert_temp_cache_to_array(this, matrix_ij, matrix_coeffs)
Definition: DistributedMatrix_module.f90:256
basematrix_module::basematrix
Base matrix type.
Definition: BaseMatrix_module.f90:47
parallelization_module
Distribution of processes into a grid.
Definition: Parallelization_module.F90:29
distributedmatrix_module::insert_into_diag_matrix
logical function insert_into_diag_matrix(this, row, column, coefficient)
This inserts an element into the hard storage which is considered the final location before diagonali...
Definition: DistributedMatrix_module.f90:181
distributedmatrix_module::update_continuum_distributed
subroutine update_continuum_distributed(this, force_update)
Definition: DistributedMatrix_module.f90:192
distributedmatrix_module::setup_diag_matrix
subroutine setup_diag_matrix(this, matrix_size, matrix_type, block_size)
Definition: DistributedMatrix_module.f90:173
distributedmatrix_module::clear_distributed
subroutine clear_distributed(this)
Definition: DistributedMatrix_module.f90:414
consts_mpi_ci::mat_dense
integer, parameter mat_dense
Matrix is dense.
Definition: consts_mpi_ci.f90:118
distributedmatrix_module::destroy_matrix
subroutine destroy_matrix(this)
Definition: DistributedMatrix_module.f90:399
distributedmatrix_module::initialize_struct_distributed
subroutine initialize_struct_distributed(this, matrix_size, matrix_type, block_size)
Definition: DistributedMatrix_module.f90:110
matrixcache_module
Matrix cache module.
Definition: MatrixCache_module.f90:28
timing_module::master_timer
type(timer), public master_timer
Definition: Timing_Module.f90:74
distributedmatrix_module::print_distributed
subroutine print_distributed(this)
Definition: DistributedMatrix_module.f90:404
memorymanager_module::master_memory
type(memorymanager), public master_memory
This is the global memory manager.
Definition: MemoryManager_module.f90:69
distributedmatrix_module::get_matelem_distributed
subroutine get_matelem_distributed(this, idx, i, j, coeff)
Definition: DistributedMatrix_module.f90:160
matrixcache_module::matrixcache
This handles the matrix elements and also expands the vector size if we have reached max capacity.
Definition: MatrixCache_module.f90:60