MPI-SCATCI 2.0
An MPI version of SCATCI
Loading...
Searching...
No Matches
DistributedMatrix_module.f90
Go to the documentation of this file.
1! Copyright 2019
2!
3! For a comprehensive list of the developers that contributed to these codes
4! see the UK-AMOR website.
5!
6! This file is part of UKRmol-in (UKRmol+ suite).
7!
8! UKRmol-in is free software: you can redistribute it and/or modify
9! it under the terms of the GNU General Public License as published by
10! the Free Software Foundation, either version 3 of the License, or
11! (at your option) any later version.
12!
13! UKRmol-in is distributed in the hope that it will be useful,
14! but WITHOUT ANY WARRANTY; without even the implied warranty of
15! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16! GNU General Public License for more details.
17!
18! You should have received a copy of the GNU General Public License
19! along with UKRmol-in (in source/COPYING). Alternatively, you can also visit
20! <https://www.gnu.org/licenses/>.
21
22!> \brief Distributed matrix module
23!> \authors A Al-Refaie
24!> \date 2017
25!>
26!> Provides DistributedMatrix used by parallel diagonalizers. Other specialized matrix types
27!> are based on this. See \ref SCALAPACKMatrix_module::SCALAPACKMatrix and
28!> \ref SLEPCMatrix_module::SLEPCMatrix.
29!>
30!> \note 16/01/2019 - Jakub Benda: Unifom coding style and expanded documentation.
31!>
32module distributedmatrix_module
33
34 use const_gbl, only: stdout
35 use consts_mpi_ci, only: mat_dense
36 use precisn, only: longint, wp
37 use mpi_gbl, only: nprocs, mpi_xermsg, mpi_reduceall_min, mpi_reduceall_max, &
38 mpi_mod_rotate_arrays_around_ring, mpi_reduceall_inplace_sum_cfp, mpi_reduceall_sum_cfp
39 use basematrix_module, only: basematrix
40 use matrixcache_module, only: matrixcache
41 use memorymanager_module, only: master_memory
42 use parallelization_module, only: grid => process_grid
43 use timing_module, only: master_timer
44
45 implicit none
46
47 public distributedmatrix
48
49 private
50
51 !> \brief Distributed matrix type
52 !> \authors A Al-Refaie
53 !> \date 2017
54 !>
55 !> Distributed matrix is used for construction of the Hamiltonian in distributed (MPI) mode.
56 !>
57 type, abstract, extends(basematrix) :: distributedmatrix
58 !>This cache holds the temprorary matrix values before they are sent to the relevant process
59 type(matrixcache) :: temp_cache
60
61 real(wp) :: memory_scale = 0.75_wp
62
63 integer :: continuum_counter
64 integer :: l2_counter
65 integer :: start_continuum_update
66 integer :: start_l2_update
67
68 contains
69
70 procedure, public :: print => print_distributed
71 procedure, public :: update_continuum => update_continuum_distributed
72 procedure, public :: update_pure_l2 => update_l2_distributed
73
74 procedure, public :: initialize_struct_self => initialize_struct_distributed
75
76 procedure, public :: setup_diag_matrix
77
78 procedure, public :: construct_self => construct_mat_distributed
79 procedure, public :: insert_matelem_self => insert_matelem_distributed
80 procedure, public :: get_matelem_self => get_matelem_distributed
81 procedure, public :: clear_self => clear_distributed
82 procedure, public :: destroy_self => destroy_distributed
83 procedure, public :: finalize_matrix => finalize_distributed
84 procedure, public :: finalize_matrix_self
85 procedure, public :: destroy_matrix
86 procedure, public :: clear_matrix
87
88 procedure, public :: insert_into_diag_matrix
89 procedure, private :: convert_temp_cache_to_array
90 procedure, private :: update_counter
91 end type
92
93contains
94
95 subroutine construct_mat_distributed (this)
96 class(distributedmatrix) :: this
97
98 write (stdout, "('Constructing Distributed matrix')")
99
100 !Initialize the temporary cache
101 call this % temp_cache % construct
102
103 !Clear them both
104 call this % temp_cache % clear
105
106 end subroutine construct_mat_distributed
107
108
109 subroutine initialize_struct_distributed (this, matrix_size, matrix_type, block_size)
110 class(distributedmatrix) :: this
111 integer, intent(in) :: matrix_size, matrix_type, block_size
112 integer :: ifail
113
114 call this % temp_cache % clear
115
116 this % memory_scale = 0.75_wp
117 call this % setup_diag_matrix(matrix_size, matrix_type, block_size)
118
119 this % continuum_counter = 0
120 this % L2_counter = 0
121
122 !this is an estimate based on the number of L2*contiuum functions
123 !We will use a a rough estimate which wuill be the dimension of the matrix *100
124 !Lets figure out when we need to perform a continuum update
125
126 call this % update_counter
127
128 end subroutine initialize_struct_distributed
129
130
131 subroutine insert_matelem_distributed (this, i, j, coefficient, class, thresh)
132 class(distributedmatrix) :: this
133 integer, intent(in) :: i, j, class
134 real(wp), intent(in) :: coefficient, thresh
135 logical :: dummy
136 integer :: row, column
137
138 if (nprocs <= 1) then
139 dummy = this % insert_into_diag_matrix(i, j, coefficient)
140 return
141 end if
142
143 if (class /= 8 .and. class /= 2 .and. this % matrix_type /= mat_dense) then
144 call this % temp_cache % insert_into_cache(i, j, coefficient)
145 else
146 !Is it within the threshold
147 if (abs(coefficient) < thresh) return
148
149 !Does it belong to me?
150 if (this % insert_into_diag_matrix(i, j, coefficient)) return
151
152 !Otherwise we insert it into the temporary cache
153 call this % temp_cache % insert_into_cache(i, j, coefficient)
154 end if
155
156 end subroutine insert_matelem_distributed
157
158
159 subroutine get_matelem_distributed (this, idx, i, j, coeff)
160 class(distributedmatrix) :: this
161 integer, intent(in) :: idx
162 integer, intent(out) :: i, j
163 real(wp), intent(out) :: coeff
164
165 i = -1
166 j = -1
167 coeff = 0
168
169 end subroutine get_matelem_distributed
170
171
172 subroutine setup_diag_matrix (this, matrix_size, matrix_type, block_size)
173 class(distributedmatrix) :: this
174 integer, intent(in) :: matrix_size, matrix_type, block_size
175 end subroutine setup_diag_matrix
176
177
178 !> This inserts an element into the hard storage which is considered the final location before diagonalization
179 !> It also checks wherther the element exists within the aloowed range and tells us if it was successfully inserted
180 logical function insert_into_diag_matrix (this, row, column, coefficient)
181 class(distributedmatrix) :: this
182 integer, intent(in) :: row, column
183 real(wp), intent(in) :: coefficient
184
185 insert_into_diag_matrix = .false.
186 call mpi_xermsg('DistributedMatrix_module', 'insert_into_diag_matrix', 'Not implemented', 1, 1)
187
188 end function insert_into_diag_matrix
189
190
191 subroutine update_continuum_distributed (this, force_update)
192 class(distributedmatrix) :: this
193 logical, intent(in) :: force_update
194
195 real(wp), allocatable :: matrix_coeffs(:)
196 real(wp) :: coeff
197
198 integer :: number_of_chunks, num_elements, nelms_chunk, num_elems, i, j, ido, jdo, ierr, error, length, temp
199 logical :: dummy
200
201 if (this % temp_cache % is_empty()) return
202
203 if (nprocs <= 1) return
204
205 this % continuum_counter = this % continuum_counter + 1
206
207 if (this % continuum_counter < this % start_continuum_update .and. .not. force_update) return
208
209 this % continuum_counter = 0
210
211 call master_timer % start_timer("Update Continuum")
212
213 !Otherwise we will need to to an MPI reduce on all elements
214 !Here we make the assumption that all the processes have the same elements (which they should in this case)
215
216 number_of_chunks = this % temp_cache % num_matrix_chunks
217
218 write (stdout, "('Number of elements to reduce = ',i8)") this % temp_cache % get_size()
219
220 !Loop through each chunk and reduce
221
222 do ido = 1, number_of_chunks
223 !Get the number f elements in the matrix chunk
224 nelms_chunk = this % temp_cache % matrix_arrays(ido) % num_elems
225 allocate(matrix_coeffs(nelms_chunk))
226
227 !Reduce them all
228 !call mpi_reduce_inplace_sum_cfp(matrix_coeffs,nelms_chunk)
229 !For some strange reason MPI_IN_PLACE does not work here -_-
230
231 call mpi_reduceall_sum_cfp(this % temp_cache % matrix_arrays(ido) % coefficient(1:nelms_chunk), &
232 matrix_coeffs, nelms_chunk, grid % gcomm)
233
234 this % temp_cache % matrix_arrays(ido) % coefficient(1:nelms_chunk) = matrix_coeffs(1:nelms_chunk)
235
236 deallocate(matrix_coeffs)
237 end do
238
239 num_elems = this % temp_cache % get_size()
240
241 !Insert what is in the temp cache
242 do ido = 1, num_elems
243 call this % temp_cache % get_from_cache(ido, i, j, coeff)
244 dummy = this % insert_into_diag_matrix(i, j, coeff)
245 end do
246
247 !Clear it
248 call this % temp_cache % clear_and_shrink
249 call master_timer % stop_timer("Update Continuum")
250 call this % update_counter
251
252 end subroutine update_continuum_distributed
253
254
255 subroutine convert_temp_cache_to_array (this, matrix_ij, matrix_coeffs)
256
257 class(distributedmatrix) :: this
258 integer(longint), intent(inout) :: matrix_ij(:,:)
259 real(wp), intent(inout) :: matrix_coeffs(:)
260
261 integer :: ido, i, j
262
263 write (stdout, "('Converting cache to array')")
264 do ido = 1, this % temp_cache % n
265 call this % temp_cache % get_from_cache(ido, i, j, matrix_coeffs(ido))
266 matrix_ij(ido, 1) = i
267 matrix_ij(ido, 2) = j
268 end do
269 write (stdout, "('done')")
270 call this % temp_cache % clear_and_shrink
271
272 end subroutine convert_temp_cache_to_array
273
274
275 subroutine update_l2_distributed (this, force_update, count_)
276 class(distributedmatrix) :: this
277 logical, intent(in) :: force_update
278 real(wp), allocatable :: matrix_coeffs(:)
279 integer, optional :: count_
280
281 integer(longint), allocatable, target :: matrix_ij(:,:)
282 integer(longint), pointer :: mat_ptr(:)
283 integer(longint) :: my_num_of_elements, procs_num_of_elements, largest_num_of_elems
284
285 integer :: count_amount, ido, proc_id, i, j, ierr
286 logical :: dummy
287 real(wp) :: coeff
288
289 count_amount = 1
290
291 if (present(count_)) count_amount = count_
292
293 this % L2_counter = this % L2_counter + count_amount
294
295 if (this % L2_counter < this % start_L2_update .and. .not. force_update) return
296
297 my_num_of_elements = this % temp_cache % get_size()
298
299 if (nprocs <= 1) then
300 do ido = 1, my_num_of_elements
301 call this % temp_cache % get_from_cache(ido, i, j, coeff)
302 dummy = this % insert_into_diag_matrix(i, j, coeff)
303 end do
304
305 call this % temp_cache % clear
306 return
307 end if
308
309 call mpi_reduceall_max(my_num_of_elements, largest_num_of_elems, grid % gcomm)
310 !No point if we aint full
311 if (largest_num_of_elems < this % start_L2_update .and. .not. force_update) then
312 this % L2_counter = largest_num_of_elems
313 return
314 end if
315
316 this % L2_counter = 0
317
318 call this % temp_cache % shrink_capacity
319
320 !Lets start off with a much simpler method and move on to something more complex
321 !First off we find the largest_number of elements between procs
322
323 write (stdout, "('The largest number of elements is ',i10,' mine is ',i10)") largest_num_of_elems, my_num_of_elements
324 flush(stdout)
325 !Now we allocate the needed space
326
327 if (largest_num_of_elems == 0) return
328 write (stdout, "('Starting L2 update')")
329 flush(stdout)
330
331 call master_timer % start_timer("Update L2")
332 allocate(matrix_ij(largest_num_of_elems, 2), matrix_coeffs(largest_num_of_elems), stat = ierr)
333
334 call master_memory % track_memory(storage_size(matrix_ij)/8, size(matrix_ij), ierr, "DIST::L2UPDATE::MAT_IJ")
335 call master_memory % track_memory(storage_size(matrix_coeffs)/8, size(matrix_coeffs), ierr, "DIST::L2UPDATE::MAT_COEFFS")
336
337 if (ierr /=0 ) then
338 write (stdout, "('Memory allocation error during update!')")
339 stop "Memory allocation error"
340 end if
341
342 matrix_ij = 0
343 matrix_coeffs = 0.0
344
345 mat_ptr(1:largest_num_of_elems*2) => matrix_ij(:,:)
346
347 call this % convert_temp_cache_to_array(matrix_ij(:,:), matrix_coeffs(:))
348 call this % temp_cache%clear_and_shrink
349
350 procs_num_of_elements = my_num_of_elements
351
352 !whats nice is that we can guarentee that the processes caches will not contain coefficients that belong to itself
353 do proc_id = 1, grid % gprows * grid % gpcols - 1
354 call mpi_mod_rotate_arrays_around_ring(procs_num_of_elements, mat_ptr, &
355 matrix_coeffs, largest_num_of_elems, grid % gcomm)
356 do ido = 1, procs_num_of_elements
357 dummy = this % insert_into_diag_matrix(int(matrix_ij(ido,1)), int(matrix_ij(ido,2)), matrix_coeffs(ido))
358 end do
359 end do
360
361 call master_memory % free_memory(storage_size(matrix_ij)/8, size(matrix_ij))
362 call master_memory % free_memory(storage_size(matrix_coeffs)/8, size(matrix_coeffs))
363
364 !Free the space
365 deallocate(matrix_ij,matrix_coeffs)
366
367 !Reduce the temporary cache to free more space
368 call this % temp_cache % clear_and_shrink
369 call master_timer % stop_timer("Update L2")
370
371 write (stdout, "('Finished L2 update')")
372 flush(stdout)
373
374 call this % update_counter
375
376 end subroutine update_l2_distributed
377
378
379 subroutine clear_matrix (this)
380 class(distributedmatrix) :: this
381 end subroutine clear_matrix
382
383
384 subroutine finalize_matrix_self (this)
385 class(distributedmatrix) :: this
386 end subroutine finalize_matrix_self
387
388
389 subroutine finalize_distributed (this)
390 class(distributedmatrix) :: this
391
392 call mpi_reduceall_inplace_sum_cfp(this % diagonal, this % matrix_dimension, grid % gcomm)
393 call this % finalize_matrix_self
394
395 end subroutine finalize_distributed
396
397
398 subroutine destroy_matrix (this)
399 class(distributedmatrix) :: this
400 end subroutine destroy_matrix
401
402
403 subroutine print_distributed (this)
404 class(distributedmatrix) :: this
405
406 write (stdout, "('-------TEMP CACHE---------')")
407
408 call this % temp_cache % print
409
410 end subroutine print_distributed
411
412
413 subroutine clear_distributed (this)
414 class(distributedmatrix) :: this
415
416 call this % temp_cache % clear_and_shrink
417 call this % clear_matrix
418
419 end subroutine clear_distributed
420
421
422 subroutine destroy_distributed (this)
423 class(distributedmatrix) :: this
424
425 call this % clear
426 call this % temp_cache % destroy
427 call this % destroy_matrix
428
429 end subroutine destroy_distributed
430
431
432 subroutine update_counter (this)
433 class(distributedmatrix) :: this
434 integer :: ifail, per_elm, dummy_int, c_update, l_update
435 real(wp) :: dummy_real
436
437 this % continuum_counter = 0
438 this % L2_counter = 0
439
440 !this is an estimate based on the number of L2*contiuum functions
441 !We will use a a rough estimate which wuill be the dimension of the matrix *100
442 !Lets figure out when we need to perform a continuum update
443
444 per_elm = kind(dummy_int) * 2 + kind(dummy_real) + 4
445
446 c_update = master_memory % get_scaled_available_memory(this % memory_scale) / (per_elm * 2)
447 l_update = master_memory % get_scaled_available_memory(this % memory_scale) / (per_elm * 2)
448
449 call mpi_reduceall_min(c_update, this % start_continuum_update, grid % gcomm)
450 call mpi_reduceall_min(l_update, this % start_L2_update, grid % gcomm)
451
452 call master_memory % print_memory_report
453
454 write (stdout, "(2i16,' updates will occur at continuum = ',i12,' and L2 = ',i12)") &
455 c_update, l_update, this % start_continuum_update, this % start_L2_update
456
457 end subroutine update_counter
458
459end module distributedmatrix_module
MPI SCATCI Constants module.
integer, parameter mat_dense
Matrix is dense.