Skip to content

Commit

Permalink
Updated internal block size for FM matrices
Browse files Browse the repository at this point in the history
- Input keyword defaults now rely on internal setting (NROW/COL_BLOCKS).
- Use m_cpuid_vlen to make block size multiple of vector-length.
  • Loading branch information
hfp committed Dec 4, 2024
1 parent 7442a40 commit b34a976
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 33 deletions.
50 changes: 26 additions & 24 deletions src/fm/cp_fm_struct.F
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ MODULE cp_fm_struct
cp_logger_type,&
cp_to_string
USE kinds, ONLY: dp
USE machine, ONLY: m_flush
USE machine, ONLY: m_cpuid_vlen,&
m_flush
USE message_passing, ONLY: mp_para_env_release,&
mp_para_env_type
#include "../base/base_uses.f90"
Expand All @@ -33,9 +34,8 @@ MODULE cp_fm_struct
! the default blacs block sizes
! consider using #ifdefs to give them the optimal values
! these can be changed using scf_control
! *** these are used by default
INTEGER, PRIVATE :: optimal_blacs_col_block_size = 32
INTEGER, PRIVATE :: optimal_blacs_row_block_size = 32
INTEGER, PRIVATE :: optimal_blacs_col_block_size = 64
INTEGER, PRIVATE :: optimal_blacs_row_block_size = 64
LOGICAL, PRIVATE :: force_block_size = .FALSE.

PUBLIC :: cp_fm_struct_type, cp_fm_struct_p_type
Expand Down Expand Up @@ -142,7 +142,7 @@ SUBROUTINE cp_fm_struct_create(fmstruct, para_env, context, nrow_global, &
LOGICAL, OPTIONAL, INTENT(in) :: square_blocks
LOGICAL, OPTIONAL, INTENT(in) :: force_block

INTEGER :: dumblock, i
INTEGER :: dumblock, i, vlen
#if defined(__parallel)
INTEGER :: iunit, stat
INTEGER, EXTERNAL :: numroc
Expand Down Expand Up @@ -193,32 +193,34 @@ SUBROUTINE cp_fm_struct_create(fmstruct, para_env, context, nrow_global, &
fmstruct%ncol_global = ncol_global
END IF

! try to avoid small left-over blocks (anyway naive)
IF (PRESENT(nrow_block)) THEN
IF (nrow_block > 0) & ! allows setting the number of blocks to -1 to explicitly set to auto
fmstruct%nrow_block = nrow_block
END IF
! allow setting the number of blocks to zero or negative (request default blocksize)
IF (PRESENT(nrow_block)) IF(0 < nrow_block) fmstruct%nrow_block = nrow_block
IF (PRESENT(ncol_block)) IF(0 < ncol_block) fmstruct%ncol_block = ncol_block

! adjust block size if not forced like avoiding small remainder blocks
IF (.NOT. my_force_block) THEN
dumblock = CEILING(REAL(fmstruct%nrow_global, KIND=dp)/ &
REAL(fmstruct%context%num_pe(1), KIND=dp))
fmstruct%nrow_block = MAX(1, MIN(fmstruct%nrow_block, dumblock))
END IF
IF (PRESENT(ncol_block)) THEN
IF (ncol_block > 0) & ! allows setting the number of blocks to -1 to explicitly set to auto
fmstruct%ncol_block = ncol_block
END IF
IF (.NOT. my_force_block) THEN
dumblock = CEILING(REAL(fmstruct%ncol_global, KIND=dp)/ &
REAL(fmstruct%context%num_pe(2), KIND=dp))
fmstruct%ncol_block = MAX(1, MIN(fmstruct%ncol_block, dumblock))

! square matrix -> square blocks (in the past/otherwise, some ops failed)
my_square_blocks = fmstruct%nrow_global == fmstruct%ncol_global
IF (PRESENT(square_blocks) .AND. .NOT. my_square_blocks) THEN
my_square_blocks = square_blocks
END IF
IF (my_square_blocks) THEN
fmstruct%nrow_block = MIN(fmstruct%nrow_block, fmstruct%ncol_block)
fmstruct%ncol_block = fmstruct%nrow_block
END IF
END IF

! square matrix -> square blocks (otherwise some op fail)
my_square_blocks = fmstruct%nrow_global == fmstruct%ncol_global
IF (PRESENT(square_blocks)) my_square_blocks = square_blocks
IF (my_square_blocks) THEN
fmstruct%nrow_block = MIN(fmstruct%nrow_block, fmstruct%ncol_block)
fmstruct%ncol_block = fmstruct%nrow_block
vlen = m_cpuid_vlen()
IF (1 < vlen) THEN ! always make blocks a multiple of vlen
fmstruct%nrow_block = (fmstruct%nrow_block + vlen - 1)/vlen*vlen
fmstruct%ncol_block = (fmstruct%ncol_block + vlen - 1)/vlen*vlen
END IF

ALLOCATE (fmstruct%nrow_locals(0:(fmstruct%context%num_pe(1) - 1)), &
Expand Down Expand Up @@ -610,8 +612,8 @@ SUBROUTINE cp_fm_struct_config(nrow_block, ncol_block, force_block)
INTEGER, INTENT(IN), OPTIONAL :: nrow_block, ncol_block
LOGICAL, INTENT(IN), OPTIONAL :: force_block

IF (PRESENT(ncol_block)) optimal_blacs_col_block_size = ncol_block
IF (PRESENT(nrow_block)) optimal_blacs_row_block_size = nrow_block
IF (PRESENT(ncol_block)) IF(0 < ncol_block) optimal_blacs_col_block_size = ncol_block
IF (PRESENT(nrow_block)) IF(0 < nrow_block) optimal_blacs_row_block_size = nrow_block
IF (PRESENT(force_block)) force_block_size = force_block

END SUBROUTINE cp_fm_struct_config
Expand Down
22 changes: 13 additions & 9 deletions src/input_cp2k_global.F
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ MODULE input_cp2k_global
USE cp_fm_elpa, ONLY: elpa_kernel_descriptions,&
elpa_kernel_ids,&
elpa_kernel_names
USE cp_fm_struct, ONLY: cp_fm_struct_get_ncol_block,&
cp_fm_struct_get_nrow_block
USE cp_output_handling, ONLY: add_last_numeric,&
cp_print_key_section_create,&
debug_print_level,&
Expand Down Expand Up @@ -283,7 +285,7 @@ SUBROUTINE create_global_section(section)
"larger allowed grids, or grids that more precisely match a given cutoff. "// &
"IMPORTANT NOTE: in this case, the actual grids used in CP2K depends on the FFT library. "// &
"A change of FFT library must therefore be considered equivalent to a change of basis, "// &
"which implies a change of total energy. ", &
"which implies a change of total energy.", &
usage="EXTENDED_FFT_LENGTHS", &
default_l_val=.FALSE., lone_keyword_l_val=.TRUE.)
CALL section_add_keyword(section, keyword)
Expand Down Expand Up @@ -419,7 +421,7 @@ SUBROUTINE create_global_section(section)
CALL keyword_release(keyword)

CALL keyword_create(keyword, __LOCATION__, name="TRACE", &
description="If a debug trace of the execution of the program should be written ", &
description="If a debug trace of the execution of the program should be written", &
usage="TRACE", &
default_l_val=.FALSE., lone_keyword_l_val=.TRUE.)
CALL section_add_keyword(section, keyword)
Expand Down Expand Up @@ -490,7 +492,7 @@ SUBROUTINE create_global_section(section)

CALL keyword_create(keyword, __LOCATION__, name="SAVE_MEM", &
description="Some sections of the input structure are deallocated when not needed,"// &
" and reallocated only when used. This reduces the required maximum memory ", &
" and reallocated only when used. This reduces the required maximum memory.", &
usage="SAVE_MEM", &
default_l_val=.FALSE., lone_keyword_l_val=.TRUE.)
CALL section_add_keyword(section, keyword)
Expand Down Expand Up @@ -639,21 +641,23 @@ SUBROUTINE create_fm_section(section)

CALL keyword_create(keyword, __LOCATION__, name="NROW_BLOCKS", &
description="Defines the number of rows per scalapack block in "// &
"the creation of block cyclic dense matrices ", &
default_i_val=64)
"the creation of block cyclic dense matrices. "// &
"Use an internal default if zero or negative.", &
default_i_val=cp_fm_struct_get_nrow_block())
CALL section_add_keyword(section, keyword)
CALL keyword_release(keyword)

CALL keyword_create(keyword, __LOCATION__, name="NCOL_BLOCKS", &
description="Defines the number of columns per scalapack block in "// &
"the creation of vlock cyclic dense matrices ", &
default_i_val=64)
"the creation of vlock cyclic dense matrices. "// &
"Use an internal default if zero or negative.", &
default_i_val=cp_fm_struct_get_ncol_block())
CALL section_add_keyword(section, keyword)
CALL keyword_release(keyword)

CALL keyword_create(keyword, __LOCATION__, name="FORCE_BLOCK_SIZE", &
description="Ensure for small matrices that the layout is compatible "// &
"with bigger ones, i.e. no subdivision is performed (can break LAPACK!!!).", &
"with bigger ones, i.e. no subdivision is performed (can break LAPACK).", &
usage="FORCE_BLOCK_SIZE", &
default_l_val=.FALSE., lone_keyword_l_val=.TRUE.)
CALL section_add_keyword(section, keyword)
Expand Down Expand Up @@ -737,7 +741,7 @@ SUBROUTINE create_fm_diag_rules_section(section)
description="Controls how to perform redistribution when ELPA is used for diagonalization. "// &
"By default, redistribution is always performed using the defined rules. "// &
"By turning off this keyword, matrices are redistributed only to prevent crashes in the ELPA "// &
"library which happens when the original matrix is distributed over too many processors. ", &
"library which happens when the original matrix is distributed over too many processors.", &
usage="ELPA_FORCE_REDISTRIBUTE", type_of_var=logical_t, &
default_l_val=.TRUE., lone_keyword_l_val=.TRUE.)
CALL section_add_keyword(section, keyword)
Expand Down

0 comments on commit b34a976

Please sign in to comment.