Skip to content

Commit

Permalink
Merge pull request #23 from smoors/lmod_cache_from_EB
Browse files Browse the repository at this point in the history
submit lmod cache job from EB end_hook
  • Loading branch information
wpoely86 authored Jun 12, 2024
2 parents efd40de + d879706 commit eec1073
Show file tree
Hide file tree
Showing 10 changed files with 57 additions and 27 deletions.
17 changes: 6 additions & 11 deletions bin/submit_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from build_tools.bwraptools import bwrap_prefix, rsync_copy
from build_tools.clusters import ARCHS, PARTITIONS
from build_tools.filetools import APPS_BRUSSEL, get_module
from build_tools.lmodtools import LMOD_CACHE_CLUSTERS, submit_lmod_cache_job
from build_tools.lmodtools import submit_lmod_cache_job
from build_tools.softinstall import mk_job_name, set_toolchain_generation, submit_build_job

# repositories with easyconfigs
Expand Down Expand Up @@ -60,6 +60,7 @@ def main():

# Default job options
job = {
'lmod_cache': '1',
'langcode': 'en_US.utf8',
'cluster': 'hydra',
'target_arch': None,
Expand All @@ -72,6 +73,7 @@ def main():
# Easybuild default paths
# start using environment from local machine, job scripts get custom paths
ebconf = {
'accept-eula-for': 'Intel-oneAPI,CUDA',
'robot-paths': ":".join([os.path.join(VSCSOFTSTACK_ROOT, repo) for repo in EASYCONFIG_REPOS]),
'include-easyblocks': os.path.join(VSCSOFTSTACK_ROOT, EASYBLOCK_REPO),
'sourcepath': '/apps/brussel/sources:/apps/gent/source',
Expand Down Expand Up @@ -229,8 +231,8 @@ def main():
logger.error("Failed to get module name/version for %s", easyconfig)
sys.exit(1)

lmod_cache = not opts.options.skip_lmod_cache
if not lmod_cache:
if opts.options.skip_lmod_cache:
job['lmod_cache'] = ''
logger.info("Not running Lmod cache after installation")

# ---> main build + lmod cache loop <--- #
Expand Down Expand Up @@ -326,7 +328,7 @@ def main():

ec, buildjob_out = submit_build_job(
job_options,
opts.options.keep,
keep_job=opts.options.keep,
sub_options=opts.options.extra_sub_flags,
cluster=job_options['cluster'],
local_exec=local_exec,
Expand All @@ -337,13 +339,6 @@ def main():
logger.error("Failed to submit or run build job for '%s': %s", easyconfig, buildjob_out)
sys.exit(1)

# submit lmod cache job(s)
if lmod_cache and job_options['cluster'] in LMOD_CACHE_CLUSTERS:
jobids_depend = None
if buildjob_out and not dry_run and not local_exec:
jobids_depend = [buildjob_out.rstrip().split(';')[0]]
submit_lmod_cache_job(host_partition, jobids_depend, dry_run=dry_run)


if __name__ == '__main__':
main()
21 changes: 21 additions & 0 deletions src/build_tools/hooks_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@

import os

from vsc.utils import fancylogger

from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS
from easybuild.tools import LooseVersion
from easybuild.tools.hooks import SANITYCHECK_STEP

from build_tools.clusters import ARCHS
from build_tools.ib_modules import IB_MODULE_SOFTWARE, IB_MODULE_SUFFIX, IB_OPT_MARK
from build_tools.lmodtools import submit_lmod_cache_job

# permission groups for licensed software
SOFTWARE_GROUPS = {
Expand Down Expand Up @@ -368,3 +371,21 @@ def pre_module_hook(self, *args, **kwargs): # pylint: disable=unused-argument
############################

self.cfg.enable_templating = en_templ


def end_hook():
"""Hook to run shortly before EasyBuild completes"""

logger = fancylogger.getLogger()
fancylogger.logToScreen(True, stdout=True)
fancylogger.setLogLevelInfo()

# submit Lmod cache job
if os.getenv('BUILD_TOOLS_RUN_LMOD_CACHE', '1') == '1':
partition = os.getenv('SLURM_JOB_PARTITION')
if partition:
logger.info('[end hook] Submitting Lmod cache job for partition %s', partition)
# set cluster=False to avoid loading cluster module in job
submit_lmod_cache_job(partition, cluster=False)
else:
logger.info('[end hook] Skipping Lmod cache job: not in a Slurm job')
2 changes: 2 additions & 0 deletions src/build_tools/jobtemplate.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
fi
# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=${lmod_cache}
export LANG=${langcode}
export PATH=$$PREFIX_EB/easybuild-framework:$$PATH
export PYTHONPATH=$$PREFIX_EB/easybuild-easyconfigs:$$PREFIX_EB/easybuild-easyblocks:$$PREFIX_EB/easybuild-framework:$$PREFIX_EB/vsc-base/lib
Expand Down
14 changes: 9 additions & 5 deletions src/build_tools/lmodtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,20 @@
"""


def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs):
def submit_lmod_cache_job(partition, jobids_depend=None, cluster=None, **kwargs):
"""
Run Lmod cache in a Slurm job
:param jobids_depend: list of strings: jobids on with to set job dependency
:param mod_basedir: the module basedir
:param partition: the partition to submit the job to
:param jobids_depend: list of strings: jobids on with to set job dependency
:param cluster: the Slurm cluster to submit the job to.
if cluster is None, load the cluster module corresponding to the current partition
if cluster is False, don’t purge/load a cluster module (use the currently active cluster)
"""

archdir = PARTITIONS[partition]['arch']
cluster = PARTITIONS[partition]['cluster']
if cluster is None:
cluster = PARTITIONS[partition]['cluster']

cache_cmd = [
'/usr/libexec/lmod/run_lmod_cache.py',
Expand All @@ -67,7 +71,7 @@ def submit_lmod_cache_job(partition, jobids_depend=None, *args, **kwargs):

logger.info(
"Refreshing Lmod cache on partition %s for architecture %s", partition or 'default', archdir or 'default')
ec, out = submit_job_script(job_file, cluster=cluster, *args, **kwargs)
ec, out = submit_job_script(job_file, cluster=cluster, **kwargs)

if ec != 0:
logger.error("Failed to submit Lmod cache job: %s", out)
Expand Down
2 changes: 1 addition & 1 deletion src/build_tools/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@author: Alex Domingo (Vrije Universiteit Brussel)
"""

VERSION = '3.0.0'
VERSION = '3.1.0'

AUTHOR = {
'wp': 'Ward Poelmans',
Expand Down
18 changes: 10 additions & 8 deletions src/build_tools/softinstall.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,30 @@ def submit_job_script(job_file, sub_options='', cluster='hydra', local_exec=Fals
:param dry_run: print submit command
"""

submit_cmd = []
# switch to corresponding cluster and submit
submit_cmd = ["module --force purge"]
submit_cmd.append("module load cluster/%s" % cluster)
submit_cmd.append("sbatch --parsable %s %s" % (sub_options, job_file))
if cluster:
submit_cmd.append("module --force purge")
submit_cmd.append(f"module load cluster/{cluster}")

submit_cmd.append(f"sbatch --parsable {sub_options} {job_file}")
submit_cmd = " && ".join(submit_cmd)

if dry_run:
log_msg = "(DRY RUN) Job submission command: %s" % submit_cmd
log_msg = f"(DRY RUN) Job submission command: {submit_cmd}"
logger.info(log_msg)
ec, out = 0, log_msg
elif local_exec:
logger.debug("Local execution of job script: %s", job_file)
ec, out = RunLoopStdout.run("bash %s" % job_file)
ec, out = RunLoopStdout.run(f"bash {job_file}")
else:
logger.debug("Job submission command: %s", submit_cmd)
ec, out = RunNoShell.run('bash -c "%s"' % submit_cmd)
ec, out = RunNoShell.run(f'bash -c "{submit_cmd}"')

return ec, out


def submit_build_job(job_options, keep_job=False, *args, **kargs):
def submit_build_job(job_options, keep_job=False, **kwargs):
"""
Generate job script from BUILD_JOB template and submit it with Slurm to target cluster
:param job_options: dict with options to pass to job template
Expand All @@ -138,7 +140,7 @@ def submit_build_job(job_options, keep_job=False, *args, **kargs):
job_file = write_tempfile(job_script)
logger.debug("Job script written to %s", job_file)

ec, out = submit_job_script(job_file, *args, **kargs)
ec, out = submit_job_script(job_file, **kwargs)

if not keep_job:
try:
Expand Down
2 changes: 2 additions & 0 deletions tests/input/build_job_01.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then
fi

# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=1
export LANG=C
export PATH=$PREFIX_EB/easybuild-framework:$PATH
export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib
Expand Down
2 changes: 2 additions & 0 deletions tests/input/build_job_02.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ if [ -z $PREFIX_EB ]; then
fi

# set environment
export BUILD_TOOLS_LOAD_DUMMY_MODULES=1
export BUILD_TOOLS_RUN_LMOD_CACHE=
export LANG=C
export PATH=$PREFIX_EB/easybuild-framework:$PATH
export PYTHONPATH=$PREFIX_EB/easybuild-easyconfigs:$PREFIX_EB/easybuild-easyblocks:$PREFIX_EB/easybuild-framework:$PREFIX_EB/vsc-base/lib
Expand Down
2 changes: 1 addition & 1 deletion tests/test_lmodtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def test_submit_lmod_cache_job(inputdir):
job_script = 'lmod_cache_job_01.sh'

_, out = submit_lmod_cache_job(jobids_depend=['123', '456'], partition='skylake_mpi', dry_run=True)
_, out = submit_lmod_cache_job('skylake_mpi', jobids_depend=['123', '456'], dry_run=True)

new_job = out.split(' ')[-1]
with open(new_job) as nj:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_softinstall.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def test_mk_job_name(test_name):
'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/skylake',
'tmp': '/tmp/eb-test-build',
'postinstall': '',
'lmod_cache': '1',
}),
('build_job_02.sh', {
'job_name': 'test-job-gpu',
Expand All @@ -113,6 +114,7 @@ def test_mk_job_name(test_name):
'eb_installpath': '/apps/brussel/${VSC_OS_LOCAL}/zen2-ib',
'tmp': '/tmp/eb-test-build',
'postinstall': 'rsync src dest',
'lmod_cache': '',
}),
]
)
Expand All @@ -122,7 +124,7 @@ def test_submit_build_job(inputdir, test_job):
cluster = 'hydra'

ec, out = softinstall.submit_build_job(
job_options, True, sub_options, cluster=cluster, local_exec=False, dry_run=True
job_options, keep_job=True, sub_options=sub_options, cluster=cluster, local_exec=False, dry_run=True
)

new_job = out.split(' ')[-1]
Expand Down

0 comments on commit eec1073

Please sign in to comment.