From bd817a6c65ba437301072630ede9147452b64c6d Mon Sep 17 00:00:00 2001 From: "Dong H. Ahn" Date: Fri, 6 May 2016 14:39:52 -0700 Subject: [PATCH] Cray support: Added Cray XE/XK/XC support Adapt Cray's @agontarek PR #7 to the HEAD of the master branch. Thus, the contributor of this patch is actually Cray's @agontarek. - ChangeLog - Added change notes. - config/x_ac_platform.m4 - Fixed bugs/added updated support for Cray. - configure.ac - Added pkg-config check. Newer Cray systems use pkg-config to retrieve alps dependencies. - launchmon/src/linux/sdbg_linux_launchmon.cxx - Fixed misplaced brace bug, added support for Cray on demand MPIR proctable acquisition. - launchmon/src/rm_alps.conf - Updated Cray RM config. - launchmon/src/sdbg_rm_map.cxx - Added Cray support. - launchmon/src/sdbg_rm_map.hxx - Added Cray support. - tools/alps/src/Makefile.am - Updated Cray support. - tools/alps/src/alps_fe_colocator.cxx - Updated Cray support. --- AUTHORS | 7 +- ChangeLog | 12 ++ config/x_ac_platform.m4 | 21 ++- configure.ac | 7 + etc/rm_alps.conf | 8 +- launchmon/src/linux/sdbg_linux_launchmon.cxx | 154 ++++++++++--------- launchmon/src/sdbg_rm_map.cxx | 40 ++++- launchmon/src/sdbg_rm_map.hxx | 4 + tools/alps/src/Makefile.am | 12 +- tools/alps/src/alps_fe_colocator.cxx | 6 +- 10 files changed, 180 insertions(+), 91 deletions(-) diff --git a/AUTHORS b/AUTHORS index 6e0e32b..dc391e7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,4 @@ -Dong H. Ahn -Matt LeGendre -Gregory Lee +Dong H. Ahn (LLNL) +Matt LeGendre (LLNL) +Gregory Lee (LLNL) +Andrew Gontarek (Cray Inc) diff --git a/ChangeLog b/ChangeLog index 24da4a3..106fa4b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2016-05-05 Andrew Gontarek + * AUTHORS, + config/x_ac_platform.m4, + configure.ac, + etc/rm_alps.conf, + launchmon/src/linux/sdbg_linux_launchmon.cxx, + launchmon/src/sdbg_rm_map.cxx, + launchmon/src/sdbg_rm_map.hxx, + tools/alps/src/Makefile.am, + tools/alps/src/alps_fe_colocator.cxx: + Added Cray XE/XK/XC support. + 2016-05-04 Dong H. Ahn * git show --stat 42a9e 37e25 3b491 \ bc791 fecc9 dac19 81dcf 07417 39efd 6727: diff --git a/config/x_ac_platform.m4 b/config/x_ac_platform.m4 index cff9552..e9c2818 100644 --- a/config/x_ac_platform.m4 +++ b/config/x_ac_platform.m4 @@ -28,6 +28,7 @@ # -------------------------------------------------------------------------------- # # Update Log: +# Feb 20 2015 andrewg@cray.com: Fixes for Cray systems. # Jun 11 2008 DHA: File created. # @@ -115,12 +116,24 @@ AC_DEFUN([X_AC_PLATFORM], [ ac_target_rm="bgqrm" fi elif test "x$ac_target_os" = "xlinux" -a "x$ac_target_isa" = "xx86_64"; then - if test ! -z "/use/bin/aprun" -a -f "/usr/bin/aprun"; then + if test -f "/opt/cray/alps/default/bin/aprun"; then + dnl This is the new OBS system AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS]) AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location]) AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location]) - AC_SUBST(RMINC,"/usr/include/alps") - AC_SUBST(RMLIB,"/usr/lib/alps/libalps.a") + PKG_CHECK_MODULES([CRAY_ALPS], [cray-alps]) + AC_SUBST(ARCHHEADER,"/") + AC_SUBST(ARCHLIB,"/") + ac_target_rm="alps" + elif test -f "/usr/bin/aprun"; then + dnl This is the old system. We hack things to work without pkg-config + AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS]) + AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location]) + AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location]) + AC_SUBST(CRAY_ALPS_CFLAGS,"-I/usr/include") + AC_SUBST(CRAY_ALPS_LIBS,"-L/usr/lib/alps -lalps") + AC_SUBST(ARCHHEADER,"/") + AC_SUBST(ARCHLIB,"/") ac_target_rm="alps" else AC_SUBST(ARCHHEADER,"/") @@ -133,7 +146,7 @@ AC_DEFUN([X_AC_PLATFORM], [ AC_DEFINE_UNQUOTED(TARGET_OS_ISA_STRING, "$ac_target_os-$ac_target_isa", [Define os-isa string]) AC_DEFINE_UNQUOTED(TARGET_RM_STRING, "$ac_target_rm" ,[Define rm string]) - AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "alps"]) + AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "xalps"]) AM_CONDITIONAL([WITH_CIOD], [test "x$ac_target_rm" = "xbglrm" \ -o "x$ac_target_rm" = "xbgprm" \ -o "x$ac_target_rm" = "xbgqrm"]) diff --git a/configure.ac b/configure.ac index 0295587..a4abc84 100644 --- a/configure.ac +++ b/configure.ac @@ -29,6 +29,7 @@ dnl ---------------------------------------------------------------------------- dnl dnl Update Log: dnl Apr 28 2016 DHA: Modernize autotool support +dnl Feb 20 2015 andrewg@cray.com: Added Cray support. dnl Apr 30 2014 DHA: Declare 1.0.1 -- this doesn't have Cray support though. dnl Apr 15 2014 DHA: Drop tools/cobo/test dnl Mar 10 2014 MPL: Add secure handshake @@ -78,6 +79,12 @@ AC_CONFIG_HEADER([config.h]) dnl config.guess and config.sub must be distributed AC_CANONICAL_SYSTEM +dnl +dnl Check for pkg-config +dnl +PKG_PROG_PKG_CONFIG + + dnl ----------------------------------------------- dnl Automake support dnl ----------------------------------------------- diff --git a/etc/rm_alps.conf b/etc/rm_alps.conf index 053aecc..b290e1b 100644 --- a/etc/rm_alps.conf +++ b/etc/rm_alps.conf @@ -28,6 +28,7 @@ ##-------------------------------------------------------------------------------- ## ## Update Log: +## Feb 20 2015 andrewg@cray.com: Updated for XE/XK/XC systems ## Oct 3 2011 DHA: Created file. ## ## @@ -45,12 +46,13 @@ ## RM=alps -RM_MPIR=STD +RM_MPIR=STD_CRAY RM_launcher=aprun RM_launcher=aprun.orig -RM_launcher_id=RM_launcher|sym|run_aprun +RM_launcher=alps_fe_colocator +RM_launcher_id=RM_launcher|sym|AprunUsageMsg RM_jobid=RM_launcher|pid RM_launch_helper=alps_fe_colocator RM_signal_for_kill=SIGINT RM_fail_detection=false -RM_launch_str=--be_starter=%b --apid=%j %d %o --lmonsharedsec=%s --lmonsecchk=%c +RM_launch_str=--be_starter=%b --apid=%j --daemon=%d %o --lmonsharedsec=%s --lmonsecchk=%c diff --git a/launchmon/src/linux/sdbg_linux_launchmon.cxx b/launchmon/src/linux/sdbg_linux_launchmon.cxx index 3cedee9..d95b4d5 100644 --- a/launchmon/src/linux/sdbg_linux_launchmon.cxx +++ b/launchmon/src/linux/sdbg_linux_launchmon.cxx @@ -26,6 +26,8 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Feb 20 2015 andrewg@cray.com: Added support for RMs that build the + * proctable on demand. Fixed a misplaced brace bug. * Oct 26 2012 DHA: Removed catch clauses for deprecated thread tracers * exceptions. * Jul 31 2012 DHA: Added a fix for a thread race-related hang problem. @@ -848,7 +850,7 @@ linux_launchmon_t::acquire_proctable ( // fetching the RPDTAB size // symbol_base_t debug_ps - = main_im->get_a_symbol ( p.get_launch_proctable_size() ); + = main_im->get_a_symbol ( p.get_launch_proctable_size() ); if (!debug_ps && p.get_myrmso_image()) { @@ -866,10 +868,10 @@ linux_launchmon_t::acquire_proctable ( set_pcount (local_pcount); if (get_pcount() <= 0 ) { - self_trace_t::trace ( - true, - MODULENAME, 1, - "MPIR_proctable_size is negative"); + self_trace_t::trace ( + true, + MODULENAME, 1, + "MPIR_proctable_size is negative"); return false; } @@ -880,15 +882,15 @@ linux_launchmon_t::acquire_proctable ( // perform separate read operations using those addresses. // MPIR_PROCDESC* launcher_proctable - = (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount()); + = (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount()); if (!launcher_proctable) { - self_trace_t::trace ( - true, - MODULENAME, - 1, - "Out of memory!"); + self_trace_t::trace ( + true, + MODULENAME, + 1, + "Out of memory!"); return false; } @@ -920,72 +922,72 @@ linux_launchmon_t::acquire_proctable ( // maxcount = (unsigned long long) get_pcount(); for ( i = 0; i < maxcount; ++i ) - { - MPIR_PROCDESC_EXT* an_entry - = (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT)); + { + MPIR_PROCDESC_EXT* an_entry + = (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT)); if (!an_entry) { - self_trace_t::trace ( - true, - MODULENAME, 1, - "Out of memory!"); + self_trace_t::trace ( + true, + MODULENAME, 1, + "Out of memory!"); return false; } - // - // allocating storages for "an_entry" - // - an_entry->pd.host_name + // + // allocating storages for "an_entry" + // + an_entry->pd.host_name = (char*) malloc(MAX_STRING_SIZE); - an_entry->pd.executable_name + an_entry->pd.executable_name = (char*) malloc(MAX_STRING_SIZE); #if SUB_ARCH_BGQ - an_entry->cnodeid + an_entry->cnodeid = launcher_proctable[i].pid; #else - an_entry->pd.pid + an_entry->pd.pid = launcher_proctable[i].pid; - an_entry->cnodeid = -1; + an_entry->cnodeid = -1; #endif - an_entry->mpirank = i; /* The mpi rank is the index into the global tab */ + an_entry->mpirank = i; /* The mpi rank is the index into the global tab */ #if SUB_ARCH_BGQ - an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */ + an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */ #endif - // - // memory-fetching to get the "host_name" // - get_tracer()->tracer_read_string(p, + // memory-fetching to get the "host_name" + // + get_tracer()->tracer_read_string(p, (T_VA) launcher_proctable[i].host_name, (void*) (an_entry->pd.host_name), MAX_STRING_SIZE, use_cxt ); - // - // memory-fetching to get the "executable name" - // - get_tracer()->tracer_read_string(p, - (T_VA)launcher_proctable[i].executable_name, + // + // memory-fetching to get the "executable name" + // + get_tracer()->tracer_read_string(p, + (T_VA)launcher_proctable[i].executable_name, (void*)an_entry->pd.executable_name, MAX_STRING_SIZE, use_cxt ); - get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry); - } + get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry); + } free ( launcher_proctable ); if ( get_proctable_copy().empty() ) - { - self_trace_t::trace ( LEVELCHK(level1), - MODULENAME, 1, - "proctable is empty!"); + { + self_trace_t::trace ( LEVELCHK(level1), + MODULENAME, 1, + "proctable is empty!"); - return LAUNCHMON_FAILED; - } + return false; + } if (p.get_myopts()->get_my_rmconfig()->is_rid_sup()) { @@ -1027,13 +1029,13 @@ linux_launchmon_t::acquire_proctable ( // -1 is the init value that SLURM sets internally // for "totalview_jobid" if ( get_resid() == -1 ) - { - self_trace_t::trace ( LEVELCHK(level1), - MODULENAME, 1, - "resource ID is not valid!"); + { + self_trace_t::trace ( LEVELCHK(level1), + MODULENAME, 1, + "resource ID is not valid!"); - return LAUNCHMON_FAILED; - } + return false; + } } else if (r_mgr.get_job_id().dtype == integer32) { @@ -1048,14 +1050,13 @@ linux_launchmon_t::acquire_proctable ( set_resid(int_val); p.set_rid(get_resid()); } + } else if (p.get_myopts()->get_my_rmconfig()->is_rid_via_pid()) { set_resid (p.get_pid(false)); p.set_rid (get_resid()); } } - } - #if MEASURE_TRACING_COST c_end_ts = gettimeofdayD(); @@ -1067,7 +1068,7 @@ linux_launchmon_t::acquire_proctable ( } #endif - return LAUNCHMON_OK; + return true; } catch ( symtab_exception_t e ) { @@ -2011,28 +2012,39 @@ linux_launchmon_t::handle_trap_after_attach_event ( else { // - // Without MPIR Colocation service, you would have - // Proctable available on attach and you would be - // ready to launch daemon at this point - // - // - acquire_proctable ( p, use_cxt ); - ship_proctab_msg ( lmonp_proctable_avail ); - ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() ); - ship_rminfo_msg ( lmonp_rminfo, - (int) p.get_pid(false), - p.rmgr()->get_resource_manager().get_rm()); - say_fetofe_msg ( lmonp_stop_at_first_attach ); - launch_tool_daemons(p); - get_tracer()->tracer_continue (p, use_cxt); - } + // Some RMs will build the Proctable on demand. If so, continue until + // the MPIR_Breakpoint is hit. + // + if (p.rmgr()->is_cont_on_att()) + { + get_tracer()->tracer_continue (p, use_cxt); + } + else + { + // + // Without MPIR Colocation service, you would have + // Proctable available on attach and you would be + // ready to launch daemon at this point + // + // + acquire_proctable ( p, use_cxt ); + ship_proctab_msg ( lmonp_proctable_avail ); + ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() ); + ship_rminfo_msg ( lmonp_rminfo, + (int) p.get_pid(false), + p.rmgr()->get_resource_manager().get_rm()); + say_fetofe_msg ( lmonp_stop_at_first_attach ); + launch_tool_daemons(p); + get_tracer()->tracer_continue (p, use_cxt); + } + } { - self_trace_t::trace ( + self_trace_t::trace ( LEVELCHK(level2), - MODULENAME, + MODULENAME, 0, - "trap after attach event handler completed."); + "trap after attach event handler completed."); } #if MEASURE_TRACING_COST diff --git a/launchmon/src/sdbg_rm_map.cxx b/launchmon/src/sdbg_rm_map.cxx index 9d72a64..ba80033 100644 --- a/launchmon/src/sdbg_rm_map.cxx +++ b/launchmon/src/sdbg_rm_map.cxx @@ -26,6 +26,9 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Feb 20 2015 andrewg@cray.com: Added support for RMs that build the + * proctable on demand. Checks added for launch helpers + * that are included with launchmon. * Oct 06 2011 DHA: Created file. */ @@ -259,6 +262,10 @@ resource_manager_t::fill_mpir_type(const std::string &v) { mpir = standard; } + else if (v == std::string("STD_CRAY")) + { + mpir = x_cray; + } else if (v == std::string("STD_COLOC")) { mpir = x_coloc; @@ -436,7 +443,31 @@ resource_manager_t::fill_launch_helper(const std::string &v) else { launch_helper.launch_method = launch_helper_method; - launch_helper.launcher_command = v; + // Some RMs will package their launch helper with launchmon. If v is not + // found, then we will test to see if it is included with launchmon. + std::string v2 = v; + if (access(v.c_str(), R_OK) < 0) + { + char *pref; + // + // FIXME: DHA May 6 2016. This probably won't work with the + // new in-tree testing support introduced with PR #12. + // + if (pref = getenv("LMON_PREFIX")) + { + v2.clear(); + v2 = std::string(pref) + std::string("/bin/") + v; + // Check for packaged launch helper, if it is not found then we + // assume that v is found in PATH. + if (access(v2.c_str(), R_OK) < 0) + { + // reset back to v, it is in PATH. + v2.clear(); + v2 = v; + } + } + } + launch_helper.launcher_command = v2; } } @@ -799,6 +830,13 @@ rc_rm_t::is_attfifo_sup() } +bool +rc_rm_t::is_cont_on_att() +{ + return (resource_manager.get_mpir() == x_cray); +} + + bool rc_rm_t::is_rid_sup() { diff --git a/launchmon/src/sdbg_rm_map.hxx b/launchmon/src/sdbg_rm_map.hxx index 6c48745..34625c2 100644 --- a/launchmon/src/sdbg_rm_map.hxx +++ b/launchmon/src/sdbg_rm_map.hxx @@ -26,6 +26,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * Feb 20 2015 andrewg@cray.com: Added Cray support. * Dec 07 2012 DHA: init_rm_instance returns a meaningful return code * Oct 06 2011 DHA: Restructuring to support * runtime detection of target resource managers @@ -62,6 +63,7 @@ enum mpir_catalogue_e { standard, + x_cray, x_coloc, x_fifo, x_coloc_fifo, @@ -288,6 +290,8 @@ public: bool is_attfifo_sup(); + bool is_cont_on_att(); + bool is_rid_sup(); bool is_rid_via_symbol(); diff --git a/tools/alps/src/Makefile.am b/tools/alps/src/Makefile.am index d29b48b..52787fc 100644 --- a/tools/alps/src/Makefile.am +++ b/tools/alps/src/Makefile.am @@ -28,6 +28,7 @@ ##-------------------------------------------------------------------------------- ## ## Update Log: +## May 05 2016 DHA: Adapt @agontarek's Cray port patch ## Apr 29 2016 DHA: Mondernize build support ## Nov 23 2010 DHA: Changed bin_SCRIPTS to etc_SCRIPTS for CLE dso list ## Nov 04 2010 DHA: Added lib_SCRIPTS @@ -36,16 +37,15 @@ ## AM_CPPFLAGS = \ - -I$(top_srcdir) \ - -I$(top_srcdir)/launchmon/src \ - -I$(top_srcdir)/@LMONAPILOC@ \ - -I@RMINC@ + -I$(abs_top_srcdir) \ + -I$(abs_top_srcdir)/launchmon/src \ + -I$(abs_top_srcdir)/@LMONAPILOC@ bin_PROGRAMS = alps_be_starter alps_fe_colocator alps_be_starter_SOURCES = alps_be_starter.c alps_fe_colocator_SOURCES = alps_fe_colocator.cxx -alps_fe_colocator_CXXFLAGS = @LNCHR_BIT_FLAGS@ -alps_fe_colocator_LDADD = @LIBELF@ @RMLIB@ +alps_fe_colocator_CXXFLAGS = @LNCHR_BIT_FLAGS@ $(CRAY_ALPS_CFLAGS) +alps_fe_colocator_LDADD = @LIBELF@ $(CRAY_ALPS_LIBS) alps_be_starter$(EXEEXT): alps_be_starter.c Makefile $(MPICC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CFLAGS) $(CFLAGS) -o $@ $< diff --git a/tools/alps/src/alps_fe_colocator.cxx b/tools/alps/src/alps_fe_colocator.cxx index 3657859..58f128a 100644 --- a/tools/alps/src/alps_fe_colocator.cxx +++ b/tools/alps/src/alps_fe_colocator.cxx @@ -26,6 +26,7 @@ *-------------------------------------------------------------------------------- * * Update Log: + * May 05 2016 DHA: Applied @agontarek's Cray patch * Nov 05 2010 DHA: Support for excluding existing system DSOs from * being broadcast. Credit to Andrew Gontarek at Cray * for providing DSO list for CLE3.1 and CLE2.2 @@ -39,7 +40,6 @@ extern "C" { # include -# include # include # include # include @@ -60,8 +60,8 @@ extern "C" { # else # error libelf.h is required #endif -# include "apInfo.h" -# include "libalps.h" +# include "alps/apInfo.h" +# include "alps/libalps.h" extern char *alpsGetMyNid(int *nid); }