-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Cray support: Added Cray XE/XK/XC support
Adapt Cray's @agontarek PR #7 to the HEAD of the master branch. Thus, the contributor of this patch is actually Cray's @agontarek. - ChangeLog - Added change notes. - config/x_ac_platform.m4 - Fixed bugs/added updated support for Cray. - configure.ac - Added pkg-config check. Newer Cray systems use pkg-config to retrieve alps dependencies. - launchmon/src/linux/sdbg_linux_launchmon.cxx - Fixed misplaced brace bug, added support for Cray on demand MPIR proctable acquisition. - launchmon/src/rm_alps.conf - Updated Cray RM config. - launchmon/src/sdbg_rm_map.cxx - Added Cray support. - launchmon/src/sdbg_rm_map.hxx - Added Cray support. - tools/alps/src/Makefile.am - Updated Cray support. - tools/alps/src/alps_fe_colocator.cxx - Updated Cray support.
- Loading branch information
Showing
10 changed files
with
180 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
Dong H. Ahn | ||
Matt LeGendre | ||
Gregory Lee | ||
Dong H. Ahn (LLNL) | ||
Matt LeGendre (LLNL) | ||
Gregory Lee (LLNL) | ||
Andrew Gontarek (Cray Inc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,15 @@ | ||
2016-05-05 Andrew Gontarek <[email protected]> | ||
* AUTHORS, | ||
config/x_ac_platform.m4, | ||
configure.ac, | ||
etc/rm_alps.conf, | ||
launchmon/src/linux/sdbg_linux_launchmon.cxx, | ||
launchmon/src/sdbg_rm_map.cxx, | ||
launchmon/src/sdbg_rm_map.hxx, | ||
tools/alps/src/Makefile.am, | ||
tools/alps/src/alps_fe_colocator.cxx: | ||
Added Cray XE/XK/XC support. | ||
|
||
2016-05-04 Dong H. Ahn <[email protected]> | ||
* git show --stat 42a9e 37e25 3b491 \ | ||
bc791 fecc9 dac19 81dcf 07417 39efd 6727: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
# -------------------------------------------------------------------------------- | ||
# | ||
# Update Log: | ||
# Feb 20 2015 [email protected]: Fixes for Cray systems. | ||
# Jun 11 2008 DHA: File created. | ||
# | ||
|
||
|
@@ -115,12 +116,24 @@ AC_DEFUN([X_AC_PLATFORM], [ | |
ac_target_rm="bgqrm" | ||
fi | ||
elif test "x$ac_target_os" = "xlinux" -a "x$ac_target_isa" = "xx86_64"; then | ||
if test ! -z "/use/bin/aprun" -a -f "/usr/bin/aprun"; then | ||
if test -f "/opt/cray/alps/default/bin/aprun"; then | ||
dnl This is the new OBS system | ||
AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS]) | ||
AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location]) | ||
AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location]) | ||
AC_SUBST(RMINC,"/usr/include/alps") | ||
AC_SUBST(RMLIB,"/usr/lib/alps/libalps.a") | ||
PKG_CHECK_MODULES([CRAY_ALPS], [cray-alps]) | ||
AC_SUBST(ARCHHEADER,"/") | ||
AC_SUBST(ARCHLIB,"/") | ||
ac_target_rm="alps" | ||
elif test -f "/usr/bin/aprun"; then | ||
dnl This is the old system. We hack things to work without pkg-config | ||
AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS]) | ||
AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location]) | ||
AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location]) | ||
AC_SUBST(CRAY_ALPS_CFLAGS,"-I/usr/include") | ||
AC_SUBST(CRAY_ALPS_LIBS,"-L/usr/lib/alps -lalps") | ||
AC_SUBST(ARCHHEADER,"/") | ||
AC_SUBST(ARCHLIB,"/") | ||
ac_target_rm="alps" | ||
else | ||
AC_SUBST(ARCHHEADER,"/") | ||
|
@@ -133,7 +146,7 @@ AC_DEFUN([X_AC_PLATFORM], [ | |
AC_DEFINE_UNQUOTED(TARGET_OS_ISA_STRING, "$ac_target_os-$ac_target_isa", [Define os-isa string]) | ||
AC_DEFINE_UNQUOTED(TARGET_RM_STRING, "$ac_target_rm" ,[Define rm string]) | ||
AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "alps"]) | ||
AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "xalps"]) | ||
AM_CONDITIONAL([WITH_CIOD], [test "x$ac_target_rm" = "xbglrm" \ | ||
-o "x$ac_target_rm" = "xbgprm" \ | ||
-o "x$ac_target_rm" = "xbgqrm"]) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ dnl ---------------------------------------------------------------------------- | |
dnl | ||
dnl Update Log: | ||
dnl Apr 28 2016 DHA: Modernize autotool support | ||
dnl Feb 20 2015 [email protected]: Added Cray support. | ||
dnl Apr 30 2014 DHA: Declare 1.0.1 -- this doesn't have Cray support though. | ||
dnl Apr 15 2014 DHA: Drop tools/cobo/test | ||
dnl Mar 10 2014 MPL: Add secure handshake | ||
|
@@ -78,6 +79,12 @@ AC_CONFIG_HEADER([config.h]) | |
dnl config.guess and config.sub must be distributed | ||
AC_CANONICAL_SYSTEM | ||
|
||
dnl | ||
dnl Check for pkg-config | ||
dnl | ||
PKG_PROG_PKG_CONFIG | ||
|
||
|
||
dnl ----------------------------------------------- | ||
dnl Automake support | ||
dnl ----------------------------------------------- | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
##-------------------------------------------------------------------------------- | ||
## | ||
## Update Log: | ||
## Feb 20 2015 [email protected]: Updated for XE/XK/XC systems | ||
## Oct 3 2011 DHA: Created file. | ||
## | ||
## | ||
|
@@ -45,12 +46,13 @@ | |
## | ||
|
||
RM=alps | ||
RM_MPIR=STD | ||
RM_MPIR=STD_CRAY | ||
RM_launcher=aprun | ||
RM_launcher=aprun.orig | ||
RM_launcher_id=RM_launcher|sym|run_aprun | ||
RM_launcher=alps_fe_colocator | ||
RM_launcher_id=RM_launcher|sym|AprunUsageMsg | ||
RM_jobid=RM_launcher|pid | ||
RM_launch_helper=alps_fe_colocator | ||
RM_signal_for_kill=SIGINT | ||
RM_fail_detection=false | ||
RM_launch_str=--be_starter=%b --apid=%j %d %o --lmonsharedsec=%s --lmonsecchk=%c | ||
RM_launch_str=--be_starter=%b --apid=%j --daemon=%d %o --lmonsharedsec=%s --lmonsecchk=%c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,8 @@ | |
*-------------------------------------------------------------------------------- | ||
* | ||
* Update Log: | ||
* Feb 20 2015 [email protected]: Added support for RMs that build the | ||
* proctable on demand. Fixed a misplaced brace bug. | ||
* Oct 26 2012 DHA: Removed catch clauses for deprecated thread tracers | ||
* exceptions. | ||
* Jul 31 2012 DHA: Added a fix for a thread race-related hang problem. | ||
|
@@ -848,7 +850,7 @@ linux_launchmon_t::acquire_proctable ( | |
// fetching the RPDTAB size | ||
// | ||
symbol_base_t<T_VA> debug_ps | ||
= main_im->get_a_symbol ( p.get_launch_proctable_size() ); | ||
= main_im->get_a_symbol ( p.get_launch_proctable_size() ); | ||
|
||
if (!debug_ps && p.get_myrmso_image()) | ||
{ | ||
|
@@ -866,10 +868,10 @@ linux_launchmon_t::acquire_proctable ( | |
set_pcount (local_pcount); | ||
if (get_pcount() <= 0 ) | ||
{ | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, 1, | ||
"MPIR_proctable_size is negative"); | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, 1, | ||
"MPIR_proctable_size is negative"); | ||
return false; | ||
} | ||
|
||
|
@@ -880,15 +882,15 @@ linux_launchmon_t::acquire_proctable ( | |
// perform separate read operations using those addresses. | ||
// | ||
MPIR_PROCDESC* launcher_proctable | ||
= (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount()); | ||
= (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount()); | ||
|
||
if (!launcher_proctable) | ||
{ | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, | ||
1, | ||
"Out of memory!"); | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, | ||
1, | ||
"Out of memory!"); | ||
|
||
return false; | ||
} | ||
|
@@ -920,72 +922,72 @@ linux_launchmon_t::acquire_proctable ( | |
// | ||
maxcount = (unsigned long long) get_pcount(); | ||
for ( i = 0; i < maxcount; ++i ) | ||
{ | ||
MPIR_PROCDESC_EXT* an_entry | ||
= (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT)); | ||
{ | ||
MPIR_PROCDESC_EXT* an_entry | ||
= (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT)); | ||
|
||
if (!an_entry) | ||
{ | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, 1, | ||
"Out of memory!"); | ||
self_trace_t::trace ( | ||
true, | ||
MODULENAME, 1, | ||
"Out of memory!"); | ||
|
||
return false; | ||
} | ||
|
||
// | ||
// allocating storages for "an_entry" | ||
// | ||
an_entry->pd.host_name | ||
// | ||
// allocating storages for "an_entry" | ||
// | ||
an_entry->pd.host_name | ||
= (char*) malloc(MAX_STRING_SIZE); | ||
an_entry->pd.executable_name | ||
an_entry->pd.executable_name | ||
= (char*) malloc(MAX_STRING_SIZE); | ||
#if SUB_ARCH_BGQ | ||
an_entry->cnodeid | ||
an_entry->cnodeid | ||
= launcher_proctable[i].pid; | ||
#else | ||
an_entry->pd.pid | ||
an_entry->pd.pid | ||
= launcher_proctable[i].pid; | ||
an_entry->cnodeid = -1; | ||
an_entry->cnodeid = -1; | ||
#endif | ||
an_entry->mpirank = i; /* The mpi rank is the index into the global tab */ | ||
an_entry->mpirank = i; /* The mpi rank is the index into the global tab */ | ||
|
||
#if SUB_ARCH_BGQ | ||
an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */ | ||
an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */ | ||
#endif | ||
|
||
// | ||
// memory-fetching to get the "host_name" | ||
// | ||
get_tracer()->tracer_read_string(p, | ||
// memory-fetching to get the "host_name" | ||
// | ||
get_tracer()->tracer_read_string(p, | ||
(T_VA) launcher_proctable[i].host_name, | ||
(void*) (an_entry->pd.host_name), | ||
MAX_STRING_SIZE, | ||
use_cxt ); | ||
|
||
// | ||
// memory-fetching to get the "executable name" | ||
// | ||
get_tracer()->tracer_read_string(p, | ||
(T_VA)launcher_proctable[i].executable_name, | ||
// | ||
// memory-fetching to get the "executable name" | ||
// | ||
get_tracer()->tracer_read_string(p, | ||
(T_VA)launcher_proctable[i].executable_name, | ||
(void*)an_entry->pd.executable_name, | ||
MAX_STRING_SIZE, | ||
use_cxt ); | ||
|
||
get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry); | ||
} | ||
get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry); | ||
} | ||
|
||
free ( launcher_proctable ); | ||
|
||
if ( get_proctable_copy().empty() ) | ||
{ | ||
self_trace_t::trace ( LEVELCHK(level1), | ||
MODULENAME, 1, | ||
"proctable is empty!"); | ||
{ | ||
self_trace_t::trace ( LEVELCHK(level1), | ||
MODULENAME, 1, | ||
"proctable is empty!"); | ||
|
||
return LAUNCHMON_FAILED; | ||
} | ||
return false; | ||
} | ||
|
||
if (p.get_myopts()->get_my_rmconfig()->is_rid_sup()) | ||
{ | ||
|
@@ -1027,13 +1029,13 @@ linux_launchmon_t::acquire_proctable ( | |
// -1 is the init value that SLURM sets internally | ||
// for "totalview_jobid" | ||
if ( get_resid() == -1 ) | ||
{ | ||
self_trace_t::trace ( LEVELCHK(level1), | ||
MODULENAME, 1, | ||
"resource ID is not valid!"); | ||
{ | ||
self_trace_t::trace ( LEVELCHK(level1), | ||
MODULENAME, 1, | ||
"resource ID is not valid!"); | ||
|
||
return LAUNCHMON_FAILED; | ||
} | ||
return false; | ||
} | ||
} | ||
else if (r_mgr.get_job_id().dtype == integer32) | ||
{ | ||
|
@@ -1048,14 +1050,13 @@ linux_launchmon_t::acquire_proctable ( | |
set_resid(int_val); | ||
p.set_rid(get_resid()); | ||
} | ||
} | ||
else if (p.get_myopts()->get_my_rmconfig()->is_rid_via_pid()) | ||
{ | ||
set_resid (p.get_pid(false)); | ||
p.set_rid (get_resid()); | ||
} | ||
} | ||
} | ||
|
||
|
||
#if MEASURE_TRACING_COST | ||
c_end_ts = gettimeofdayD(); | ||
|
@@ -1067,7 +1068,7 @@ linux_launchmon_t::acquire_proctable ( | |
} | ||
#endif | ||
|
||
return LAUNCHMON_OK; | ||
return true; | ||
} | ||
catch ( symtab_exception_t e ) | ||
{ | ||
|
@@ -2011,28 +2012,39 @@ linux_launchmon_t::handle_trap_after_attach_event ( | |
else | ||
{ | ||
// | ||
// Without MPIR Colocation service, you would have | ||
// Proctable available on attach and you would be | ||
// ready to launch daemon at this point | ||
// | ||
// | ||
acquire_proctable ( p, use_cxt ); | ||
ship_proctab_msg ( lmonp_proctable_avail ); | ||
ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() ); | ||
ship_rminfo_msg ( lmonp_rminfo, | ||
(int) p.get_pid(false), | ||
p.rmgr()->get_resource_manager().get_rm()); | ||
say_fetofe_msg ( lmonp_stop_at_first_attach ); | ||
launch_tool_daemons(p); | ||
get_tracer()->tracer_continue (p, use_cxt); | ||
} | ||
// Some RMs will build the Proctable on demand. If so, continue until | ||
// the MPIR_Breakpoint is hit. | ||
// | ||
if (p.rmgr()->is_cont_on_att()) | ||
{ | ||
get_tracer()->tracer_continue (p, use_cxt); | ||
} | ||
else | ||
{ | ||
// | ||
// Without MPIR Colocation service, you would have | ||
// Proctable available on attach and you would be | ||
// ready to launch daemon at this point | ||
// | ||
// | ||
acquire_proctable ( p, use_cxt ); | ||
ship_proctab_msg ( lmonp_proctable_avail ); | ||
ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() ); | ||
ship_rminfo_msg ( lmonp_rminfo, | ||
(int) p.get_pid(false), | ||
p.rmgr()->get_resource_manager().get_rm()); | ||
say_fetofe_msg ( lmonp_stop_at_first_attach ); | ||
launch_tool_daemons(p); | ||
get_tracer()->tracer_continue (p, use_cxt); | ||
} | ||
} | ||
|
||
{ | ||
self_trace_t::trace ( | ||
self_trace_t::trace ( | ||
LEVELCHK(level2), | ||
MODULENAME, | ||
MODULENAME, | ||
0, | ||
"trap after attach event handler completed."); | ||
"trap after attach event handler completed."); | ||
} | ||
|
||
#if MEASURE_TRACING_COST | ||
|
Oops, something went wrong.