Skip to content

Commit

Permalink
Merge pull request #20 from dongahn/adapt_cray_pr
Browse files Browse the repository at this point in the history
Cray support: Added Cray XE/XK/XC support
  • Loading branch information
lee218llnl committed May 16, 2016
2 parents 172bba2 + bd817a6 commit 0cae76f
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 91 deletions.
7 changes: 4 additions & 3 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Dong H. Ahn
Matt LeGendre
Gregory Lee
Dong H. Ahn (LLNL)
Matt LeGendre (LLNL)
Gregory Lee (LLNL)
Andrew Gontarek (Cray Inc)
12 changes: 12 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
2016-05-05 Andrew Gontarek <[email protected]>
* AUTHORS,
config/x_ac_platform.m4,
configure.ac,
etc/rm_alps.conf,
launchmon/src/linux/sdbg_linux_launchmon.cxx,
launchmon/src/sdbg_rm_map.cxx,
launchmon/src/sdbg_rm_map.hxx,
tools/alps/src/Makefile.am,
tools/alps/src/alps_fe_colocator.cxx:
Added Cray XE/XK/XC support.

2016-05-04 Dong H. Ahn <[email protected]>
* git show --stat 42a9e 37e25 3b491 \
bc791 fecc9 dac19 81dcf 07417 39efd 6727:
Expand Down
21 changes: 17 additions & 4 deletions config/x_ac_platform.m4
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# --------------------------------------------------------------------------------
#
# Update Log:
# Feb 20 2015 [email protected]: Fixes for Cray systems.
# Jun 11 2008 DHA: File created.
#

Expand Down Expand Up @@ -115,12 +116,24 @@ AC_DEFUN([X_AC_PLATFORM], [
ac_target_rm="bgqrm"
fi
elif test "x$ac_target_os" = "xlinux" -a "x$ac_target_isa" = "xx86_64"; then
if test ! -z "/use/bin/aprun" -a -f "/usr/bin/aprun"; then
if test -f "/opt/cray/alps/default/bin/aprun"; then
dnl This is the new OBS system
AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS])
AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location])
AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location])
AC_SUBST(RMINC,"/usr/include/alps")
AC_SUBST(RMLIB,"/usr/lib/alps/libalps.a")
PKG_CHECK_MODULES([CRAY_ALPS], [cray-alps])
AC_SUBST(ARCHHEADER,"/")
AC_SUBST(ARCHLIB,"/")
ac_target_rm="alps"
elif test -f "/usr/bin/aprun"; then
dnl This is the old system. We hack things to work without pkg-config
AC_DEFINE(SUB_ARCH_ALPS,1,[Define 1 for SUB_ARCH_ALPS])
AC_DEFINE(RM_BE_STUB_CMD, "alps_be_starter", [be starter stub location])
AC_DEFINE(RM_FE_COLOC_CMD, "alps_fe_colocator", [bulk launcher location])
AC_SUBST(CRAY_ALPS_CFLAGS,"-I/usr/include")
AC_SUBST(CRAY_ALPS_LIBS,"-L/usr/lib/alps -lalps")
AC_SUBST(ARCHHEADER,"/")
AC_SUBST(ARCHLIB,"/")
ac_target_rm="alps"
else
AC_SUBST(ARCHHEADER,"/")
Expand All @@ -133,7 +146,7 @@ AC_DEFUN([X_AC_PLATFORM], [
AC_DEFINE_UNQUOTED(TARGET_OS_ISA_STRING, "$ac_target_os-$ac_target_isa", [Define os-isa string])
AC_DEFINE_UNQUOTED(TARGET_RM_STRING, "$ac_target_rm" ,[Define rm string])
AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "alps"])
AM_CONDITIONAL([WITH_ALPS], [test "x$ac_target_rm" = "xalps"])
AM_CONDITIONAL([WITH_CIOD], [test "x$ac_target_rm" = "xbglrm" \
-o "x$ac_target_rm" = "xbgprm" \
-o "x$ac_target_rm" = "xbgqrm"])
Expand Down
7 changes: 7 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dnl ----------------------------------------------------------------------------
dnl
dnl Update Log:
dnl Apr 28 2016 DHA: Modernize autotool support
dnl Feb 20 2015 [email protected]: Added Cray support.
dnl Apr 30 2014 DHA: Declare 1.0.1 -- this doesn't have Cray support though.
dnl Apr 15 2014 DHA: Drop tools/cobo/test
dnl Mar 10 2014 MPL: Add secure handshake
Expand Down Expand Up @@ -78,6 +79,12 @@ AC_CONFIG_HEADER([config.h])
dnl config.guess and config.sub must be distributed
AC_CANONICAL_SYSTEM

dnl
dnl Check for pkg-config
dnl
PKG_PROG_PKG_CONFIG


dnl -----------------------------------------------
dnl Automake support
dnl -----------------------------------------------
Expand Down
8 changes: 5 additions & 3 deletions etc/rm_alps.conf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
##--------------------------------------------------------------------------------
##
## Update Log:
## Feb 20 2015 [email protected]: Updated for XE/XK/XC systems
## Oct 3 2011 DHA: Created file.
##
##
Expand All @@ -45,12 +46,13 @@
##

RM=alps
RM_MPIR=STD
RM_MPIR=STD_CRAY
RM_launcher=aprun
RM_launcher=aprun.orig
RM_launcher_id=RM_launcher|sym|run_aprun
RM_launcher=alps_fe_colocator
RM_launcher_id=RM_launcher|sym|AprunUsageMsg
RM_jobid=RM_launcher|pid
RM_launch_helper=alps_fe_colocator
RM_signal_for_kill=SIGINT
RM_fail_detection=false
RM_launch_str=--be_starter=%b --apid=%j %d %o --lmonsharedsec=%s --lmonsecchk=%c
RM_launch_str=--be_starter=%b --apid=%j --daemon=%d %o --lmonsharedsec=%s --lmonsecchk=%c
154 changes: 83 additions & 71 deletions launchmon/src/linux/sdbg_linux_launchmon.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
*--------------------------------------------------------------------------------
*
* Update Log:
* Feb 20 2015 [email protected]: Added support for RMs that build the
* proctable on demand. Fixed a misplaced brace bug.
* Oct 26 2012 DHA: Removed catch clauses for deprecated thread tracers
* exceptions.
* Jul 31 2012 DHA: Added a fix for a thread race-related hang problem.
Expand Down Expand Up @@ -848,7 +850,7 @@ linux_launchmon_t::acquire_proctable (
// fetching the RPDTAB size
//
symbol_base_t<T_VA> debug_ps
= main_im->get_a_symbol ( p.get_launch_proctable_size() );
= main_im->get_a_symbol ( p.get_launch_proctable_size() );

if (!debug_ps && p.get_myrmso_image())
{
Expand All @@ -866,10 +868,10 @@ linux_launchmon_t::acquire_proctable (
set_pcount (local_pcount);
if (get_pcount() <= 0 )
{
self_trace_t::trace (
true,
MODULENAME, 1,
"MPIR_proctable_size is negative");
self_trace_t::trace (
true,
MODULENAME, 1,
"MPIR_proctable_size is negative");
return false;
}

Expand All @@ -880,15 +882,15 @@ linux_launchmon_t::acquire_proctable (
// perform separate read operations using those addresses.
//
MPIR_PROCDESC* launcher_proctable
= (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount());
= (MPIR_PROCDESC *) malloc (sizeof (MPIR_PROCDESC) * get_pcount());

if (!launcher_proctable)
{
self_trace_t::trace (
true,
MODULENAME,
1,
"Out of memory!");
self_trace_t::trace (
true,
MODULENAME,
1,
"Out of memory!");

return false;
}
Expand Down Expand Up @@ -920,72 +922,72 @@ linux_launchmon_t::acquire_proctable (
//
maxcount = (unsigned long long) get_pcount();
for ( i = 0; i < maxcount; ++i )
{
MPIR_PROCDESC_EXT* an_entry
= (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT));
{
MPIR_PROCDESC_EXT* an_entry
= (MPIR_PROCDESC_EXT* ) malloc(sizeof(MPIR_PROCDESC_EXT));

if (!an_entry)
{
self_trace_t::trace (
true,
MODULENAME, 1,
"Out of memory!");
self_trace_t::trace (
true,
MODULENAME, 1,
"Out of memory!");

return false;
}

//
// allocating storages for "an_entry"
//
an_entry->pd.host_name
//
// allocating storages for "an_entry"
//
an_entry->pd.host_name
= (char*) malloc(MAX_STRING_SIZE);
an_entry->pd.executable_name
an_entry->pd.executable_name
= (char*) malloc(MAX_STRING_SIZE);
#if SUB_ARCH_BGQ
an_entry->cnodeid
an_entry->cnodeid
= launcher_proctable[i].pid;
#else
an_entry->pd.pid
an_entry->pd.pid
= launcher_proctable[i].pid;
an_entry->cnodeid = -1;
an_entry->cnodeid = -1;
#endif
an_entry->mpirank = i; /* The mpi rank is the index into the global tab */
an_entry->mpirank = i; /* The mpi rank is the index into the global tab */

#if SUB_ARCH_BGQ
an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */
an_entry->pd.pid = i; /* The mpi rank is the index into the global tab */
#endif

//
// memory-fetching to get the "host_name"
//
get_tracer()->tracer_read_string(p,
// memory-fetching to get the "host_name"
//
get_tracer()->tracer_read_string(p,
(T_VA) launcher_proctable[i].host_name,
(void*) (an_entry->pd.host_name),
MAX_STRING_SIZE,
use_cxt );

//
// memory-fetching to get the "executable name"
//
get_tracer()->tracer_read_string(p,
(T_VA)launcher_proctable[i].executable_name,
//
// memory-fetching to get the "executable name"
//
get_tracer()->tracer_read_string(p,
(T_VA)launcher_proctable[i].executable_name,
(void*)an_entry->pd.executable_name,
MAX_STRING_SIZE,
use_cxt );

get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry);
}
get_proctable_copy()[an_entry->pd.host_name].push_back(an_entry);
}

free ( launcher_proctable );

if ( get_proctable_copy().empty() )
{
self_trace_t::trace ( LEVELCHK(level1),
MODULENAME, 1,
"proctable is empty!");
{
self_trace_t::trace ( LEVELCHK(level1),
MODULENAME, 1,
"proctable is empty!");

return LAUNCHMON_FAILED;
}
return false;
}

if (p.get_myopts()->get_my_rmconfig()->is_rid_sup())
{
Expand Down Expand Up @@ -1027,13 +1029,13 @@ linux_launchmon_t::acquire_proctable (
// -1 is the init value that SLURM sets internally
// for "totalview_jobid"
if ( get_resid() == -1 )
{
self_trace_t::trace ( LEVELCHK(level1),
MODULENAME, 1,
"resource ID is not valid!");
{
self_trace_t::trace ( LEVELCHK(level1),
MODULENAME, 1,
"resource ID is not valid!");

return LAUNCHMON_FAILED;
}
return false;
}
}
else if (r_mgr.get_job_id().dtype == integer32)
{
Expand All @@ -1048,14 +1050,13 @@ linux_launchmon_t::acquire_proctable (
set_resid(int_val);
p.set_rid(get_resid());
}
}
else if (p.get_myopts()->get_my_rmconfig()->is_rid_via_pid())
{
set_resid (p.get_pid(false));
p.set_rid (get_resid());
}
}
}


#if MEASURE_TRACING_COST
c_end_ts = gettimeofdayD();
Expand All @@ -1067,7 +1068,7 @@ linux_launchmon_t::acquire_proctable (
}
#endif

return LAUNCHMON_OK;
return true;
}
catch ( symtab_exception_t e )
{
Expand Down Expand Up @@ -2011,28 +2012,39 @@ linux_launchmon_t::handle_trap_after_attach_event (
else
{
//
// Without MPIR Colocation service, you would have
// Proctable available on attach and you would be
// ready to launch daemon at this point
//
//
acquire_proctable ( p, use_cxt );
ship_proctab_msg ( lmonp_proctable_avail );
ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() );
ship_rminfo_msg ( lmonp_rminfo,
(int) p.get_pid(false),
p.rmgr()->get_resource_manager().get_rm());
say_fetofe_msg ( lmonp_stop_at_first_attach );
launch_tool_daemons(p);
get_tracer()->tracer_continue (p, use_cxt);
}
// Some RMs will build the Proctable on demand. If so, continue until
// the MPIR_Breakpoint is hit.
//
if (p.rmgr()->is_cont_on_att())
{
get_tracer()->tracer_continue (p, use_cxt);
}
else
{
//
// Without MPIR Colocation service, you would have
// Proctable available on attach and you would be
// ready to launch daemon at this point
//
//
acquire_proctable ( p, use_cxt );
ship_proctab_msg ( lmonp_proctable_avail );
ship_resourcehandle_msg ( lmonp_resourcehandle_avail, get_resid() );
ship_rminfo_msg ( lmonp_rminfo,
(int) p.get_pid(false),
p.rmgr()->get_resource_manager().get_rm());
say_fetofe_msg ( lmonp_stop_at_first_attach );
launch_tool_daemons(p);
get_tracer()->tracer_continue (p, use_cxt);
}
}

{
self_trace_t::trace (
self_trace_t::trace (
LEVELCHK(level2),
MODULENAME,
MODULENAME,
0,
"trap after attach event handler completed.");
"trap after attach event handler completed.");
}

#if MEASURE_TRACING_COST
Expand Down
Loading

0 comments on commit 0cae76f

Please sign in to comment.