Skip to content

Commit

Permalink
Several bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
janciesko committed Sep 12, 2024
1 parent 7674899 commit c0b4e92
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 32 deletions.
8 changes: 3 additions & 5 deletions benchmarks/access_overhead/access_overhead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ struct Access<ViewType_t, typename std::enable_if_t<!std::is_same<
Kokkos::parallel_for("access_overhead-init", policy_init_t(0, N), *this);
Kokkos::fence();
#ifdef KRS_ENABLE_NVSHMEMSPACE
nvshmem_barrier_all(); // Not sure why this impacts perf
//nvshmem_barrier_all(); // Not sure why this impacts perf
#endif

time_a = timer.seconds();
for (int i = 0; i < iters; i++) {
Kokkos::parallel_for("access_overhead", policy_update_t(0, N), *this);
RemoteSpace_t().fence();
Kokkos::fence();
}
time_b = timer.seconds();
time += time_b - time_a;
Expand Down Expand Up @@ -183,7 +183,7 @@ struct Access<ViewType_t, typename std::enable_if_t<std::is_same<
time_a = timer.seconds();
for (int i = 0; i < iters; i++) {
Kokkos::parallel_for("access_overhead", policy_update_t(0, N), *this);
RemoteSpace_t().fence();
Kokkos::fence();
}
time_b = timer.seconds();
time += time_b - time_a;
Expand Down Expand Up @@ -248,8 +248,6 @@ int main(int argc, char *argv[]) {
}
} while (false);

Kokkos::fence();

Kokkos::finalize();
#ifdef KRS_ENABLE_SHMEMSPACE
shmem_finalize();
Expand Down
11 changes: 6 additions & 5 deletions benchmarks/access_overhead/access_overhead_p2p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
#include <type_traits>
#include <string>

#define LDC_LEAGUE_SIZE 4096
#define LDC_LEAGUE_SIZE 2
#define LDC_TEAM_SIZE 1
//#define CHECK_FOR_CORRECTNESS

#define CHECK_FOR_CORRECTNESS

using RemoteSpace_t = Kokkos::Experimental::DefaultRemoteMemorySpace;
using RemoteView_t = Kokkos::View<double *, RemoteSpace_t>;
using PlainView_t = Kokkos::View<double *, Kokkos::LayoutLeft>;
using PlainView_t = Kokkos::View<double *>;
using UnmanagedView_t =
Kokkos::View<double *, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
using HostView_t = typename RemoteView_t::HostMirror;
Expand All @@ -53,8 +54,8 @@ using policy_check_t = Kokkos::RangePolicy<CheckTag, size_t>;

// Default values
#define default_Mode 0
#define default_N 134217728
#define default_Iters 3
#define default_N 128
#define default_Iters 1
#define default_RmaOp RMA_GET
#define TAG 0

Expand Down
8 changes: 5 additions & 3 deletions benchmarks/access_overhead/scripts/run_over_size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ FILENAME="${BENCHMARK}_${HASH}.res"
echo $FILENAME
echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME

export NVSHMEM_SYMMETRIC_SIZE=12884901888

#run test over size
SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME
CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 0 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done
Expand All @@ -28,7 +30,7 @@ done
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME
CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 1 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done
Expand All @@ -37,7 +39,7 @@ done
let SIZE=$DEFAULT_SIZE
for S in $(seq 1 21); do
for reps in $(seq 1 3); do
./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME
CUDA_VISIBLE_DEVICES=1 ./$BENCHMARK -N $SIZE -I $ITERS -M 2 | tee -a $FILENAME
done
let SIZE=$SIZE*2
done
7 changes: 4 additions & 3 deletions benchmarks/access_overhead/scripts/run_over_size_p2p.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
BENCHMARK=$1
HOST1=$2
HOST2=$3
DEFAULT_SIZE=33554432 #128
DEFAULT_SIZE=128

#exports
export OMP_PROC_BIND=spread
Expand All @@ -13,7 +13,7 @@ ITERS=30

#NVLInk (=||=)
DEVICE_ID_1=0
DEVICE_ID_2=1
DEVICE_ID_2=0

#XBus (Summit-like systems)
#DEVICE_ID_1=0
Expand All @@ -28,7 +28,8 @@ FILENAME="${BENCHMARK}_${HASH}_p2p.res"
echo $FILENAME
echo "name,type,N,size,iters,time,gups,bw" | tee $FILENAME
VARS0="--bind-to core --map-by socket"
VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/11.8.0/gcc/9.3.0/base/c3ajoqf/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888"
#VARS1="-x LD_LIBRARY_PATH=/projects/ppc64le-pwr9-rhel8/tpls/cuda/11.8.0/gcc/9.3.0/base/c3ajoqf/lib64/:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888"
VARS1="--oversubscribe -x LD_LIBRARY_PATH=/home/jciesko/software/nvshmem_src_3.0.6-4_blake/install/lib:$LD_LIBRARY_PATH -x NVSHMEM_SYMMETRIC_SIZE=12884901888"

# Some more potential optimizations
#VARS1="" #-x UCX_WARN_UNUSED_ENV_VARS=n -x HCOLL_RCACHE=^ucs -x \
Expand Down
2 changes: 2 additions & 0 deletions src/core/Kokkos_RemoteSpaces_LocalDeepCopy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
auto team_range = Kokkos::pair(size_type(start_offset),
size_type(start_offset + team_block));

if(team_range.first == team_range.second) return; //nothing to be done

// Construct per-team subviews
auto src_subview = Kokkos::Impl::get_local_subview(src, team_range);
auto dst_subview = Kokkos::Impl::get_local_subview(dst, team_range);
Expand Down
7 changes: 2 additions & 5 deletions src/core/Kokkos_RemoteSpaces_ViewMapping.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,8 @@ class ViewMapping<

typename view_type::size_type offset;
offset = switch_to_local_indexing ? total_offset : local_offset;
dst.remote_view_props.total_offset = offset;

dst.remote_view_props.total_offset = total_offset;
#ifdef KRS_ENABLE_MPISPACE
// Subviews propagate MPI_Window of the original view
dst.m_handle = ViewDataHandle<DstTraits>::assign(
Expand Down Expand Up @@ -397,10 +397,7 @@ class ViewMapping<Traits, Kokkos::Experimental::RemoteSpaceSpecializeTag> {

KOKKOS_INLINE_FUNCTION
auto get_ptr() const {
if (remote_view_props.using_local_indexing)
return handle().ptr + remote_view_props.total_offset;
else
return handle().ptr;
}

template <typename T = Traits>
Expand Down
22 changes: 11 additions & 11 deletions unit_tests/Test_LocalDeepCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -383,7 +383,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -412,7 +412,7 @@ void test_localdeepcopy_withSubview(
// Copy from next
if (my_rank % 2 == 0) {
Kokkos::parallel_for(
"Team", TeamPolicy_t(team_sizes::big, 1),
"Team", TeamPolicy_t(team_sizes::small, 1),
KOKKOS_LAMBDA(typename TeamPolicy_t::member_type team) {
Kokkos::Experimental::RemoteSpaces::local_deep_copy(
team, v_R_subview_local, v_R_subview_next);
Expand Down Expand Up @@ -464,7 +464,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -554,7 +554,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -634,7 +634,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -722,7 +722,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -806,7 +806,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -896,7 +896,7 @@ void test_localdeepcopy_withSubview(
prev_rank = (my_rank - 1) < 0 ? num_ranks - 1 : my_rank - 1;
next_rank = (my_rank + 1) % num_ranks;

if (num_ranks % 2 && num_ranks > 1) return; // skip
if (num_ranks % 2 || num_ranks < 2) return; // skip

using ViewRemote_t = Kokkos::View<Data_t ***, Layout_t, Space_B>;
using ViewHost_t = typename ViewRemote_t::HostMirror;
Expand Down Expand Up @@ -988,10 +988,10 @@ void test_localdeepcopy_withSubview(
test_localdeepcopy_withSubview<int, Kokkos::HostSpace, RemoteSpace_t, TEAM, \
get_op, IS_RANGES, LAYOUT>(12, 15); \
/* 2D with Subviews (put block transfer) */ \
test_localdeepcopy_withSubview<int, Kokkos::HostSpace, RemoteSpace_t, TEAM, \
test_localdeepcopy_withSubview<int, Kokkos::HostSpace, RemoteSpace_t, TEAM, \
put_op, IS_RANGES, LAYOUT>(5, 16); \
/* 2D with Subviews (get block transfer)*/ \
test_localdeepcopy_withSubview<int64_t, Kokkos::HostSpace, RemoteSpace_t, \
test_localdeepcopy_withSubview<int64_t, Kokkos::HostSpace, RemoteSpace_t, \
TEAM, get_op, IS_RANGES, LAYOUT>(12, 15); \
/* 2D with Subviews (put block transfer)*/ \
test_localdeepcopy_withSubview<int64_t, Kokkos::HostSpace, RemoteSpace_t, \
Expand Down

0 comments on commit c0b4e92

Please sign in to comment.