Skip to content

Commit

Permalink
i#5843 scheduler: Only switch on long-latency syscalls (#6458)
Browse files Browse the repository at this point in the history
Rather than context switching on every syscall labeled maybe-blocking,
the scheduler uses the now-available syscall latency to decide whether
the syscall should block and result in a context switch.

Adds two new command line options, -sched_syscall_switch_us (default
500us) and -sched_blocking_switch_us (default 100us), and corresponding
scheduler_t inputs, to control the latency thresholds. To avoid relying
too much on the maybe-blocking labels, we do consider a
very-high-latency syscall not marked as maybe-blocking to result in
a context switch.

Adds a new schedule_stats unit test.

Tested in a large proprietary app where this reduces the context switch
rate from ~100x too high down to ~10x too high. The next step of adding
i/o wait times should further improve the representativeness.

Issue: #5843
  • Loading branch information
derekbruening authored Nov 16, 2023
1 parent 81fdbd7 commit 5990405
Show file tree
Hide file tree
Showing 11 changed files with 349 additions and 35 deletions.
13 changes: 13 additions & 0 deletions clients/drcachesim/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,9 @@ add_subdirectory(tools/external)
# We build larger executables here. All tests are added in suite/tests/ except unit tests.
# Be sure to give the targets qualified test names ("tool.drcache*...").

# XXX: Try to add a macro add_drcachesim_test() to share common pieces
# of these executables.

if (BUILD_TESTS)
add_executable(tool.reuse_distance.unit_tests tests/reuse_distance_test.cpp)
target_link_libraries(tool.reuse_distance.unit_tests drmemtrace_reuse_distance
Expand Down Expand Up @@ -907,6 +910,16 @@ if (BUILD_TESTS)
set_tests_properties(tool.drcachesim.invariant_checker_test PROPERTIES
TIMEOUT ${test_seconds})

add_executable(tool.drcachesim.schedule_stats_test tests/schedule_stats_test.cpp)
configure_DynamoRIO_standalone(tool.drcachesim.schedule_stats_test)
add_win32_flags(tool.drcachesim.schedule_stats_test)
target_link_libraries(tool.drcachesim.schedule_stats_test drmemtrace_schedule_stats
drmemtrace_static drmemtrace_analyzer test_helpers)
add_test(NAME tool.drcachesim.schedule_stats_test
COMMAND tool.drcachesim.schedule_stats_test)
set_tests_properties(tool.drcachesim.schedule_stats_test PROPERTIES
TIMEOUT ${test_seconds})

add_executable(tool.drcacheoff.view_test tests/view_test.cpp reader/file_reader.cpp)
configure_DynamoRIO_standalone(tool.drcacheoff.view_test)
add_win32_flags(tool.drcacheoff.view_test)
Expand Down
2 changes: 2 additions & 0 deletions clients/drcachesim/analyzer_multi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ analyzer_multi_t::init_dynamic_schedule()
sched_ops.quantum_duration = op_sched_quantum.get_value();
if (op_sched_time.get_value())
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
#ifdef HAS_ZIP
if (!op_record_file.get_value().empty()) {
record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
Expand Down
13 changes: 13 additions & 0 deletions clients/drcachesim/common/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,19 @@ droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", tru
"Applies to -core_sharded and -core_serial. "
"Whether to honor recorded timestamps for ordering");

droption_t<uint64_t> op_sched_syscall_switch_us(
DROPTION_SCOPE_ALL, "sched_syscall_switch_us", 500,
"Minimum latency to consider any syscall as incurring a context switch.",
"Minimum latency in timestamp units (us) to consider any syscall as incurring "
"a context switch. Applies to -core_sharded and -core_serial. ");

droption_t<uint64_t> op_sched_blocking_switch_us(
DROPTION_SCOPE_ALL, "sched_blocking_switch_us", 100,
"Minimum latency to consider a maybe-blocking syscall as incurring a context switch.",
"Minimum latency in timestamp units (us) to consider any syscall that is marked as "
"maybe-blocking to incur a context switch. Applies to -core_sharded and "
"-core_serial. ");

#ifdef HAS_ZIP
droption_t<std::string> op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "",
"Path for storing record of schedule",
Expand Down
2 changes: 2 additions & 0 deletions clients/drcachesim/common/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ extern dynamorio::droption::droption_t<bool> op_core_serial;
extern dynamorio::droption::droption_t<int64_t> op_sched_quantum;
extern dynamorio::droption::droption_t<bool> op_sched_time;
extern dynamorio::droption::droption_t<bool> op_sched_order_time;
extern dynamorio::droption::droption_t<uint64_t> op_sched_syscall_switch_us;
extern dynamorio::droption::droption_t<uint64_t> op_sched_blocking_switch_us;
#ifdef HAS_ZIP
extern dynamorio::droption::droption_t<std::string> op_record_file;
extern dynamorio::droption::droption_t<std::string> op_replay_file;
Expand Down
64 changes: 51 additions & 13 deletions clients/drcachesim/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
return res;
}

template <typename RecordType, typename ReaderType>
bool
scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *input)
{
uint64_t post_time = input->reader->get_last_timestamp();
assert(input->processing_syscall || input->processing_maybe_blocking_syscall);
if (input->reader->get_version() < TRACE_ENTRY_VERSION_FREQUENT_TIMESTAMPS) {
// This is a legacy trace that does not have timestamps bracketing syscalls.
// We switch on every maybe-blocking syscall in this case.
return input->processing_maybe_blocking_syscall;
}
assert(input->pre_syscall_timestamp > 0);
assert(input->pre_syscall_timestamp < post_time);
uint64_t latency = post_time - input->pre_syscall_timestamp;
VPRINT(this, 3, "input %d %ssyscall latency: %" PRIu64 "\n", input->index,
input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency);
return (input->processing_maybe_blocking_syscall &&
latency >= options_.blocking_switch_threshold) ||
latency >= options_.syscall_switch_threshold;
}

template <typename RecordType, typename ReaderType>
typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
Expand Down Expand Up @@ -2006,11 +2027,16 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
} else if (options_.mapping == MAP_TO_ANY_OUTPUT) {
trace_marker_type_t marker_type;
uintptr_t marker_value;
if (input->processing_blocking_syscall) {
// While regular traces typically always have a syscall marker when there's a
// maybe-blocking marker, some tests and synthetic traces have just the maybe
// so we check both.
if (input->processing_syscall || input->processing_maybe_blocking_syscall) {
// Wait until we're past all the markers associated with the syscall.
// XXX: We may prefer to stop before the return value marker for futex,
// or a kernel xfer marker, but our recorded format is on instr
// boundaries so we live with those being before the switch.
// XXX: Once we insert kernel traces, we may have to try harder
// to stop before the post-syscall records.
if (record_type_is_marker(record, marker_type, marker_value) &&
marker_type == TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH) {
memref_tid_t target_tid = marker_value;
Expand All @@ -2025,18 +2051,33 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
input->switch_to_input = it->second;
}
} else if (record_type_is_instr(record)) {
// Assume it will block and we should switch to a different input.
need_new_input = true;
in_wait_state = true;
input->processing_blocking_syscall = false;
VPRINT(this, 3, "next_record[%d]: hit blocking syscall in input %d\n",
output, input->index);
if (syscall_incurs_switch(input)) {
// Model as blocking and should switch to a different input.
need_new_input = true;
in_wait_state = true;
VPRINT(this, 3,
"next_record[%d]: hit blocking syscall in input %d\n",
output, input->index);
}
input->processing_syscall = false;
input->processing_maybe_blocking_syscall = false;
input->pre_syscall_timestamp = 0;
}
}
if (record_type_is_marker(record, marker_type, marker_value) &&
marker_type == TRACE_MARKER_TYPE_SYSCALL) {
input->processing_syscall = true;
input->pre_syscall_timestamp = input->reader->get_last_timestamp();
} else if (record_type_is_marker(record, marker_type, marker_value) &&
marker_type == TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL) {
input->processing_blocking_syscall = true;
} else if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
record_type_is_instr(record)) {
input->processing_maybe_blocking_syscall = true;
// Generally we should already have the timestamp from a just-prior
// syscall marker, but we support tests and other synthetic sequences
// with just a maybe-blocking.
input->pre_syscall_timestamp = input->reader->get_last_timestamp();
}
if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
record_type_is_instr(record)) {
++input->instrs_in_quantum;
if (input->instrs_in_quantum > options_.quantum_duration) {
// We again prefer to switch to another input even if the current
Expand All @@ -2045,9 +2086,6 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
need_new_input = true;
}
} else if (options_.quantum_unit == QUANTUM_TIME) {
// The above if-else cases are all either for non-instrs or
// QUANTUM_INSTRUCTIONS, except the blocking syscall next instr which is
// already switching: so an else{} works here.
if (cur_time == 0 || cur_time < input->start_time_in_quantum) {
VPRINT(this, 1,
"next_record[%d]: invalid time %" PRIu64 " vs start %" PRIu64
Expand Down
24 changes: 22 additions & 2 deletions clients/drcachesim/scheduler/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,20 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
* of traced cores).
*/
archive_istream_t *replay_as_traced_istream = nullptr;
/**
* Determines the minimum latency in the unit of the trace's timestamps
* (microseconds) for which a non-maybe-blocking system call (one without
* a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
* blocking and trigger a context switch.
*/
uint64_t syscall_switch_threshold = 500;
/**
* Determines the minimum latency in the unit of the trace's timestamps
* (microseconds) for which a maybe-blocking system call (one with
* a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
* blocking and trigger a context switch.
*/
uint64_t blocking_switch_threshold = 100;
};

/**
Expand Down Expand Up @@ -965,8 +979,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
bool order_by_timestamp = false;
// Global ready queue counter used to provide FIFO for same-priority inputs.
uint64_t queue_counter = 0;
// Used to switch on the instruction *after* a blocking syscall.
bool processing_blocking_syscall = false;
// Used to switch on the instruction *after* a long-latency syscall.
bool processing_syscall = false;
bool processing_maybe_blocking_syscall = false;
uint64_t pre_syscall_timestamp = 0;
// Use for special kernel features where one thread specifies a target
// thread to replace it.
input_ordinal_t switch_to_input = INVALID_INPUT_ORDINAL;
Expand Down Expand Up @@ -1275,6 +1291,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
void
add_to_ready_queue(input_info_t *input);

// The input's lock must be held by the caller.
bool
syscall_incurs_switch(input_info_t *input);

// sched_lock_ must be held by the caller.
// "for_output" is which output stream is looking for a new input; only an
// input which is able to run on that output will be selected.
Expand Down
20 changes: 10 additions & 10 deletions clients/drcachesim/tests/schedule_stats_nopreempt.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Core #0 counts:
0 direct switch requests
0 waits
Core #1 counts:
2 threads
. threads
*[0-9]* instructions
. total context switches
0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
Expand All @@ -38,35 +38,35 @@ Core #1 counts:
0 direct context switches
100.00% voluntary switches
0.00% direct switches
.. system calls
*[0-9]* system calls
. maybe-blocking system calls
0 direct switch requests
0 waits
Core #2 counts:
2 threads
. threads
*[0-9]* instructions
1 total context switches
. total context switches
0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
*[0-9]* instructions per context switch
1 voluntary context switches
. voluntary context switches
0 direct context switches
100.00% voluntary switches
0.00% direct switches
.. system calls
*[0-9]* system calls
. maybe-blocking system calls
0 direct switch requests
0 waits
Core #3 counts:
2 threads
. threads
*[0-9]* instructions
1 total context switches
. total context switches
0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
*[0-9]* instructions per context switch
1 voluntary context switches
. voluntary context switches
0 direct context switches
100.00% voluntary switches
0.00% direct switches
.. system calls
*[0-9]* system calls
. maybe-blocking system calls
0 direct switch requests
0 waits
Expand Down
Loading

0 comments on commit 5990405

Please sign in to comment.