Skip to content

Commit

Permalink
i#6471 sched idle: Add idle time from blocking syscalls (#6494)
Browse files Browse the repository at this point in the history
Adds a longer waiting period on blocking syscalls using either provided
output time or for instruction-based quanta a count of queue selections
before a blocked input is actually selected. Since the scheduler does
not have timer interrupts or regular points of control and relies on its
user calling it, idle inputs are kept on the ready queue and are checked
for becoming unblocked when the ready queue is queried.

The wait duration is set based on the "wait time factor" which is the
syscall latency divided by the context switch threshold multipled by a
user-provided "block_time_scale" option which can be used to scale up or
down the durations.

The wait duration is erased on a direct switch to an input.

Adds a new replay record type to represent idle time on replay.

Augments the unit tests to include blocking high-latency syscalls to
test the new feature in various sub-tests.

Issue: #6471
  • Loading branch information
derekbruening authored Dec 7, 2023
1 parent d833458 commit 110ca5e
Show file tree
Hide file tree
Showing 7 changed files with 502 additions and 131 deletions.
1 change: 1 addition & 0 deletions clients/drcachesim/analyzer_multi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ analyzer_multi_t::init_dynamic_schedule()
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
sched_ops.block_time_scale = op_sched_block_scale.get_value();
#ifdef HAS_ZIP
if (!op_record_file.get_value().empty()) {
record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
Expand Down
5 changes: 5 additions & 0 deletions clients/drcachesim/common/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,11 @@ droption_t<uint64_t> op_sched_blocking_switch_us(
"maybe-blocking to incur a context switch. Applies to -core_sharded and "
"-core_serial. ");

droption_t<double>
op_sched_block_scale(DROPTION_SCOPE_ALL, "sched_block_scale", 1.,
"Input block time scale factor",
"A higher value here results in blocking syscalls "
"keeping inputs unscheduled for longer.");
#ifdef HAS_ZIP
droption_t<std::string> op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "",
"Path for storing record of schedule",
Expand Down
1 change: 1 addition & 0 deletions clients/drcachesim/common/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ extern dynamorio::droption::droption_t<bool> op_sched_time;
extern dynamorio::droption::droption_t<bool> op_sched_order_time;
extern dynamorio::droption::droption_t<uint64_t> op_sched_syscall_switch_us;
extern dynamorio::droption::droption_t<uint64_t> op_sched_blocking_switch_us;
extern dynamorio::droption::droption_t<double> op_sched_block_scale;
#ifdef HAS_ZIP
extern dynamorio::droption::droption_t<std::string> op_record_file;
extern dynamorio::droption::droption_t<std::string> op_replay_file;
Expand Down
335 changes: 255 additions & 80 deletions clients/drcachesim/scheduler/scheduler.cpp

Large diffs are not rendered by default.

54 changes: 47 additions & 7 deletions clients/drcachesim/scheduler/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,18 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
* blocking and trigger a context switch.
*/
uint64_t blocking_switch_threshold = 100;
/**
* Controls the amount of time inputs are considered blocked at a syscall whose
* latency exceeds #syscall_switch_threshold or #blocking_switch_threshold. A
* "block time factor" is computed from the syscall latency divided by either
* #syscall_switch_threshold or #blocking_switch_threshold. This factor is
* multiplied by this field #block_time_scale to produce a final value. For
* #QUANTUM_TIME, that final value's amount of time, as reported by the time
* parameter to next_record(), must pass before the input is no longer considered
* blocked. For instruction quanta, that final value's count of scheduler
* selections must occur before the input is actually selected.
*/
double block_time_scale = 1.;
};

/**
Expand Down Expand Up @@ -956,6 +968,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
// While the scheduler only hands an input to one output at a time, during
// scheduling decisions one thread may need to access another's fields.
// We use a unique_ptr to make this moveable for vector storage.
// For inputs not actively assigned to a core but sitting in the ready_queue,
// sched_lock_ suffices to synchronize access.
std::unique_ptr<std::mutex> lock;
// A tid can be duplicated across workloads so we need the pair of
// workload index + tid to identify the original input.
Expand Down Expand Up @@ -1002,6 +1016,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
bool switching_pre_instruction = false;
// Used for time-based quanta.
uint64_t start_time_in_quantum = 0;
// These fields model waiting at a blocking syscall.
double block_time_factor = 0.;
uint64_t blocked_start_time = 0; // For QUANTUM_TIME only.
};

// Format for recording a schedule to disk. A separate sequence of these records
Expand All @@ -1019,14 +1036,17 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
FOOTER, // The final entry in the component. Other fields are ignored.
SKIP, // Skip ahead to the next region of interest.
SYNTHETIC_END, // A synthetic thread exit record must be supplied.
// Indicates that the output is idle. The value.idle_duration field holds
// a duration in microseconds.
IDLE,
};
static constexpr int VERSION_CURRENT = 0;
schedule_record_t() = default;
schedule_record_t(record_type_t type, input_ordinal_t input, uint64_t start,
uint64_t stop, uint64_t time)
: type(type)
, key(input)
, start_instruction(start)
, value(start)
, stop_instruction(stop)
, timestamp(time)
{
Expand All @@ -1045,8 +1065,18 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
input_ordinal_t input = -1;
int version; // For record_type_t::VERSION.
} END_PACKED_STRUCTURE key;
// Input stream ordinal of starting point.
uint64_t start_instruction = 0;
START_PACKED_STRUCTURE
union value {
value() = default;
value(uint64_t start)
: start_instruction(start)
{
}
// For record_type_t::IDLE, the duration in microseconds of the idling.
uint64_t idle_duration;
// Input stream ordinal of starting point, for non-IDLE types.
uint64_t start_instruction = 0;
} END_PACKED_STRUCTURE value;
// Input stream ordinal, exclusive. Max numeric value means continue until EOF.
uint64_t stop_instruction = 0;
// Timestamp in microseconds to keep context switches ordered.
Expand Down Expand Up @@ -1096,6 +1126,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
int64_t as_traced_cpuid = -1;
// Used for MAP_AS_PREVIOUSLY with live_replay_output_count_.
bool at_eof = false;
// Used for replaying wait periods.
uint64_t wait_start_time = 0;
};

// Called just once at initialization time to set the initial input-to-output
Expand Down Expand Up @@ -1168,6 +1200,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
scheduler_status_t
read_recorded_schedule();

uint64_t
get_time_micros();

uint64_t
get_output_time(output_ordinal_t output);

// The caller must hold the lock for the input.
stream_status_t
record_schedule_segment(
Expand All @@ -1193,7 +1231,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
// Finds the next input stream for the 'output_ordinal'-th output stream.
// No input_info_t lock can be held on entry.
stream_status_t
pick_next_input(output_ordinal_t output, bool in_wait_state);
pick_next_input(output_ordinal_t output, double block_time_factor);

// Helper for pick_next_input() for MAP_AS_PREVIOUSLY.
// No input_info_t lock can be held on entry.
Expand Down Expand Up @@ -1313,14 +1351,16 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
add_to_ready_queue(input_info_t *input);

// The input's lock must be held by the caller.
// Returns a multiplier for how long the input should be considered blocked.
bool
syscall_incurs_switch(input_info_t *input);
syscall_incurs_switch(input_info_t *input, double &block_time_factor);

// sched_lock_ must be held by the caller.
// "for_output" is which output stream is looking for a new input; only an
// input which is able to run on that output will be selected.
input_info_t *
pop_from_ready_queue(output_ordinal_t for_output);
stream_status_t
pop_from_ready_queue(output_ordinal_t for_output, input_info_t *&new_input);

///
///////////////////////////////////////////////////////////////////////////

Expand Down
6 changes: 6 additions & 0 deletions clients/drcachesim/tests/scheduler_launcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ droption_t<bool> op_honor_stamps(DROPTION_SCOPE_ALL, "honor_stamps", true,
"Whether to honor recorded timestamps for ordering",
"Whether to honor recorded timestamps for ordering");

droption_t<double> op_block_time_scale(DROPTION_SCOPE_ALL, "block_time_scale", 1.,
"Input block time scale factor",
"A higher value here results in blocking syscalls "
"keeping inputs unscheduled for longer.");

#ifdef HAS_ZIP
droption_t<std::string> op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "",
"Path for storing record of schedule",
Expand Down Expand Up @@ -319,6 +324,7 @@ _tmain(int argc, const TCHAR *targv[])
sched_ops.quantum_duration = op_sched_quantum.get_value();
if (op_sched_time.get_value())
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.block_time_scale = op_block_time_scale.get_value();
#ifdef HAS_ZIP
std::unique_ptr<zipfile_ostream_t> record_zip;
std::unique_ptr<zipfile_istream_t> replay_zip;
Expand Down
Loading

0 comments on commit 110ca5e

Please sign in to comment.