Skip to content

Commit

Permalink
i#7157 syscall sched: Handle static injected syscall traces in scheduler
Browse files Browse the repository at this point in the history
Adds handling for statically injected kernel syscall traces in the scheduler.

Ensures that quantum context switches are not done in the middle of a statically-injected syscall trace.

Also ensures that voluntary context switches are also delayed until after the syscall trace. This involved fixing the bookkeeping logic which is done on the next user-space instr.

We keep status quo on the scheduler behavior of showing the post-syscall markers before the switch.

Adds a unit test for statically injected kernel syscall trace handling by the scheduler.

Issue: #7157
  • Loading branch information
abhinav92003 committed Dec 18, 2024
1 parent 9931511 commit 544d58e
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 24 deletions.
62 changes: 39 additions & 23 deletions clients/drcachesim/scheduler/scheduler_dynamic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
// boundaries so we live with those being before the switch.
// XXX: Once we insert kernel traces, we may have to try harder
// to stop before the post-syscall records.
if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
if (!outputs_[output].in_syscall_code &&
this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
// The switch request overrides any latency threshold.
need_new_input = true;
Expand Down Expand Up @@ -506,18 +507,26 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
}
if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
!outputs_[output].in_kernel_code) {
!outputs_[output].in_context_switch_code) {
++input->instrs_in_quantum;
if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
// We again prefer to switch to another input even if the current
// input has the oldest timestamp, prioritizing context switches
// over timestamp ordering.
VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
output, input->index);
preempt = true;
need_new_input = true;
input->instrs_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
if (outputs_[output].in_syscall_code) {
VPRINT(this, 4,
"next_record[%d]: input %d delaying context switch "
"after end of instr quantum due to syscall code\n",
output, input->index);

} else {
// We again prefer to switch to another input even if the current
// input has the oldest timestamp, prioritizing context switches
// over timestamp ordering.
VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
output, input->index);
preempt = true;
need_new_input = true;
input->instrs_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
}
}
} else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
Expand All @@ -535,14 +544,21 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
// in between (e.g., scatter/gather long sequence of reads/writes) by
// setting input->switching_pre_instruction.
this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
VPRINT(this, 4,
"next_record[%d]: input %d hit end of time quantum after %" PRIu64
"\n",
output, input->index, input->time_spent_in_quantum);
preempt = true;
need_new_input = true;
input->time_spent_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
if (outputs_[output].in_syscall_code) {
VPRINT(this, 4,
"next_record[%d]: input %d delaying context switch after end of "
"time quantum after %" PRIu64 " due to syscall code\n",
output, input->index, input->time_spent_in_quantum);
} else {
VPRINT(this, 4,
"next_record[%d]: input %d hit end of time quantum after %" PRIu64
"\n",
output, input->index, input->time_spent_in_quantum);
preempt = true;
need_new_input = true;
input->time_spent_in_quantum = 0;
++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
}
}
}
// For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for
Expand Down Expand Up @@ -574,16 +590,16 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::process_marker(
break;
case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
outputs_[output].in_context_switch_code = true;
ANNOTATE_FALLTHROUGH;
break;
case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
outputs_[output].in_kernel_code = true;
outputs_[output].in_syscall_code = true;
break;
case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
// We have to delay until the next record.
outputs_[output].hit_switch_code_end = true;
ANNOTATE_FALLTHROUGH;
break;
case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
outputs_[output].in_kernel_code = false;
outputs_[output].in_syscall_code = false;
break;
case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: {
if (!options_.honor_direct_switches)
Expand Down
2 changes: 1 addition & 1 deletion clients/drcachesim/scheduler/scheduler_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
// This is accessed by other outputs for stealing and rebalancing.
// Indirected so we can store it in our vector.
std::unique_ptr<std::atomic<bool>> active;
bool in_kernel_code = false;
bool in_syscall_code = false;
bool in_context_switch_code = false;
bool hit_switch_code_end = false;
// Used for time-based quanta.
Expand Down
178 changes: 178 additions & 0 deletions clients/drcachesim/tests/scheduler_unit_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,183 @@ test_synthetic()
}
}

static void
test_synthetic_with_syscall_seq()
{
std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n";
static constexpr int NUM_INPUTS = 7;
static constexpr int NUM_OUTPUTS = 2;
static constexpr int NUM_INSTRS = 9;
static constexpr int QUANTUM_DURATION = 3;
// We do not want to block for very long.
static constexpr double BLOCK_SCALE = 0.01;
static constexpr uint64_t BLOCK_THRESHOLD = 100;
static constexpr memref_tid_t TID_BASE = 100;
static constexpr uint64_t KERNEL_CODE_OFFSET = 123456;
std::vector<trace_entry_t> inputs[NUM_INPUTS];
for (int i = 0; i < NUM_INPUTS; i++) {
memref_tid_t tid = TID_BASE + i;
inputs[i].push_back(make_thread(tid));
inputs[i].push_back(make_pid(1));
inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
inputs[i].push_back(make_timestamp(10)); // All the same time priority.
for (int j = 0; j < NUM_INSTRS; j++) {
inputs[i].push_back(make_instr(42 + j * 4));
// Test accumulation of usage across voluntary switches.
if ((i == 0 || i == 1) && j == 1) {
inputs[i].push_back(make_timestamp(20));
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
inputs[i].push_back(
make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
inputs[i].push_back(make_timestamp(120));
}
// Test a syscall sequence starting at each offset within a quantum
// of instrs.
if (i <= QUANTUM_DURATION && i == j) {
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 84));
inputs[i].push_back(
make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, 84));
for (int k = 0; k <= QUANTUM_DURATION; ++k)
inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k));
inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, 84));
}
}
inputs[i].push_back(make_exit(tid));
}
// A has a syscall sequence at [2,5], B has it at [3,6], C has it at [4,7],
// D has it at [5,8].
// The syscall sequence consists of 4 instrs which is greater than the
// #instr quanta.
// Total instrs in A, B, C, and D are 9 + 4 == 13, others have just 9.

// Hardcoding here for the 2 outputs and 7 inputs.
// We make assumptions on the scheduler's initial runqueue assignment
// being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on
// core1 (even parity letters).
// The dots are markers and thread exits.
// A has a voluntary switch at its 6th instr (1st in that scheduling). Its
// CPU usage persists to its next scheduling which has only 2 letters.
// B has a voluntary switch at its 2nd instr, but it doesn't take because a
// syscall sequence starts just then.
// Since core0 has an extra input, core1 finishes
// its runqueue first and then steals G from core0 (migration threshold is 0)
// and finishes it off.
static const char *const CORE0_SCHED_STRING =
"..A..AAAA...CCC..CCCC...EEE..GGGA....CCCEEEGGGAACCC.EEE.AAAAA.";
static const char *const CORE1_SCHED_STRING =
"..BB......BBBB...DDD..FFFBBBD..DDDD.FFFBBBDDDFFF.B.DD.GGG.____";
{
// Test instruction quanta.
std::vector<scheduler_t::input_workload_t> sched_inputs;
for (int i = 0; i < NUM_INPUTS; i++) {
std::vector<scheduler_t::input_reader_t> readers;
readers.emplace_back(
std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
sched_inputs.emplace_back(std::move(readers));
}
scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
scheduler_t::DEPENDENCY_IGNORE,
scheduler_t::SCHEDULER_DEFAULTS,
/*verbosity=*/4);
sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
// This was tuned with a 100us threshold: so avoid scheduler.h defaults
// changes from affecting our output.
sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
sched_ops.block_time_multiplier = BLOCK_SCALE;
sched_ops.time_units_per_us = 1.;
// Migration is measured in wall-clock-time for instr quanta
// so avoid non-determinism by having no threshold.
sched_ops.migration_threshold_us = 0;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
scheduler_t::STATUS_SUCCESS)
assert(false);
std::vector<std::string> sched_as_string =
run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
for (int i = 0; i < NUM_OUTPUTS; i++) {
std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
}
// Check scheduler stats. # switches is the # of letter transitions; # preempts
// is the instances where the same letter appears 3 times without another letter
// appearing in between (and ignoring the last letter for an input: EOF doesn't
// count as a preempt). # nops are the instances where the same input is picked
// to run because nothing else is waiting.
verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11,
/*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
/*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0,
/*direct_successes=*/0, /*migrations=*/1);
verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11,
/*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
/*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0,
/*direct_successes=*/0, /*migrations=*/0);
assert(scheduler.get_stream(0)->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0);
assert(scheduler.get_stream(1)->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1);
#ifndef WIN32
// XXX: Windows microseconds on test VMs are very coarse and stay the same
// for long periods. Instruction quanta use wall-clock idle times, so
// the result is extreme variations here. We try to adjust by handling
// any schedule with singleton 'A' and 'B', but in some cases on Windows
// we see the A and B delayed all the way to the very end where they
// are adjacent to their own letters. We just give up on checking the
// precise output for this test on Windows.
if (sched_as_string[0] != CORE0_SCHED_STRING ||
sched_as_string[1] != CORE1_SCHED_STRING) {
bool found_single_A = false, found_single_B = false;
for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) {
for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) {
if (sched_as_string[cpu][i] == 'A' &&
sched_as_string[cpu][i - 1] != 'A' &&
sched_as_string[cpu][i + 1] != 'A')
found_single_A = true;
if (sched_as_string[cpu][i] == 'B' &&
sched_as_string[cpu][i - 1] != 'B' &&
sched_as_string[cpu][i + 1] != 'B')
found_single_B = true;
}
}
assert(found_single_A && found_single_B);
}
#endif
}
{
// Test time quanta.
std::vector<scheduler_t::input_workload_t> sched_inputs;
for (int i = 0; i < NUM_INPUTS; i++) {
std::vector<scheduler_t::input_reader_t> readers;
readers.emplace_back(
std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
sched_inputs.emplace_back(std::move(readers));
}
scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
scheduler_t::DEPENDENCY_IGNORE,
scheduler_t::SCHEDULER_DEFAULTS,
/*verbosity=*/4);
sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
sched_ops.time_units_per_us = 1.;
// This was tuned with a 100us threshold: so avoid scheduler.h defaults
// changes from affecting our output.
sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
sched_ops.quantum_duration_us = QUANTUM_DURATION;
sched_ops.block_time_multiplier = BLOCK_SCALE;
sched_ops.migration_threshold_us = 0;
scheduler_t scheduler;
if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
scheduler_t::STATUS_SUCCESS)
assert(false);
std::vector<std::string> sched_as_string =
run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
for (int i = 0; i < NUM_OUTPUTS; i++) {
std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
}
assert(sched_as_string[0] == CORE0_SCHED_STRING);
assert(sched_as_string[1] == CORE1_SCHED_STRING);
}
}

static void
test_synthetic_time_quanta()
{
Expand Down Expand Up @@ -6424,6 +6601,7 @@ test_main(int argc, const char *argv[])
test_only_threads();
test_real_file_queries_and_filters(argv[1]);
test_synthetic();
test_synthetic_with_syscall_seq();
test_synthetic_time_quanta();
test_synthetic_with_timestamps();
test_synthetic_with_priorities();
Expand Down

0 comments on commit 544d58e

Please sign in to comment.