i#7157 syscall sched: Handle static injected syscall traces in scheduler

Adds handling for statically injected kernel syscall traces in the scheduler. Ensures that quantum context switches are not done in the middle of a statically-injected syscall trace. Also ensures that voluntary context switches are also delayed until after the syscall trace. This involved fixing the bookkeeping logic which is done on the next user-space instr. We keep status quo on the scheduler behavior of showing the post-syscall markers before the switch. Adds a unit test for statically injected kernel syscall trace handling by the scheduler. Issue: #7157
DynamoRIO · Dec 18, 2024 · 544d58e · 544d58e
1 parent 9931511
commit 544d58e
Show file tree

Hide file tree

Showing 3 changed files with 218 additions and 24 deletions.
diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
@@ -458,7 +458,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         // boundaries so we live with those being before the switch.
         // XXX: Once we insert kernel traces, we may have to try harder
         // to stop before the post-syscall records.
-        if (this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
+        if (!outputs_[output].in_syscall_code &&
+            this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
             if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
                 // The switch request overrides any latency threshold.
                 need_new_input = true;
@@ -506,18 +507,26 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
     }
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
         this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
-        !outputs_[output].in_kernel_code) {
+        !outputs_[output].in_context_switch_code) {
         ++input->instrs_in_quantum;
         if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
-            // We again prefer to switch to another input even if the current
-            // input has the oldest timestamp, prioritizing context switches
-            // over timestamp ordering.
-            VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
-                   output, input->index);
-            preempt = true;
-            need_new_input = true;
-            input->instrs_in_quantum = 0;
-            ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            if (outputs_[output].in_syscall_code) {
+                VPRINT(this, 4,
+                       "next_record[%d]: input %d delaying context switch "
+                       "after end of instr quantum due to syscall code\n",
+                       output, input->index);
+
+            } else {
+                // We again prefer to switch to another input even if the current
+                // input has the oldest timestamp, prioritizing context switches
+                // over timestamp ordering.
+                VPRINT(this, 4, "next_record[%d]: input %d hit end of instr quantum\n",
+                       output, input->index);
+                preempt = true;
+                need_new_input = true;
+                input->instrs_in_quantum = 0;
+                ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            }
         }
     } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
         if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
@@ -535,14 +544,21 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
             // in between (e.g., scatter/gather long sequence of reads/writes) by
             // setting input->switching_pre_instruction.
             this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
-            VPRINT(this, 4,
-                   "next_record[%d]: input %d hit end of time quantum after %" PRIu64
-                   "\n",
-                   output, input->index, input->time_spent_in_quantum);
-            preempt = true;
-            need_new_input = true;
-            input->time_spent_in_quantum = 0;
-            ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            if (outputs_[output].in_syscall_code) {
+                VPRINT(this, 4,
+                       "next_record[%d]: input %d delaying context switch after end of "
+                       "time quantum after %" PRIu64 " due to syscall code\n",
+                       output, input->index, input->time_spent_in_quantum);
+            } else {
+                VPRINT(this, 4,
+                       "next_record[%d]: input %d hit end of time quantum after %" PRIu64
+                       "\n",
+                       output, input->index, input->time_spent_in_quantum);
+                preempt = true;
+                need_new_input = true;
+                input->time_spent_in_quantum = 0;
+                ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS];
+            }
         }
     }
     // For sched_type_t::DEPENDENCY_TIMESTAMPS: enforcing asked-for
@@ -574,16 +590,16 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::process_marker(
         break;
     case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
         outputs_[output].in_context_switch_code = true;
-        ANNOTATE_FALLTHROUGH;
+        break;
     case TRACE_MARKER_TYPE_SYSCALL_TRACE_START:
-        outputs_[output].in_kernel_code = true;
+        outputs_[output].in_syscall_code = true;
         break;
     case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
         // We have to delay until the next record.
         outputs_[output].hit_switch_code_end = true;
-        ANNOTATE_FALLTHROUGH;
+        break;
     case TRACE_MARKER_TYPE_SYSCALL_TRACE_END:
-        outputs_[output].in_kernel_code = false;
+        outputs_[output].in_syscall_code = false;
         break;
     case TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH: {
         if (!options_.honor_direct_switches)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -480,7 +480,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // This is accessed by other outputs for stealing and rebalancing.
         // Indirected so we can store it in our vector.
         std::unique_ptr<std::atomic<bool>> active;
-        bool in_kernel_code = false;
+        bool in_syscall_code = false;
         bool in_context_switch_code = false;
         bool hit_switch_code_end = false;
         // Used for time-based quanta.

diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -1354,6 +1354,183 @@ test_synthetic()
     }
 }
 
+static void
+test_synthetic_with_syscall_seq()
+{
+    std::cerr << "\n----------------\nTesting synthetic with syscall sequences\n";
+    static constexpr int NUM_INPUTS = 7;
+    static constexpr int NUM_OUTPUTS = 2;
+    static constexpr int NUM_INSTRS = 9;
+    static constexpr int QUANTUM_DURATION = 3;
+    // We do not want to block for very long.
+    static constexpr double BLOCK_SCALE = 0.01;
+    static constexpr uint64_t BLOCK_THRESHOLD = 100;
+    static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr uint64_t KERNEL_CODE_OFFSET = 123456;
+    std::vector<trace_entry_t> inputs[NUM_INPUTS];
+    for (int i = 0; i < NUM_INPUTS; i++) {
+        memref_tid_t tid = TID_BASE + i;
+        inputs[i].push_back(make_thread(tid));
+        inputs[i].push_back(make_pid(1));
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(10)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
+            inputs[i].push_back(make_instr(42 + j * 4));
+            // Test accumulation of usage across voluntary switches.
+            if ((i == 0 || i == 1) && j == 1) {
+                inputs[i].push_back(make_timestamp(20));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                inputs[i].push_back(make_timestamp(120));
+            }
+            // Test a syscall sequence starting at each offset within a quantum
+            // of instrs.
+            if (i <= QUANTUM_DURATION && i == j) {
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 84));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_START, 84));
+                for (int k = 0; k <= QUANTUM_DURATION; ++k)
+                    inputs[i].push_back(make_instr(KERNEL_CODE_OFFSET + k));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL_TRACE_END, 84));
+            }
+        }
+        inputs[i].push_back(make_exit(tid));
+    }
+    // A has a syscall sequence at [2,5], B has it at [3,6], C has it at [4,7],
+    // D has it at [5,8].
+    // The syscall sequence consists of 4 instrs which is greater than the
+    // #instr quanta.
+    // Total instrs in A, B, C, and D are 9 + 4 == 13, others have just 9.
+
+    // Hardcoding here for the 2 outputs and 7 inputs.
+    // We make assumptions on the scheduler's initial runqueue assignment
+    // being round-robin, resulting in 4 on core0 (odd parity letters) and 3 on
+    // core1 (even parity letters).
+    // The dots are markers and thread exits.
+    // A has a voluntary switch at its 6th instr (1st in that scheduling). Its
+    // CPU usage persists to its next scheduling which has only 2 letters.
+    // B has a voluntary switch at its 2nd instr, but it doesn't take because a
+    // syscall sequence starts just then.
+    // Since core0 has an extra input, core1 finishes
+    // its runqueue first and then steals G from core0 (migration threshold is 0)
+    // and finishes it off.
+    static const char *const CORE0_SCHED_STRING =
+        "..A..AAAA...CCC..CCCC...EEE..GGGA....CCCEEEGGGAACCC.EEE.AAAAA.";
+    static const char *const CORE1_SCHED_STRING =
+        "..BB......BBBB...DDD..FFFBBBD..DDDD.FFFBBBDDDFFF.B.DD.GGG.____";
+    {
+        // Test instruction quanta.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/4);
+        sched_ops.quantum_duration_instrs = QUANTUM_DURATION;
+        // This was tuned with a 100us threshold: so avoid scheduler.h defaults
+        // changes from affecting our output.
+        sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.time_units_per_us = 1.;
+        // Migration is measured in wall-clock-time for instr quanta
+        // so avoid non-determinism by having no threshold.
+        sched_ops.migration_threshold_us = 0;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        // Check scheduler stats.  # switches is the # of letter transitions; # preempts
+        // is the instances where the same letter appears 3 times without another letter
+        // appearing in between (and ignoring the last letter for an input: EOF doesn't
+        // count as a preempt). # nops are the instances where the same input is picked
+        // to run because nothing else is waiting.
+        verify_scheduler_stats(scheduler.get_stream(0), /*switch_input_to_input=*/11,
+                               /*switch_input_to_idle=*/0, /*switch_idle_to_input=*/0,
+                               /*switch_nop=*/1, /*preempts=*/9, /*direct_attempts=*/0,
+                               /*direct_successes=*/0, /*migrations=*/1);
+        verify_scheduler_stats(scheduler.get_stream(1), /*switch_input_to_input=*/11,
+                               /*switch_input_to_idle=*/1, /*switch_idle_to_input=*/0,
+                               /*switch_nop=*/0, /*preempts=*/8, /*direct_attempts=*/0,
+                               /*direct_successes=*/0, /*migrations=*/0);
+        assert(scheduler.get_stream(0)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 0);
+        assert(scheduler.get_stream(1)->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS) == 1);
+#ifndef WIN32
+        // XXX: Windows microseconds on test VMs are very coarse and stay the same
+        // for long periods.  Instruction quanta use wall-clock idle times, so
+        // the result is extreme variations here.  We try to adjust by handling
+        // any schedule with singleton 'A' and 'B', but in some cases on Windows
+        // we see the A and B delayed all the way to the very end where they
+        // are adjacent to their own letters.  We just give up on checking the
+        // precise output for this test on Windows.
+        if (sched_as_string[0] != CORE0_SCHED_STRING ||
+            sched_as_string[1] != CORE1_SCHED_STRING) {
+            bool found_single_A = false, found_single_B = false;
+            for (int cpu = 0; cpu < NUM_OUTPUTS; ++cpu) {
+                for (size_t i = 1; i < sched_as_string[cpu].size() - 1; ++i) {
+                    if (sched_as_string[cpu][i] == 'A' &&
+                        sched_as_string[cpu][i - 1] != 'A' &&
+                        sched_as_string[cpu][i + 1] != 'A')
+                        found_single_A = true;
+                    if (sched_as_string[cpu][i] == 'B' &&
+                        sched_as_string[cpu][i - 1] != 'B' &&
+                        sched_as_string[cpu][i + 1] != 'B')
+                        found_single_B = true;
+                }
+            }
+            assert(found_single_A && found_single_B);
+        }
+#endif
+    }
+    {
+        // Test time quanta.
+        std::vector<scheduler_t::input_workload_t> sched_inputs;
+        for (int i = 0; i < NUM_INPUTS; i++) {
+            std::vector<scheduler_t::input_reader_t> readers;
+            readers.emplace_back(
+                std::unique_ptr<mock_reader_t>(new mock_reader_t(inputs[i])),
+                std::unique_ptr<mock_reader_t>(new mock_reader_t()), TID_BASE + i);
+            sched_inputs.emplace_back(std::move(readers));
+        }
+        scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
+                                                   scheduler_t::DEPENDENCY_IGNORE,
+                                                   scheduler_t::SCHEDULER_DEFAULTS,
+                                                   /*verbosity=*/4);
+        sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+        sched_ops.time_units_per_us = 1.;
+        // This was tuned with a 100us threshold: so avoid scheduler.h defaults
+        // changes from affecting our output.
+        sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
+        sched_ops.quantum_duration_us = QUANTUM_DURATION;
+        sched_ops.block_time_multiplier = BLOCK_SCALE;
+        sched_ops.migration_threshold_us = 0;
+        scheduler_t scheduler;
+        if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+            scheduler_t::STATUS_SUCCESS)
+            assert(false);
+        std::vector<std::string> sched_as_string =
+            run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
+        for (int i = 0; i < NUM_OUTPUTS; i++) {
+            std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
+        }
+        assert(sched_as_string[0] == CORE0_SCHED_STRING);
+        assert(sched_as_string[1] == CORE1_SCHED_STRING);
+    }
+}
+
 static void
 test_synthetic_time_quanta()
 {
@@ -6424,6 +6601,7 @@ test_main(int argc, const char *argv[])
     test_only_threads();
     test_real_file_queries_and_filters(argv[1]);
     test_synthetic();
+    test_synthetic_with_syscall_seq();
     test_synthetic_time_quanta();
     test_synthetic_with_timestamps();
     test_synthetic_with_priorities();