From 278fc510a2adb237cd8983dd2ec112e83e6a7d12 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Tue, 12 Dec 2023 12:39:45 -0500
Subject: [PATCH] i#5843 scheduler: Accumulate quanta across runs (#6502)

Changes the quanta accounting to match the real kernel by accumulating
it across executions if a prior execution was terminated early due to a
voluntary context switch.

Adds new testing, and updates old tests with the behavior change.
Scheduler unit test string changes were carefully vetted. E.g., for
test_synthetic_with_syscalls_multiple(): the output strings changed
because H's quantum accumulates and it hits a preempt in the middle of
its second HH sequence, which decrements B's quantum, causing B to
become available sooner.

Issue: #5843
---
 clients/drcachesim/scheduler/scheduler.cpp    | 40 ++++++++++++-----
 clients/drcachesim/scheduler/scheduler.h      |  3 +-
 .../drcachesim/tests/scheduler_unit_tests.cpp | 45 ++++++++++++++-----
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 344e0e313a1..393d42a40a9 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -1724,8 +1724,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
     if (prev_input == input)
         return STATUS_OK;
     std::lock_guard<std::mutex> lock(*inputs_[input].lock);
-    inputs_[input].instrs_in_quantum = 0;
-    inputs_[input].start_time_in_quantum = outputs_[output].cur_time;
+    inputs_[input].prev_time_in_quantum = outputs_[output].cur_time;
     if (options_.schedule_record_ostream != nullptr) {
         uint64_t instr_ord = inputs_[input].reader->get_instruction_ordinal();
         if (!inputs_[input].recorded_in_schedule && instr_ord == 1) {
@@ -2104,8 +2103,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     auto lock = std::unique_lock<std::mutex>(*input->lock);
     // Since we do not ask for a start time, we have to check for the first record from
     // each input and set the time here.
-    if (input->start_time_in_quantum == 0)
-        input->start_time_in_quantum = cur_time;
+    if (input->prev_time_in_quantum == 0)
+        input->prev_time_in_quantum = cur_time;
     if (!outputs_[output].speculation_stack.empty()) {
         outputs_[output].prev_speculate_pc = outputs_[output].speculate_pc;
         error_string_ = outputs_[output].speculator.next_record(
@@ -2170,7 +2169,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                input->index, input->reader->get_instruction_ordinal());
         VDO(this, 5, print_record(record););
         bool need_new_input = false;
+        bool preempt = false;
         double block_time_factor = 0.;
+        uint64_t prev_time_in_quantum = 0;
         if (options_.mapping == MAP_AS_PREVIOUSLY) {
             assert(outputs_[output].record_index >= 0);
             if (outputs_[output].record_index >=
@@ -2275,28 +2276,32 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                     VPRINT(this, 4,
                            "next_record[%d]: input %d hit end of instr quantum\n", output,
                            input->index);
+                    preempt = !need_new_input;
                     need_new_input = true;
+                    input->instrs_in_quantum = 0;
                 }
             } else if (options_.quantum_unit == QUANTUM_TIME) {
-                if (cur_time == 0 || cur_time < input->start_time_in_quantum) {
+                if (cur_time == 0 || cur_time < input->prev_time_in_quantum) {
                     VPRINT(this, 1,
                            "next_record[%d]: invalid time %" PRIu64 " vs start %" PRIu64
                            "\n",
-                           output, cur_time, input->start_time_in_quantum);
+                           output, cur_time, input->prev_time_in_quantum);
                     return sched_type_t::STATUS_INVALID;
                 }
-                if (cur_time - input->start_time_in_quantum >=
-                        options_.quantum_duration &&
+                input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum;
+                prev_time_in_quantum = input->prev_time_in_quantum;
+                input->prev_time_in_quantum = cur_time;
+                if (input->time_spent_in_quantum >= options_.quantum_duration &&
                     // We only switch on instruction boundaries.  We could possibly switch
                     // in between (e.g., scatter/gather long sequence of reads/writes) by
                     // setting input->switching_pre_instruction.
                     record_type_is_instr(record)) {
                     VPRINT(this, 4,
-                           "next_record[%d]: hit end of time quantum after %" PRIu64
-                           " (%" PRIu64 " - %" PRIu64 ")\n",
-                           output, cur_time - input->start_time_in_quantum, cur_time,
-                           input->start_time_in_quantum);
+                           "next_record[%d]: hit end of time quantum after %" PRIu64 "\n",
+                           output, input->time_spent_in_quantum);
+                    preempt = !need_new_input;
                     need_new_input = true;
+                    input->time_spent_in_quantum = 0;
                 }
             }
         }
@@ -2335,6 +2340,15 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 lock.lock();
                 VPRINT(this, 5, "next_record_mid[%d]: switching from %d to %d\n", output,
                        prev_input, outputs_[output].cur_input);
+                if (!preempt) {
+                    if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
+                        record_type_is_instr(record)) {
+                        --inputs_[prev_input].instrs_in_quantum;
+                    } else if (options_.quantum_unit == QUANTUM_TIME) {
+                        inputs_[prev_input].time_spent_in_quantum -=
+                            (cur_time - prev_time_in_quantum);
+                    }
+                }
                 if (res == sched_type_t::STATUS_WAIT)
                     return res;
                 input = &inputs_[outputs_[output].cur_input];
@@ -2392,6 +2406,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t ou
     record = outinfo.last_record;
     input = &inputs_[outinfo.cur_input];
     std::lock_guard<std::mutex> lock(*input->lock);
+    VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
+           input->index);
     input->queue.push_back(outinfo.last_record);
     if (options_.quantum_unit == QUANTUM_INSTRUCTIONS && record_type_is_instr(record))
         --input->instrs_in_quantum;
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 490afeecc6b..ef218a9d350 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1015,7 +1015,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Used to switch before we've read the next instruction.
         bool switching_pre_instruction = false;
         // Used for time-based quanta.
-        uint64_t start_time_in_quantum = 0;
+        uint64_t prev_time_in_quantum = 0;
+        uint64_t time_spent_in_quantum = 0;
         // These fields model waiting at a blocking syscall.
         double block_time_factor = 0.;
         uint64_t blocked_start_time = 0; // For QUANTUM_TIME only.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 445785772a1..d41d2119798 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -898,22 +898,38 @@ test_synthetic()
     static constexpr int NUM_OUTPUTS = 2;
     static constexpr int NUM_INSTRS = 9;
     static constexpr int QUANTUM_DURATION = 3;
+    static constexpr double BLOCK_SCALE = 0.1;
     static constexpr memref_tid_t TID_BASE = 100;
     std::vector<trace_entry_t> inputs[NUM_INPUTS];
     for (int i = 0; i < NUM_INPUTS; i++) {
         memref_tid_t tid = TID_BASE + i;
         inputs[i].push_back(make_thread(tid));
         inputs[i].push_back(make_pid(1));
-        for (int j = 0; j < NUM_INSTRS; j++)
+        inputs[i].push_back(make_version(TRACE_ENTRY_VERSION));
+        inputs[i].push_back(make_timestamp(10)); // All the same time priority.
+        for (int j = 0; j < NUM_INSTRS; j++) {
             inputs[i].push_back(make_instr(42 + j * 4));
+            // Test accumulation of usage across voluntary switches.
+            if ((i == 0 || i == 1) && j == 1) {
+                inputs[i].push_back(make_timestamp(20));
+                inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
+                inputs[i].push_back(
+                    make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
+                inputs[i].push_back(make_timestamp(120));
+            }
+        }
         inputs[i].push_back(make_exit(tid));
     }
     // Hardcoding here for the 2 outputs and 7 inputs.
     // We expect 3 letter sequences (our quantum) alternating every-other as each
-    // core alternates; with an odd number the 2nd core finishes early.
-    // The dots are thread exits.
-    static const char *const CORE0_SCHED_STRING = "AAACCCEEEGGGBBBDDDFFFAAA.CCC.EEE.GGG.";
-    static const char *const CORE1_SCHED_STRING = "BBBDDDFFFAAACCCEEEGGGBBB.DDD.FFF.____";
+    // core alternates. The dots are markers and thread exits.
+    // A and B have a voluntary switch after their 1st 2 letters, but we expect
+    // the usage to persist to their next scheduling which should only have
+    // a single letter.
+    static const char *const CORE0_SCHED_STRING =
+        "..AA......CCC..EEE..GGGEEEABGGGDDD.AAABBBAAA.___";
+    static const char *const CORE1_SCHED_STRING =
+        "..BB......DDD..FFFCCCDDDFFFCCC.EEE.FFF.GGG.BBB.";
     {
         // Test instruction quanta.
         std::vector<scheduler_t::input_workload_t> sched_inputs;
@@ -929,6 +945,8 @@ test_synthetic()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
         sched_ops.quantum_duration = QUANTUM_DURATION;
+        // We do not want to block for very long.
+        sched_ops.block_time_scale = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
             scheduler_t::STATUS_SUCCESS)
@@ -957,6 +975,8 @@ test_synthetic()
                                                    /*verbosity=*/3);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
         sched_ops.quantum_duration = QUANTUM_DURATION;
+        // QUANTUM_INSTRUCTIONS divides by the threshold so to match we multiply.
+        sched_ops.block_time_scale = sched_ops.blocking_switch_threshold * BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
             scheduler_t::STATUS_SUCCESS)
@@ -1530,9 +1550,9 @@ test_synthetic_with_syscalls_multiple()
     // with the "." in run_lockstep_simulation().  The omitted "." markers also
     // explains why the two strings are different lengths.
     assert(sched_as_string[0] ==
-           "BHHHFFFJJJJJJJBEEHHHIIIFFFAAAHHHBAAAGGGAAABGGG__B___B___B");
+           "BHHHFFFJJJJJJJBEEHHHIIIBIIIEEDDDBAAAEEGGGBDDD___B___B___B___B");
     assert(sched_as_string[1] ==
-           "EECCCIIICCCJJFFFCCCBIIIEEDDDGGGDDDEEDDD____EB__________________________");
+           "EECCCIIICCCJJFFFCCCFFFAAAHHHGGGDDDAAAGGGE__________________________");
 }
 
 static void
@@ -1900,9 +1920,10 @@ test_synthetic_with_syscalls_idle()
     // The timestamps provide the ABCD ordering, but A's blocking syscall after its
     // 2nd instr makes it delayed for 3 full queue cycles of BCD BCD: A's duration
     // of 2 is decremented after the 1st (to 1) and 2nd (to 0) and A is finally
-    // schedulable after the 3rd.
+    // schedulable after the 3rd, when it just gets 1 instruction in before its
+    // (accumulated) count equals the quantum.
     assert(sched_as_string[0] ==
-           "..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDAAABBB.CCC.DDD.AAAAAAA.");
+           "..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDABBB.CCC.DDD.AAAAAAAAA.");
 }
 
 static void
@@ -3285,7 +3306,6 @@ test_inactive()
         // Ensure cpu0 now picks up the input that was on cpu1.
         // This is also the record we un-read earlier.
         check_next(stream0, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
-        check_next(stream0, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
         // End of quantum.
         check_next(stream0, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_INSTR);
         // Make cpu0 inactive and cpu1 active.
@@ -3296,6 +3316,7 @@ test_inactive()
         assert(status == scheduler_t::STATUS_OK);
         // Now cpu1 should finish things.
         check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
+        check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_INSTR);
         check_next(stream1, scheduler_t::STATUS_OK, TID_B, TRACE_TYPE_THREAD_EXIT);
         check_next(stream1, scheduler_t::STATUS_OK, TID_A, TRACE_TYPE_THREAD_EXIT);
         check_next(stream1, scheduler_t::STATUS_EOF);
@@ -3326,8 +3347,8 @@ test_inactive()
         for (int i = 0; i < NUM_OUTPUTS; i++) {
             std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
         }
-        assert(sched_as_string[0] == "..AABBA._");
-        assert(sched_as_string[1] == "..B---B.");
+        assert(sched_as_string[0] == "..AABA.__");
+        assert(sched_as_string[1] == "..B--BB.");
     }
 #endif // HAS_ZIP
 }