From dff98b7a4a121db71dcea8ce271a7d355a888589 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Fri, 4 Oct 2024 14:31:43 -0400
Subject: [PATCH] i#6971: Use instr count instead of wallclock for simulated
 time (#7015)

When using the drmemtrace scheduler in an analyzer or other tool that
does not track simulated time with the default QUANTUM_INSTRUCTIONS, the
scheduler used to use wall-clock time to measure blocking-input and idle
time. Here we change that to use the instruction count plus the idle
count via a new idle counter.

The time_units_per_us and sched_time_units_per_us defaults are set to
1000, reflecting a 2gHz machine with IPC=0.5.
The old time_units_per_us=100 for wall clock was too low; to match it
with counts, we need a low sched_time_units_per_us: 500 is better than
1000, but that seems unrealisitc. Instead we can get the results we want
from our large traces by exiting earlier, since most of the unwanted
idle is still in seemingly unrepresentative regions at the end. We raise
exit_if_fraction_left from 0.05 to 0.1 here.

Using counters provides a more reproducible result across different runs
and machines.

Wall-clock time is still used to measure idle time on replay. Switching
to the idle count added here is left as separate work under #7023.
(Replay also uses wall-clock time to coordinate concurrent outputs
beyond shared input constraints; that will likely always remain.)

The new default values of the options were tested on larger traces and
found to produce a representative level of idle time.

This change means that the clock going backward problem (#6966) is no
longer seen in default runs. The analyzer still supports wall-clock with
the -sched_time option so a check to avoid underflow is added.

Fixes #6971
Fixes #6966
---
 clients/drcachesim/analyzer.cpp               |  2 +-
 clients/drcachesim/common/options.cpp         | 25 ++++++-------
 clients/drcachesim/scheduler/scheduler.cpp    | 35 +++++++++++++------
 clients/drcachesim/scheduler/scheduler.h      | 33 ++++++++++++-----
 .../drcachesim/tests/scheduler_unit_tests.cpp |  4 +--
 5 files changed, 66 insertions(+), 33 deletions(-)
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index a4f860f4ec4..f2c2ddb5098 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -524,7 +524,7 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
     while (true) {
         RecordType record;
         // The current time is used for time quanta; for instr quanta, it's ignored and
-        // we pass 0.
+        // we pass 0 and let the scheduler use instruction + idle counts.
         uint64_t cur_micros = sched_by_time_ ? get_current_microseconds() : 0;
         typename sched_type_t::stream_status_t status =
             worker.stream->next_record(record, cur_micros);
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 10de4990314..5166a8bcd63 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -908,17 +908,17 @@ droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000,
                      "Scheduling quantum",
-                     "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum in instructions, unless -sched_time is set in "
-                     "which case this value is multiplied by -sched_time_per_us to "
-                     "produce a quantum in wall-clock microseconds.");
+                     "Applies to -core_sharded and -core_serial.  Scheduling quantum in "
+                     "instructions, unless -sched_time is set in which case this value "
+                     "is the quantum in simulated microseconds (equal to wall-clock "
+                     "microseconds multiplied by -sched_time_per_us).");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
                   "Whether to use time for the scheduling quantum",
-                  "Applies to -core_sharded and -core_serial. "
-                  "Whether to use wall-clock time for the scheduling quantum, with a "
-                  "value equal to -sched_quantum in microseconds of wall-clock time.");
+                  "Applies to -core_sharded and -core_serial.  Whether to use wall-clock "
+                  "time (multiplied by -sched_time_per_us) for measuring idle time and "
+                  "for the scheduling quantum (see -sched_quantum).");
 
 droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", true,
                                      "Whether to honor recorded timestamps for ordering",
@@ -1016,11 +1016,12 @@ droption_t<bool> op_sched_infinite_timeouts(
     "(set to false).");
 
 droption_t<double> op_sched_time_units_per_us(
-    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 1000.,
     "Time units per simulated microsecond",
-    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
-    "of the -sched_*_us values as it converts wall-clock time into the simulated "
-    "microseconds measured by those options.");
+    "Time units per simulated microsecond.  The units are either the instruction count "
+    "plus the idle count (the default) or if -sched_time is selected wall-clock "
+    "microseconds.  This option value scales all of the -sched_*_us values as it "
+    "converts time units into the simulated microseconds measured by those options.");
 
 droption_t<uint64_t> op_sched_migration_threshold_us(
     DROPTION_SCOPE_ALL, "sched_migration_threshold_us", 500,
@@ -1035,7 +1036,7 @@ droption_t<uint64_t> op_sched_rebalance_period_us(
     "to redistribute load.");
 
 droption_t<double> op_sched_exit_if_fraction_inputs_left(
-    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.05,
+    DROPTION_SCOPE_FRONTEND, "sched_exit_if_fraction_inputs_left", 0.1,
     "Exit if non-EOF inputs left are <= this fraction of the total",
     "Applies to -core_sharded and -core_serial.  When an input reaches EOF, if the "
     "number of non-EOF inputs left as a fraction of the original inputs is equal to or "
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 16fb96798bf..072f92c622f 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -2721,8 +2721,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
             if (res->blocked_time > 0 &&
                 // cur_time can be 0 at initialization time.
                 (cur_time == 0 ||
-                 // XXX i#6966: We have seen wall-clock time go backward, which
-                 // underflows here and then always unblocks the input.
+                 // Guard against time going backward (happens for wall-clock: i#6966).
+                 cur_time < res->blocked_start_time ||
                  cur_time - res->blocked_start_time < res->blocked_time)) {
                 VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n",
                        res->index,
@@ -2733,6 +2733,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                 // This input is no longer blocked.
                 res->blocked_time = 0;
                 res->unscheduled = false;
+                VPRINT(this, 4, "pop queue: %d @ %" PRIu64 " no longer blocked\n",
+                       res->index, cur_time);
                 // We've found a candidate.  One final check if this is a migration.
                 bool found_candidate = false;
                 if (from_output == for_output)
@@ -2745,7 +2747,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
                            from_output, for_output, cur_time, res->last_run_time,
                            cur_time - res->last_run_time,
                            options_.migration_threshold_us);
-                    // Guard against time going backward, which happens: i#6966.
+                    // Guard against time going backward (happens for wall-clock: i#6966).
                     if (options_.migration_threshold_us == 0 || res->last_run_time == 0 ||
                         (cur_time > res->last_run_time &&
                          cur_time - res->last_run_time >=
@@ -2771,6 +2773,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue_hold_locks(
     if (res == nullptr && !blocked.empty()) {
         // Do not hand out EOF thinking we're done: we still have inputs blocked
         // on i/o, so just wait and retry.
+        if (for_output != INVALID_OUTPUT_ORDINAL)
+            ++outputs_[for_output].idle_count;
         status = STATUS_IDLE;
     }
     // Re-add the ones we skipped, but without changing their counters so we preserve
@@ -3039,6 +3043,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
         outputs_[output].waiting = true;
         outputs_[output].wait_start_time = get_output_time(output);
         outputs_[output].record_index->fetch_add(1, std::memory_order_release);
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
     }
     index = segment.key.input;
@@ -3729,17 +3734,25 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
     if (cur_time == 0) {
-        // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
-        // in get_output_time().  This also makes the two more similarly behaved with
-        // respect to blocking system calls.
-        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
-        // with a counter-based time, weighted appropriately for STATUS_IDLE.
-        cur_time = get_time_micros();
+        if (options_.mapping == MAP_AS_PREVIOUSLY) {
+            // XXX i#7023: We should instead store the simulator's time (whether
+            // passed in or our instr-based formula below) in the records and do away
+            // with wall-clock time for idle measurement.  Either way, we should make
+            // it clear in the docs whether the user/simulator has to pass in the
+            // time on replay.
+            cur_time = get_time_micros();
+        } else {
+            // We add 1 to avoid an invalid value of 0.
+            cur_time = 1 + outputs_[output].stream->get_output_instruction_ordinal() +
+                outputs_[output].idle_count;
+        }
     }
     // Invalid values for cur_time are checked below.
     outputs_[output].cur_time->store(cur_time, std::memory_order_release);
-    if (!outputs_[output].active->load(std::memory_order_acquire))
+    if (!outputs_[output].active->load(std::memory_order_acquire)) {
+        ++outputs_[output].idle_count;
         return sched_type_t::STATUS_IDLE;
+    }
     if (outputs_[output].waiting) {
         if (options_.mapping == MAP_AS_PREVIOUSLY &&
             outputs_[output].wait_start_time > 0) {
@@ -3752,6 +3765,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 VPRINT(this, 4,
                        "next_record[%d]: elapsed %" PRIu64 " < duration %" PRIu64 "\n",
                        output, now - outputs_[output].wait_start_time, duration);
+                // XXX i#7023: This should always be STATUS_IDLE, right?
                 return sched_type_t::STATUS_WAIT;
             } else
                 outputs_[output].wait_start_time = 0;
@@ -4266,6 +4280,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
     if (prev_input != INVALID_INPUT_ORDINAL)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
     set_cur_input(output, INVALID_INPUT_ORDINAL);
+    ++outputs_[output].idle_count;
     return sched_type_t::STATUS_IDLE;
 }
 
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 84b0cbe31f1..2ea8b334154 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -739,8 +739,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * other parameters that are in microseconds (they all end in "_us": e.g.,
          * #quantum_duration_us) so that they operate on the right time scale for the
          * passed-in simulator time (or wall-clock microseconds if no time is passed).
+         * The default value is a rough estimate when no accurate simulated time is
+         * available: the instruction count is used in that case, and we use the
+         * instructions per microsecond for a 2GHz clock at 0.5 IPC as our default.
          */
-        double time_units_per_us = 100.;
+        double time_units_per_us = 1000.;
         /**
          * The scheduling quantum duration for preemption, in simulated microseconds,
          * for #QUANTUM_TIME.  This value is multiplied by #time_units_per_us to
@@ -815,7 +818,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * instruction count is not considered (as it is not available), use discretion
          * when raising this value on uneven inputs.
          */
-        double exit_if_fraction_inputs_left = 0.05;
+        double exit_if_fraction_inputs_left = 0.1;
         // When adding new options, also add to print_configuration().
     };
 
@@ -874,7 +877,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // diminished.
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
-         * and how to continue.
+         * and how to continue.  Uses the instruction count plus idle count to this point
+         * as the time; use the variant that takes "cur_time" to instead provide a time.
          */
         virtual stream_status_t
         next_record(RecordType &record);
@@ -882,11 +886,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         /**
          * Advances to the next record in the stream.  Returns a status code on whether
          * and how to continue.  Supplies the current time for #QUANTUM_TIME.  The time
-         * should be considered to be the time prior to processing the returned record.
-         * The time is unitless but needs to be a globally consistent increasing value
-         * across all output streams.  #STATUS_INVALID is returned if 0 or a value smaller
-         * than the start time of the current input's quantum is passed in when
-         * #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
+         * should be considered to be the simulated time prior to processing the returned
+         * record.  The time's units can be chosen by the caller, with
+         * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.time_units_per_us
+         * providing the conversion to simulated microseconds.  #STATUS_INVALID is
+         * returned if 0 or a value smaller than the start time of the current input's
+         * quantum is passed in when #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
          */
         virtual stream_status_t
         next_record(RecordType &record, uint64_t cur_time);
@@ -996,6 +1001,15 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
                 return scheduler_->get_input_stream(ordinal_)->get_instruction_ordinal();
             return cur_instr_count_;
         }
+        /**
+         * Identical to get_instruction_ordinal() but ignores the
+         * #SCHEDULER_USE_INPUT_ORDINALS flag.
+         */
+        uint64_t
+        get_output_instruction_ordinal() const
+        {
+            return cur_instr_count_;
+        }
         /**
          * Returns a name for the current input stream feeding this output stream. For
          * stored offline traces, this is the base name of the trace on disk. For online
@@ -1683,6 +1697,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         // Exported statistics. Currently all integers and cast to double on export.
         std::vector<int64_t> stats =
             std::vector<int64_t>(memtrace_stream_t::SCHED_STAT_TYPE_COUNT);
+        // When no simulation time is passed to us, we use the idle count plus
+        // instruction count to measure time.
+        uint64_t idle_count = 0;
     };
 
     // Used for reading as-traced schedules.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 21f8fba7a6c..e69d23a05ee 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -2245,7 +2245,7 @@ test_synthetic_with_syscalls_precise()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     sched_ops.blocking_switch_threshold = BLOCK_THRESHOLD;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) !=
@@ -2349,7 +2349,7 @@ test_synthetic_with_syscalls_latencies()
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
-                                               /*verbosity=*/4);
+                                               /*verbosity=*/3);
     // We use a mock time for a deterministic result.
     sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
     sched_ops.time_units_per_us = 1.;