diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index c35d4064100..9310abe2b02 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -262,6 +262,7 @@ analyzer_multi_t::init_dynamic_schedule()
     sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
     sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
     sched_ops.block_time_scale = op_sched_block_scale.get_value();
+    sched_ops.block_time_max = op_sched_block_max_us.get_value();
 #ifdef HAS_ZIP
     if (!op_record_file.get_value().empty()) {
         record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 74adf921095..52f8660efb6 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -851,11 +851,18 @@ droption_t<uint64_t> op_sched_blocking_switch_us(
     "maybe-blocking to incur a context switch. Applies to -core_sharded and "
     "-core_serial. ");
 
-droption_t<double>
-    op_sched_block_scale(DROPTION_SCOPE_ALL, "sched_block_scale", 1.,
-                         "Input block time scale factor",
-                         "A higher value here results in blocking syscalls "
-                         "keeping inputs unscheduled for longer.");
+droption_t<double> op_sched_block_scale(
+    DROPTION_SCOPE_ALL, "sched_block_scale", 1000., "Input block time scale factor",
+    "The scale applied to the microsecond latency of blocking system calls.  A higher "
+    "value here results in blocking syscalls keeping inputs unscheduled for longer.  "
+    "This should roughly equal the slowdown of instruction record processing versus the "
+    "original (untraced) application execution.");
+
+droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us",
+                                           25000000,
+                                           "Maximum blocked input time, in microseconds",
+                                           "The maximum blocked time, after scaling with "
+                                           "-sched_block_scale.");
 #ifdef HAS_ZIP
 droption_t<std::string> op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "",
                                        "Path for storing record of schedule",
@@ -875,8 +882,8 @@ droption_t<std::string>
 
 // Schedule_stats options.
 droption_t<uint64_t>
-    op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every", 5000,
-                                  "A letter is printed every N instrs",
+    op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",
+                                  500000, "A letter is printed every N instrs",
                                   "A letter is printed every N instrs or N waits");
 
 droption_t<std::string> op_syscall_template_file(
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 0a3a999ae55..dbf1c57ca47 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -194,6 +194,7 @@ extern dynamorio::droption::droption_t<bool> op_sched_order_time;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_syscall_switch_us;
 extern dynamorio::droption::droption_t<uint64_t> op_sched_blocking_switch_us;
 extern dynamorio::droption::droption_t<double> op_sched_block_scale;
+extern dynamorio::droption::droption_t<uint64_t> op_sched_block_max_us;
 #ifdef HAS_ZIP
 extern dynamorio::droption::droption_t<std::string> op_record_file;
 extern dynamorio::droption::droption_t<std::string> op_replay_file;
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 393d42a40a9..a75fd13eba2 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -1576,10 +1576,12 @@ scheduler_tmpl_t<RecordType, ReaderType>::add_to_ready_queue(input_info_t *input
     VPRINT(
         this, 4,
         "add_to_ready_queue (pre-size %zu): input %d priority %d timestamp delta %" PRIu64
-        " block factor %3.2f time %" PRIu64 "\n",
+        " block time %" PRIu64 " start time %" PRIu64 "\n",
         ready_priority_.size(), input->index, input->priority,
-        input->reader->get_last_timestamp() - input->base_timestamp,
-        input->block_time_factor, input->blocked_start_time);
+        input->reader->get_last_timestamp() - input->base_timestamp, input->blocked_time,
+        input->blocked_start_time);
+    if (input->blocked_time > 0)
+        ++num_blocked_;
     input->queue_counter = ++ready_counter_;
     ready_priority_.push(input);
 }
@@ -1593,6 +1595,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
     std::set<input_info_t *> blocked;
     input_info_t *res = nullptr;
     sched_type_t::stream_status_t status = STATUS_OK;
+    uint64_t cur_time = (num_blocked_ > 0) ? get_output_time(for_output) : 0;
     while (!ready_priority_.empty()) {
         res = ready_priority_.top();
         ready_priority_.pop();
@@ -1600,33 +1603,19 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
             // For blocked inputs, as we don't have interrupts or other regular
             // control points we only check for being unblocked when an input
             // would be chosen to run.  We thus keep blocked inputs in the ready queue.
-            if (options_.quantum_unit == QUANTUM_TIME) {
-                if (res->block_time_factor > 0 &&
-                    outputs_[for_output].cur_time - res->blocked_start_time <
-                        options_.block_time_scale * res->block_time_factor) {
-                    VPRINT(
-                        this, 4,
-                        "pop queue: %d still blocked for %4.1f (%3.2f * %3.2f - (%" PRIu64
-                        " - %" PRIu64 ")\n",
-                        res->index,
-                        options_.block_time_scale * res->block_time_factor -
-                            (outputs_[for_output].cur_time - res->blocked_start_time),
-                        options_.block_time_scale, res->block_time_factor,
-                        outputs_[for_output].cur_time, res->blocked_start_time);
-                    // We keep searching for a suitable input.
-                    blocked.insert(res);
-                } else
-                    break;
-            } else {
-                if (res->block_time_factor > 0) {
-                    VPRINT(this, 4, "pop queue: %d still blocked for %4.1f\n", res->index,
-                           res->block_time_factor);
-                    --res->block_time_factor;
-                    // We keep searching for a suitable input.
-                    blocked.insert(res);
-                } else
-                    break;
+            if (res->blocked_time > 0) {
+                assert(cur_time > 0);
+                --num_blocked_;
             }
+            if (res->blocked_time > 0 &&
+                cur_time - res->blocked_start_time < res->blocked_time) {
+                VPRINT(this, 4, "pop queue: %d still blocked for %" PRIu64 "\n",
+                       res->index,
+                       res->blocked_time - (cur_time - res->blocked_start_time));
+                // We keep searching for a suitable input.
+                blocked.insert(res);
+            } else
+                break;
         } else {
             // We keep searching for a suitable input.
             skipped.insert(res);
@@ -1645,13 +1634,21 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
     // Re-add the blocked ones to the back.
     for (input_info_t *save : blocked)
         add_to_ready_queue(save);
+    VDO(this, 1, {
+        static int heartbeat;
+        if (++heartbeat % 500 == 0) {
+            VPRINT(this, 1, "heartbeat[%d] %zd in queue; %d blocked => %d %d\n",
+                   for_output, ready_priority_.size(), num_blocked_,
+                   res == nullptr ? -1 : res->index, status);
+        }
+    });
     if (res != nullptr) {
         VPRINT(this, 4,
                "pop_from_ready_queue[%d] (post-size %zu): input %d priority %d timestamp "
                "delta %" PRIu64 "\n",
                for_output, ready_priority_.size(), res->index, res->priority,
                res->reader->get_last_timestamp() - res->base_timestamp);
-        res->block_time_factor = 0.;
+        res->blocked_time = 0;
     }
     new_input = res;
     return status;
@@ -1660,7 +1657,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
 template <typename RecordType, typename ReaderType>
 bool
 scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *input,
-                                                                double &block_time_factor)
+                                                                uint64_t &blocked_time)
 {
     uint64_t post_time = input->reader->get_last_timestamp();
     assert(input->processing_syscall || input->processing_maybe_blocking_syscall);
@@ -1668,7 +1665,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
         // This is a legacy trace that does not have timestamps bracketing syscalls.
         // We switch on every maybe-blocking syscall in this case and have a simplified
         // blocking model.
-        block_time_factor = 1.;
+        blocked_time = options_.blocking_switch_threshold;
         return input->processing_maybe_blocking_syscall;
     }
     assert(input->pre_syscall_timestamp > 0);
@@ -1677,13 +1674,22 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
     uint64_t threshold = input->processing_maybe_blocking_syscall
         ? options_.blocking_switch_threshold
         : options_.syscall_switch_threshold;
-    block_time_factor = static_cast<double>(latency) / threshold;
-    if (options_.quantum_unit == QUANTUM_INSTRUCTIONS)
-        block_time_factor *= options_.block_time_scale;
-    VPRINT(this, 3, "input %d %ssyscall latency %" PRIu64 " => factor %4.1f\n",
+    blocked_time =
+        static_cast<uint64_t>(static_cast<double>(latency) * options_.block_time_scale);
+    if (blocked_time > options_.block_time_max) {
+        // We have a max to avoid outlier latencies that are already a second or
+        // more from scaling up to tens of minutes.  We assume a cap is representative
+        // as the outliers likely were not part of key dependence chains.  Without a
+        // cap the other threads all finish and the simulation waits for tens of
+        // minutes further for a couple of outliers.
+        blocked_time = options_.block_time_max;
+    }
+    VPRINT(this, 3,
+           "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64
+           "\n",
            input->index,
            input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency,
-           block_time_factor);
+           options_.block_time_scale, blocked_time);
     return latency >= threshold;
 }
 
@@ -1877,7 +1883,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input_as_previously(
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t output,
-                                                          double block_time_factor)
+                                                          uint64_t blocked_time)
 {
     sched_type_t::stream_status_t res = sched_type_t::STATUS_OK;
     bool need_lock =
@@ -1917,14 +1923,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                 if (res != sched_type_t::STATUS_OK)
                     return res;
             } else if (options_.mapping == MAP_TO_ANY_OUTPUT) {
-                if (block_time_factor > 0. && prev_index != INVALID_INPUT_ORDINAL) {
+                if (blocked_time > 0 && prev_index != INVALID_INPUT_ORDINAL) {
                     std::lock_guard<std::mutex> lock(*inputs_[prev_index].lock);
-                    if (inputs_[prev_index].block_time_factor == 0.) {
-                        VPRINT(this, 2, "next_record[%d]: block time factor %3.2f\n",
-                               output, block_time_factor);
-                        inputs_[prev_index].block_time_factor = block_time_factor;
-                        inputs_[prev_index].blocked_start_time =
-                            outputs_[output].cur_time;
+                    if (inputs_[prev_index].blocked_time == 0) {
+                        VPRINT(this, 2, "next_record[%d]: blocked time %" PRIu64 "\n",
+                               output, blocked_time);
+                        inputs_[prev_index].blocked_time = blocked_time;
+                        inputs_[prev_index].blocked_start_time = get_output_time(output);
                     } else {
                         // If we looped we could have the same prev_index.
                         assert(iters > 1);
@@ -1942,13 +1947,13 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                         ready_priority_.erase(target);
                         index = target->index;
                         // Erase any remaining wait time for the target.
-                        if (target->block_time_factor != 0.) {
+                        if (target->blocked_time > 0) {
                             VPRINT(this, 3,
-                                   "next_record[%d]: direct switch erasing block time "
-                                   "factor "
-                                   "%4.1f for input %d\n",
-                                   output, target->block_time_factor, target->index);
-                            target->block_time_factor = 0.;
+                                   "next_record[%d]: direct switch erasing blocked time "
+                                   "for input %d\n",
+                                   output, target->index);
+                            --num_blocked_;
+                            target->blocked_time = 0;
                         }
                     } else {
                         // TODO i#5843: If the target is running on another output, we
@@ -1965,13 +1970,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                 }
                 if (index != INVALID_INPUT_ORDINAL) {
                     // We found a direct switch target above.
-                } else if (ready_queue_empty() && block_time_factor == 0.) {
+                } else if (ready_queue_empty() && blocked_time == 0) {
                     if (prev_index == INVALID_INPUT_ORDINAL)
                         return eof_or_idle(output);
-                    std::lock_guard<std::mutex> lock(*inputs_[prev_index].lock);
-                    if (inputs_[prev_index].at_eof)
+                    auto lock = std::unique_lock<std::mutex>(*inputs_[prev_index].lock);
+                    if (inputs_[prev_index].at_eof) {
+                        lock.unlock();
                         return eof_or_idle(output);
-                    else
+                    } else
                         index = prev_index; // Go back to prior.
                 } else {
                     // Give up the input before we go to the queue so we can add
@@ -1997,8 +2003,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                         return status;
                     }
                     if (queue_next == nullptr) {
-                        assert(block_time_factor == 0. ||
-                               prev_index == INVALID_INPUT_ORDINAL);
+                        assert(blocked_time == 0 || prev_index == INVALID_INPUT_ORDINAL);
                         return eof_or_idle(output);
                     }
                     index = queue_next->index;
@@ -2070,6 +2075,12 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
     // We do not enforce a globally increasing time to avoid the synchronization cost; we
     // do return an error on a time smaller than an input's current start time when we
     // check for quantum end.
+    if (cur_time == 0) {
+        // It's more efficient for QUANTUM_TIME to get the time here instead of
+        // in get_output_time().  This also makes the two more similarly behaved
+        // with respect to blocking system calls.
+        cur_time = get_time_micros();
+    }
     outputs_[output].cur_time = cur_time; // Invalid values are checked below.
     if (!outputs_[output].active)
         return sched_type_t::STATUS_IDLE;
@@ -2170,7 +2181,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         VDO(this, 5, print_record(record););
         bool need_new_input = false;
         bool preempt = false;
-        double block_time_factor = 0.;
+        uint64_t blocked_time = 0;
         uint64_t prev_time_in_quantum = 0;
         if (options_.mapping == MAP_AS_PREVIOUSLY) {
             assert(outputs_[output].record_index >= 0);
@@ -2235,7 +2246,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                         input->switch_to_input = it->second;
                     }
                 } else if (record_type_is_instr(record)) {
-                    if (syscall_incurs_switch(input, block_time_factor)) {
+                    if (syscall_incurs_switch(input, blocked_time)) {
                         // Model as blocking and should switch to a different input.
                         need_new_input = true;
                         VPRINT(this, 3,
@@ -2323,8 +2334,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
             VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output);
             input->queue.push_back(record);
             lock.unlock();
-            sched_type_t::stream_status_t res =
-                pick_next_input(output, block_time_factor);
+            sched_type_t::stream_status_t res = pick_next_input(output, blocked_time);
             if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT &&
                 res != sched_type_t::STATUS_SKIPPED)
                 return res;
@@ -2385,7 +2395,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         }
         break;
     }
-    VPRINT(this, 4, "next_record[%d]: from %d: ", output, input->index);
+    VPRINT(this, 4, "next_record[%d]: from %d @%" PRId64 ": ", output, input->index,
+           cur_time);
     VDO(this, 4, print_record(record););
 
     outputs_[output].last_record = record;
@@ -2493,6 +2504,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output)
         assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof);
         return sched_type_t::STATUS_EOF;
     } else {
+        set_cur_input(output, INVALID_INPUT_ORDINAL);
         outputs_[output].waiting = true;
         return sched_type_t::STATUS_IDLE;
     }
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index ef218a9d350..b428480d654 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -524,16 +524,27 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         uint64_t blocking_switch_threshold = 100;
         /**
          * Controls the amount of time inputs are considered blocked at a syscall whose
-         * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold.  A
-         * "block time factor" is computed from the syscall latency divided by either
-         * #syscall_switch_threshold or #blocking_switch_threshold.  This factor is
-         * multiplied by this field #block_time_scale to produce a final value.  For
-         * #QUANTUM_TIME, that final value's amount of time, as reported by the time
-         * parameter to next_record(), must pass before the input is no longer considered
-         * blocked.  For instruction quanta, that final value's count of scheduler
-         * selections must occur before the input is actually selected.
+         * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold.  The
+         * syscall latency (in microseconds) is multiplied by this field to produce the
+         * blocked time.  For #QUANTUM_TIME, that blocked time in the units reported by
+         * the time parameter to next_record() must pass before the input is no longer
+         * considered blocked.  Since the system call latencies are in microseconds, this
+         * #block_time_scale should be set to the number of next_record() time units in
+         * one simulated microsecond.  For #QUANTUM_INSTRUCTIONS, the blocked time in
+         * wall-clock microseconds must pass before the input is actually selected
+         * (wall-clock time is used as there is no reasonable alternative with no other
+         * uniform notion of time); thus, the #block_time_scale value here should equal
+         * the slowdown of the instruction record processing versus the original
+         * (untraced) application execution.  The blocked time is clamped to a maximum
+         * value controlled by #block_time_max.
          */
-        double block_time_scale = 1.;
+        double block_time_scale = 1000.;
+        /**
+         * The maximum time, in microseconds, for an input to be considered blocked
+         * for any one system call.  This is applied after multiplying by
+         * #block_time_scale.
+         */
+        uint64_t block_time_max = 25000000;
     };
 
     /**
@@ -1018,8 +1029,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         uint64_t prev_time_in_quantum = 0;
         uint64_t time_spent_in_quantum = 0;
         // These fields model waiting at a blocking syscall.
-        double block_time_factor = 0.;
-        uint64_t blocked_start_time = 0; // For QUANTUM_TIME only.
+        // The units are us for instr quanta and simuilation time for time quanta.
+        uint64_t blocked_time = 0.;
+        uint64_t blocked_start_time = 0;
     };
 
     // Format for recording a schedule to disk.  A separate sequence of these records
@@ -1232,7 +1244,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     // Finds the next input stream for the 'output_ordinal'-th output stream.
     // No input_info_t lock can be held on entry.
     stream_status_t
-    pick_next_input(output_ordinal_t output, double block_time_factor);
+    pick_next_input(output_ordinal_t output, uint64_t blocked_time);
 
     // Helper for pick_next_input() for MAP_AS_PREVIOUSLY.
     // No input_info_t lock can be held on entry.
@@ -1354,7 +1366,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     // The input's lock must be held by the caller.
     // Returns a multiplier for how long the input should be considered blocked.
     bool
-    syscall_incurs_switch(input_info_t *input, double &block_time_factor);
+    syscall_incurs_switch(input_info_t *input, uint64_t &blocked_time);
 
     // sched_lock_ must be held by the caller.
     // "for_output" is which output stream is looking for a new input; only an
@@ -1385,6 +1397,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     // timestamp in each workload in order to mix inputs from different workloads in the
     // same queue.  FIFO ordering is used for same-priority entries.
     flexible_queue_t<input_info_t *, InputTimestampComparator> ready_priority_;
+    // Trackes the count of blocked inputs.  Protected by sched_lock_.
+    int num_blocked_ = 0;
     // Global ready queue counter used to provide FIFO for same-priority inputs.
     uint64_t ready_counter_ = 0;
     // Count of inputs not yet at eof.
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index 8eeada43d63..c5c98e0e9c0 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -3,10 +3,10 @@ Total counts:
            4 cores
            8 threads
       638938 instructions
-           5 total context switches
-   0\.0078255 CSPKI \(context switches per 1000 instructions\)
-      127788 instructions per context switch
-           5 voluntary context switches
+           6 total context switches
+   0\.0093906 CSPKI \(context switches per 1000 instructions\)
+      106490 instructions per context switch
+           6 voluntary context switches
            0 direct context switches
       100\.00% voluntary switches
         0\.00% direct switches
@@ -15,7 +15,12 @@ Total counts:
            0 direct switch requests
            0 waits
     *[0-9]* idles
-      *[0-9\.]*% cpu busy
+      *[0-9\.]*% cpu busy by record count
+   *[0-9]* cpu microseconds
+   *[0-9]* idle microseconds
+   *[0-9]* idle microseconds at last instr
+      *[0-9\.]*% cpu busy by time
+      *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #0 counts:
            . threads
       *[0-9]* instructions
@@ -31,7 +36,12 @@ Core #0 counts:
            0 direct switch requests
            0 waits
     *[0-9]* idles
-      *[0-9\.]*% cpu busy
+      *[0-9\.]*% cpu busy by record count
+   *[0-9]* cpu microseconds
+   *[0-9]* idle microseconds
+   *[0-9]* idle microseconds at last instr
+      *[0-9\.]*% cpu busy by time
+      *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #1 counts:
            . threads
       *[0-9]* instructions
@@ -47,7 +57,12 @@ Core #1 counts:
            0 direct switch requests
            0 waits
     *[0-9]* idles
-      *[0-9\.]*% cpu busy
+      *[0-9\.]*% cpu busy by record count
+   *[0-9]* cpu microseconds
+   *[0-9]* idle microseconds
+   *[0-9]* idle microseconds at last instr
+      *[0-9\.]*% cpu busy by time
+      *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #2 counts:
            . threads
       *[0-9]* instructions
@@ -63,7 +78,12 @@ Core #2 counts:
            0 direct switch requests
            0 waits
     *[0-9]* idles
-      *[0-9\.]*% cpu busy
+      *[0-9\.]*% cpu busy by record count
+   *[0-9]* cpu microseconds
+   *[0-9]* idle microseconds
+   *[0-9]* idle microseconds at last instr
+      *[0-9\.]*% cpu busy by time
+      *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #3 counts:
            . threads
       *[0-9]* instructions
@@ -79,7 +99,12 @@ Core #3 counts:
            0 direct switch requests
            0 waits
     *[0-9]* idles
-      *[0-9\.]*% cpu busy
+      *[0-9\.]*% cpu busy by record count
+   *[0-9]* cpu microseconds
+   *[0-9]* idle microseconds
+   *[0-9]* idle microseconds at last instr
+      *[0-9\.]*% cpu busy by time
+      *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #0 schedule: [A-Ha-h_]*
 Core #1 schedule: [A-Ha-h_]*
 Core #2 schedule: [A-Ha-h_]*
diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
index 48177635375..86e22894ce1 100644
--- a/clients/drcachesim/tests/schedule_stats_test.cpp
+++ b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -196,6 +196,8 @@ test_basic_stats()
     assert(result.maybe_blocking_syscalls == 3);
     assert(result.direct_switch_requests == 2);
     assert(result.waits == 3);
+    assert(result.idle_microseconds == 0);
+    assert(result.cpu_microseconds > 0);
     return true;
 }
 
@@ -216,6 +218,7 @@ test_idle()
             gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
             gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
             gen_marker(TID_B, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            // A switch from idle w/ no syscall is an involuntary switch.
             gen_instr(TID_B),
             gen_instr(TID_B),
             gen_instr(TID_B),
@@ -229,6 +232,7 @@ test_idle()
             gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
             gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
             gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_IDLE, 0),
+            // A switch from idle w/ no syscall is an involuntary switch.
             gen_instr(TID_C),
             gen_instr(TID_C),
             // Wait.
@@ -243,7 +247,7 @@ test_idle()
     };
     auto result = run_schedule_stats(memrefs, tid2ord);
     assert(result.instrs == 13);
-    assert(result.total_switches == 3);
+    assert(result.total_switches == 5);
     assert(result.voluntary_switches == 0);
     assert(result.direct_switches == 0);
     assert(result.syscalls == 0);
@@ -251,6 +255,11 @@ test_idle()
     assert(result.direct_switch_requests == 0);
     assert(result.waits == 3);
     assert(result.idles == 6);
+    // It is hard to test wall-clock time precise values so we have sanity checks.
+    assert(result.idle_microseconds > 0);
+    assert(result.idle_micros_at_last_instr > 0 &&
+           result.idle_micros_at_last_instr <= result.idle_microseconds);
+    assert(result.cpu_microseconds > 0);
     return true;
 }
 
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index d41d2119798..b8155671696 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -815,6 +815,7 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
     for (int i = 0; i < num_outputs; i++)
         outputs[i] = scheduler.get_stream(i);
     int num_eof = 0;
+    int64_t meta_records = 0;
     // Record the threads, one char each.
     std::vector<std::string> sched_as_string(num_outputs);
     static constexpr char THREAD_LETTER_START = 'A';
@@ -833,8 +834,9 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
                 // instruction quanta.  This is a per-output time which technically
                 // violates the globally-increasing requirement, so this will not work
                 // perfectly with i/o waits, but should work fine for basic tests.
+                // We add the wait and idle records to make progress with idle time.
                 status = outputs[i]->next_record(
-                    memref, outputs[i]->get_instruction_ordinal() + 1);
+                    memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records);
             } else {
                 status = outputs[i]->next_record(memref);
             }
@@ -845,10 +847,12 @@ run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t ti
             }
             if (status == scheduler_t::STATUS_WAIT) {
                 sched_as_string[i] += WAIT_SYMBOL;
+                ++meta_records;
                 continue;
             }
             if (status == scheduler_t::STATUS_IDLE) {
                 sched_as_string[i] += IDLE_SYMBOL;
+                ++meta_records;
                 continue;
             }
             assert(status == scheduler_t::STATUS_OK);
@@ -898,7 +902,8 @@ test_synthetic()
     static constexpr int NUM_OUTPUTS = 2;
     static constexpr int NUM_INSTRS = 9;
     static constexpr int QUANTUM_DURATION = 3;
-    static constexpr double BLOCK_SCALE = 0.1;
+    // We do not want to block for very long.
+    static constexpr double BLOCK_SCALE = 0.01;
     static constexpr memref_tid_t TID_BASE = 100;
     std::vector<trace_entry_t> inputs[NUM_INPUTS];
     for (int i = 0; i < NUM_INPUTS; i++) {
@@ -927,9 +932,10 @@ test_synthetic()
     // the usage to persist to their next scheduling which should only have
     // a single letter.
     static const char *const CORE0_SCHED_STRING =
-        "..AA......CCC..EEE..GGGEEEABGGGDDD.AAABBBAAA.___";
+        "..AA......CCC..EEE..GGGDDDFFFBBBCCC.EEE.AAA.GGG.";
     static const char *const CORE1_SCHED_STRING =
-        "..BB......DDD..FFFCCCDDDFFFCCC.EEE.FFF.GGG.BBB.";
+        "..BB......DDD..FFFABCCCEEEAAAGGGDDD.FFF.BBB.____";
+
     {
         // Test instruction quanta.
         std::vector<scheduler_t::input_workload_t> sched_inputs;
@@ -945,7 +951,6 @@ test_synthetic()
                                                    scheduler_t::SCHEDULER_DEFAULTS,
                                                    /*verbosity=*/3);
         sched_ops.quantum_duration = QUANTUM_DURATION;
-        // We do not want to block for very long.
         sched_ops.block_time_scale = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
@@ -975,8 +980,7 @@ test_synthetic()
                                                    /*verbosity=*/3);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
         sched_ops.quantum_duration = QUANTUM_DURATION;
-        // QUANTUM_INSTRUCTIONS divides by the threshold so to match we multiply.
-        sched_ops.block_time_scale = sched_ops.blocking_switch_threshold * BLOCK_SCALE;
+        sched_ops.block_time_scale = BLOCK_SCALE;
         scheduler_t scheduler;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
             scheduler_t::STATUS_SUCCESS)
@@ -1002,6 +1006,8 @@ test_synthetic_time_quanta()
     static constexpr memref_tid_t TID_C = TID_A + 2;
     static constexpr int NUM_OUTPUTS = 2;
     static constexpr int NUM_INPUTS = 3;
+    static constexpr int PRE_BLOCK_TIME = 20;
+    static constexpr int POST_BLOCK_TIME = 220;
     std::vector<trace_entry_t> refs[NUM_INPUTS];
     for (int i = 0; i < NUM_INPUTS; ++i) {
         refs[i].push_back(make_thread(TID_BASE + i));
@@ -1011,10 +1017,10 @@ test_synthetic_time_quanta()
         refs[i].push_back(make_instr(10));
         refs[i].push_back(make_instr(30));
         if (i == 0) {
-            refs[i].push_back(make_timestamp(20));
+            refs[i].push_back(make_timestamp(PRE_BLOCK_TIME));
             refs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
             refs[i].push_back(make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
-            refs[i].push_back(make_timestamp(220));
+            refs[i].push_back(make_timestamp(POST_BLOCK_TIME));
         }
         refs[i].push_back(make_instr(50));
         refs[i].push_back(make_exit(TID_BASE + i));
@@ -1037,7 +1043,8 @@ test_synthetic_time_quanta()
                                                    /*verbosity=*/4);
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
         sched_ops.quantum_duration = 3;
-        sched_ops.block_time_scale = 5.; // Ensure it waits a while.
+        // Ensure it waits 10 steps.
+        sched_ops.block_time_scale = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME);
         zipfile_ostream_t outfile(record_fname);
         sched_ops.schedule_record_ostream = &outfile;
         if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
@@ -1454,6 +1461,8 @@ test_synthetic_with_syscalls_multiple()
     static constexpr int NUM_OUTPUTS = 2;
     static constexpr int NUM_INSTRS = 9;
     static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
     std::vector<scheduler_t::input_workload_t> sched_inputs;
     auto get_tid = [&](int workload_idx, int input_idx) {
         return TID_BASE + workload_idx * NUM_INPUTS_PER_WORKLOAD + input_idx;
@@ -1485,8 +1494,8 @@ test_synthetic_with_syscalls_multiple()
                     inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
                     inputs.push_back(
                         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
-                    // Blocked for 5 100-threshold blocking-syscall units.
-                    inputs.push_back(make_timestamp(stamp + 550));
+                    // Blocked for 10 time units with our BLOCK_SCALE.
+                    inputs.push_back(make_timestamp(stamp + 10 + 10 * BLOCK_LATENCY));
                 } else {
                     // Insert meta records to keep the locksteps lined up.
                     inputs.push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, 0));
@@ -1529,13 +1538,17 @@ test_synthetic_with_syscalls_multiple()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration = 3;
+    // We use our mock's time==instruction count for a deterministic result.
+    sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+    sched_ops.block_time_scale = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
         scheduler_t::STATUS_SUCCESS)
         assert(false);
     // We omit the "." marker chars to keep the strings short enough to be readable.
     std::vector<std::string> sched_as_string = run_lockstep_simulation(
-        scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/false, /*print_markers=*/false);
+        scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true, /*print_markers=*/false);
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
@@ -1550,9 +1563,8 @@ test_synthetic_with_syscalls_multiple()
     // with the "." in run_lockstep_simulation().  The omitted "." markers also
     // explains why the two strings are different lengths.
     assert(sched_as_string[0] ==
-           "BHHHFFFJJJJJJJBEEHHHIIIBIIIEEDDDBAAAEEGGGBDDD___B___B___B___B");
-    assert(sched_as_string[1] ==
-           "EECCCIIICCCJJFFFCCCFFFAAAHHHGGGDDDAAAGGGE__________________________");
+           "BHHHFFFJJJJJJBEEHHHFFFBCCCEEIIIDDDBAAAGGGDDDB________B_______");
+    assert(sched_as_string[1] == "EECCCIIICCCBEEJJJHHHIIIFFFEAAAGGGBDDDAAAGGG________B");
 }
 
 static void
@@ -1567,6 +1579,8 @@ test_synthetic_with_syscalls_single()
     static constexpr int NUM_OUTPUTS = 2;
     static constexpr int NUM_INSTRS = 9;
     static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
     std::vector<scheduler_t::input_workload_t> sched_inputs;
     auto get_tid = [&](int workload_idx, int input_idx) {
         return TID_BASE + workload_idx * NUM_INPUTS_PER_WORKLOAD + input_idx;
@@ -1591,15 +1605,14 @@ test_synthetic_with_syscalls_single()
                     inputs.push_back(make_timestamp(stamp));
                 }
                 inputs.push_back(make_instr(42 + instr_idx * 4));
-                // Insert some blocking syscalls in the high-priority (see below)
-                // middle threads.
+                // Insert some blocking syscalls.
                 if (instr_idx % 3 == 1) {
                     inputs.push_back(make_timestamp(stamp + 10));
                     inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
                     inputs.push_back(
                         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
-                    // Blocked for 3 100-threshold blocking-syscall units.
-                    inputs.push_back(make_timestamp(stamp + 350));
+                    // Blocked for 3 time units.
+                    inputs.push_back(make_timestamp(stamp + 10 + 3 * BLOCK_LATENCY));
                 } else {
                     // Insert meta records to keep the locksteps lined up.
                     inputs.push_back(make_marker(TRACE_MARKER_TYPE_CPU_ID, 0));
@@ -1621,20 +1634,22 @@ test_synthetic_with_syscalls_single()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/4);
     sched_ops.quantum_duration = 3;
+    // We use our mock's time==instruction count for a deterministic result.
+    sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+    sched_ops.block_time_scale = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
         scheduler_t::STATUS_SUCCESS)
         assert(false);
     std::vector<std::string> sched_as_string =
-        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
+        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
     // We expect an idle CPU every 3 instrs but starting at the 2nd (1-based % 3).
-    assert(sched_as_string[0] ==
-           "..A....A....__A....A.....A....__A.....A....A....__A.....");
-    assert(sched_as_string[1] ==
-           "________________________________________________________");
+    assert(sched_as_string[0] == "..A....A....___________________________________A.....");
+    assert(sched_as_string[1] == "____________A....A.....A....__A.....A....A...._______");
 }
 
 static bool
@@ -1756,6 +1771,8 @@ test_synthetic_with_syscalls_latencies()
     static constexpr memref_tid_t TID_A = 42;
     static constexpr memref_tid_t TID_B = 99;
     static constexpr int SYSNUM = 202;
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
     std::vector<trace_entry_t> refs_A = {
         /* clang-format off */
         make_thread(TID_A),
@@ -1779,7 +1796,7 @@ test_synthetic_with_syscalls_latencies()
         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
         make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100),
         make_marker(TRACE_MARKER_TYPE_FUNC_ARG, 42),
-        make_timestamp(1100),
+        make_timestamp(1000 + BLOCK_LATENCY),
         make_marker(TRACE_MARKER_TYPE_CPU_ID, 1),
         make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100),
         make_marker(TRACE_MARKER_TYPE_FUNC_RETVAL, 0),
@@ -1809,14 +1826,20 @@ test_synthetic_with_syscalls_latencies()
                                                scheduler_t::DEPENDENCY_TIMESTAMPS,
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/4);
+    // We use a mock time for a deterministic result.
+    sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+    sched_ops.block_time_scale = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, 1, sched_ops) != scheduler_t::STATUS_SUCCESS)
         assert(false);
     auto *stream = scheduler.get_stream(0);
     memref_t memref;
     std::vector<memref_t> refs;
-    for (scheduler_t::stream_status_t status = stream->next_record(memref);
-         status != scheduler_t::STATUS_EOF; status = stream->next_record(memref)) {
+    int step = 0;
+    for (scheduler_t::stream_status_t status = stream->next_record(memref, ++step);
+         status != scheduler_t::STATUS_EOF;
+         status = stream->next_record(memref, ++step)) {
         if (status == scheduler_t::STATUS_WAIT)
             continue;
         assert(status == scheduler_t::STATUS_OK);
@@ -1867,6 +1890,9 @@ test_synthetic_with_syscalls_idle()
     static constexpr int NUM_OUTPUTS = 1;
     static constexpr int NUM_INSTRS = 12;
     static constexpr memref_tid_t TID_BASE = 100;
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
+    static constexpr int BLOCK_UNITS = 27;
     std::vector<scheduler_t::input_workload_t> sched_inputs;
     std::vector<scheduler_t::input_reader_t> readers;
     for (int input_idx = 0; input_idx < NUM_INPUTS; input_idx++) {
@@ -1886,15 +1912,17 @@ test_synthetic_with_syscalls_idle()
                     inputs.push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42));
                     inputs.push_back(
                         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0));
-                    // Blocked for 2 100-threshold blocking-syscall units, but
+                    // Blocked for BLOCK_UNITS time units with BLOCK_SCALE, but
                     // after each queue rejection it should go to the back of
                     // the queue and all the other inputs should be selected
                     // before another retry.
-                    inputs.push_back(make_timestamp(stamp + 210));
+                    inputs.push_back(
+                        make_timestamp(stamp + 10 + BLOCK_UNITS * BLOCK_LATENCY));
                 } else {
                     // Insert a timestamp to match the blocked input so the inputs
                     // are all at equal priority in the queue.
-                    inputs.push_back(make_timestamp(stamp + 210));
+                    inputs.push_back(
+                        make_timestamp(stamp + 10 + BLOCK_UNITS * BLOCK_LATENCY));
                 }
             }
         }
@@ -1908,20 +1936,23 @@ test_synthetic_with_syscalls_idle()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration = 3;
+    // We use a mock time for a deterministic result.
+    sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+    sched_ops.block_time_scale = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
         scheduler_t::STATUS_SUCCESS)
         assert(false);
     std::vector<std::string> sched_as_string =
-        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
+        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
     // The timestamps provide the ABCD ordering, but A's blocking syscall after its
-    // 2nd instr makes it delayed for 3 full queue cycles of BCD BCD: A's duration
-    // of 2 is decremented after the 1st (to 1) and 2nd (to 0) and A is finally
-    // schedulable after the 3rd, when it just gets 1 instruction in before its
-    // (accumulated) count equals the quantum.
+    // 2nd instr makes it delayed for 3 full queue cycles of BBBCCCDDD (27 instrs,
+    // which is BLOCK_UNITS): A's is finally schedulable after the 3rd, when it just gets
+    // 1 instruction in before its (accumulated) count equals the quantum.
     assert(sched_as_string[0] ==
            "..AA......BB.B..CC.C..DD.DBBBCCCDDDBBBCCCDDDABBB.CCC.DDD.AAAAAAAAA.");
 }
@@ -2191,7 +2222,7 @@ test_replay()
         scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
                                                    scheduler_t::DEPENDENCY_IGNORE,
                                                    scheduler_t::SCHEDULER_DEFAULTS,
-                                                   /*verbosity=*/2);
+                                                   /*verbosity=*/3);
         sched_ops.quantum_duration = QUANTUM_INSTRS;
 
         zipfile_ostream_t outfile(record_fname);
@@ -3360,6 +3391,8 @@ test_direct_switch()
     // We have just 1 output to better control the order and avoid flakiness.
     static constexpr int NUM_OUTPUTS = 1;
     static constexpr int QUANTUM_DURATION = 100; // Never reached.
+    static constexpr int BLOCK_LATENCY = 100;
+    static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY);
     static constexpr memref_tid_t TID_BASE = 100;
     static constexpr memref_tid_t TID_A = TID_BASE + 0;
     static constexpr memref_tid_t TID_B = TID_BASE + 1;
@@ -3428,7 +3461,7 @@ test_direct_switch()
     // has significant blocked time left.  But then after B is scheduled and finishes,
     // we still have to wait for C's block time so we see idle underscores:
     static const char *const CORE0_SCHED_STRING =
-        "...AA.........CC......A....BBBB.____________________C...";
+        "...AA.........CC......A....BBBB.______________C...";
 
     std::vector<scheduler_t::input_workload_t> sched_inputs;
     sched_inputs.emplace_back(std::move(readers));
@@ -3437,12 +3470,16 @@ test_direct_switch()
                                                scheduler_t::SCHEDULER_DEFAULTS,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration = QUANTUM_DURATION;
+    // We use our mock's time==instruction count for a deterministic result.
+    sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.blocking_switch_threshold = BLOCK_LATENCY;
+    sched_ops.block_time_scale = BLOCK_SCALE;
     scheduler_t scheduler;
     if (scheduler.init(sched_inputs, NUM_OUTPUTS, sched_ops) !=
         scheduler_t::STATUS_SUCCESS)
         assert(false);
     std::vector<std::string> sched_as_string =
-        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE);
+        run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true);
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
     }
@@ -3457,7 +3494,6 @@ test_main(int argc, const char *argv[])
     // Avoid races with lazy drdecode init (b/279350357).
     dr_standalone_init();
 
-    test_direct_switch();
     test_serial();
     test_parallel();
     test_param_checks();
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index 67bfe69b0e5..f73fbcd874b 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -34,6 +34,13 @@
 
 #include "schedule_stats.h"
 
+#ifdef WINDOWS
+#    define WIN32_LEAN_AND_MEAN
+#    include <windows.h>
+#else
+#    include <sys/time.h>
+#endif
+
 #include <stddef.h>
 #include <stdint.h>
 
@@ -123,6 +130,14 @@ bool
 schedule_stats_t::parallel_shard_exit(void *shard_data)
 {
     // Nothing (we read the shard data in print_results).
+    per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+    if (shard->prev_was_idle) {
+        shard->counters.idle_microseconds +=
+            get_current_microseconds() - shard->segment_start_microseconds;
+    } else if (!shard->prev_was_wait) {
+        shard->counters.cpu_microseconds +=
+            get_current_microseconds() - shard->segment_start_microseconds;
+    }
     return true;
 }
 
@@ -133,6 +148,25 @@ schedule_stats_t::parallel_shard_error(void *shard_data)
     return per_shard->error;
 }
 
+uint64_t
+schedule_stats_t::get_current_microseconds()
+{
+#ifdef UNIX
+    struct timeval time;
+    if (gettimeofday(&time, nullptr) != 0)
+        return 0;
+    return time.tv_sec * 1000000 + time.tv_usec;
+#else
+    SYSTEMTIME sys_time;
+    GetSystemTime(&sys_time);
+    FILETIME file_time;
+    if (!SystemTimeToFileTime(&sys_time, &file_time))
+        return 0;
+    return file_time.dwLowDateTime +
+        (static_cast<uint64_t>(file_time.dwHighDateTime) << 32);
+#endif
+}
+
 bool
 schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
 {
@@ -163,8 +197,10 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     // Cache and reset here to ensure we reset on early return paths.
     bool was_wait = shard->prev_was_wait;
     bool was_idle = shard->prev_was_idle;
+    int64_t prev_input = shard->prev_input;
     shard->prev_was_wait = false;
     shard->prev_was_idle = false;
+    shard->prev_input = -1;
     if (memref.marker.type == TRACE_TYPE_MARKER &&
         memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
         ++shard->counters.waits;
@@ -187,6 +223,11 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         if (!was_idle) {
             shard->thread_sequence += IDLE_SYMBOL;
             shard->cur_segment_instrs = 0;
+            if (!was_wait && shard->segment_start_microseconds > 0) {
+                shard->counters.idle_microseconds +=
+                    get_current_microseconds() - shard->segment_start_microseconds;
+            }
+            shard->segment_start_microseconds = get_current_microseconds();
         } else {
             ++shard->cur_segment_instrs;
             if (shard->cur_segment_instrs == knob_print_every_) {
@@ -197,7 +238,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         return true;
     }
     int64_t input = shard->stream->get_input_id();
-    if (input != shard->prev_input) {
+    if (input != prev_input) {
         // We convert to letters which only works well for <=26 inputs.
         if (!shard->thread_sequence.empty()) {
             ++shard->counters.total_switches;
@@ -209,6 +250,11 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         shard->thread_sequence +=
             THREAD_LETTER_INITIAL_START + static_cast<char>(input % 26);
         shard->cur_segment_instrs = 0;
+        if (!was_wait && !was_idle && shard->segment_start_microseconds > 0) {
+            shard->counters.cpu_microseconds +=
+                get_current_microseconds() - shard->segment_start_microseconds;
+        }
+        shard->segment_start_microseconds = get_current_microseconds();
         if (knob_verbose_ >= 2) {
             std::ostringstream line;
             line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
@@ -226,11 +272,12 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
                  << " == thread " << memref.instr.tid << "\n";
             std::cerr << line.str();
         }
-        shard->prev_input = input;
     }
+    shard->prev_input = input;
     if (type_is_instr(memref.instr.type)) {
         ++shard->counters.instrs;
         ++shard->cur_segment_instrs;
+        shard->counters.idle_micros_at_last_instr = shard->counters.idle_microseconds;
         if (shard->cur_segment_instrs == knob_print_every_) {
             shard->thread_sequence +=
                 THREAD_LETTER_SUBSEQUENT_START + static_cast<char>(input % 26);
@@ -300,7 +347,22 @@ schedule_stats_t::print_counters(const counters_t &counters)
     std::cerr << std::setw(12) << std::setprecision(2)
               << 100 *
             (counters.instrs / static_cast<double>(counters.instrs + counters.idles))
-              << "% cpu busy\n";
+              << "% cpu busy by record count\n";
+    std::cerr << std::setw(12) << counters.cpu_microseconds << " cpu microseconds\n";
+    std::cerr << std::setw(12) << counters.idle_microseconds << " idle microseconds\n";
+    std::cerr << std::setw(12) << counters.idle_micros_at_last_instr
+              << " idle microseconds at last instr\n";
+    std::cerr << std::setw(12) << std::setprecision(2)
+              << 100 *
+            (counters.cpu_microseconds /
+             static_cast<double>(counters.cpu_microseconds + counters.idle_microseconds))
+              << "% cpu busy by time\n";
+    std::cerr << std::setw(12) << std::setprecision(2)
+              << 100 *
+            (counters.cpu_microseconds /
+             static_cast<double>(counters.cpu_microseconds +
+                                 counters.idle_micros_at_last_instr))
+              << "% cpu busy by time, ignoring idle past last instr\n";
 }
 
 bool
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index 9d15ccc0638..f0e9425e39d 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -89,6 +89,9 @@ class schedule_stats_t : public analysis_tool_t {
             direct_switch_requests += rhs.direct_switch_requests;
             waits += rhs.waits;
             idles += rhs.idles;
+            idle_microseconds += rhs.idle_microseconds;
+            idle_micros_at_last_instr += rhs.idle_micros_at_last_instr;
+            cpu_microseconds += rhs.cpu_microseconds;
             for (const memref_tid_t tid : rhs.threads) {
                 threads.insert(tid);
             }
@@ -103,6 +106,9 @@ class schedule_stats_t : public analysis_tool_t {
         int64_t direct_switch_requests = 0;
         int64_t waits = 0;
         int64_t idles = 0;
+        uint64_t idle_microseconds = 0;
+        uint64_t idle_micros_at_last_instr = 0;
+        uint64_t cpu_microseconds = 0;
         std::unordered_set<memref_tid_t> threads;
     };
     counters_t
@@ -124,11 +130,16 @@ class schedule_stats_t : public analysis_tool_t {
         uint64_t cur_segment_instrs = 0;
         bool prev_was_wait = false;
         bool prev_was_idle = false;
+        // Computing %-idle.
+        uint64_t segment_start_microseconds = 0;
     };
 
     void
     print_counters(const counters_t &counters);
 
+    uint64_t
+    get_current_microseconds();
+
     uint64_t knob_print_every_ = 0;
     unsigned int knob_verbose_ = 0;
     // We use an ordered map to get our output in order.  This table is not