From 5990405dde021a1a7d062cbfc71bb3fac9bacdc4 Mon Sep 17 00:00:00 2001
From: Derek Bruening <bruening@google.com>
Date: Thu, 16 Nov 2023 16:17:15 -0500
Subject: [PATCH] i#5843 scheduler: Only switch on long-latency syscalls
 (#6458)

Rather than context switching on every syscall labeled maybe-blocking,
the scheduler uses the now-available syscall latency to decide whether
the syscall should block and result in a context switch.

Adds two new command line options, -sched_syscall_switch_us (default
500us) and -sched_blocking_switch_us (default 100us), and corresponding
scheduler_t inputs, to control the latency thresholds. To avoid relying
too much on the maybe-blocking labels, we do consider a
very-high-latency syscall not marked as maybe-blocking to result in
a context switch.

Adds a new schedule_stats unit test.

Tested in a large proprietary app where this reduces the context switch
rate from ~100x too high down to ~10x too high. The next step of adding
i/o wait times should further improve the representativeness.

Issue: #5843
---
 clients/drcachesim/CMakeLists.txt             |  13 ++
 clients/drcachesim/analyzer_multi.cpp         |   2 +
 clients/drcachesim/common/options.cpp         |  13 ++
 clients/drcachesim/common/options.h           |   2 +
 clients/drcachesim/scheduler/scheduler.cpp    |  64 ++++--
 clients/drcachesim/scheduler/scheduler.h      |  24 +-
 .../tests/schedule_stats_nopreempt.templatex  |  20 +-
 .../drcachesim/tests/schedule_stats_test.cpp  | 213 ++++++++++++++++++
 .../drcachesim/tests/scheduler_unit_tests.cpp |   4 +-
 clients/drcachesim/tools/schedule_stats.cpp   |  25 +-
 clients/drcachesim/tools/schedule_stats.h     |   4 +-
 11 files changed, 349 insertions(+), 35 deletions(-)
 create mode 100644 clients/drcachesim/tests/schedule_stats_test.cpp

diff --git a/clients/drcachesim/CMakeLists.txt b/clients/drcachesim/CMakeLists.txt
index b482353a038..8b531541edd 100644
--- a/clients/drcachesim/CMakeLists.txt
+++ b/clients/drcachesim/CMakeLists.txt
@@ -748,6 +748,9 @@ add_subdirectory(tools/external)
 # We build larger executables here.  All tests are added in suite/tests/ except unit tests.
 # Be sure to give the targets qualified test names ("tool.drcache*...").
 
+# XXX: Try to add a macro add_drcachesim_test() to share common pieces
+# of these executables.
+
 if (BUILD_TESTS)
   add_executable(tool.reuse_distance.unit_tests tests/reuse_distance_test.cpp)
   target_link_libraries(tool.reuse_distance.unit_tests drmemtrace_reuse_distance
@@ -907,6 +910,16 @@ if (BUILD_TESTS)
     set_tests_properties(tool.drcachesim.invariant_checker_test PROPERTIES
       TIMEOUT ${test_seconds})
 
+    add_executable(tool.drcachesim.schedule_stats_test tests/schedule_stats_test.cpp)
+    configure_DynamoRIO_standalone(tool.drcachesim.schedule_stats_test)
+    add_win32_flags(tool.drcachesim.schedule_stats_test)
+    target_link_libraries(tool.drcachesim.schedule_stats_test drmemtrace_schedule_stats
+        drmemtrace_static drmemtrace_analyzer test_helpers)
+    add_test(NAME tool.drcachesim.schedule_stats_test
+             COMMAND tool.drcachesim.schedule_stats_test)
+    set_tests_properties(tool.drcachesim.schedule_stats_test PROPERTIES
+      TIMEOUT ${test_seconds})
+
     add_executable(tool.drcacheoff.view_test tests/view_test.cpp reader/file_reader.cpp)
     configure_DynamoRIO_standalone(tool.drcacheoff.view_test)
     add_win32_flags(tool.drcacheoff.view_test)
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index 7c458ea881c..11ba9ef3b23 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -257,6 +257,8 @@ analyzer_multi_t::init_dynamic_schedule()
     sched_ops.quantum_duration = op_sched_quantum.get_value();
     if (op_sched_time.get_value())
         sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME;
+    sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
+    sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
 #ifdef HAS_ZIP
     if (!op_record_file.get_value().empty()) {
         record_schedule_zip_.reset(new zipfile_ostream_t(op_record_file.get_value()));
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index af0496a9d07..71d7c5ac2f1 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -838,6 +838,19 @@ droption_t<bool> op_sched_order_time(DROPTION_SCOPE_ALL, "sched_order_time", tru
                                      "Applies to -core_sharded and -core_serial. "
                                      "Whether to honor recorded timestamps for ordering");
 
+droption_t<uint64_t> op_sched_syscall_switch_us(
+    DROPTION_SCOPE_ALL, "sched_syscall_switch_us", 500,
+    "Minimum latency to consider any syscall as incurring a context switch.",
+    "Minimum latency in timestamp units (us) to consider any syscall as incurring "
+    "a context switch.  Applies to -core_sharded and -core_serial. ");
+
+droption_t<uint64_t> op_sched_blocking_switch_us(
+    DROPTION_SCOPE_ALL, "sched_blocking_switch_us", 100,
+    "Minimum latency to consider a maybe-blocking syscall as incurring a context switch.",
+    "Minimum latency in timestamp units (us) to consider any syscall that is marked as "
+    "maybe-blocking to incur a context switch. Applies to -core_sharded and "
+    "-core_serial. ");
+
 #ifdef HAS_ZIP
 droption_t<std::string> op_record_file(DROPTION_SCOPE_FRONTEND, "record_file", "",
                                        "Path for storing record of schedule",
diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
index 996cff01739..8deff88ea74 100644
--- a/clients/drcachesim/common/options.h
+++ b/clients/drcachesim/common/options.h
@@ -191,6 +191,8 @@ extern dynamorio::droption::droption_t<bool> op_core_serial;
 extern dynamorio::droption::droption_t<int64_t> op_sched_quantum;
 extern dynamorio::droption::droption_t<bool> op_sched_time;
 extern dynamorio::droption::droption_t<bool> op_sched_order_time;
+extern dynamorio::droption::droption_t<uint64_t> op_sched_syscall_switch_us;
+extern dynamorio::droption::droption_t<uint64_t> op_sched_blocking_switch_us;
 #ifdef HAS_ZIP
 extern dynamorio::droption::droption_t<std::string> op_record_file;
 extern dynamorio::droption::droption_t<std::string> op_replay_file;
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 1404b3b7fa3..8a63171080b 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -1564,6 +1564,27 @@ scheduler_tmpl_t<RecordType, ReaderType>::pop_from_ready_queue(
     return res;
 }
 
+template <typename RecordType, typename ReaderType>
+bool
+scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *input)
+{
+    uint64_t post_time = input->reader->get_last_timestamp();
+    assert(input->processing_syscall || input->processing_maybe_blocking_syscall);
+    if (input->reader->get_version() < TRACE_ENTRY_VERSION_FREQUENT_TIMESTAMPS) {
+        // This is a legacy trace that does not have timestamps bracketing syscalls.
+        // We switch on every maybe-blocking syscall in this case.
+        return input->processing_maybe_blocking_syscall;
+    }
+    assert(input->pre_syscall_timestamp > 0);
+    assert(input->pre_syscall_timestamp < post_time);
+    uint64_t latency = post_time - input->pre_syscall_timestamp;
+    VPRINT(this, 3, "input %d %ssyscall latency: %" PRIu64 "\n", input->index,
+           input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency);
+    return (input->processing_maybe_blocking_syscall &&
+            latency >= options_.blocking_switch_threshold) ||
+        latency >= options_.syscall_switch_threshold;
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
@@ -2006,11 +2027,16 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         } else if (options_.mapping == MAP_TO_ANY_OUTPUT) {
             trace_marker_type_t marker_type;
             uintptr_t marker_value;
-            if (input->processing_blocking_syscall) {
+            // While regular traces typically always have a syscall marker when there's a
+            // maybe-blocking marker, some tests and synthetic traces have just the maybe
+            // so we check both.
+            if (input->processing_syscall || input->processing_maybe_blocking_syscall) {
                 // Wait until we're past all the markers associated with the syscall.
                 // XXX: We may prefer to stop before the return value marker for futex,
                 // or a kernel xfer marker, but our recorded format is on instr
                 // boundaries so we live with those being before the switch.
+                // XXX: Once we insert kernel traces, we may have to try harder
+                // to stop before the post-syscall records.
                 if (record_type_is_marker(record, marker_type, marker_value) &&
                     marker_type == TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH) {
                     memref_tid_t target_tid = marker_value;
@@ -2025,18 +2051,33 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                         input->switch_to_input = it->second;
                     }
                 } else if (record_type_is_instr(record)) {
-                    // Assume it will block and we should switch to a different input.
-                    need_new_input = true;
-                    in_wait_state = true;
-                    input->processing_blocking_syscall = false;
-                    VPRINT(this, 3, "next_record[%d]: hit blocking syscall in input %d\n",
-                           output, input->index);
+                    if (syscall_incurs_switch(input)) {
+                        // Model as blocking and should switch to a different input.
+                        need_new_input = true;
+                        in_wait_state = true;
+                        VPRINT(this, 3,
+                               "next_record[%d]: hit blocking syscall in input %d\n",
+                               output, input->index);
+                    }
+                    input->processing_syscall = false;
+                    input->processing_maybe_blocking_syscall = false;
+                    input->pre_syscall_timestamp = 0;
                 }
+            }
+            if (record_type_is_marker(record, marker_type, marker_value) &&
+                marker_type == TRACE_MARKER_TYPE_SYSCALL) {
+                input->processing_syscall = true;
+                input->pre_syscall_timestamp = input->reader->get_last_timestamp();
             } else if (record_type_is_marker(record, marker_type, marker_value) &&
                        marker_type == TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL) {
-                input->processing_blocking_syscall = true;
-            } else if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
-                       record_type_is_instr(record)) {
+                input->processing_maybe_blocking_syscall = true;
+                // Generally we should already have the timestamp from a just-prior
+                // syscall marker, but we support tests and other synthetic sequences
+                // with just a maybe-blocking.
+                input->pre_syscall_timestamp = input->reader->get_last_timestamp();
+            }
+            if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
+                record_type_is_instr(record)) {
                 ++input->instrs_in_quantum;
                 if (input->instrs_in_quantum > options_.quantum_duration) {
                     // We again prefer to switch to another input even if the current
@@ -2045,9 +2086,6 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                     need_new_input = true;
                 }
             } else if (options_.quantum_unit == QUANTUM_TIME) {
-                // The above if-else cases are all either for non-instrs or
-                // QUANTUM_INSTRUCTIONS, except the blocking syscall next instr which is
-                // already switching: so an else{} works here.
                 if (cur_time == 0 || cur_time < input->start_time_in_quantum) {
                     VPRINT(this, 1,
                            "next_record[%d]: invalid time %" PRIu64 " vs start %" PRIu64
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index a0eaa63bf2e..046c57e3085 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -496,6 +496,20 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
          * of traced cores).
          */
         archive_istream_t *replay_as_traced_istream = nullptr;
+        /**
+         * Determines the minimum latency in the unit of the trace's timestamps
+         * (microseconds) for which a non-maybe-blocking system call (one without
+         * a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
+         * blocking and trigger a context switch.
+         */
+        uint64_t syscall_switch_threshold = 500;
+        /**
+         * Determines the minimum latency in the unit of the trace's timestamps
+         * (microseconds) for which a maybe-blocking system call (one with
+         * a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
+         * blocking and trigger a context switch.
+         */
+        uint64_t blocking_switch_threshold = 100;
     };
 
     /**
@@ -965,8 +979,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         bool order_by_timestamp = false;
         // Global ready queue counter used to provide FIFO for same-priority inputs.
         uint64_t queue_counter = 0;
-        // Used to switch on the instruction *after* a blocking syscall.
-        bool processing_blocking_syscall = false;
+        // Used to switch on the instruction *after* a long-latency syscall.
+        bool processing_syscall = false;
+        bool processing_maybe_blocking_syscall = false;
+        uint64_t pre_syscall_timestamp = 0;
         // Use for special kernel features where one thread specifies a target
         // thread to replace it.
         input_ordinal_t switch_to_input = INVALID_INPUT_ORDINAL;
@@ -1275,6 +1291,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     void
     add_to_ready_queue(input_info_t *input);
 
+    // The input's lock must be held by the caller.
+    bool
+    syscall_incurs_switch(input_info_t *input);
+
     // sched_lock_ must be held by the caller.
     // "for_output" is which output stream is looking for a new input; only an
     // input which is able to run on that output will be selected.
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index f94d9f610f6..9f1ef413e23 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -29,7 +29,7 @@ Core #0 counts:
            0 direct switch requests
            0 waits
 Core #1 counts:
-           2 threads
+           . threads
       *[0-9]* instructions
            . total context switches
    0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
@@ -38,35 +38,35 @@ Core #1 counts:
            0 direct context switches
       100.00% voluntary switches
         0.00% direct switches
-          .. system calls
+         *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
 Core #2 counts:
-           2 threads
+           . threads
       *[0-9]* instructions
-           1 total context switches
+           . total context switches
    0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
-           1 voluntary context switches
+           . voluntary context switches
            0 direct context switches
       100.00% voluntary switches
         0.00% direct switches
-          .. system calls
+         *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
 Core #3 counts:
-           2 threads
+           . threads
       *[0-9]* instructions
-           1 total context switches
+           . total context switches
    0.0[0-9.]* CSPKI \(context switches per 1000 instructions\)
       *[0-9]* instructions per context switch
-           1 voluntary context switches
+           . voluntary context switches
            0 direct context switches
       100.00% voluntary switches
         0.00% direct switches
-          .. system calls
+         *[0-9]* system calls
            . maybe-blocking system calls
            0 direct switch requests
            0 waits
diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
new file mode 100644
index 00000000000..897ffe46fbf
--- /dev/null
+++ b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -0,0 +1,213 @@
+/* **********************************************************
+ * Copyright (c) 2021-2023 Google, LLC  All rights reserved.
+ * **********************************************************/
+
+/*
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * * Neither the name of Google, Inc. nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, LLC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/* Test for checks performed by invariant_checker_t that are not tested
+ * by the signal_invariants app's prefetch and handler markers.
+ * This looks for precise error strings from invariant_checker.cpp: but
+ * we will notice if the literals get out of sync as the test will fail.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "../tools/schedule_stats.h"
+#include "../common/memref.h"
+#include "memref_gen.h"
+
+namespace dynamorio {
+namespace drmemtrace {
+
+using ::dynamorio::drmemtrace::default_memtrace_stream_t;
+using ::dynamorio::drmemtrace::memref_t;
+using ::dynamorio::drmemtrace::memref_tid_t;
+using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_CORE_WAIT;
+using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH;
+using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL;
+using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_SYSCALL;
+
+// Bypasses the analyzer and scheduler for a controlled test sequence.
+// Alternates the per-core memref vectors in lockstep.
+static schedule_stats_t::counters_t
+run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs,
+                   const std::unordered_map<memref_tid_t, int64_t> &tid2ord)
+{
+    schedule_stats_t tool(/*print_every=*/1, /*verbosity=*/2);
+    // schedule_stats_t uses get_input_id() to identify switches.
+    class mock_stream_t : public default_memtrace_stream_t {
+    public:
+        void
+        set_input_id(int64_t input_id)
+        {
+            input_id_ = input_id;
+        }
+        int64_t
+        get_input_id() const override
+        {
+            return input_id_;
+        }
+        memtrace_stream_t *
+        get_input_interface() const override
+        {
+            return const_cast<mock_stream_t *>(this);
+        }
+
+    private:
+        int64_t input_id_ = 0;
+    };
+    struct per_core_t {
+        void *worker_data;
+        void *shard_data;
+        mock_stream_t stream;
+        bool finished = false;
+        size_t memref_idx = 0;
+    };
+    std::vector<per_core_t> per_core(memrefs.size());
+    for (int cpu = 0; cpu < static_cast<int>(memrefs.size()); ++cpu) {
+        per_core[cpu].worker_data = tool.parallel_worker_init(cpu);
+        per_core[cpu].shard_data = tool.parallel_shard_init_stream(
+            cpu, per_core[cpu].worker_data, &per_core[cpu].stream);
+    }
+    // Walk in lockstep until all are empty.
+    int num_finished = 0;
+    while (num_finished < static_cast<int>(memrefs.size())) {
+        for (size_t cpu = 0; cpu < memrefs.size(); ++cpu) {
+            if (per_core[cpu].finished)
+                continue;
+            memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
+            per_core[cpu].stream.set_input_id(tid2ord.at(memref.instr.tid));
+            bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
+            assert(res);
+            ++per_core[cpu].memref_idx;
+            if (per_core[cpu].memref_idx >= memrefs[cpu].size()) {
+                per_core[cpu].finished = true;
+                ++num_finished;
+            }
+        }
+    }
+    for (int cpu = 0; cpu < static_cast<int>(memrefs.size()); ++cpu) {
+        tool.parallel_shard_exit(per_core[cpu].shard_data);
+        tool.parallel_worker_exit(per_core[cpu].worker_data);
+    }
+    return tool.get_total_counts();
+}
+
+static bool
+test_basic_stats()
+{
+    static constexpr int64_t TID_A = 42;
+    static constexpr int64_t TID_B = 142;
+    static constexpr int64_t TID_C = 242;
+    std::unordered_map<memref_tid_t, int64_t> tid2ord;
+    tid2ord[TID_A] = 0;
+    tid2ord[TID_B] = 1;
+    tid2ord[TID_C] = 2;
+    std::vector<std::vector<memref_t>> memrefs = {
+        {
+            gen_instr(TID_A),
+            // Involuntary switch.
+            gen_instr(TID_B),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_TIMESTAMP, 1100),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_SYSCALL, 0),
+            gen_marker(TID_B, TRACE_MARKER_TYPE_TIMESTAMP, 1600),
+            // Voluntary switch, on non-maybe-blocking-marked syscall.
+            gen_instr(TID_A),
+            gen_instr(TID_A),
+            gen_instr(TID_A),
+            gen_marker(TID_A, TRACE_MARKER_TYPE_TIMESTAMP, 2100),
+            gen_marker(TID_A, TRACE_MARKER_TYPE_SYSCALL, 0),
+            gen_marker(TID_A, TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+            gen_marker(TID_A, TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_C),
+            gen_marker(TID_A, TRACE_MARKER_TYPE_TIMESTAMP, 2300),
+            // Direct switch.
+            gen_instr(TID_C),
+            // No switch: latency too small.
+            gen_marker(TID_C, TRACE_MARKER_TYPE_TIMESTAMP, 2500),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_SYSCALL, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_TIMESTAMP, 2599),
+            gen_instr(TID_C),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_TIMESTAMP, 3100),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_SYSCALL, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_A),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_TIMESTAMP, 3300),
+            // Direct switch requested but failed.
+            gen_instr(TID_C),
+        },
+        {
+            gen_instr(TID_B),
+            // Involuntary switch.
+            gen_instr(TID_A),
+            // Involuntary switch.
+            gen_instr(TID_C),
+            gen_instr(TID_C),
+            gen_instr(TID_C),
+            // Wait.
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            gen_marker(TID_C, TRACE_MARKER_TYPE_CORE_WAIT, 0),
+            // Involuntary switch.
+            gen_instr(TID_B),
+            gen_instr(TID_B),
+            gen_instr(TID_B),
+        },
+    };
+    auto result = run_schedule_stats(memrefs, tid2ord);
+    assert(result.instrs == 16);
+    assert(result.total_switches == 6);
+    assert(result.voluntary_switches == 2);
+    assert(result.direct_switches == 1);
+    assert(result.syscalls == 4);
+    assert(result.maybe_blocking_syscalls == 3);
+    assert(result.direct_switch_requests == 2);
+    assert(result.waits == 3);
+    return true;
+}
+
+int
+test_main(int argc, const char *argv[])
+{
+    if (test_basic_stats()) {
+        std::cerr << "schedule_stats_test passed\n";
+        return 0;
+    }
+    std::cerr << "schedule_stats_test FAILED\n";
+    exit(1);
+}
+
+} // namespace drmemtrace
+} // namespace dynamorio
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 3166ba79a68..2544305358b 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -1577,11 +1577,12 @@ test_synthetic_with_syscalls_precise()
         make_version(TRACE_ENTRY_VERSION),
         make_timestamp(20),
         make_instr(10),
+        make_timestamp(120),
         make_marker(TRACE_MARKER_TYPE_SYSCALL, SYSNUM),
         make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
         make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100),
         make_marker(TRACE_MARKER_TYPE_FUNC_ARG, 42),
-        make_timestamp(50),
+        make_timestamp(250),
         make_marker(TRACE_MARKER_TYPE_CPU_ID, 1),
         make_marker(TRACE_MARKER_TYPE_FUNC_ID, 100),
         make_marker(TRACE_MARKER_TYPE_FUNC_RETVAL, 0),
@@ -1631,6 +1632,7 @@ test_synthetic_with_syscalls_precise()
         check_ref(refs, idx, TID_A, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_VERSION) &&
         check_ref(refs, idx, TID_A, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_TIMESTAMP) &&
         check_ref(refs, idx, TID_A, TRACE_TYPE_INSTR) &&
+        check_ref(refs, idx, TID_A, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_TIMESTAMP) &&
         check_ref(refs, idx, TID_A, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_SYSCALL) &&
         check_ref(refs, idx, TID_A, TRACE_TYPE_MARKER,
                   TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL) &&
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index 16103add7fa..ad12644a3b5 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -179,7 +179,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         // We convert to letters which only works well for <=26 inputs.
         if (!shard->thread_sequence.empty()) {
             ++shard->counters.total_switches;
-            if (shard->saw_maybe_blocking || shard->saw_exit)
+            if (shard->saw_syscall || shard->saw_exit)
                 ++shard->counters.voluntary_switches;
             if (shard->direct_switch_target == memref.marker.tid)
                 ++shard->counters.direct_switches;
@@ -217,19 +217,20 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
             shard->cur_segment_instrs = 0;
         }
         shard->direct_switch_target = INVALID_THREAD_ID;
-        shard->saw_maybe_blocking = false;
+        shard->saw_syscall = false;
         shard->saw_exit = false;
     }
     if (memref.instr.tid != INVALID_THREAD_ID)
         shard->counters.threads.insert(memref.instr.tid);
     if (memref.marker.type == TRACE_TYPE_MARKER) {
-        if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL)
+        if (memref.marker.marker_type == TRACE_MARKER_TYPE_SYSCALL) {
             ++shard->counters.syscalls;
-        if (memref.marker.marker_type == TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL) {
+            shard->saw_syscall = true;
+        } else if (memref.marker.marker_type ==
+                   TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL) {
             ++shard->counters.maybe_blocking_syscalls;
-            shard->saw_maybe_blocking = true;
-        }
-        if (memref.marker.marker_type == TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH) {
+            shard->saw_syscall = true;
+        } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH) {
             ++shard->counters.direct_switch_requests;
             shard->direct_switch_target = memref.marker.marker_value;
         }
@@ -296,5 +297,15 @@ schedule_stats_t::print_results()
     return true;
 }
 
+schedule_stats_t::counters_t
+schedule_stats_t::get_total_counts()
+{
+    counters_t total;
+    for (const auto &shard : shard_map_) {
+        total += shard.second->counters;
+    }
+    return total;
+}
+
 } // namespace drmemtrace
 } // namespace dynamorio
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index 0afffa193b1..073d754b6e8 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -51,7 +51,7 @@ namespace drmemtrace {
 
 class schedule_stats_t : public analysis_tool_t {
 public:
-    schedule_stats_t(uint64_t print_every, unsigned int verbose);
+    schedule_stats_t(uint64_t print_every, unsigned int verbose = 0);
     ~schedule_stats_t() override;
     std::string
     initialize_stream(memtrace_stream_t *serial_stream) override;
@@ -114,7 +114,7 @@ class schedule_stats_t : public analysis_tool_t {
         counters_t counters;
         int64_t prev_input = -1;
         // These are cleared when an instruction is seen.
-        bool saw_maybe_blocking = false;
+        bool saw_syscall = false;
         memref_tid_t direct_switch_target = INVALID_THREAD_ID;
         bool saw_exit = false;
         // A representation of the thread interleavings.