diff --git a/api/docs/release.dox b/api/docs/release.dox
index 9e5a6d06697..a4a7d9f5049 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -269,6 +269,12 @@ Further non-compatibility-affecting changes include:
    the value of TRACE_MARKER_TYPE_ markers. This filter takes a list of
    <TRACE_MARKER_TYPE_,new_value> and changes every listed marker in the trace to its
    corresponding new_value.
+ - Added trace_analysis_tool::preferred_shard_type() to the drmemtrace framework to
+   allow switching to core-sharded by default if all tools prefer that mode.
+ - For the drmemtrace framework, if only core-sharded-preferring tools are enabled
+   (these include cache and TLB simulators and the schedule_stats tool), -core_sharded or
+   -core_serial is automatically turned on for offline analysis to enable more
+   representative simulated software thread scheduling onto virtual cores.
 
 **************************************************
 <hr>
diff --git a/clients/drcachesim/analysis_tool.h b/clients/drcachesim/analysis_tool.h
index 24306cd7534..c5cae5f2b55 100644
--- a/clients/drcachesim/analysis_tool.h
+++ b/clients/drcachesim/analysis_tool.h
@@ -156,6 +156,19 @@ template <typename RecordType> class analysis_tool_tmpl_t {
     {
         return "";
     }
+    /**
+     * Identifies the preferred shard type for this analysis.  This only applies when
+     * the user does not specify a shard type for a run.  In that case, if every tool
+     * being run prefers #SHARD_BY_CORE, the framework uses that mode.  If tools
+     * disagree then an error is raised.  This is ignored if the user specifies a
+     * shard type via one of -core_sharded, -core_serial, -no_core_sharded,
+     * -no_core_serial, or -cpu_scheduling.
+     */
+    virtual shard_type_t
+    preferred_shard_type()
+    {
+        return SHARD_BY_THREAD;
+    }
     /** Returns whether the tool was created successfully. */
     virtual bool
     operator!()
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
index f2c2ddb5098..728648171fa 100644
--- a/clients/drcachesim/analyzer.cpp
+++ b/clients/drcachesim/analyzer.cpp
@@ -339,6 +339,19 @@ analyzer_tmpl_t<RecordType, ReaderType>::init_scheduler_common(
             uint64_t filetype = scheduler_.get_stream(i)->get_filetype();
             VPRINT(this, 2, "Worker %d filetype %" PRIx64 "\n", i, filetype);
             if (TESTANY(OFFLINE_FILE_TYPE_CORE_SHARDED, filetype)) {
+                if (i == 0 && shard_type_ == SHARD_BY_CORE) {
+                    // This is almost certainly user error.
+                    // Better to exit than risk user confusion.
+                    // XXX i#7045: Ideally this could be reported as an error by the
+                    // scheduler, and also detected early in analyzer_multi to auto-fix
+                    // (when no mode is specified: if the user specifies core-sharding
+                    // there could be config differences and this should be an error),
+                    // but neither is simple so today the user has to re-run.
+                    error_string_ =
+                        "Re-scheduling a core-sharded-on-disk trace is generally a "
+                        "mistake; re-run with -no_core_sharded.\n";
+                    return false;
+                }
                 shard_type_ = SHARD_BY_CORE;
             }
         }
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
index 47bc5ff0c58..521ae6bf488 100644
--- a/clients/drcachesim/analyzer_multi.cpp
+++ b/clients/drcachesim/analyzer_multi.cpp
@@ -462,6 +462,7 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             if (!error.empty()) {
                 this->success_ = false;
                 this->error_string_ = "raw2trace failed: " + error;
+                return;
             }
         }
     }
@@ -473,8 +474,54 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         return;
     }
 
+    bool sharding_specified = op_core_sharded.specified() || op_core_serial.specified() ||
+        // -cpu_scheduling implies thread-sharded.
+        op_cpu_scheduling.get_value();
+    // TODO i#7040: Add core-sharded support for online tools.
+    bool offline = !op_indir.get_value().empty() || !op_infile.get_value().empty();
+    if (offline && !sharding_specified) {
+        bool all_prefer_thread_sharded = true;
+        bool all_prefer_core_sharded = true;
+        for (int i = 0; i < this->num_tools_; ++i) {
+            if (this->tools_[i]->preferred_shard_type() == SHARD_BY_THREAD) {
+                all_prefer_core_sharded = false;
+            } else if (this->tools_[i]->preferred_shard_type() == SHARD_BY_CORE) {
+                all_prefer_thread_sharded = false;
+            }
+            if (this->parallel_ && !this->tools_[i]->parallel_shard_supported()) {
+                this->parallel_ = false;
+            }
+        }
+        if (all_prefer_core_sharded) {
+            // XXX i#6949: Ideally we could detect a core-sharded-on-disk input
+            // here and avoid this but that's not simple so currently we have a
+            // fatal error from the analyzer and the user must re-run with
+            // -no_core_sharded for such inputs.
+            if (this->parallel_) {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_sharded as all tools prefer it\n");
+                op_core_sharded.set_value(true);
+            } else {
+                if (op_verbose.get_value() > 0)
+                    fprintf(stderr, "Enabling -core_serial as all tools prefer it\n");
+                op_core_serial.set_value(true);
+            }
+        } else if (!all_prefer_thread_sharded) {
+            this->success_ = false;
+            this->error_string_ = "Selected tools differ in preferred sharding: please "
+                                  "re-run with -[no_]core_sharded or -[no_]core_serial";
+            return;
+        }
+    }
+
     typename sched_type_t::scheduler_options_t sched_ops;
     if (op_core_sharded.get_value() || op_core_serial.get_value()) {
+        if (!offline) {
+            // TODO i#7040: Add core-sharded support for online tools.
+            this->success_ = false;
+            this->error_string_ = "Core-sharded is not yet supported for online analysis";
+            return;
+        }
         if (op_core_serial.get_value()) {
             this->parallel_ = false;
         }
@@ -502,8 +549,10 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
             return;
         }
         if (!this->init_scheduler(tracedir, only_threads, only_shards,
-                                  op_verbose.get_value(), std::move(sched_ops)))
+                                  op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     } else if (op_infile.get_value().empty()) {
         // XXX i#3323: Add parallel analysis support for online tools.
         this->parallel_ = false;
@@ -520,12 +569,15 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::analyzer_multi_tmpl_t()
         if (!this->init_scheduler(std::move(reader), std::move(end),
                                   op_verbose.get_value(), std::move(sched_ops))) {
             this->success_ = false;
+            return;
         }
     } else {
         // Legacy file.
         if (!this->init_scheduler(op_infile.get_value(), {}, {}, op_verbose.get_value(),
-                                  std::move(sched_ops)))
+                                  std::move(sched_ops))) {
             this->success_ = false;
+            return;
+        }
     }
     if (!init_analysis_tools()) {
         this->success_ = false;
diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
index 8e98e08ed9d..3bd7d7c4a03 100644
--- a/clients/drcachesim/common/options.cpp
+++ b/clients/drcachesim/common/options.cpp
@@ -299,13 +299,19 @@ droption_t<std::string> op_v2p_file(
 droption_t<bool> op_cpu_scheduling(
     DROPTION_SCOPE_CLIENT, "cpu_scheduling", false,
     "Map threads to cores matching recorded cpu execution",
-    "By default, the simulator schedules threads to simulated cores in a static "
+    "By default for online analysis, the simulator schedules threads to simulated cores "
+    "in a static "
     "round-robin fashion.  This option causes the scheduler to instead use the recorded "
     "cpu that each thread executed on (at a granularity of the trace buffer size) "
     "for scheduling, mapping traced cpu's to cores and running each segment of each "
     "thread on the core that owns the recorded cpu for that segment. "
     "This option is not supported with -core_serial; use "
-    "-cpu_schedule_file with -core_serial instead.");
+    "-cpu_schedule_file with -core_serial instead.  For offline analysis, the "
+    "recommendation is to not recreate the as-traced schedule (as it is not accurate due "
+    "to overhead) and instead use a dynamic schedule via -core_serial.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS
+    "), -core_serial is automatically turned on for offline analysis.");
 
 droption_t<bytesize_t> op_max_trace_size(
     DROPTION_SCOPE_CLIENT, "max_trace_size", 0,
@@ -890,11 +896,16 @@ droption_t<int> op_kernel_trace_buffer_size_shift(
 // Core-oriented analysis.
 droption_t<bool> op_core_sharded(
     DROPTION_SCOPE_ALL, "core_sharded", false, "Analyze per-core in parallel.",
-    "By default, the input trace is analyzed in parallel across shards equal to "
-    "software threads.  This option instead schedules those threads onto virtual cores "
+    "By default, the sharding mode is determined by the preferred shard type of the"
+    "tools selected (unless overridden, the default preferred type is thread-sharded). "
+    "This option enables core-sharded, overriding tool defaults.  Core-sharded "
+    "anlysis schedules the input software threads onto virtual cores "
     "and analyzes each core in parallel.  Thus, each shard consists of pieces from "
     "many software threads.  How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and they all support parallel operation, -core_sharded is "
+    "automatically turned on for offline analysis.");
 
 droption_t<bool> op_core_serial(
     DROPTION_SCOPE_ALL, "core_serial", false, "Analyze per-core in serial.",
@@ -902,7 +913,10 @@ droption_t<bool> op_core_serial(
     "However, the resulting schedule is acted upon by a single analysis thread"
     "which walks the N cores in lockstep in round robin fashion. "
     "How the scheduling is performed is controlled by a set "
-    "of options with the prefix \"sched_\" along with -cores.");
+    "of options with the prefix \"sched_\" along with -cores.  If only "
+    "core-sharded-preferring tools are enabled (e.g., " CPU_CACHE ", " TLB
+    ", " SCHEDULE_STATS ") and not all of them support parallel operation, "
+    "-core_serial is automatically turned on for offline analysis.");
 
 droption_t<int64_t>
     // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
diff --git a/clients/drcachesim/docs/drcachesim.dox.in b/clients/drcachesim/docs/drcachesim.dox.in
index 5cc2818c7bd..82ecaead2a9 100644
--- a/clients/drcachesim/docs/drcachesim.dox.in
+++ b/clients/drcachesim/docs/drcachesim.dox.in
@@ -1292,8 +1292,8 @@ Neither simulator has a simple way to know which core any particular thread
 executed on for each of its instructions.  The tracer records which core a
 thread is on each time it writes out a full trace buffer, giving an
 approximation of the actual scheduling: but this is not representative
-due to overhead (see \ref sec_drcachesim_as_traced).  By default, these cache and TLB
-simulators ignore that
+due to overhead (see \ref sec_drcachesim_as_traced).  For online analysis, by default,
+these cache and TLB simulators ignore that
 information and schedule threads to simulated cores in a static round-robin
 fashion with load balancing to fill in gaps with new threads after threads
 exit.  The option "-cpu_scheduling" (see \ref sec_drcachesim_ops) can be
@@ -1301,13 +1301,15 @@ used to instead map each physical cpu to a simulated core and use the
 recorded cpu that each segment of thread execution occurred on to schedule
 execution following the "as traced" schedule, but as just noted this is not
 representative.  Instead, we recommend using offline traces and dynamic
-re-scheduling as explained in \ref sec_drcachesim_sched_dynamic using the
-`-core_serial` parameter.  Here is an example:
+re-scheduling in core-sharded mode as explained in \ref sec_drcachesim_sched_dynamic
+using the
+`-core_serial` parameter.  In offline mode, `-core_serial` is the default for
+these simulators.
 
 \code
 $ bin64/drrun -t drmemtrace -offline -- ~/test/pi_estimator 8 20
 Estimation of pi is 3.141592653798125
-$ bin64/drrun -t drcachesim -core_serial -cores 3 -indir drmemtrace.pi_estimator.*.dir
+$ bin64/drrun -t drcachesim -cores 3 -indir drmemtrace.pi_estimator.*.dir
 Cache simulation results:
 Core #0 (traced CPU(s): #0)
   L1I0 (size=32768, assoc=8, block=64, LRU) stats:
@@ -1473,6 +1475,9 @@ The #dynamorio::drmemtrace::TRACE_MARKER_TYPE_TIMESTAMP and
 #dynamorio::drmemtrace::TRACE_MARKER_TYPE_CPU_ID markers are modified by the dynamic
 scheduler to reflect the new schedule.  The new timestamps maintain relative ordering
 but should not be relied upon to indicate accurate durations between events.
+When analyzing core-sharded-on-disk traces, `-no_core_sharded` must be passed when
+using core-sharded-preferring tools to avoid an error from the framework attempting
+to re-schedule the already-scheduled trace.
 
 Traces also include markers indicating disruptions in user mode control
 flow such as signal handler entry and exit.
@@ -1512,7 +1517,9 @@ the framework controls the iteration), to request the next trace
 record for each output on its own.  This scheduling is also available to any analysis tool
 when the input traces are sharded by core (see the `-core_sharded` and `-core_serial`
 and various `-sched_*` option documentation under \ref sec_drcachesim_ops as well as
-core-sharded notes when \ref sec_drcachesim_newtool).
+core-sharded notes when \ref sec_drcachesim_newtool), and in fact is the
+default when all tools prefer core-sharded operation via
+#dynamorio::drmemtrace::analysis_tool_t::preferred_shard_type().
 
 ********************
 \section sec_drcachesim_as_traced As-Traced Schedule Limitations
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 7b3f885777c..18c1a88e6dc 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -3245,9 +3245,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t outpu
                                                           uint64_t blocked_time)
 {
     VDO(this, 1, {
-        static int global_heartbeat;
+        static int64_t global_heartbeat;
+        // 10K is too frequent for simple analyzer runs: it is too noisy with
+        // the new core-sharded-by-default for new users using defaults.
+        // 50K is a reasonable compromise.
+        // XXX: Add a runtime option to tweak this.
+        static constexpr int64_t GLOBAL_HEARTBEAT_CADENCE = 50000;
         // We are ok with races as the cadence is approximate.
-        if (++global_heartbeat % 10000 == 0) {
+        if (++global_heartbeat % GLOBAL_HEARTBEAT_CADENCE == 0) {
             print_queue_stats();
         }
     });
diff --git a/clients/drcachesim/simulator/cache_simulator.cpp b/clients/drcachesim/simulator/cache_simulator.cpp
index 0d5bc263007..e834b3b27ae 100644
--- a/clients/drcachesim/simulator/cache_simulator.cpp
+++ b/clients/drcachesim/simulator/cache_simulator.cpp
@@ -632,8 +632,7 @@ cache_simulator_t::print_results()
     std::cerr << "Cache simulation results:\n";
     // Print core and associated L1 cache stats first.
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (shard_type_ == SHARD_BY_CORE || thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             if (l1_icaches_[i] != l1_dcaches_[i]) {
                 std::cerr << "  " << l1_icaches_[i]->get_name() << " ("
                           << l1_icaches_[i]->get_description() << ") stats:" << std::endl;
diff --git a/clients/drcachesim/simulator/simulator.cpp b/clients/drcachesim/simulator/simulator.cpp
index 9199de6bad3..5c61006a7b3 100644
--- a/clients/drcachesim/simulator/simulator.cpp
+++ b/clients/drcachesim/simulator/simulator.cpp
@@ -311,18 +311,19 @@ simulator_t::handle_thread_exit(memref_tid_t tid)
     thread2core_.erase(tid);
 }
 
-void
+bool
 simulator_t::print_core(int core) const
 {
     if (!knob_cpu_scheduling_ && shard_type_ == SHARD_BY_THREAD) {
         std::cerr << "Core #" << core << " (" << thread_ever_counts_[core]
                   << " thread(s))" << std::endl;
+        return thread_ever_counts_[core] > 0;
     } else {
         std::cerr << "Core #" << core;
         if (shard_type_ == SHARD_BY_THREAD && cpu_counts_[core] == 0) {
             // We keep the "(s)" mainly to simplify test templates.
             std::cerr << " (0 traced CPU(s))" << std::endl;
-            return;
+            return false;
         }
         std::cerr << " (";
         if (shard_type_ == SHARD_BY_THREAD) // Always 1:1 for SHARD_BY_CORE.
@@ -338,6 +339,8 @@ simulator_t::print_core(int core) const
             }
         }
         std::cerr << ")" << std::endl;
+        // If anything ran on this core, need_comma will be true.
+        return need_comma;
     }
 }
 
diff --git a/clients/drcachesim/simulator/simulator.h b/clients/drcachesim/simulator/simulator.h
index 78e434a6517..c3359e265da 100644
--- a/clients/drcachesim/simulator/simulator.h
+++ b/clients/drcachesim/simulator/simulator.h
@@ -69,6 +69,13 @@ class simulator_t : public analysis_tool_t {
     std::string
     initialize_shard_type(shard_type_t shard_type) override;
 
+    shard_type_t
+    preferred_shard_type() override
+    {
+        // We prefer a dynamic schedule with more realistic thread interleavings.
+        return SHARD_BY_CORE;
+    }
+
     bool
     process_memref(const memref_t &memref) override;
 
@@ -83,7 +90,8 @@ class simulator_t : public analysis_tool_t {
                double warmup_fraction, uint64_t sim_refs, bool cpu_scheduling,
                bool use_physical, unsigned int verbose);
 
-    void
+    // Returns whether the core was ever non-empty.
+    bool
     print_core(int core) const;
 
     int
diff --git a/clients/drcachesim/simulator/tlb_simulator.cpp b/clients/drcachesim/simulator/tlb_simulator.cpp
index f5ac4ff9e1b..396a9c9cab7 100644
--- a/clients/drcachesim/simulator/tlb_simulator.cpp
+++ b/clients/drcachesim/simulator/tlb_simulator.cpp
@@ -264,8 +264,7 @@ tlb_simulator_t::print_results()
 {
     std::cerr << "TLB simulation results:\n";
     for (unsigned int i = 0; i < knobs_.num_cores; i++) {
-        print_core(i);
-        if (thread_ever_counts_[i] > 0) {
+        if (print_core(i)) {
             std::cerr << "  L1I stats:" << std::endl;
             itlbs_[i]->get_stats()->print_stats("    ");
             std::cerr << "  L1D stats:" << std::endl;
diff --git a/clients/drcachesim/tests/offline-burst_client.templatex b/clients/drcachesim/tests/offline-burst_client.templatex
index 3bc0005808f..cd0fb000c19 100644
--- a/clients/drcachesim/tests/offline-burst_client.templatex
+++ b/clients/drcachesim/tests/offline-burst_client.templatex
@@ -23,7 +23,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -36,9 +36,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_maps.templatex b/clients/drcachesim/tests/offline-burst_maps.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_maps.templatex
+++ b/clients/drcachesim/tests/offline-burst_maps.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_noreach.templatex b/clients/drcachesim/tests/offline-burst_noreach.templatex
index 50ddb5c76bb..a851855f7c3 100644
--- a/clients/drcachesim/tests/offline-burst_noreach.templatex
+++ b/clients/drcachesim/tests/offline-burst_noreach.templatex
@@ -11,7 +11,7 @@ pre-DR start
 pre-DR detach
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -24,9 +24,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_replace.templatex b/clients/drcachesim/tests/offline-burst_replace.templatex
index 6ef8c7388c0..3b6bc0c5ac3 100644
--- a/clients/drcachesim/tests/offline-burst_replace.templatex
+++ b/clients/drcachesim/tests/offline-burst_replace.templatex
@@ -19,7 +19,7 @@ close file .*
 close file .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -32,9 +32,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-burst_static.templatex b/clients/drcachesim/tests/offline-burst_static.templatex
index 1e1446cce3b..5fcc4fa8959 100644
--- a/clients/drcachesim/tests/offline-burst_static.templatex
+++ b/clients/drcachesim/tests/offline-burst_static.templatex
@@ -20,7 +20,7 @@ DynamoRIO statistics:
 .*
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -33,9 +33,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-3][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-filter-and-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-filter.templatex b/clients/drcachesim/tests/offline-filter.templatex
index 326b785b35a..7db8cbdfaa1 100644
--- a/clients/drcachesim/tests/offline-filter.templatex
+++ b/clients/drcachesim/tests/offline-filter.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                   *[0-9]?[0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-fork.templatex b/clients/drcachesim/tests/offline-fork.templatex
index d4a5a4b2402..f8991518514 100644
--- a/clients/drcachesim/tests/offline-fork.templatex
+++ b/clients/drcachesim/tests/offline-fork.templatex
@@ -3,7 +3,7 @@ parent waiting for child
 child is running under DynamoRIO
 child has exited
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -16,9 +16,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-instr-only-trace.templatex b/clients/drcachesim/tests/offline-instr-only-trace.templatex
index 0c3712398cd..4ef96d3d3e9 100644
--- a/clients/drcachesim/tests/offline-instr-only-trace.templatex
+++ b/clients/drcachesim/tests/offline-instr-only-trace.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -12,9 +12,9 @@ Core #0 \(1 thread\(s\)\)
     Misses:                              0
     Compulsory misses:                   *[0-9,\.]*
     Invalidations:                       0
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tests/offline-multiproc.templatex b/clients/drcachesim/tests/offline-multiproc.templatex
index 76b03014e8e..1555a744eaa 100644
--- a/clients/drcachesim/tests/offline-multiproc.templatex
+++ b/clients/drcachesim/tests/offline-multiproc.templatex
@@ -1,6 +1,6 @@
 all done
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:       *[0-9\.,]*
     Invalidations:           *0
 .*   Miss rate:              *[0-9]*[,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                    *[0-9\.,]*
     Misses:                  *[0-9\.,]*
diff --git a/clients/drcachesim/tests/offline-simple.templatex b/clients/drcachesim/tests/offline-simple.templatex
index c35d941b14b..3f1e8897be4 100644
--- a/clients/drcachesim/tests/offline-simple.templatex
+++ b/clients/drcachesim/tests/offline-simple.templatex
@@ -1,6 +1,6 @@
 Hello, world!
 Cache simulation results:
-Core #0 \(1 thread\(s\)\)
+Core #0 \(traced CPU\(s\): #0\)
   L1I0 .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
@@ -13,9 +13,9 @@ Core #0 \(1 thread\(s\)\)
     Compulsory misses:            *[0-9,\.]*
     Invalidations:                *0
 .*   Miss rate:                        [0-9][,\.]..%
-Core #1 \(0 thread\(s\)\)
-Core #2 \(0 thread\(s\)\)
-Core #3 \(0 thread\(s\)\)
+Core #1 \(traced CPU\(s\): \)
+Core #2 \(traced CPU\(s\): \)
+Core #3 \(traced CPU\(s\): \)
 LL .* stats:
     Hits:                         *[0-9,\.]*
     Misses:                       *[0-9,\.]*
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index db3e217b5d8..61e175a4c5e 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -65,6 +65,11 @@ class schedule_stats_t : public analysis_tool_t {
     print_results() override;
     bool
     parallel_shard_supported() override;
+    shard_type_t
+    preferred_shard_type() override
+    {
+        return SHARD_BY_CORE;
+    }
     void *
     parallel_shard_init_stream(int shard_index, void *worker_data,
                                memtrace_stream_t *stream) override;
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index e7939ddaac1..5bb52650edc 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -2131,7 +2131,8 @@ if (AARCH64 AND UNIX AND ZLIB_FOUND)
     prefix_cmd_if_necessary(drcachesim_path ON ${drcachesim_path})
     torunonly_api(tool.drcacheoff.tlb_simulator_v2p "${drcachesim_path}"
       "offline-tlb_simulator_v2p" ""
-      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical"
+      # Do not use core-sharded scheduling, so we'll have a deterministic result.
+      "-indir;${locdir};-tool;TLB;-alt_module_dir;${srcdir};-module_file;${locdir}/raw/modules.log;-v2p_file;${srcdir}/v2p.textproto;-use_physical;-no_core_sharded"
       OFF OFF)
     set(tool.drcacheoff.tlb_simulator_v2p_basedir
       "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -3976,12 +3977,14 @@ if (BUILD_CLIENTS)
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
 
         torunonly_simtool(core_on_disk_schedule ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -no_core_sharded" "")
         set(tool.core_on_disk_schedule_rawtemp ON) # no preprocessor
 
         # Test -only_shards on core-sharded-on-disk traces.
         torunonly_simtool(only_shards ${ci_shared_app}
-          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3" "")
+          # Avoid the default core-sharded from re-scheduling the trace.
+          "-indir ${core_sharded_dir} -tool schedule_stats -only_shards 2,3 -no_core_sharded" "")
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
       endif ()
     endif ()
@@ -4100,10 +4103,10 @@ if (BUILD_CLIENTS)
       torunonly_drcacheoff(fork linux.fork "" "" "")
     endif ()
 
-    # Test reading a legacy pre-interleaved file.
+    # Test reading a legacy pre-interleaved file in thread-sharded mode.
     if (ZLIB_FOUND)
       torunonly_api(tool.drcacheoff.legacy "${drcachesim_path}" "offline-legacy.c" ""
-        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz"
+        "-infile;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/offline-legacy-trace.gz;-no_core_sharded"
         OFF OFF)
       set(tool.drcacheoff.legacy_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
@@ -4139,16 +4142,20 @@ if (BUILD_CLIENTS)
     # Test reading a trace in sharded snappy-compressed files.
     if (libsnappy)
       # with a parallel tool (basic_counts)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy.c" ""
+      torunonly_api(tool.drcacheoff.snappy_parallel "${drcachesim_path}" "offline-snappy.c" ""
         "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-tool;basic_counts"
         OFF OFF)
+      set(tool.drcacheoff.snappy_parallel_basedir
+        "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_parallel_expectbase "offline-snappy")
 
-      # with a legacy serial tool (full simulator)
-      torunonly_api(tool.drcacheoff.snappy "${drcachesim_path}" "offline-snappy-serial.c" ""
-        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir"
+      # With a legacy serial tool (full simulator) in thread-sharded mode.
+      torunonly_api(tool.drcacheoff.snappy_serial "${drcachesim_path}" "offline-snappy-serial.c" ""
+        "-indir;${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/drmemtrace.chase-snappy.x64.tracedir;-no_core_sharded"
         OFF OFF)
-      set(tool.drcacheoff.snappy_basedir
+      set(tool.drcacheoff.snappy_serial_basedir
         "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests")
+      set(tool.drcacheoff.snappy_serial_expectbase "offline-snappy-serial")
     endif()
 
     if (UNIX) # Enable on Windows once i#1727 is fixed.
@@ -4441,15 +4448,16 @@ if (BUILD_CLIENTS)
         # Run with -trace_after_instrs to ensure we test the
         # drbbdup + rseq combo (i#5658, i#5659).
         "-trace_after_instrs 5K"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        # Run thread-sharded for the invariant checker.
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       # Test filtering.
       torunonly_drcacheoff(rseq-filter linux.rseq
         "-trace_after_instrs 5K -L0_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-filter_expectbase "offline-rseq")
       torunonly_drcacheoff(rseq-dfilter linux.rseq
         "-trace_after_instrs 5K -L0D_filter"
-        "@${test_mode_flag}@-test_mode_name@rseq_app" "")
+        "@${test_mode_flag}@-test_mode_name@rseq_app@-no_core_sharded" "")
       set(tool.drcacheoff.rseq-dfilter_expectbase "offline-rseq")
     endif ()
 
@@ -4746,9 +4754,9 @@ if (BUILD_CLIENTS)
       # Run the record filter tool with a null filter.
       set(${testname}_postcmd2 "${CMAKE_COMMAND}@-E@make_directory@${outdir}")
       set(${testname}_postcmd3 ${launch_cmd})
-      # Run the analyzer on the result.
+      # Run the analyzer on the result.  Avoid double-core-sharding.
       set(${testname}_postcmd4
-        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}")
+        "${drcachesim_path}@-indir@${outdir}@-tool@${analyzer}@-no_core_sharded")
     endmacro ()
 
     set(testname "tool.record_filter")
@@ -4884,7 +4892,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_as_traced "${drcachesim_path}"
         "record_filter_as_traced"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-no_core_sharded;-indir;${outdir}" OFF OFF)
       set(tool.record_filter_as_traced_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_as_traced_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@7@-indir@${trace_dir}@-outdir@${outdir}")
@@ -4901,7 +4909,7 @@ if (BUILD_CLIENTS)
       file(MAKE_DIRECTORY ${outdir})
       torunonly_api(tool.record_filter_start_idle "${drcachesim_path}"
         "record_filter_start_idle"
-        "" "-tool;schedule_stats;-indir;${outdir}" OFF OFF)
+        "" "-tool;schedule_stats;-indir;${outdir};-no_core_sharded" OFF OFF)
       set(tool.record_filter_start_idle_runcmp "${CMAKE_CURRENT_SOURCE_DIR}/runmulti.cmake")
       set(tool.record_filter_start_idle_precmd
         "${drcachesim_path}@-tool@record_filter@-cpu_schedule_file@${sched_file}@-core_sharded@-cores@4@-indir@${trace_dir}@-outdir@${outdir}")