diff --git a/.github/workflows/ci-aarchxx.yml b/.github/workflows/ci-aarchxx.yml
deleted file mode 100644
index 22b42ef0e34..00000000000
--- a/.github/workflows/ci-aarchxx.yml
+++ /dev/null
@@ -1,107 +0,0 @@
-# **********************************************************
-# Copyright (c) 2020-2023 Google, Inc.  All rights reserved.
-# Copyright (c) 2023 Arm Limited        All rights reserved.
-# **********************************************************
-
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of Google, Inc. nor the names of its contributors may be
-#   used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-# DAMAGE.
-
-# Github Actions workflow for aarch64 Continuous Integration testing.
-
-name: ci-aarchxx
-on:
-  # Run on pushes to master and on pull request changes, including from a
-  # forked repo with no "push" trigger, while avoiding duplicate triggers.
-  push:
-    branches:
-      - master
-  pull_request:
-    types: [opened, reopened, synchronize]
-  merge_group:
-
-  workflow_dispatch:
-
-jobs:
-  aarch64-precommit:
-    strategy:
-      fail-fast: false
-      matrix:
-        # This job will run in parallel.
-        os: [ubuntu-20-arm64, ubuntu-20-arm64-sve]
-    runs-on: ${{ matrix.os }}
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-
-      # Cancel any prior runs for a PR (but do not cancel master branch runs).
-      - name: Cancel previous runs
-        uses: n1hility/cancel-previous-runs@v2
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-        if: ${{ github.event_name == 'pull_request' }}
-
-      # We also need origin/master for pre-commit source file checks in runsuite.cmake.
-      # But fetching multiple branches isn't supported yet: actions/checkout#214
-      # Pending PR that adds this support actions/checkout#155
-      # TODO i#4549: When necessary support is available, remove/replace the
-      # workaround here and from every job in other Github Actions CI workflows.
-      - name: Fetch master
-        run: git fetch --no-tags --depth=1 origin master
-
-      - name: Create build directory
-        run: mkdir build
-
-      - name: Run Suite
-        working-directory: build
-        run: ../suite/runsuite_wrapper.pl travis
-        env:
-          CI_BRANCH: ${{ github.ref }}
-
-      - name: Send failure mail to dynamorio-devs
-        if: failure() && github.ref == 'refs/heads/master'
-        uses: dawidd6/action-send-mail@v2
-        with:
-          server_address: smtp.gmail.com
-          server_port: 465
-          username: ${{secrets.DYNAMORIO_NOTIFICATION_EMAIL_USERNAME}}
-          password: ${{secrets.DYNAMORIO_NOTIFICATION_EMAIL_PASSWORD}}
-          subject: |
-            [${{github.repository}}] ${{github.workflow}} FAILED
-            on ${{github.event_name}} at ${{github.ref}}
-          body: |
-            Github Actions CI workflow run FAILED!
-            Workflow: ${{github.workflow}}/x86-32
-            Repository: ${{github.repository}}
-            Branch ref: ${{github.ref}}
-            SHA: ${{github.sha}}
-            Triggering actor: ${{github.actor}}
-            Triggering event: ${{github.event_name}}
-            Run Id: ${{github.run_id}}
-            See more details on github.com/DynamoRIO/dynamorio/actions/runs/${{github.run_id}}
-          to: dynamorio-devs@googlegroups.com
-          from: Github Action CI
diff --git a/clients/drcachesim/common/utils.h b/clients/drcachesim/common/utils.h
index 21a25efbb17..87b5ce6f11f 100644
--- a/clients/drcachesim/common/utils.h
+++ b/clients/drcachesim/common/utils.h
@@ -57,6 +57,8 @@ namespace drmemtrace {
 
 // XXX: DR should export this
 #define INVALID_THREAD_ID 0
+// We avoid collisions with DR's INVALID_PROCESS_ID by using our own name.
+#define INVALID_PID -1
 
 // XXX: perhaps we should use a C++-ish stream approach instead
 // This cannot be named ERROR as that conflicts with Windows headers.
diff --git a/clients/drcachesim/reader/record_file_reader.h b/clients/drcachesim/reader/record_file_reader.h
index 33cf2df4001..a80cdeba581 100644
--- a/clients/drcachesim/reader/record_file_reader.h
+++ b/clients/drcachesim/reader/record_file_reader.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2022-2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2022-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -126,7 +126,7 @@ class record_reader_t : public std::iterator<std::input_iterator_tag, trace_entr
     virtual ~record_reader_t()
     {
     }
-    bool
+    virtual bool
     init()
     {
         if (!open_input_file())
diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
index 77767d300a2..bb5e72c9a99 100644
--- a/clients/drcachesim/scheduler/scheduler.cpp
+++ b/clients/drcachesim/scheduler/scheduler.cpp
@@ -207,6 +207,17 @@ scheduler_tmpl_t<memref_t, reader_t>::record_type_has_tid(memref_t record,
     return true;
 }
 
+template <>
+bool
+scheduler_tmpl_t<memref_t, reader_t>::record_type_has_pid(memref_t record,
+                                                          memref_pid_t &pid)
+{
+    if (record.marker.pid == INVALID_PID)
+        return false;
+    pid = record.marker.pid;
+    return true;
+}
+
 template <>
 void
 scheduler_tmpl_t<memref_t, reader_t>::record_type_set_tid(memref_t &record,
@@ -222,6 +233,23 @@ scheduler_tmpl_t<memref_t, reader_t>::record_type_is_instr(memref_t record)
     return type_is_instr(record.instr.type);
 }
 
+template <>
+bool
+scheduler_tmpl_t<memref_t, reader_t>::record_type_is_encoding(memref_t record)
+{
+    // There are no separate memref_t encoding records: encoding info is
+    // inside instruction records.
+    return false;
+}
+
+template <>
+bool
+scheduler_tmpl_t<memref_t, reader_t>::record_type_is_instr_boundary(memref_t record,
+                                                                    memref_t prev_record)
+{
+    return record_type_is_instr(record);
+}
+
 template <>
 bool
 scheduler_tmpl_t<memref_t, reader_t>::record_type_is_marker(memref_t record,
@@ -302,6 +330,13 @@ scheduler_tmpl_t<memref_t, reader_t>::print_record(const memref_t &record)
     fprintf(stderr, "\n");
 }
 
+template <>
+void
+scheduler_tmpl_t<memref_t, reader_t>::insert_switch_tid_pid(input_info_t &info)
+{
+    // We do nothing, as every record has a tid from the separate inputs.
+}
+
 /******************************************************************************
  * Specializations for scheduler_tmpl_t<record_reader_t>, aka record_scheduler_t.
  */
@@ -343,6 +378,17 @@ scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_has_tid(
     return true;
 }
 
+template <>
+bool
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_has_pid(
+    trace_entry_t record, memref_pid_t &pid)
+{
+    if (record.type != TRACE_TYPE_PID)
+        return false;
+    pid = static_cast<memref_pid_t>(record.addr);
+    return true;
+}
+
 template <>
 void
 scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_set_tid(
@@ -361,6 +407,34 @@ scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_instr(
     return type_is_instr(static_cast<trace_type_t>(record.type));
 }
 
+template <>
+bool
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_encoding(
+    trace_entry_t record)
+{
+    return static_cast<trace_type_t>(record.type) == TRACE_TYPE_ENCODING;
+}
+
+template <>
+bool
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_instr_boundary(
+    trace_entry_t record, trace_entry_t prev_record)
+{
+    // Don't advance past encodings and split them from their associated instr.
+    return (record_type_is_instr(record) || record_type_is_encoding(record)) &&
+        !record_type_is_encoding(prev_record);
+}
+
+template <>
+typename scheduler_tmpl_t<trace_entry_t, record_reader_t>::stream_status_t
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::unread_last_record(
+    output_ordinal_t output, trace_entry_t &record, input_info_t *&input)
+{
+    // See the general unread_last_record() below: we don't support this as
+    // we can't provide the prev-prev record for record_type_is_instr_boundary().
+    return STATUS_NOT_IMPLEMENTED;
+}
+
 template <>
 bool
 scheduler_tmpl_t<trace_entry_t, record_reader_t>::record_type_is_marker(
@@ -437,6 +511,27 @@ scheduler_tmpl_t<trace_entry_t, record_reader_t>::print_record(
             record.addr);
 }
 
+template <>
+void
+scheduler_tmpl_t<trace_entry_t, record_reader_t>::insert_switch_tid_pid(
+    input_info_t &input)
+{
+    // We need explicit tid,pid records so reader_t will see the new context.
+    // We insert at the front, so we have reverse order.
+    trace_entry_t pid;
+    pid.type = TRACE_TYPE_PID;
+    pid.size = 0;
+    pid.addr = static_cast<addr_t>(input.pid);
+
+    trace_entry_t tid;
+    tid.type = TRACE_TYPE_THREAD;
+    tid.size = 0;
+    tid.addr = static_cast<addr_t>(input.tid);
+
+    input.queue.push_front(pid);
+    input.queue.push_front(tid);
+}
+
 /***************************************************************************
  * Scheduled stream.
  */
@@ -1470,7 +1565,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::get_tid(output_ordinal_t output)
     int index = outputs_[output].cur_input;
     if (index < 0)
         return -1;
-    if (inputs_[index].is_combined_stream())
+    if (inputs_[index].is_combined_stream() ||
+        TESTANY(OFFLINE_FILE_TYPE_CORE_SHARDED, inputs_[index].reader->get_filetype()))
         return inputs_[index].last_record_tid;
     return inputs_[index].tid;
 }
@@ -2035,6 +2131,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::set_cur_input(output_ordinal_t output,
         outputs_[output].stream->filetype_ = inputs_[input].reader->get_filetype();
     }
 
+    if (inputs_[input].pid != INVALID_PID) {
+        insert_switch_tid_pid(inputs_[input]);
+    }
+
     if (!switch_sequence_.empty() &&
         outputs_[output].stream->get_instruction_ordinal() > 0) {
         sched_type_t::switch_type_t switch_type = SWITCH_INVALID;
@@ -2583,7 +2683,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                     } else {
                         input->switch_to_input = it->second;
                     }
-                } else if (record_type_is_instr(record)) {
+                } else if (record_type_is_instr_boundary(record,
+                                                         outputs_[output].last_record)) {
                     if (syscall_incurs_switch(input, blocked_time)) {
                         // Model as blocking and should switch to a different input.
                         need_new_input = true;
@@ -2641,7 +2742,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 }
             }
             if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
-                record_type_is_instr(record) && !outputs_[output].in_kernel_code) {
+                record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+                !outputs_[output].in_kernel_code) {
                 ++input->instrs_in_quantum;
                 if (input->instrs_in_quantum > options_.quantum_duration) {
                     // We again prefer to switch to another input even if the current
@@ -2669,7 +2771,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                     // We only switch on instruction boundaries.  We could possibly switch
                     // in between (e.g., scatter/gather long sequence of reads/writes) by
                     // setting input->switching_pre_instruction.
-                    record_type_is_instr(record)) {
+                    record_type_is_instr_boundary(record, outputs_[output].last_record)) {
                     VPRINT(this, 4,
                            "next_record[%d]: hit end of time quantum after %" PRIu64 "\n",
                            output, input->time_spent_in_quantum);
@@ -2715,7 +2817,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                        prev_input, outputs_[output].cur_input);
                 if (!preempt) {
                     if (options_.quantum_unit == QUANTUM_INSTRUCTIONS &&
-                        record_type_is_instr(record)) {
+                        record_type_is_instr_boundary(record,
+                                                      outputs_[output].last_record)) {
                         --inputs_[prev_input].instrs_in_quantum;
                     } else if (options_.quantum_unit == QUANTUM_TIME) {
                         inputs_[prev_input].time_spent_in_quantum -=
@@ -2764,6 +2867,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
 
     outputs_[output].last_record = record;
     record_type_has_tid(record, input->last_record_tid);
+    record_type_has_pid(record, input->pid);
     return sched_type_t::STATUS_OK;
 }
 
@@ -2784,6 +2888,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal_t ou
     VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
            input->index);
     input->queue.push_back(outinfo.last_record);
+    // XXX: This should be record_type_is_instr_boundary() but we don't have the pre-prev
+    // record.  For now we don't support unread_last_record() for record_reader_t,
+    // enforced in a specialization of unread_last_record().
     if (options_.quantum_unit == QUANTUM_INSTRUCTIONS && record_type_is_instr(record))
         --input->instrs_in_quantum;
     outinfo.last_record = create_invalid_record();
diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h
index 4e6ea8271c2..049935d0e58 100644
--- a/clients/drcachesim/scheduler/scheduler.h
+++ b/clients/drcachesim/scheduler/scheduler.h
@@ -1132,6 +1132,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
             : lock(new std::mutex)
         {
         }
+        // Returns whether the stream mixes threads (online analysis mode) yet
+        // wants to treat them as separate shards (so not core-sharded-on-disk).
         bool
         is_combined_stream()
         {
@@ -1152,6 +1154,8 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
         int workload = -1;
         // If left invalid, this is a combined stream (online analysis mode).
         memref_tid_t tid = INVALID_THREAD_ID;
+        memref_pid_t pid = INVALID_PID;
+        // Used for combined streams.
         memref_tid_t last_record_tid = INVALID_THREAD_ID;
         // If non-empty these records should be returned before incrementing the reader.
         // This is used for read-ahead and inserting synthetic records.
@@ -1437,6 +1441,10 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     record_type_has_tid(RecordType record, memref_tid_t &tid);
 
+    // If the given record has a process id field, returns true and the value.
+    bool
+    record_type_has_pid(RecordType record, memref_pid_t &pid);
+
     // For trace_entry_t, only sets the tid for record types that have it.
     void
     record_type_set_tid(RecordType &record, memref_tid_t tid);
@@ -1456,6 +1464,12 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     bool
     record_type_is_invalid(RecordType record);
 
+    bool
+    record_type_is_encoding(RecordType record);
+
+    bool
+    record_type_is_instr_boundary(RecordType record, RecordType prev_record);
+
     // Creates the marker we insert between regions of interest.
     RecordType
     create_region_separator_marker(memref_tid_t tid, uintptr_t value);
@@ -1467,6 +1481,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
     RecordType
     create_invalid_record();
 
+    // If necessary, inserts context switch info on the incoming pid+tid.
+    // The lock for 'input' is held by the caller.
+    void
+    insert_switch_tid_pid(input_info_t &input);
+
     // Used for diagnostics: prints record fields to stderr.
     void
     print_record(const RecordType &record);
diff --git a/clients/drcachesim/tests/core_on_disk.templatex b/clients/drcachesim/tests/core_on_disk.templatex
index 3b9203f203d..9d45ae880df 100644
--- a/clients/drcachesim/tests/core_on_disk.templatex
+++ b/clients/drcachesim/tests/core_on_disk.templatex
@@ -3,27 +3,19 @@ Total counts:
 .*
            8 total threads
 .*
-Core [0-5] counts:
+Core [0-3] counts:
 .*
            2 threads
 .*
-Core [0-5] counts:
+Core [0-3] counts:
 .*
            2 threads
 .*
-Core [0-5] counts:
+Core [0-3] counts:
 .*
            2 threads
 .*
-Core [0-5] counts:
+Core [0-3] counts:
 .*
-           1 threads
-.*
-Core [0-5] counts:
-.*
-           1 threads
-.*
-Core [0-5] counts:
-.*
-           1 threads
+           2 threads
 .*
diff --git a/clients/drcachesim/tests/core_on_disk_schedule.templatex b/clients/drcachesim/tests/core_on_disk_schedule.templatex
new file mode 100644
index 00000000000..df62b6f68ef
--- /dev/null
+++ b/clients/drcachesim/tests/core_on_disk_schedule.templatex
@@ -0,0 +1,5 @@
+.*
+Core #0 schedule: (FJ_|EK_|GI|CHC_C__)
+Core #1 schedule: (FJ_|EK_|GI|CHC_C__)
+Core #2 schedule: (FJ_|EK_|GI|CHC_C__)
+Core #3 schedule: (FJ_|EK_|GI|CHC_C__)
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.0.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.0.trace.zip
new file mode 100644
index 00000000000..690c07dca85
Binary files /dev/null and b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.0.trace.zip differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.1.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.1.trace.zip
new file mode 100644
index 00000000000..9291d0be7be
Binary files /dev/null and b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.1.trace.zip differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.2.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.2.trace.zip
new file mode 100644
index 00000000000..f4ce5fbbfcf
Binary files /dev/null and b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.2.trace.zip differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.3.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.3.trace.zip
new file mode 100644
index 00000000000..9b144143e03
Binary files /dev/null and b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.core.3.trace.zip differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257596.5136.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257596.5136.trace.zip
deleted file mode 100644
index 1a2de7d9aa3..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257596.5136.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257598.2655.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257598.2655.trace.zip
deleted file mode 100644
index e2c996f3493..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257598.2655.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257599.2824.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257599.2824.trace.zip
deleted file mode 100644
index 3fda1bf3c86..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257599.2824.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257600.9375.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257600.9375.trace.zip
deleted file mode 100644
index 9c36953f97b..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257600.9375.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257601.8161.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257601.8161.trace.zip
deleted file mode 100644
index 61e682e43fa..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257601.8161.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257604.1983.trace.zip b/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257604.1983.trace.zip
deleted file mode 100644
index ca76bc2b329..00000000000
Binary files a/clients/drcachesim/tests/drmemtrace.threadsig-core-sharded.x64.tracedir/drmemtrace.threadsig.1257604.1983.trace.zip and /dev/null differ
diff --git a/clients/drcachesim/tests/mock_reader.h b/clients/drcachesim/tests/mock_reader.h
index e17b2831a0f..e08d77b7e03 100644
--- a/clients/drcachesim/tests/mock_reader.h
+++ b/clients/drcachesim/tests/mock_reader.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2016-2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2016-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -83,6 +83,54 @@ class mock_reader_t : public reader_t {
     int index_ = -1;
 };
 
+// A mock record reader that iterates over a vector of records.
+class mock_record_reader_t : public record_reader_t {
+public:
+    mock_record_reader_t() = default;
+    explicit mock_record_reader_t(const std::vector<trace_entry_t> &trace)
+        : trace_(trace)
+    {
+        verbosity_ = 3;
+    }
+    bool
+    init() override
+    {
+        eof_ = false;
+        ++*this;
+        return true;
+    }
+    bool
+    read_next_entry() override
+    {
+        ++index_;
+        if (index_ >= static_cast<int>(trace_.size())) {
+            eof_ = true;
+            return false;
+        }
+        cur_entry_ = trace_[index_];
+        return true;
+    }
+    std::string
+    get_stream_name() const override
+    {
+        return "";
+    }
+    bool
+    open_single_file(const std::string &input_path) override
+    {
+        return false;
+    }
+    bool
+    open_input_file() override
+    {
+        return false;
+    }
+
+private:
+    std::vector<trace_entry_t> trace_;
+    int index_ = -1;
+};
+
 static inline trace_entry_t
 make_memref(addr_t addr, trace_type_t type = TRACE_TYPE_READ, unsigned short size = 1)
 {
@@ -177,6 +225,16 @@ make_marker(trace_marker_type_t type, uintptr_t value)
     return entry;
 }
 
+static inline trace_entry_t
+make_encoding(unsigned short size, addr_t encoding)
+{
+    trace_entry_t entry;
+    entry.type = TRACE_TYPE_ENCODING;
+    entry.size = size;
+    entry.addr = encoding;
+    return entry;
+}
+
 } // namespace drmemtrace
 } // namespace dynamorio
 
diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
index 760dc7cab04..7a18fbf0c9e 100644
--- a/clients/drcachesim/tests/schedule_stats_test.cpp
+++ b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2021-2023 Google, LLC  All rights reserved.
+ * Copyright (c) 2021-2024 Google, LLC  All rights reserved.
  * **********************************************************/
 
 /*
@@ -63,36 +63,15 @@ using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_SYSCALL;
 // Bypasses the analyzer and scheduler for a controlled test sequence.
 // Alternates the per-core memref vectors in lockstep.
 static schedule_stats_t::counters_t
-run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs,
-                   const std::unordered_map<memref_tid_t, int64_t> &tid2ord)
+run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
 {
-    schedule_stats_t tool(/*print_every=*/1, /*verbosity=*/2);
-    // schedule_stats_t uses get_input_id() to identify switches.
-    class mock_stream_t : public default_memtrace_stream_t {
-    public:
-        void
-        set_input_id(int64_t input_id)
-        {
-            input_id_ = input_id;
-        }
-        int64_t
-        get_input_id() const override
-        {
-            return input_id_;
-        }
-        memtrace_stream_t *
-        get_input_interface() const override
-        {
-            return const_cast<mock_stream_t *>(this);
-        }
-
-    private:
-        int64_t input_id_ = 0;
-    };
+    // At verbosity 2+ we'd need to subclass default_memtrace_stream_t
+    // and provide a non-null get_input_interface() (point at "this").
+    schedule_stats_t tool(/*print_every=*/1, /*verbosity=*/1);
     struct per_core_t {
         void *worker_data;
         void *shard_data;
-        mock_stream_t stream;
+        default_memtrace_stream_t stream;
         bool finished = false;
         size_t memref_idx = 0;
     };
@@ -109,7 +88,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs,
             if (per_core[cpu].finished)
                 continue;
             memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
-            per_core[cpu].stream.set_input_id(tid2ord.at(memref.instr.tid));
+            per_core[cpu].stream.set_tid(memref.instr.tid);
             bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
             assert(res);
             ++per_core[cpu].memref_idx;
@@ -132,10 +111,6 @@ test_basic_stats()
     static constexpr int64_t TID_A = 42;
     static constexpr int64_t TID_B = 142;
     static constexpr int64_t TID_C = 242;
-    std::unordered_map<memref_tid_t, int64_t> tid2ord;
-    tid2ord[TID_A] = 0;
-    tid2ord[TID_B] = 1;
-    tid2ord[TID_C] = 2;
     std::vector<std::vector<memref_t>> memrefs = {
         {
             gen_instr(TID_A),
@@ -187,7 +162,7 @@ test_basic_stats()
             gen_instr(TID_B),
         },
     };
-    auto result = run_schedule_stats(memrefs, tid2ord);
+    auto result = run_schedule_stats(memrefs);
     assert(result.instrs == 16);
     assert(result.total_switches == 6);
     assert(result.voluntary_switches == 2);
@@ -210,10 +185,6 @@ test_idle()
     static constexpr int64_t TID_A = 42;
     static constexpr int64_t TID_B = 142;
     static constexpr int64_t TID_C = 242;
-    std::unordered_map<memref_tid_t, int64_t> tid2ord;
-    tid2ord[TID_A] = 0;
-    tid2ord[TID_B] = 1;
-    tid2ord[TID_C] = 2;
     std::vector<std::vector<memref_t>> memrefs = {
         {
             gen_instr(TID_B),
@@ -248,7 +219,7 @@ test_idle()
             gen_instr(TID_A),
         },
     };
-    auto result = run_schedule_stats(memrefs, tid2ord);
+    auto result = run_schedule_stats(memrefs);
     assert(result.instrs == 13);
     assert(result.total_switches == 5);
     assert(result.voluntary_switches == 0);
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index d18bcd0253a..5801bce250c 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -3911,6 +3911,136 @@ test_random_schedule()
     }
 }
 
+static void
+test_record_scheduler()
+{
+    // Test record_scheduler_t switches, which operate differently:
+    // they have to deal with encoding records preceding instructions,
+    // and they have to insert tid,pid records.
+    std::cerr << "\n----------------\nTesting record_scheduler_t\n";
+    static constexpr memref_tid_t TID_A = 42;
+    static constexpr memref_tid_t TID_B = TID_A + 1;
+    static constexpr memref_tid_t PID_A = 142;
+    static constexpr memref_tid_t PID_B = PID_A + 1;
+    static constexpr int NUM_OUTPUTS = 1;
+    static constexpr addr_t ENCODING_SIZE = 2;
+    static constexpr addr_t ENCODING_IGNORE = 0xfeed;
+    std::vector<trace_entry_t> refs_A = {
+        /* clang-format off */
+        make_thread(TID_A),
+        make_pid(PID_A),
+        make_version(TRACE_ENTRY_VERSION),
+        make_timestamp(10),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(10),
+        make_timestamp(20),
+        make_marker(TRACE_MARKER_TYPE_SYSCALL, 42),
+        make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0),
+        make_timestamp(120),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(30),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(50),
+        make_exit(TID_A),
+        /* clang-format on */
+    };
+    std::vector<trace_entry_t> refs_B = {
+        /* clang-format off */
+        make_thread(TID_B),
+        make_pid(PID_B),
+        make_version(TRACE_ENTRY_VERSION),
+        make_timestamp(20),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(20),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(40),
+        make_encoding(ENCODING_SIZE, ENCODING_IGNORE),
+        make_instr(60),
+        // No encoding for repeated instr.
+        make_instr(20),
+        make_exit(TID_B),
+        /* clang-format on */
+    };
+    std::vector<record_scheduler_t::input_reader_t> readers;
+    readers.emplace_back(
+        std::unique_ptr<mock_record_reader_t>(new mock_record_reader_t(refs_A)),
+        std::unique_ptr<mock_record_reader_t>(new mock_record_reader_t()), TID_A);
+    readers.emplace_back(
+        std::unique_ptr<mock_record_reader_t>(new mock_record_reader_t(refs_B)),
+        std::unique_ptr<mock_record_reader_t>(new mock_record_reader_t()), TID_B);
+    record_scheduler_t scheduler;
+    std::vector<record_scheduler_t::input_workload_t> sched_inputs;
+    sched_inputs.emplace_back(std::move(readers));
+    record_scheduler_t::scheduler_options_t sched_ops(
+        record_scheduler_t::MAP_TO_ANY_OUTPUT, record_scheduler_t::DEPENDENCY_IGNORE,
+        record_scheduler_t::SCHEDULER_DEFAULTS,
+        /*verbosity=*/4);
+    sched_ops.quantum_duration = 2;
+    sched_ops.block_time_scale = 0.001; // Do not stay blocked.
+    if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) !=
+        record_scheduler_t::STATUS_SUCCESS)
+        assert(false);
+    auto *stream0 = scheduler.get_stream(0);
+    auto check_next = [](record_scheduler_t::stream_t *stream,
+                         record_scheduler_t::stream_status_t expect_status,
+                         trace_type_t expect_type = TRACE_TYPE_MARKER,
+                         addr_t expect_addr = 0) {
+        trace_entry_t record;
+        record_scheduler_t::stream_status_t status = stream->next_record(record);
+        assert(status == expect_status);
+        if (status == record_scheduler_t::STATUS_OK) {
+            if (record.type != expect_type) {
+                std::cerr << "Expected type " << expect_type << " != " << record.type
+                          << "\n";
+                assert(false);
+            }
+            if (expect_addr != 0 && record.addr != expect_addr) {
+                std::cerr << "Expected addr " << expect_addr << " != " << record.addr
+                          << "\n";
+                assert(false);
+            }
+        }
+    };
+    // Advance cpu0 on TID_A to its 1st context switch.
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD, TID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_PID, PID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    // Ensure the context switch is *before* the encoding.
+    // Advance cpu0 on TID_B to its 1st context switch.
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD, TID_B);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_PID, PID_B);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_MARKER);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    // Ensure the switch is *before* the encoding.
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD, TID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_PID, PID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD, TID_B);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_PID, PID_B);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD_EXIT);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD, TID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_PID, PID_A);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_ENCODING);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_INSTR);
+    check_next(stream0, record_scheduler_t::STATUS_OK, TRACE_TYPE_THREAD_EXIT);
+    check_next(stream0, record_scheduler_t::STATUS_EOF);
+}
+
 int
 test_main(int argc, const char *argv[])
 {
@@ -3947,6 +4077,7 @@ test_main(int argc, const char *argv[])
     test_direct_switch();
     test_kernel_switch_sequences();
     test_random_schedule();
+    test_record_scheduler();
 
     dr_standalone_exit();
     return 0;
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index b26fffb59b2..26afe37696c 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -126,6 +126,7 @@ schedule_stats_t::parallel_shard_init_stream(int shard_index, void *worker_data,
     std::lock_guard<std::mutex> guard(shard_map_mutex_);
     per_shard->stream = stream;
     per_shard->core = stream->get_output_cpuid();
+    per_shard->filetype = static_cast<intptr_t>(stream->get_filetype());
     shard_map_[shard_index] = per_shard;
     return reinterpret_cast<void *>(per_shard);
 }
@@ -166,12 +167,13 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     static constexpr char WAIT_SYMBOL = '-';
     static constexpr char IDLE_SYMBOL = '_';
     per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
+    int64_t input_id = shard->stream->get_input_id();
     if (knob_verbose_ >= 4) {
         std::ostringstream line;
         line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
              << shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
              << shard->stream->get_instruction_ordinal() << " instrs: input "
-             << std::setw(4) << shard->stream->get_input_id() << " @" << std::setw(9)
+             << std::setw(4) << input_id << " @" << std::setw(9)
              << shard->stream->get_input_interface()->get_record_ordinal() << " refs, "
              << std::setw(9)
              << shard->stream->get_input_interface()->get_instruction_ordinal()
@@ -188,10 +190,12 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     // Cache and reset here to ensure we reset on early return paths.
     bool was_wait = shard->prev_was_wait;
     bool was_idle = shard->prev_was_idle;
-    int64_t prev_input = shard->prev_input;
+    int64_t prev_workload_id = shard->prev_workload_id;
+    int64_t prev_tid = shard->prev_tid;
     shard->prev_was_wait = false;
     shard->prev_was_idle = false;
-    shard->prev_input = -1;
+    shard->prev_workload_id = -1;
+    shard->prev_tid = -1;
     if (memref.marker.type == TRACE_TYPE_MARKER &&
         memref.marker.marker_type == TRACE_MARKER_TYPE_CORE_WAIT) {
         ++shard->counters.waits;
@@ -228,8 +232,16 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         }
         return true;
     }
-    int64_t input = shard->stream->get_input_id();
-    if (input != prev_input) {
+    // We use <workload,tid> to detect switches (instead of input_id) to handle
+    // core-sharded-on-disk.  However, we still prefer the input_id ordinal
+    // for the letters.
+    int64_t workload_id = shard->stream->get_workload_id();
+    int64_t tid = shard->stream->get_tid();
+    int64_t letter_ord =
+        (TESTANY(OFFLINE_FILE_TYPE_CORE_SHARDED, shard->filetype) || input_id < 0)
+        ? tid
+        : input_id;
+    if (workload_id != prev_workload_id || tid != prev_tid) {
         // We convert to letters which only works well for <=26 inputs.
         if (!shard->thread_sequence.empty()) {
             ++shard->counters.total_switches;
@@ -239,7 +251,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
                 ++shard->counters.direct_switches;
         }
         shard->thread_sequence +=
-            THREAD_LETTER_INITIAL_START + static_cast<char>(input % 26);
+            THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
         shard->cur_segment_instrs = 0;
         if (!was_wait && !was_idle && shard->segment_start_microseconds > 0) {
             shard->counters.cpu_microseconds +=
@@ -251,7 +263,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
             line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
                  << shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
                  << shard->stream->get_instruction_ordinal() << " instrs: input "
-                 << std::setw(4) << input << " @" << std::setw(9)
+                 << std::setw(4) << input_id << " @" << std::setw(9)
                  << shard->stream->get_input_interface()->get_record_ordinal()
                  << " refs, " << std::setw(9)
                  << shard->stream->get_input_interface()->get_instruction_ordinal()
@@ -264,14 +276,15 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
             std::cerr << line.str();
         }
     }
-    shard->prev_input = input;
+    shard->prev_workload_id = workload_id;
+    shard->prev_tid = tid;
     if (type_is_instr(memref.instr.type)) {
         ++shard->counters.instrs;
         ++shard->cur_segment_instrs;
         shard->counters.idle_micros_at_last_instr = shard->counters.idle_microseconds;
         if (shard->cur_segment_instrs == knob_print_every_) {
             shard->thread_sequence +=
-                THREAD_LETTER_SUBSEQUENT_START + static_cast<char>(input % 26);
+                THREAD_LETTER_SUBSEQUENT_START + static_cast<char>(letter_ord % 26);
             shard->cur_segment_instrs = 0;
         }
         shard->direct_switch_target = INVALID_THREAD_ID;
@@ -291,6 +304,8 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
         } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH) {
             ++shard->counters.direct_switch_requests;
             shard->direct_switch_target = memref.marker.marker_value;
+        } else if (memref.marker.marker_type == TRACE_MARKER_TYPE_FILETYPE) {
+            shard->filetype = static_cast<intptr_t>(memref.marker.marker_value);
         }
     } else if (memref.exit.type == TRACE_TYPE_THREAD_EXIT)
         shard->saw_exit = true;
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index dcf00ab069d..0dfb4fda350 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2023 Google, Inc.  All rights reserved.
+ * Copyright (c) 2023-2024 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -120,7 +120,8 @@ class schedule_stats_t : public analysis_tool_t {
         memtrace_stream_t *stream = nullptr;
         int64_t core = 0; // We target core-sharded.
         counters_t counters;
-        int64_t prev_input = -1;
+        int64_t prev_workload_id = -1;
+        int64_t prev_tid = -1;
         // These are cleared when an instruction is seen.
         bool saw_syscall = false;
         memref_tid_t direct_switch_target = INVALID_THREAD_ID;
@@ -132,6 +133,7 @@ class schedule_stats_t : public analysis_tool_t {
         bool prev_was_idle = false;
         // Computing %-idle.
         uint64_t segment_start_microseconds = 0;
+        intptr_t filetype = 0;
     };
 
     void
diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
index 8e0d6fb7371..63a0ec57f2c 100644
--- a/core/ir/aarch64/instr.c
+++ b/core/ir/aarch64/instr.c
@@ -34,19 +34,21 @@
 #include "../globals.h"
 #include "instr.h"
 #include "decode.h"
-
+#include "encode_api.h"
 #include "opcode_names.h"
 
+/* XXX i#6690: currently only A64 is supported for instruction encoding.
+ * We want to add support for A64 decoding and synthetic ISA encoding as well.
+ * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
+ * all architectures in the same build of DR.
+ */
 bool
 instr_set_isa_mode(instr_t *instr, dr_isa_mode_t mode)
 {
-    return (mode == DR_ISA_ARM_A64);
-}
-
-dr_isa_mode_t
-instr_get_isa_mode(instr_t *instr)
-{
-    return DR_ISA_ARM_A64;
+    if (mode != DR_ISA_ARM_A64)
+        return false;
+    instr->isa_mode = DR_ISA_ARM_A64;
+    return true;
 }
 
 int
diff --git a/core/ir/arm/instr.c b/core/ir/arm/instr.c
index 7097f19045e..08c7e90014f 100644
--- a/core/ir/arm/instr.c
+++ b/core/ir/arm/instr.c
@@ -32,28 +32,24 @@
 
 #include "../globals.h"
 #include "instr.h"
+#include "encode_api.h"
 #include "decode.h"
 
-/* FIXME i#1551: add A64 and Thumb support throughout */
-
+/* XXX i#6690: currently only A32 and Thumb is supported for instruction encoding.
+ * We want to add support for A32 and Thumb decoding and synthetic ISA encoding as well.
+ * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
+ * all architectures in the same build of DR.
+ */
 bool
 instr_set_isa_mode(instr_t *instr, dr_isa_mode_t mode)
 {
-    if (mode == DR_ISA_ARM_THUMB)
-        instr->flags |= INSTR_THUMB_MODE;
-    else if (mode == DR_ISA_ARM_A32)
-        instr->flags &= ~INSTR_THUMB_MODE;
-    else
+    if (mode != DR_ISA_ARM_THUMB && mode != DR_ISA_ARM_A32) {
         return false;
+    }
+    instr->isa_mode = mode;
     return true;
 }
 
-dr_isa_mode_t
-instr_get_isa_mode(instr_t *instr)
-{
-    return TEST(INSTR_THUMB_MODE, instr->flags) ? DR_ISA_ARM_THUMB : DR_ISA_ARM_A32;
-}
-
 int
 instr_length_arch(dcontext_t *dcontext, instr_t *instr)
 {
diff --git a/core/ir/instr.h b/core/ir/instr.h
index b26e83017cb..76dc3a02c82 100644
--- a/core/ir/instr.h
+++ b/core/ir/instr.h
@@ -202,19 +202,8 @@ enum {
     INSTR_DO_NOT_EMIT = 0x10000000,
     /* PR 251479: re-relativization support: is instr->rip_rel_pos valid? */
     INSTR_RIP_REL_VALID = 0x20000000,
-#ifdef X86
-    /* PR 278329: each instr stores its own mode */
-    INSTR_X86_MODE = 0x40000000,
-#elif defined(ARM)
-    /* We assume we don't need to distinguish A64 from A32 as you cannot swap
-     * between them in user mode.  Thus we only need one flag.
-     * XXX: we might want more power for drdecode, though the global isa_mode
-     * should be sufficient there.
-     */
-    INSTR_THUMB_MODE = 0x40000000,
-#endif
     /* PR 267260: distinguish our own mangling from client-added instrs */
-    INSTR_OUR_MANGLING = 0x80000000,
+    INSTR_OUR_MANGLING = 0x40000000,
 };
 
 #define DR_TUPLE_TYPE_BITS 4
diff --git a/core/ir/instr_api.h b/core/ir/instr_api.h
index 5b6017d4a88..11f05e0c6eb 100644
--- a/core/ir/instr_api.h
+++ b/core/ir/instr_api.h
@@ -310,6 +310,12 @@ struct _instr_t {
     byte num_dsts;
     byte num_srcs;
 
+    /* Instruction ISA mode to support multiple architectures in the same build of DR
+     * (xref i#6698 i#1684).
+     * This field holds values of type #dr_isa_mode_t.
+     */
+    byte isa_mode;
+
     union {
         struct {
             /* for efficiency everyone has a 1st src opnd, since we often just
diff --git a/core/ir/instr_shared.c b/core/ir/instr_shared.c
index 30f0f4b1aea..a3943249feb 100644
--- a/core/ir/instr_shared.c
+++ b/core/ir/instr_shared.c
@@ -84,16 +84,19 @@
 instr_t *
 instr_create(void *drcontext)
 {
+    bool is_instr_isa_mode_set = false;
     dcontext_t *dcontext = (dcontext_t *)drcontext;
     instr_t *instr = (instr_t *)heap_alloc(dcontext, sizeof(instr_t) HEAPACCT(ACCT_IR));
     /* everything initializes to 0, even flags, to indicate
      * an uninitialized instruction */
     memset((void *)instr, 0, sizeof(instr_t));
 #if defined(X86) && defined(X64)
-    instr_set_isa_mode(instr, X64_CACHE_MODE_DC(dcontext) ? DR_ISA_AMD64 : DR_ISA_IA32);
-#elif defined(ARM)
-    instr_set_isa_mode(instr, dr_get_isa_mode(dcontext));
+    is_instr_isa_mode_set = instr_set_isa_mode(
+        instr, X64_CACHE_MODE_DC(dcontext) ? DR_ISA_AMD64 : DR_ISA_IA32);
+#else
+    is_instr_isa_mode_set = instr_set_isa_mode(instr, dr_get_isa_mode(dcontext));
 #endif
+    CLIENT_ASSERT(is_instr_isa_mode_set, "setting instruction ISA mode unsuccessful");
     return instr;
 }
 
@@ -442,6 +445,12 @@ private_instr_encode(dcontext_t *dcontext, instr_t *instr, bool always_cache)
     return len;
 }
 
+dr_isa_mode_t
+instr_get_isa_mode(instr_t *instr)
+{
+    return (dr_isa_mode_t)instr->isa_mode;
+}
+
 #define inlined_instr_get_opcode(instr)                                           \
     (IF_DEBUG_(CLIENT_ASSERT(sizeof(*instr) == sizeof(instr_t), "invalid type"))( \
         ((instr)->opcode == OP_UNDECODED)                                         \
diff --git a/core/ir/riscv64/instr.c b/core/ir/riscv64/instr.c
index 2a28ceece39..e35550ce62f 100644
--- a/core/ir/riscv64/instr.c
+++ b/core/ir/riscv64/instr.c
@@ -32,17 +32,20 @@
 
 #include "../globals.h"
 #include "instr.h"
+#include "encode_api.h"
 
+/* XXX i#6690: currently only RISCV64 is supported for instruction encoding.
+ * We want to add support for RISCV64 decoding and synthetic ISA encoding as well.
+ * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
+ * all architectures in the same build of DR.
+ */
 bool
 instr_set_isa_mode(instr_t *instr, dr_isa_mode_t mode)
 {
-    return (mode == DR_ISA_RV64IMAFDC);
-}
-
-dr_isa_mode_t
-instr_get_isa_mode(instr_t *instr)
-{
-    return DR_ISA_RV64IMAFDC;
+    if (mode != DR_ISA_RV64IMAFDC)
+        return false;
+    instr->isa_mode = DR_ISA_RV64IMAFDC;
+    return true;
 }
 
 int
diff --git a/core/ir/x86/instr.c b/core/ir/x86/instr.c
index 9db609e7e81..b9cf49eea69 100644
--- a/core/ir/x86/instr.c
+++ b/core/ir/x86/instr.c
@@ -40,6 +40,7 @@
 #include "instr.h"
 #include "decode.h"
 #include "decode_private.h"
+#include "encode_api.h"
 #include "instr_create_shared.h"
 
 #ifdef X64
@@ -51,9 +52,9 @@ void
 instr_set_x86_mode(instr_t *instr, bool x86)
 {
     if (x86)
-        instr->flags |= INSTR_X86_MODE;
+        instr->isa_mode = DR_ISA_IA32;
     else
-        instr->flags &= ~INSTR_X86_MODE;
+        instr->isa_mode = DR_ISA_AMD64;
 }
 
 /*
@@ -63,37 +64,29 @@ instr_set_x86_mode(instr_t *instr, bool x86)
 bool
 instr_get_x86_mode(instr_t *instr)
 {
-    return TEST(INSTR_X86_MODE, instr->flags);
+    return instr->isa_mode == DR_ISA_IA32;
 }
 #endif
 
+/* XXX i#6690: currently only x86 and x64 are supported for instruction encoding.
+ * We want to add support for x86 and x64 decoding and synthetic ISA encoding as well.
+ * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
+ * all architectures in the same build of DR.
+ */
 bool
 instr_set_isa_mode(instr_t *instr, dr_isa_mode_t mode)
 {
 #ifdef X64
-    if (mode == DR_ISA_IA32)
-        instr_set_x86_mode(instr, true);
-    else if (mode == DR_ISA_AMD64)
-        instr_set_x86_mode(instr, false);
-    else
+    if (mode != DR_ISA_IA32 && mode != DR_ISA_AMD64)
         return false;
 #else
     if (mode != DR_ISA_IA32)
         return false;
 #endif
+    instr->isa_mode = mode;
     return true;
 }
 
-dr_isa_mode_t
-instr_get_isa_mode(instr_t *instr)
-{
-#ifdef X64
-    return TEST(INSTR_X86_MODE, instr->flags) ? DR_ISA_IA32 : DR_ISA_AMD64;
-#else
-    return DR_ISA_IA32;
-#endif
-}
-
 int
 instr_length_arch(dcontext_t *dcontext, instr_t *instr)
 {
diff --git a/core/lib/dr_tools.h b/core/lib/dr_tools.h
index 164ef552943..57563a4f383 100644
--- a/core/lib/dr_tools.h
+++ b/core/lib/dr_tools.h
@@ -48,9 +48,21 @@ DR_API
  * \warning This context cannot be used as the drcontext for a thread
  * running under DR control!  It is only for standalone programs that
  * wish to use DR as a library of disassembly, etc. routines.
+ * \warning This context is not fully thread-safe as it stores some state
+ * (such as #dr_isa_mode_t and other fields related to AArch32 encoding
+ * and decoding) that is global and may be prone to data races.
+ * For example, having different threads use dr_set_isa_mode() to set
+ * different ISA modes at the same time can result in a data race.
+ * Furthermore, encoding and decoding of AArch32 instructions in parallel
+ * may also result in a data race.
+ * Code that uses a standalone DR context across multiple threads should
+ * implement its own lock/unlock mechanism to avoid such data races
+ * when using dr_set_isa_mode() or encoding/decoding AArch32 instructions.
  * \return NULL on failure, such as running on an unsupported operating
  * system version.
  */
+/* TODO i#6690: Add better multi-thread standalone decoding support.
+ */
 void *
 dr_standalone_init(void);
 
@@ -66,6 +78,9 @@ dr_standalone_exit(void);
 /**
  * Use this dcontext for use with the standalone static decoder library.
  * Pass it whenever a decoding-related API routine asks for a context.
+ * Note that this GLOBAL_DCONTEXT is returned by dr_standalone_init();
+ * beware of its limitations (especially about thread-safety) described
+ * there.
  */
 #    define GLOBAL_DCONTEXT ((void *)-1)
 #endif
diff --git a/core/unix/loader.c b/core/unix/loader.c
index e7e0aca2cf1..0e29bd5fca6 100644
--- a/core/unix/loader.c
+++ b/core/unix/loader.c
@@ -743,15 +743,21 @@ privload_os_finalize(privmod_t *privmod)
         SYSLOG_INTERNAL_WARNING("glibc 2.34+ i#5437 workaround failed: missed glro");
         return;
     }
+    int GLRO_dl_tls_static_size_OFFS;
+    int GLRO_dl_tls_static_align_OFFS;
 #    ifdef X64
-    const int GLRO_dl_tls_static_size_OFFS = 0x2a8;
-    const int GLRO_dl_tls_static_align_OFFS = 0x2b0;
+    // The offsets changed between 2.38 and 2.39.
+    if (ver[2] == '3' && ver[3] < '9') {
+        GLRO_dl_tls_static_size_OFFS = 0x2a8;
+        GLRO_dl_tls_static_align_OFFS = 0x2b0;
+    } else {
+        GLRO_dl_tls_static_size_OFFS = 0x2c8;
+        GLRO_dl_tls_static_align_OFFS = 0x2d0;
+    }
 #    else
     // The offsets changed between 2.35 and 2.36.
-    const int GLRO_dl_tls_static_size_OFFS =
-        (ver[2] == '3' && ver[3] == '5') ? 0x328 : 0x31c;
-    const int GLRO_dl_tls_static_align_OFFS =
-        (ver[2] == '3' && ver[3] == '5') ? 0x32c : 0x320;
+    GLRO_dl_tls_static_size_OFFS = (ver[2] == '3' && ver[3] == '5') ? 0x328 : 0x31c;
+    GLRO_dl_tls_static_align_OFFS = (ver[2] == '3' && ver[3] == '5') ? 0x32c : 0x320;
 #    endif
     size_t val = 4096, written;
     if (!safe_write_ex(glro + GLRO_dl_tls_static_size_OFFS, sizeof(val), &val,
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index 362997e98cb..b943771bf31 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -3879,6 +3879,10 @@ if (BUILD_CLIENTS)
         torunonly_simtool(core_on_disk ${ci_shared_app}
           "-indir ${core_sharded_dir} -simulator_type basic_counts" "")
         set(tool.core_on_disk_rawtemp ON) # no preprocessor
+
+        torunonly_simtool(core_on_disk_schedule ${ci_shared_app}
+          "-indir ${core_sharded_dir} -simulator_type schedule_stats" "")
+        set(tool.core_on_disk_schedule_rawtemp ON) # no preprocessor
       endif ()
     endif ()