Skip to content

Commit

Permalink
i#6760 AArch64: Use smaller data types for SVE P and FFR registers (#…
Browse files Browse the repository at this point in the history
…6774)

PR #6757 fixed the way we read/write SVE register slots but
unfortunately it is now broken on systems with 128-bit vector length.

Both SVE vector and predicate registers use dr_simd_t slots which is a
64-byte type meant to store up to 512-bit vector registers. SVE
predicate registers are always 1/8 the size of the vector register so
for 512-bit vector length systems we only really need 64 / 8 = 8 bytes
to store predicate registers.

The ldr/str instructions we use to read and write the predicate register
slots have a base+offset memory operand where the offset is a value in
the range [-256, 255] scaled based by predicate register length. We read
and write the registers by setting the base address to the address of
the first slot, and setting the offset to n * sizeof(dr_simd_t) for each
register Pn.
For systems with 128-bit vector length, this means the predicate
registers are 16 / 8 = 2 bytes so the maximum offset we can reach is 2 *
255 = 510 bytes. This means on 128-bit VL systems we can only reach the
first 8 predicate registers (8 * sizeof(dr_simd_t) = 512).

By changing the predicate register and FFR slots to use a new type
dr_svep_t which is 1/8 the size of dr_simd_t we can fix this bug and
save space.

dr_svep_t is currently 8 bytes to correspond to 64 byte vectors, but
even if we extend DynamoRIO to support the maximum SVE vector length of
2048-bits (256 bytes) dr_svep_t will only need to be increased to 256 /
8 = 32 bytes so the maximum offset (15 * 32 = 480 bytes) will always be
in range.

As this changes the size of the predicate register and FFR slots, this
changes the size of the dr_mcontext_t structure and breaks backwards
compatibility with earlier versions of DynamoRIO so the version number
is increased to 10.90.

Issues: #6760, #5365
Fixes: #6760
  • Loading branch information
jackgallagher-arm authored Apr 15, 2024
1 parent 6af99e9 commit f45eeba
Show file tree
Hide file tree
Showing 15 changed files with 193 additions and 116 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
# We only use a non-zero build # when making multiple manual builds in one day.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/ci-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ jobs:
# We only use a non-zero build # when making multiple manual builds in one day.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down Expand Up @@ -195,7 +195,7 @@ jobs:
# XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down Expand Up @@ -283,7 +283,7 @@ jobs:
# XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down Expand Up @@ -371,7 +371,7 @@ jobs:
# XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down Expand Up @@ -451,7 +451,7 @@ jobs:
# XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER=10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))
export VERSION_NUMBER=10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
fi
Expand Down Expand Up @@ -536,7 +536,7 @@ jobs:
# XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
run: |
if test -z "${{ github.event.inputs.version }}"; then
export VERSION_NUMBER="10.0.$((`git log -n 1 --format=%ct` / (60*60*24)))"
export VERSION_NUMBER="10.90.$((`git log -n 1 --format=%ct` / (60*60*24)))"
export PREFIX="cronbuild-"
else
export VERSION_NUMBER=${{ github.event.inputs.version }}
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn")

# N.B.: When updating this, update all the default versions in ci-package.yml
# and ci-docs.yml. We should find a way to share (xref i#1565).
set(VERSION_NUMBER_DEFAULT "10.0.${VERSION_NUMBER_PATCHLEVEL}")
set(VERSION_NUMBER_DEFAULT "10.90.${VERSION_NUMBER_PATCHLEVEL}")
# do not store the default VERSION_NUMBER in the cache to prevent a stale one
# from preventing future version updates in a pre-existing build dir
set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default")
Expand Down
3 changes: 3 additions & 0 deletions api/docs/release.dox
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ changes:
users sub-classing analyzer_tmpl_t).
- Converted #dynamorio::drmemtrace::analysis_tool_tmpl_t::interval_state_snapshot_t
into a class with all its data members marked private with public accessor functions.
- Changed the type of the AArch64 #dr_mcontext_t members svep and ffr to #dr_svep_t.
This breaks binary compatibility with clients that were built against versions of
DynamoRIO before this change.

Further non-compatibility-affecting changes include:
- Added DWARF-5 support to the drsyms library by linking in 4 static libraries
Expand Down
4 changes: 2 additions & 2 deletions core/arch/aarch64/aarch64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ START_FILE
#endif

/* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */
#define PRIV_MCONTEXT_SIZE 3424
#define PRIV_MCONTEXT_SIZE 2480

/* offsetof(spill_state_t, r0) */
#define spill_state_r0_OFFSET 0
Expand All @@ -69,7 +69,7 @@ START_FILE
/* offsetof(priv_mcontext_t, simd) */
#define simd_OFFSET (16 * ARG_SZ*2 + 32)
/* offsetof(dcontext_t, dstack) */
#define dstack_OFFSET 0xda8
#define dstack_OFFSET 0x9f8
/* offsetof(dcontext_t, is_exiting) */
#define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
/* offsetof(struct tlsdesc_t, arg) */
Expand Down
96 changes: 43 additions & 53 deletions core/arch/aarch64/emit_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -576,19 +576,18 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
}
if (proc_has_feature(FEATURE_SVE)) {
for (i = 0; i < 32; i++) {
/* ldr z(i), [x1, #(i mul vl)]
/* ldr z(i), [x1, #(i * sizeof(dr_simd_t))]
* From the SVE manual:
* "Load a vector register from a memory address generated by a
* 64-bit scalar base, plus an immediate offset in the range -256
* to 255 which is multiplied by the current vector register size
* in bytes."
*/
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_Z0 + i),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes()))));
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_Z0 + i),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
i * sizeof(dr_simd_t),
OPSZ_SVE_VECLEN_BYTES)));
}
/* add x1, x(dcxt), #(offset svep) */
APP(ilist,
Expand All @@ -599,40 +598,36 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
* register for FFR load below, then restored from svep afterwards.
*/
for (i = 0; i < 15; i++) {
/* ldr p(i), [x1, #(i mul vl)] */
/* ldr p(i), [x1, #(i * sizeof(dr_svep_t))] */
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P0 + i),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P0 + i),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
i * sizeof(dr_svep_t),
OPSZ_SVE_PREDLEN_BYTES)));
}
/* There is no load instruction for the first-fault register (FFR). Use
* a temporary predicate register to load:
* add x2, x(dcxt), #(offset ffr)
* ldr p15, [x2, #(ffr)]
* ldr p15, [x2, #0]
* wrffr p15.b
* ldr p15, [x1, #(15 mul vl)]
* ldr p15, [x1, #(15 * sizeof(dr_svep_t)]
*/
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X2, DR_REG_NULL, 0, 0,
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0,
OPSZ_SVE_PREDLEN_BYTES)));
APP(ilist,
INSTR_CREATE_wrffr_sve(dcontext,
opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
15 * sizeof(dr_svep_t),
OPSZ_SVE_PREDLEN_BYTES)));
}
}

Expand Down Expand Up @@ -791,41 +786,39 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
}
if (proc_has_feature(FEATURE_SVE)) {
for (i = 0; i < 32; i++) {
/* str z(i), [x1, #(i mul vl)]
/* str z(i), [x1, #(i * sizeof(dr_simd_t))]
* "Store a vector register to a memory address generated by a
* 64-bit scalar base, plus an immediate offset in the range -256
* to 255 which is multiplied by the current vector register size
* in bytes."
*/
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes())),
opnd_create_reg(DR_REG_Z0 + i)));
INSTR_CREATE_str(dcontext,
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
i * sizeof(dr_simd_t),
OPSZ_SVE_VECLEN_BYTES),
opnd_create_reg(DR_REG_Z0 + i)));
}
/* add x1, x(dcxt), #(off) */
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
for (i = 0; i < 16; i++) {
/* str p(i), [x1, #(i mul vl)] */
/* str p(i), [x1, #(i * sizeof(dr_svep_t))] */
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
opnd_create_reg(DR_REG_P0 + i)));
INSTR_CREATE_str(dcontext,
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
i * sizeof(dr_svep_t),
OPSZ_SVE_PREDLEN_BYTES),
opnd_create_reg(DR_REG_P0 + i)));
}
/* There is no store instruction for the first-fault register (FFR). Use
* a temporary predicate register to store:
* rdffr p15.b
* add x2, x(dcxt), #(offset ffr)
* str p15, [x2, #(ffr)]
* ldr p15, [x1, #(15 mul vl)]
* rdffr p15.b ; Read FFR to P15
* add x2, x(dcxt), #(offset ffr) ; Calculate FFR dcxt offset
* str p15, [x2, #0] ; Save FFR to dcxt
* ldr p15, [x1, #(15 * sizeof(dr_svep_t))] ; Restore app P15 value from dcxt
*/
APP(ilist,
INSTR_CREATE_rdffr_sve(dcontext,
Expand All @@ -835,18 +828,15 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X2, DR_REG_NULL, 0, 0,
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
opnd_create_reg(DR_REG_P15)));
INSTR_CREATE_str(dcontext,
opnd_create_base_disp(DR_REG_X2, DR_REG_NULL, 0, 0,
OPSZ_SVE_PREDLEN_BYTES),
opnd_create_reg(DR_REG_P15)));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, 15 * sizeof(dr_simd_t),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0,
15 * sizeof(dr_svep_t),
OPSZ_SVE_PREDLEN_BYTES)));
}
}

Expand Down
17 changes: 13 additions & 4 deletions core/arch/aarchxx/mangle.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
DR_REG_Q0, SIMD_REG_TYPE);
}

dstack_offs += MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t);
dstack_offs += (MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) +
(MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) +
(MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t));

/* Restore the registers we used. */
/* ldp x0, x1, [sp] */
Expand All @@ -577,6 +579,10 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X2), OPSZ_8)));

/* Make dstack_offs 16-byte aligned. */
dstack_offs = ALIGN_FORWARD(dstack_offs, get_ABI_stack_alignment());

#else
/* vstmdb always does writeback */
PRE(ilist, instr,
Expand Down Expand Up @@ -655,9 +661,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,

/* Make dstack_offs 8-byte algined, as we only accounted for 17 4-byte slots. */
dstack_offs += XSP_SZ;
#endif
ASSERT(cci->skip_save_flags || cci->num_simd_skip != 0 || cci->num_regs_skip != 0 ||
dstack_offs == (uint)get_clean_call_switch_stack_size());
#endif
return dstack_offs;
}

Expand All @@ -678,8 +684,11 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0),
opnd_create_reg(DR_REG_SP)));

current_offs =
get_clean_call_switch_stack_size() - (MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t));
current_offs = ALIGN_BACKWARD(get_clean_call_switch_stack_size() -
(MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t)) -
(MCXT_NUM_SVEP_SLOTS * sizeof(dr_svep_t)) -
(MCXT_NUM_FFR_SLOTS * sizeof(dr_ffr_t)),
16);

/* add x0, x0, current_offs */
PRE(ilist, instr,
Expand Down
45 changes: 37 additions & 8 deletions core/arch/arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -3825,21 +3825,50 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
}
#elif defined(AARCHXX)
{
int i, j;
# ifdef AARCH64
int words = proc_has_feature(FEATURE_SVE) ? 16 : 4;
const uint vector_length_bytes =
(proc_has_feature(FEATURE_SVE) ? proc_get_vector_length_bytes()
: opnd_size_in_bytes(reg_get_size(DR_REG_Q0)));
const uint u32_count = vector_length_bytes / sizeof(uint);
/* XXX: should be proc_num_simd_saved(). */
const uint num_simd_regs = MCXT_NUM_SIMD_SVE_SLOTS;
const char *reg_prefix = proc_has_feature(FEATURE_SVE) ? "z" : "q";
# else
int words = 4;
# endif
const uint u32_count =
sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]);
/* XXX: should be proc_num_simd_saved(). */
for (i = 0; i < proc_num_simd_registers(); i++) {
print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i);
for (j = 0; j < words; j++) {
const uint num_simd_regs = proc_num_simd_registers();
const char *reg_prefix = "q";
# endif
ASSERT(u32_count <=
sizeof(context->simd[0].u32) / sizeof(context->simd[0].u32[0]));
for (uint i = 0; i < num_simd_regs; i++) {
print_file(f, dump_xml ? "\t\t%s%d= \"0x" : "\t%s%-3d= 0x", reg_prefix, i);
for (uint j = 0; j < u32_count; j++) {
print_file(f, "%08x ", context->simd[i].u32[j]);
}
print_file(f, dump_xml ? "\"\n" : "\n");
}
/* TODO i#5365: SVE predicate registers and FFR dump. */
# ifdef AARCH64
if (proc_has_feature(FEATURE_SVE)) {
/* Dump SVE P registers */
const uint pred_u16_count =
(proc_get_vector_length_bytes() / 8) / sizeof(ushort);
for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
print_file(f, dump_xml ? "\t\tp%d= \"0x" : "\tp%-3d= 0x", i);
for (size_t j = 0; j < pred_u16_count; j++) {
print_file(f, "%04x ", context->svep[i].u16[j]);
}
print_file(f, dump_xml ? "\"\n" : "\n");
}
/* Dump SVE FFR register */
print_file(f, dump_xml ? "\t\tffr= \"0x" : "\tffr = 0x");
for (size_t j = 0; j < pred_u16_count; j++) {
print_file(f, "%04x ", context->ffr.u16[j]);
}
print_file(f, dump_xml ? "\"\n" : "\n");
}
# endif
}
#endif

Expand Down
9 changes: 7 additions & 2 deletions core/arch/arm/arm.asm
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,22 @@ DECL_EXTERN(initstack_mutex)

#ifdef X64
# define MCXT_NUM_SIMD_SLOTS 32
# define SIMD_REG_SIZE 16
# define SIMD_REG_SIZE 64
# define MCXT_NUM_PRED_SLOTS 17 /* P regs + FFR */
# define PRED_REG_SIZE 8
# define NUM_GPR_SLOTS 33 /* incl flags */
# define GPR_REG_SIZE 8
#else
# define MCXT_NUM_SIMD_SLOTS 16
# define SIMD_REG_SIZE 16
# define MCXT_NUM_PRED_SLOTS 0
# define PRED_REG_SIZE 0
# define NUM_GPR_SLOTS 17 /* incl flags */
# define GPR_REG_SIZE 4
#endif
#define PRE_SIMD_PADDING 0
#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE)
#define PRIV_MCXT_SIMD_SIZE (PRE_SIMD_PADDING + MCXT_NUM_SIMD_SLOTS*SIMD_REG_SIZE \
+ MCXT_NUM_PRED_SLOTS*PRED_REG_SIZE)
#define PRIV_MCXT_SIZE (NUM_GPR_SLOTS*GPR_REG_SIZE + PRIV_MCXT_SIMD_SIZE)
#define PRIV_MCXT_SP_FROM_SIMD (-(4*GPR_REG_SIZE)) /* flags, pc, lr, then sp */
#define PRIV_MCXT_PC_FROM_SIMD (-(2*GPR_REG_SIZE)) /* flags, then pc */
Expand Down
Loading

0 comments on commit f45eeba

Please sign in to comment.