From 40e2edb66f859ee87323b1f344ce29a20a1eadb9 Mon Sep 17 00:00:00 2001 From: Abhinav Anil Sharma Date: Mon, 20 Nov 2023 10:12:14 -0500 Subject: [PATCH] i#6466: Fix raw2trace missing encoding error for filtered traces (#6465) Fixes a corner case for filtered traces in the raw2trace delayed branch logic that affects how encodings are written. Adds an increment missing for zero sized pc entries for the ordinal into the decode_pcs array. This caused the wrong decode_pc to be recorded as emitted, which caused the "missing encoding" error when encoding for that decode_pc was not really emitted later. This affects CTI that read memory and may generate in some cases only a zero-sized PC entry (if the instr is i-filtered out but its memref isn't). Adds a raw2trace_unit_test for filtered traces that fails without this fix. Fixes: #6466 --- .../drcachesim/tests/raw2trace_unit_tests.cpp | 151 +++++++++++++++++- clients/drcachesim/tracer/raw2trace.cpp | 27 ++-- 2 files changed, 162 insertions(+), 16 deletions(-) diff --git a/clients/drcachesim/tests/raw2trace_unit_tests.cpp b/clients/drcachesim/tests/raw2trace_unit_tests.cpp index 2c6a7006ce2..889dce30b7b 100644 --- a/clients/drcachesim/tests/raw2trace_unit_tests.cpp +++ b/clients/drcachesim/tests/raw2trace_unit_tests.cpp @@ -169,13 +169,13 @@ class archive_ostream_test_t : public archive_ostream_t { }; offline_entry_t -make_header(int version = OFFLINE_FILE_VERSION) +make_header(int version = OFFLINE_FILE_VERSION, uint64_t additional_file_types = 0) { offline_entry_t entry; entry.extended.type = OFFLINE_TYPE_EXTENDED; entry.extended.ext = OFFLINE_EXT_TYPE_HEADER; entry.extended.valueA = OFFLINE_FILE_TYPE_DEFAULT | OFFLINE_FILE_TYPE_ENCODINGS | - OFFLINE_FILE_TYPE_SYSCALL_NUMBERS; + OFFLINE_FILE_TYPE_SYSCALL_NUMBERS | additional_file_types; entry.extended.valueB = version; return entry; } @@ -288,7 +288,7 @@ check_entry(std::vector &entries, int &idx, unsigned short expect int expected_size, addr_t expected_addr = 0) { if (expected_type != entries[idx].type || - (expected_size > 0 && + (expected_size >= 0 && static_cast(expected_size) != entries[idx].size) || (expected_addr > 0 && expected_addr != entries[idx].addr)) { std::cerr << "Entry " << idx << " has type " << entries[idx].type << " and size " @@ -2738,6 +2738,149 @@ test_is_maybe_blocking_syscall(void *drcontext) return true; } +bool +test_ifiltered(void *drcontext) +{ +#if defined(X86) || defined(ARM) + std::cerr << "\n===============\nTesting ifiltered trace\n"; + // Our synthetic test first constructs a list of instructions to be encoded into + // a buffer for decoding by raw2trace. + instrlist_t *ilist = instrlist_create(drcontext); + // raw2trace doesn't like offsets of 0 so we shift with a nop. + instr_t *nop = XINST_CREATE_nop(drcontext); + instr_t *move1 = + XINST_CREATE_move(drcontext, opnd_create_reg(REG1), opnd_create_reg(REG2)); + instr_t *move2 = + XINST_CREATE_move(drcontext, opnd_create_reg(REG1), opnd_create_reg(REG2)); + instr_t *jcc = + XINST_CREATE_jump_cond(drcontext, DR_PRED_EQ, opnd_create_instr(move1)); + // Control flow in the test assumes that memaddr stores address to jcc. + instr_t *jmp = + XINST_CREATE_jump_mem(drcontext, opnd_create_mem_instr(jcc, 0, OPSZ_PTR)); + instr_t *move3 = + XINST_CREATE_move(drcontext, opnd_create_reg(REG1), opnd_create_reg(REG2)); + instrlist_append(ilist, nop); + instrlist_append(ilist, move1); + instrlist_append(ilist, jmp); + instrlist_append(ilist, jcc); + instrlist_append(ilist, move2); + instrlist_append(ilist, move3); + size_t offs_nop = 0; + size_t offs_move1 = offs_nop + instr_length(drcontext, nop); + size_t offs_jmp = offs_move1 + instr_length(drcontext, move1); + int jmp_length = instr_length(drcontext, jmp); + size_t offs_jcc = offs_jmp + jmp_length; + size_t offs_move2 = offs_jcc + instr_length(drcontext, jcc); + size_t offs_move3 = offs_move2 + instr_length(drcontext, move2); + + // Now we synthesize our raw trace itself, including a valid header sequence. + std::vector raw; + raw.push_back(make_header(OFFLINE_FILE_VERSION, OFFLINE_FILE_TYPE_IFILTERED)); + raw.push_back(make_tid()); + raw.push_back(make_pid()); + raw.push_back(make_line_size()); + // First instance of the jmp instr is filtered out but its memref is not filtered + // out (indicated by the zero sized block), so no encoding will be emitted and + // it will not count towards the chunk instr count. But this will still be + // accumulated as a delayed branch. + raw.push_back(make_block(offs_jmp, 0)); + raw.push_back(make_memref(42)); + // Second accumulated delayed branch. + raw.push_back(make_block(offs_jcc, 1)); + // At this point, the jmp and jcc are accumulated as delayed branches. + // When writing the delayed branches, we want to make sure we correctly track + // the index into decode_pcs. If we don't increment the index at ifiltered + // instrs, the decode pc of jmp will be accidentally used when recording the + // encoding emitted for jcc. This will cause the jmp encoding to not be emitted + // in the next entry because raw2trace incorrectly tracked that it had + // already emitted it. + raw.push_back(make_block(offs_move1, 1)); + // Second instance of the jmp instr is not filtered out. Its encoding must be + // emitted by raw2trace, or else the reader (in memref_counter_t) will + // complain about a missing encoding. + raw.push_back(make_block(offs_jmp, 1)); + // The memref is also not filtered out. We have a separate pc entry with + // zero instr count just before the memref. + raw.push_back(make_block(offs_jmp, 0)); + raw.push_back(make_memref(42)); + raw.push_back(make_block(offs_jcc, 1)); + raw.push_back(make_block(offs_move2, 1)); + // End of first chunk. + raw.push_back(make_block(offs_move3, 1)); + raw.push_back(make_exit()); + + static const int CHUNK_INSTR_COUNT = 5; + std::vector entries; + if (!run_raw2trace(drcontext, raw, ilist, entries, nullptr, CHUNK_INSTR_COUNT)) + return false; + int idx = 0; + return ( + check_entry(entries, idx, TRACE_TYPE_HEADER, -1) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_VERSION) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_FILETYPE) && + check_entry(entries, idx, TRACE_TYPE_THREAD, -1) && + check_entry(entries, idx, TRACE_TYPE_PID, -1) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CACHE_LINE_SIZE) && + check_entry(entries, idx, TRACE_TYPE_MARKER, + TRACE_MARKER_TYPE_CHUNK_INSTR_COUNT) && + // jmp + // No encoding for the i-filtered instr with 0-instr count. + check_entry(entries, idx, TRACE_TYPE_INSTR_INDIRECT_JUMP, 0) && + check_entry(entries, idx, TRACE_TYPE_READ, -1) && + // jcc + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && +# ifdef X86_32 + // An extra encoding entry is needed. + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && +# endif + // Since we cannot infer branch targets accurately for i-filtered traces, this + // has the generic conditional jump type (instead of the more specific + // TRACE_TYPE_INSTR_TAKEN_JUMP type). + check_entry(entries, idx, TRACE_TYPE_INSTR_CONDITIONAL_JUMP, -1) && + // move1 + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && + check_entry(entries, idx, TRACE_TYPE_INSTR, -1) && + // jmp + // This has an encoding because the previous dynamic instance was actually + // i-filtered. + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && +# ifdef X86_32 + // An extra encoding entry is needed. + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && +# endif + // In filtered traces, we have one pc entry for the instr itself (if the + // instruction is not i-filtered out) which has the instr length, and another + // zero-length pc entry before each of the instr's memrefs (if the memref + // is not d-filtered out). + check_entry(entries, idx, TRACE_TYPE_INSTR_INDIRECT_JUMP, jmp_length) && + check_entry(entries, idx, TRACE_TYPE_INSTR_INDIRECT_JUMP, 0) && + check_entry(entries, idx, TRACE_TYPE_READ, -1) && + // jcc. No encoding because it has already been emitted above. + // Since we cannot infer branch targets accurately for i-filtered traces, this + // has the generic conditional jump type (instead of the more specific + // TRACE_TYPE_INSTR_UNTAKEN_JUMP type). + check_entry(entries, idx, TRACE_TYPE_INSTR_CONDITIONAL_JUMP, -1) && + // move2 + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && + check_entry(entries, idx, TRACE_TYPE_INSTR, -1) && + // Chunk ends since we've seen exactly 5 instrs. + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CHUNK_FOOTER) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_RECORD_ORDINAL) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_TIMESTAMP) && + check_entry(entries, idx, TRACE_TYPE_MARKER, TRACE_MARKER_TYPE_CPU_ID) && + // move3 + check_entry(entries, idx, TRACE_TYPE_ENCODING, -1) && + check_entry(entries, idx, TRACE_TYPE_INSTR, -1) && + check_entry(entries, idx, TRACE_TYPE_THREAD_EXIT, -1) && + check_entry(entries, idx, TRACE_TYPE_FOOTER, -1)); +#else + // This test requires a CTI (so that it gets accumulated as a delayed branch) that + // also reads from memory (so that it's possible to have a case with a zero-sized PC + // entry in the raw trace). AArch64 does not have such an instr. + return true; +#endif +} + int test_main(int argc, const char *argv[]) { @@ -2756,7 +2899,7 @@ test_main(int argc, const char *argv[]) !test_midrseq_end(drcontext) || !test_xfer_modoffs(drcontext) || !test_xfer_absolute(drcontext) || !test_branch_decoration(drcontext) || !test_stats_timestamp_instr_count(drcontext) || - !test_is_maybe_blocking_syscall(drcontext)) + !test_is_maybe_blocking_syscall(drcontext) || !test_ifiltered(drcontext)) return 1; return 0; } diff --git a/clients/drcachesim/tracer/raw2trace.cpp b/clients/drcachesim/tracer/raw2trace.cpp index b1375ef6a94..c6dd6439d93 100644 --- a/clients/drcachesim/tracer/raw2trace.cpp +++ b/clients/drcachesim/tracer/raw2trace.cpp @@ -3161,19 +3161,19 @@ raw2trace_t::write(raw2trace_thread_data_t *tdata, const trace_entry_t *start, start = it; DEBUG_ASSERT(tdata->cur_chunk_instr_count == 0); } - if (type_is_instr(static_cast(it->type)) && - // Do not count PC-only i-filtered instrs. - it->size > 0) { - accumulate_to_statistic(tdata, - RAW2TRACE_STAT_FINAL_TRACE_INSTRUCTION_COUNT, 1); - ++tdata->cur_chunk_instr_count; + if (type_is_instr(static_cast(it->type))) { ++instr_ordinal; - if (TESTANY(OFFLINE_FILE_TYPE_ENCODINGS, tdata->file_type) && - // We don't want encodings for the PC-only i-filtered entries. - it->size > 0 && instr_ordinal >= static_cast(decode_pcs_size)) { - tdata->error = "decode_pcs is missing entries for written " - "instructions"; - return false; + // Do not count PC-only i-filtered instrs. + if (it->size > 0) { + accumulate_to_statistic( + tdata, RAW2TRACE_STAT_FINAL_TRACE_INSTRUCTION_COUNT, 1); + ++tdata->cur_chunk_instr_count; + if (TESTANY(OFFLINE_FILE_TYPE_ENCODINGS, tdata->file_type) && + instr_ordinal >= static_cast(decode_pcs_size)) { + tdata->error = "decode_pcs is missing entries for written " + "instructions"; + return false; + } } } // Check for missing encodings after possibly opening a new chunk. @@ -3199,6 +3199,9 @@ raw2trace_t::write(raw2trace_thread_data_t *tdata, const trace_entry_t *start, // Check whether this instr's encoding has already been emitted // due to multiple instances of the same delayed branch (the encoding // cache was cleared in open_new_chunk()). + // XXX: Do we need to delay PC-only (i-filtered) instrs (the ones + // with it->size == 0)? We're anyway skipping over those entries here + // so maybe we could avoid adding them to decode_pcs. (record_encoding_emitted(tdata, *(decode_pcs + instr_ordinal))) { // Write any data we were waiting until post-loop to write. if (it > start &&